[Intel-gfx] [PATCH 09/15] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines

Fri Nov 4 09:30:38 UTC 2016

From: Sourab Gupta <sourab.gupta at intel.com>

This patch extends the i915  perf framework to handle the perf sample
collection for any given gpu engine. Particularly, the support
for collecting timestamp sample type is added, which can be requested for
any engine.
With this, for RCS, timestamps and OA reports can be collected together,
and provided to userspace in separate sample fields. For other engines,
the capabilility to collect timestamps is added.

The thing to note is that, still only a single stream instance can be
opened at any particular time. Though that stream may now be opened for any
gpu engine, for collection of timestamp samples.

So, this patch doesn't add the support to open multiple concurrent streams,
as yet. Though it lays the groundwork for this support to be added
susequently. Part of this groundwork involves having separate command
stream buffers, per engine, for holding the samples generated.
Likewise for a few other data structures maintaining per-engine state.

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |  35 ++-
 drivers/gpu/drm/i915/i915_perf.c | 635 +++++++++++++++++++++++++--------------
 drivers/gpu/drm/i915/i915_reg.h  |   2 +
 include/uapi/drm/i915_drm.h      |   7 +
 4 files changed, 445 insertions(+), 234 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0f171f8..a05335a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1814,7 +1814,8 @@ struct i915_perf_stream_ops {
 	 * Routine to emit the commands in the command streamer associated
 	 * with the corresponding gpu engine.
 	 */
-	void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag);
+	void (*command_stream_hook)(struct i915_perf_stream *stream,
+				struct drm_i915_gem_request *req, u32 tag);
 };
 
 enum i915_perf_stream_state {
@@ -1839,6 +1840,9 @@ struct i915_perf_stream {
 	/* Whether command stream based data collection is enabled */
 	bool cs_mode;
 
+	/* Whether the OA unit is in use */
+	bool using_oa;
+
 	const struct i915_perf_stream_ops *ops;
 };
 
@@ -1870,7 +1874,16 @@ struct i915_oa_ops {
 struct i915_perf_cs_data_node {
 	struct list_head link;
 	struct drm_i915_gem_request *request;
-	u32 offset;
+
+	/* Offsets into the GEM obj holding the data */
+	u32 start_offset;
+	u32 oa_offset;
+	u32 ts_offset;
+
+	/* buffer size corresponding to this entry */
+	u32 size;
+
+	/* Other metadata */
 	u32 ctx_id;
 	u32 pid;
 	u32 tag;
@@ -2189,14 +2202,14 @@ struct drm_i915_private {
 
 		spinlock_t hook_lock;
 
-		struct {
-			struct i915_perf_stream *exclusive_stream;
 
-			u32 specific_ctx_id;
+		struct hrtimer poll_check_timer;
+		struct i915_perf_stream *exclusive_stream;
+		wait_queue_head_t poll_wq[I915_NUM_ENGINES];
+		atomic_t pollin[I915_NUM_ENGINES];
 
-			struct hrtimer poll_check_timer;
-			wait_queue_head_t poll_wq;
-			atomic_t pollin;
+		struct {
+			u32 specific_ctx_id;
 
 			bool periodic;
 			int period_exponent;
@@ -2241,13 +2254,13 @@ struct drm_i915_private {
 			u8 *addr;
 #define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0)
 			u32 status;
-		} command_stream_buf;
+		} command_stream_buf[I915_NUM_ENGINES];
 
 		u32 last_ctx_id;
 		u32 last_pid;
 		u32 last_tag;
-		struct list_head node_list;
-		spinlock_t node_list_lock;
+		struct list_head node_list[I915_NUM_ENGINES];
+		spinlock_t node_list_lock[I915_NUM_ENGINES];
 	} perf;
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index ca523b1..516fd54 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -250,12 +250,17 @@ static u32 i915_perf_stream_paranoid = true;
 /* For determining the behavior on overflow of command stream samples */
 #define CMD_STREAM_BUF_OVERFLOW_ALLOWED
 
-/* Data common to periodic and RCS based samples */
-struct oa_sample_data {
+#define OA_ADDR_ALIGN 64
+#define TS_ADDR_ALIGN 8
+#define I915_PERF_TS_SAMPLE_SIZE 8
+
+/* Data common to all samples (periodic OA / CS based OA / Timestamps) */
+struct sample_data {
 	u32 source;
 	u32 ctx_id;
 	u32 pid;
 	u32 tag;
+	u64 ts;
 	const u8 *report;
 };
 
@@ -313,6 +318,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
 #define SAMPLE_CTX_ID		(1<<2)
 #define SAMPLE_PID		(1<<3)
 #define SAMPLE_TAG		(1<<4)
+#define SAMPLE_TS		(1<<5)
 
 struct perf_open_properties {
 	u32 sample_flags;
@@ -350,8 +356,9 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *request,
 	mutex_lock(&dev_priv->perf.streams_lock);
 	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
 		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
-					stream->cs_mode)
-			stream->ops->command_stream_hook(request, tag);
+				stream->cs_mode &&
+				(stream->engine == engine->id))
+			stream->ops->command_stream_hook(stream, request, tag);
 	}
 	mutex_unlock(&dev_priv->perf.streams_lock);
 }
@@ -365,16 +372,15 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *request,
  * eventually, when the request associated with new entry completes.
  */
 static void release_some_perf_entries(struct drm_i915_private *dev_priv,
-					u32 target_size)
+				enum intel_engine_id id, u32 target_size)
 {
 	struct i915_perf_cs_data_node *entry, *next;
-	u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u32 size = 0;
 
 	list_for_each_entry_safe
-		(entry, next, &dev_priv->perf.node_list, link) {
+		(entry, next, &dev_priv->perf.node_list[id], link) {
 
-		size += entry_size;
+		size += entry->size;
 		i915_gem_request_put(entry->request);
 		list_del(&entry->link);
 		kfree(entry);
@@ -392,43 +398,61 @@ static void release_some_perf_entries(struct drm_i915_private *dev_priv,
  * appropriate status flag is set, and function returns -ENOSPC.
  */
 static int insert_perf_entry(struct drm_i915_private *dev_priv,
+				struct i915_perf_stream *stream,
 				struct i915_perf_cs_data_node *entry)
 {
 	struct i915_perf_cs_data_node *first_entry, *last_entry;
-	int max_offset = dev_priv->perf.command_stream_buf.obj->base.size;
-	u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u32 sample_flags = stream->sample_flags;
+	enum intel_engine_id id = stream->engine;
+	int max_offset = dev_priv->perf.command_stream_buf[id].obj->base.size;
+	u32 offset, entry_size = 0;
+	bool sample_ts = false;
 	int ret = 0;
 
-	spin_lock(&dev_priv->perf.node_list_lock);
-	if (list_empty(&dev_priv->perf.node_list)) {
-		entry->offset = 0;
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		entry_size += dev_priv->perf.oa.oa_buffer.format_size;
+	else if (sample_flags & SAMPLE_TS) {
+		/*
+		 * XXX: Since TS data can anyways be derived from OA report, so
+		 * no need to capture it for RCS engine, if capture oa data is
+		 * called already.
+		 */
+		entry_size += I915_PERF_TS_SAMPLE_SIZE;
+		sample_ts = true;
+	}
+
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
+	if (list_empty(&dev_priv->perf.node_list[id])) {
+		offset = 0;
 		goto out;
 	}
 
-	first_entry = list_first_entry(&dev_priv->perf.node_list,
+	first_entry = list_first_entry(&dev_priv->perf.node_list[id],
 				       typeof(*first_entry), link);
-	last_entry = list_last_entry(&dev_priv->perf.node_list,
+	last_entry = list_last_entry(&dev_priv->perf.node_list[id],
 				     typeof(*last_entry), link);
 
-	if (last_entry->offset >= first_entry->offset) {
+	if (last_entry->start_offset >= first_entry->start_offset) {
 		/* Sufficient space available at the end of buffer? */
-		if (last_entry->offset + 2*entry_size < max_offset)
-			entry->offset = last_entry->offset + entry_size;
+		if (last_entry->start_offset + last_entry->size + entry_size
+							< max_offset)
+			offset = last_entry->start_offset + last_entry->size;
 		/*
 		 * Wraparound condition. Is sufficient space available at
 		 * beginning of buffer?
 		 */
-		else if (entry_size < first_entry->offset)
-			entry->offset = 0;
+		else if (entry_size < first_entry->start_offset)
+			offset = 0;
 		/* Insufficient space */
 		else {
 #ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED
-			u32 target_size = entry_size - first_entry->offset;
+			u32 target_size = entry_size -
+						first_entry->start_offset;
 
-			release_some_perf_entries(dev_priv, target_size);
-			entry->offset = 0;
+			release_some_perf_entries(dev_priv, id, target_size);
+			offset = 0;
 #else
-			dev_priv->perf.command_stream_buf.status |=
+			dev_priv->perf.command_stream_buf[id].status |=
 				I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW;
 			ret = -ENOSPC;
 			goto out_unlock;
@@ -436,19 +460,21 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv,
 		}
 	} else {
 		/* Sufficient space available? */
-		if (last_entry->offset + 2*entry_size < first_entry->offset)
-			entry->offset = last_entry->offset + entry_size;
+		if (last_entry->start_offset + last_entry->size + entry_size
+						< first_entry->start_offset)
+			offset = last_entry->start_offset + last_entry->size;
 		/* Insufficient space */
 		else {
 #ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED
 			u32 target_size = entry_size -
-				(first_entry->offset - last_entry->offset -
-				entry_size);
+				(first_entry->start_offset -
+					last_entry->start_offset -
+					last_entry->size);
 
-			release_some_perf_entries(dev_priv, target_size);
-			entry->offset = last_entry->offset + entry_size;
+			release_some_perf_entries(dev_priv, id, target_size);
+			offset = last_entry->start_offset + last_entry->size;
 #else
-			dev_priv->perf.command_stream_buf.status |=
+			dev_priv->perf.command_stream_buf[id].status |=
 				I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW;
 			ret = -ENOSPC;
 			goto out_unlock;
@@ -457,45 +483,43 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv,
 	}
 
 out:
-	list_add_tail(&entry->link, &dev_priv->perf.node_list);
+	entry->start_offset = offset;
+	entry->size = entry_size;
+	if (stream->sample_flags & SAMPLE_OA_REPORT) {
+		entry->oa_offset = offset;
+		/* Ensure 64 byte alignment of oa_offset */
+		entry->oa_offset = ALIGN(entry->oa_offset, OA_ADDR_ALIGN);
+		offset = entry->oa_offset +
+				dev_priv->perf.oa.oa_buffer.format_size;
+	}
+	if (sample_ts) {
+		entry->ts_offset = offset;
+		/* Ensure 8 byte alignment of ts_offset */
+		entry->ts_offset = ALIGN(entry->ts_offset, TS_ADDR_ALIGN);
+		offset = entry->ts_offset + I915_PERF_TS_SAMPLE_SIZE;
+	}
+
+	list_add_tail(&entry->link, &dev_priv->perf.node_list[id]);
 #ifndef CMD_STREAM_BUF_OVERFLOW_ALLOWED
 out_unlock:
 #endif
-	spin_unlock(&dev_priv->perf.node_list_lock);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 	return ret;
 }
 
-static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req,
-						u32 tag)
+static int i915_ring_stream_capture_oa(struct drm_i915_gem_request *req,
+				u32 offset)
 {
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_ring *ring = req->ring;
-	struct i915_gem_context *ctx = req->ctx;
-	struct i915_perf_cs_data_node *entry;
 	u32 addr = 0;
 	int ret;
 
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (entry == NULL) {
-		DRM_ERROR("alloc failed\n");
-		return;
-	}
-
-	ret = insert_perf_entry(dev_priv, entry);
-	if (ret)
-		goto out_free;
-
 	ret = intel_ring_begin(req, 4);
 	if (ret)
-		goto out;
-
-	entry->ctx_id = ctx->hw_id;
-	entry->pid = current->pid;
-	entry->tag = tag;
-	i915_gem_request_assign(&entry->request, req);
+		return ret;
 
-	addr = dev_priv->perf.command_stream_buf.vma->node.start +
-		entry->offset;
+	addr = dev_priv->perf.command_stream_buf[RCS].vma->node.start + offset;
 
 	/* addr should be 64 byte aligned */
 	BUG_ON(addr & 0x3f);
@@ -512,18 +536,124 @@ static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req,
 		intel_ring_emit(ring, MI_NOOP);
 	}
 	intel_ring_advance(ring);
-	i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, req,
+	return 0;
+}
+
+static int i915_ring_stream_capture_ts(struct drm_i915_gem_request *req,
+						u32 offset)
+{
+	struct drm_i915_private *dev_priv = req->i915;
+	enum intel_engine_id id = req->engine->id;
+	struct intel_ring *ring = req->ring;
+	u32 addr = 0;
+	int ret;
+
+	ret = intel_ring_begin(req, 6);
+	if (ret)
+		return ret;
+
+	addr = dev_priv->perf.command_stream_buf[id].vma->node.start + offset;
+
+	if (id == RCS) {
+		if (INTEL_INFO(dev_priv)->gen >= 8)
+			intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
+		else
+			intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5));
+		intel_ring_emit(ring,
+				PIPE_CONTROL_GLOBAL_GTT_IVB |
+				PIPE_CONTROL_TIMESTAMP_WRITE);
+		intel_ring_emit(ring, addr | PIPE_CONTROL_GLOBAL_GTT);
+		intel_ring_emit(ring, 0);
+		if (INTEL_INFO(dev_priv)->gen >= 8) {
+			intel_ring_emit(ring, 0);
+			intel_ring_emit(ring, 0);
+		} else {
+			intel_ring_emit(ring, 0);
+			intel_ring_emit(ring, MI_NOOP);
+		}
+	} else {
+		uint32_t cmd;
+
+		cmd = MI_FLUSH_DW + 1;
+		if (INTEL_INFO(dev_priv)->gen >= 8)
+			cmd += 1;
+
+		cmd |= MI_FLUSH_DW_OP_STAMP;
+
+		intel_ring_emit(ring, cmd);
+		intel_ring_emit(ring, addr | MI_FLUSH_DW_USE_GTT);
+		if (INTEL_INFO(dev_priv)->gen >= 8) {
+			intel_ring_emit(ring, 0);
+			intel_ring_emit(ring, 0);
+			intel_ring_emit(ring, 0);
+		} else {
+			intel_ring_emit(ring, 0);
+			intel_ring_emit(ring, 0);
+			intel_ring_emit(ring, MI_NOOP);
+		}
+		intel_ring_emit(ring, MI_NOOP);
+	}
+	intel_ring_advance(ring);
+
+	return 0;
+}
+
+static void i915_ring_stream_cs_hook(struct i915_perf_stream *stream,
+				struct drm_i915_gem_request *req, u32 tag)
+{
+	struct drm_i915_private *dev_priv = req->i915;
+	struct i915_gem_context *ctx = req->ctx;
+	enum intel_engine_id id = stream->engine;
+	u32 sample_flags = stream->sample_flags;
+	struct i915_perf_cs_data_node *entry;
+	int ret = 0;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL) {
+		DRM_ERROR("alloc failed\n");
+		return;
+	}
+
+	ret = insert_perf_entry(dev_priv, stream, entry);
+	if (ret)
+		goto err_free;
+
+	entry->ctx_id = ctx->hw_id;
+	entry->pid = current->pid;
+	entry->tag = tag;
+	i915_gem_request_assign(&entry->request, req);
+
+	if (sample_flags & SAMPLE_OA_REPORT) {
+		ret = i915_ring_stream_capture_oa(req, entry->oa_offset);
+		if (ret)
+			goto err_unref;
+	} else if (sample_flags & SAMPLE_TS) {
+		/*
+		 * XXX: Since TS data can anyways be derived from OA report, so
+		 * no need to capture it for RCS engine, if capture oa data is
+		 * called already.
+		 */
+		ret = i915_ring_stream_capture_ts(req, entry->ts_offset);
+		if (ret)
+			goto err_unref;
+	}
+
+
+	i915_vma_move_to_active(dev_priv->perf.command_stream_buf[id].vma, req,
 					EXEC_OBJECT_WRITE);
 	return;
-out:
-	spin_lock(&dev_priv->perf.node_list_lock);
+
+err_unref:
+	i915_gem_request_put(entry->request);
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
 	list_del(&entry->link);
-	spin_unlock(&dev_priv->perf.node_list_lock);
-out_free:
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
+err_free:
 	kfree(entry);
 }
 
-static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
+static int i915_ring_stream_wait_gpu(struct drm_i915_private *dev_priv,
+				enum intel_engine_id id)
 {
 	struct i915_perf_cs_data_node *last_entry = NULL;
 	struct drm_i915_gem_request *req = NULL;
@@ -534,14 +664,14 @@ static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
 	 * implicitly wait for the prior submitted requests. The refcount
 	 * of the requests is not decremented here.
 	 */
-	spin_lock(&dev_priv->perf.node_list_lock);
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
 
-	if (!list_empty(&dev_priv->perf.node_list)) {
-		last_entry = list_last_entry(&dev_priv->perf.node_list,
+	if (!list_empty(&dev_priv->perf.node_list[id])) {
+		last_entry = list_last_entry(&dev_priv->perf.node_list[id],
 			struct i915_perf_cs_data_node, link);
 		req = last_entry->request;
 	}
-	spin_unlock(&dev_priv->perf.node_list_lock);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 
 	if (!req)
 		return 0;
@@ -554,17 +684,18 @@ static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv)
+static void i915_ring_stream_free_requests(struct drm_i915_private *dev_priv,
+				enum intel_engine_id id)
 {
 	struct i915_perf_cs_data_node *entry, *next;
 
 	list_for_each_entry_safe
-		(entry, next, &dev_priv->perf.node_list, link) {
+		(entry, next, &dev_priv->perf.node_list[id], link) {
 		i915_gem_request_put(entry->request);
 
-		spin_lock(&dev_priv->perf.node_list_lock);
+		spin_lock(&dev_priv->perf.node_list_lock[id]);
 		list_del(&entry->link);
-		spin_unlock(&dev_priv->perf.node_list_lock);
+		spin_unlock(&dev_priv->perf.node_list_lock[id]);
 		kfree(entry);
 	}
 }
@@ -708,11 +839,11 @@ static int append_oa_status(struct i915_perf_stream *stream,
 }
 
 /**
- * Copies single OA report into userspace read() buffer.
+ * Copies single sample into userspace read() buffer.
  */
-static int append_oa_sample(struct i915_perf_stream *stream,
+static int append_sample(struct i915_perf_stream *stream,
 			    char __user *buf, size_t count,
-			    size_t *offset, struct oa_sample_data *data)
+			    size_t *offset, struct sample_data *data)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -755,6 +886,12 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 		buf += 4;
 	}
 
+	if (sample_flags & SAMPLE_TS) {
+		if (copy_to_user(buf, &data->ts, I915_PERF_TS_SAMPLE_SIZE))
+			return -EFAULT;
+		buf += I915_PERF_TS_SAMPLE_SIZE;
+	}
+
 	if (sample_flags & SAMPLE_OA_REPORT) {
 		if (copy_to_user(buf, data->report, report_size))
 			return -EFAULT;
@@ -772,7 +909,7 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream,
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 sample_flags = stream->sample_flags;
-	struct oa_sample_data data = { 0 };
+	struct sample_data data = { 0 };
 
 	if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
 		enum drm_i915_perf_oa_event_source source;
@@ -803,10 +940,15 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream,
 	if (sample_flags & SAMPLE_TAG)
 		data.tag = dev_priv->perf.last_tag;
 
+	/* Derive timestamp from OA report, after scaling with the ts base */
+#warning "FIXME: append_oa_buffer_sample: derive the timestamp from OA report"
+	if (sample_flags & SAMPLE_TS)
+		data.ts = 0;
+
 	if (sample_flags & SAMPLE_OA_REPORT)
 		data.report = report;
 
-	return append_oa_sample(stream, buf, count, offset, &data);
+	return append_sample(stream, buf, count, offset, &data);
 }
 
 /**
@@ -927,7 +1069,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * an invalid ID. It could be good to annotate these
 		 * reports with a _CTX_SWITCH_AWAY reason later.
 		 */
-		if (!dev_priv->perf.oa.exclusive_stream->ctx ||
+		if (!dev_priv->perf.exclusive_stream->ctx ||
 		    dev_priv->perf.oa.specific_ctx_id == ctx_id ||
 		    dev_priv->perf.oa.oa_buffer.last_ctx_id == ctx_id) {
 
@@ -938,7 +1080,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 			 * the switch-away reports with an invalid
 			 * context id to be recognisable by userspace.
 			 */
-			if (dev_priv->perf.oa.exclusive_stream->ctx &&
+			if (dev_priv->perf.exclusive_stream->ctx &&
 			    dev_priv->perf.oa.specific_ctx_id != ctx_id)
 				report32[2] = 0xffffffff;
 
@@ -1284,32 +1426,40 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 }
 
 /**
- * Copies a command stream OA report into userspace read() buffer, while also
- * forwarding the periodic OA reports with timestamp lower than CS report.
+ * Copy one command stream report into userspace read() buffer.
+ * For OA reports, also forward the periodic OA reports with timestamp
+ * lower than current CS OA sample.
  *
  * NB: some data may be successfully copied to the userspace buffer
  * even if an error is returned, and this is reflected in the
  * updated @read_state.
  */
-static int append_oa_rcs_sample(struct i915_perf_stream *stream,
+static int append_one_cs_sample(struct i915_perf_stream *stream,
 				char __user *buf, size_t count,
 				size_t *offset,
 				struct i915_perf_cs_data_node *node)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	struct oa_sample_data data = { 0 };
-	const u8 *report = dev_priv->perf.command_stream_buf.addr +
-				node->offset;
+	enum intel_engine_id id = stream->engine;
+	struct sample_data data = { 0 };
 	u32 sample_flags = stream->sample_flags;
-	u32 report_ts;
-	int ret;
+	int ret = 0;
 
-	/* First, append the periodic OA samples having lower timestamps */
-	report_ts = *(u32 *)(report + 4);
-	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
-					report_ts, U32_MAX);
-	if (ret)
-		return ret;
+	if (sample_flags & SAMPLE_OA_REPORT) {
+		const u8 *report = dev_priv->perf.command_stream_buf[id].addr +
+				   node->oa_offset;
+		u32 sample_ts = *(u32 *)(report + 4);
+
+		data.report = report;
+
+		/* First, append the periodic OA samples having lower
+		 * timestamp values
+		 */
+		ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+						sample_ts, U32_MAX);
+		if (ret)
+			return ret;
+	}
 
 	if (sample_flags & SAMPLE_OA_SOURCE_INFO)
 		data.source = I915_PERF_OA_EVENT_SOURCE_RCS;
@@ -1329,25 +1479,38 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream,
 		dev_priv->perf.last_tag = node->tag;
 	}
 
-	if (sample_flags & SAMPLE_OA_REPORT)
-		data.report = report;
+	if (sample_flags & SAMPLE_TS) {
+		/* For RCS, if OA samples are also being collected, derive the
+		 * timestamp from OA report, after scaling with the TS base.
+		 * Else, forward the timestamp collected via command stream.
+		 */
+#warning "FIXME: append_one_cs_sample: derive the timestamp from OA report"
+		if (sample_flags & SAMPLE_OA_REPORT)
+			data.ts = 0;
+		else
+			data.ts = *(u64 *)
+				(dev_priv->perf.command_stream_buf[id].addr +
+					node->ts_offset);
+	}
+
 
-	return append_oa_sample(stream, buf, count, offset, &data);
+	return append_sample(stream, buf, count, offset, &data);
 }
 
 /**
- * Copies all OA reports into userspace read() buffer. This includes command
- * stream as well as periodic OA reports.
+ * Copies all samples into userspace read() buffer. This includes command
+ * stream samples as well as periodic OA reports (if enabled).
  *
  * NB: some data may be successfully copied to the userspace buffer
  * even if an error is returned, and this is reflected in the
  * updated @read_state.
  */
-static int oa_rcs_append_reports(struct i915_perf_stream *stream,
+static int append_command_stream_samples(struct i915_perf_stream *stream,
 				char __user *buf, size_t count, size_t *offset)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	struct i915_perf_cs_data_node *entry, *next;
+	enum intel_engine_id id = stream->engine;
 	LIST_HEAD(free_list);
 	int ret = 0;
 #ifndef CMD_STREAM_BUF_OVERFLOW_ALLOWED
@@ -1364,24 +1527,24 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 				~I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW;
 	}
 #endif
-	spin_lock(&dev_priv->perf.node_list_lock);
-	if (list_empty(&dev_priv->perf.node_list)) {
-		spin_unlock(&dev_priv->perf.node_list_lock);
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
+	if (list_empty(&dev_priv->perf.node_list[id])) {
+		spin_unlock(&dev_priv->perf.node_list_lock[id]);
 		goto pending_periodic;
 	}
 	list_for_each_entry_safe(entry, next,
-				 &dev_priv->perf.node_list, link) {
+				 &dev_priv->perf.node_list[id], link) {
 		if (!i915_gem_request_completed(entry->request))
 			break;
 		list_move_tail(&entry->link, &free_list);
 	}
-	spin_unlock(&dev_priv->perf.node_list_lock);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 
 	if (list_empty(&free_list))
 		goto pending_periodic;
 
 	list_for_each_entry_safe(entry, next, &free_list, link) {
-		ret = append_oa_rcs_sample(stream, buf, count, offset, entry);
+		ret = append_one_cs_sample(stream, buf, count, offset, entry);
 		if (ret)
 			break;
 
@@ -1391,14 +1554,15 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	}
 
 	/* Don't discard remaining entries, keep them for next read */
-	spin_lock(&dev_priv->perf.node_list_lock);
-	list_splice(&free_list, &dev_priv->perf.node_list);
-	spin_unlock(&dev_priv->perf.node_list_lock);
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
+	list_splice(&free_list, &dev_priv->perf.node_list[id]);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 
 	return ret;
 
 pending_periodic:
-	if (!dev_priv->perf.oa.n_pending_periodic_samples)
+	if (!((stream->sample_flags & SAMPLE_OA_REPORT) &&
+			dev_priv->perf.oa.n_pending_periodic_samples))
 		return 0;
 
 	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
@@ -1427,15 +1591,16 @@ static enum cs_buf_data_state command_stream_buf_state(
 				struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	enum intel_engine_id id = stream->engine;
 	struct i915_perf_cs_data_node *entry = NULL;
 	struct drm_i915_gem_request *request = NULL;
 
-	spin_lock(&dev_priv->perf.node_list_lock);
-	entry = list_first_entry_or_null(&dev_priv->perf.node_list,
+	spin_lock(&dev_priv->perf.node_list_lock[id]);
+	entry = list_first_entry_or_null(&dev_priv->perf.node_list[id],
 			struct i915_perf_cs_data_node, link);
 	if (entry)
 		request = entry->request;
-	spin_unlock(&dev_priv->perf.node_list_lock);
+	spin_unlock(&dev_priv->perf.node_list_lock[id]);
 
 	if (!entry)
 		return CS_BUF_EMPTY;
@@ -1453,23 +1618,23 @@ static enum cs_buf_data_state command_stream_buf_state(
 static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	enum cs_buf_data_state cs_buf_state;
-	u32 num_samples, last_ts = 0;
-
-	/* Note: oa_buffer_num_samples() is ok to run unlocked as it just
-	 * performs mmio reads of the OA buffer head + tail pointers and
-	 * it's assumed we're handling some operation that implies the stream
-	 * can't be destroyed until completion (such as a read()) that ensures
-	 * the device + OA buffer can't disappear
-	 */
-	dev_priv->perf.oa.n_pending_periodic_samples = 0;
-	dev_priv->perf.oa.pending_periodic_ts = 0;
-	num_samples = dev_priv->perf.oa.ops.oa_buffer_num_samples(dev_priv,
-								&last_ts);
-	if (stream->cs_mode)
+	enum cs_buf_data_state cs_buf_state = CS_BUF_EMPTY;
+	u32 num_samples = 0, last_ts = 0;
+
+	if (stream->sample_flags & SAMPLE_OA_REPORT) {
+		/* Note: oa_buffer_num_samples() is ok to run unlocked as it
+		 * just performs mmio reads of the OA buffer head + tail
+		 * pointers and it's assumed we're handling some operation that
+		 * implies the stream can't be destroyed until completion (such
+		 * as a read()) that ensures the device + OA buffer can't
+		 * disappear
+		 */
+		dev_priv->perf.oa.n_pending_periodic_samples = 0;
+		dev_priv->perf.oa.pending_periodic_ts = 0;
+		num_samples = dev_priv->perf.oa.ops.oa_buffer_num_samples(
+							dev_priv, &last_ts);
+	} else if (stream->cs_mode)
 		cs_buf_state = command_stream_buf_state(stream);
-	else
-		cs_buf_state = CS_BUF_EMPTY;
 
 	/*
 	 * Note: We can safely forward the periodic OA samples in the case we
@@ -1481,9 +1646,13 @@ static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
 	 */
 	switch (cs_buf_state) {
 	case CS_BUF_EMPTY:
-		dev_priv->perf.oa.n_pending_periodic_samples = num_samples;
-		dev_priv->perf.oa.pending_periodic_ts = last_ts;
-		return (num_samples != 0);
+		if (stream->sample_flags & SAMPLE_OA_REPORT) {
+			dev_priv->perf.oa.n_pending_periodic_samples =
+								num_samples;
+			dev_priv->perf.oa.pending_periodic_ts = last_ts;
+			return (num_samples != 0);
+		} else
+			return false;
 
 	case CS_BUF_HAVE_DATA:
 		return true;
@@ -1494,9 +1663,10 @@ static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
 	}
 }
 
-static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+static int i915_ring_stream_wait_unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	enum intel_engine_id id = stream->engine;
 	int ret;
 
 	/* We would wait indefinitly if periodic sampling is not enabled */
@@ -1504,25 +1674,25 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
 		return -EIO;
 
 	if (stream->cs_mode) {
-		ret = i915_oa_rcs_wait_gpu(dev_priv);
+		ret = i915_ring_stream_wait_gpu(dev_priv, id);
 		if (ret)
 			return ret;
 	}
 
-	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
+	return wait_event_interruptible(dev_priv->perf.poll_wq[id],
 					stream_have_data__unlocked(stream));
 }
 
-static void i915_oa_poll_wait(struct i915_perf_stream *stream,
+static void i915_ring_stream_poll_wait(struct i915_perf_stream *stream,
 			      struct file *file,
 			      poll_table *wait)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
+	poll_wait(file, &dev_priv->perf.poll_wq[stream->engine], wait);
 }
 
-static int i915_oa_read(struct i915_perf_stream *stream,
+static int i915_ring_stream_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
 			size_t *offset)
@@ -1530,24 +1700,27 @@ static int i915_oa_read(struct i915_perf_stream *stream,
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
 	if (stream->cs_mode)
-		return oa_rcs_append_reports(stream, buf, count, offset);
-	else
+		return append_command_stream_samples(stream, buf, count, offset);
+	else if (stream->sample_flags & SAMPLE_OA_REPORT)
 		return dev_priv->perf.oa.ops.read(stream, buf, count, offset,
 						U32_MAX, U32_MAX);
+	else
+		return -EINVAL;
 }
 
 static void
-free_command_stream_buf(struct drm_i915_private *dev_priv)
+free_command_stream_buf(struct drm_i915_private *dev_priv,
+				enum intel_engine_id id)
 {
 	mutex_lock(&dev_priv->drm.struct_mutex);
 
-	i915_gem_object_unpin_map(dev_priv->perf.command_stream_buf.obj);
-	__i915_vma_unpin(dev_priv->perf.command_stream_buf.vma);
-	i915_gem_object_put(dev_priv->perf.command_stream_buf.obj);
+	i915_gem_object_unpin_map(dev_priv->perf.command_stream_buf[id].obj);
+	__i915_vma_unpin(dev_priv->perf.command_stream_buf[id].vma);
+	i915_gem_object_put(dev_priv->perf.command_stream_buf[id].obj);
 
-	dev_priv->perf.command_stream_buf.obj = NULL;
-	dev_priv->perf.command_stream_buf.vma = NULL;
-	dev_priv->perf.command_stream_buf.addr = NULL;
+	dev_priv->perf.command_stream_buf[id].obj = NULL;
+	dev_priv->perf.command_stream_buf[id].vma = NULL;
+	dev_priv->perf.command_stream_buf[id].addr = NULL;
 
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 }
@@ -1569,16 +1742,13 @@ free_oa_buffer(struct drm_i915_private *i915)
 	mutex_unlock(&i915->drm.struct_mutex);
 }
 
-static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
+static void i915_ring_stream_destroy(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
-
-	if (stream->cs_mode)
-		free_command_stream_buf(dev_priv);
+	BUG_ON(stream != dev_priv->perf.exclusive_stream);
 
-	if (dev_priv->perf.oa.oa_buffer.obj) {
+	if (stream->using_oa) {
 		dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
 		free_oa_buffer(dev_priv);
@@ -1587,7 +1757,10 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 		intel_runtime_pm_put(dev_priv);
 	}
 
-	dev_priv->perf.oa.exclusive_stream = NULL;
+	if (stream->cs_mode)
+		free_command_stream_buf(dev_priv, stream->engine);
+
+	dev_priv->perf.exclusive_stream = NULL;
 }
 
 static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
@@ -1620,9 +1793,7 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 */
 	memset(dev_priv->perf.oa.oa_buffer.addr, 0, SZ_16M);
 
-	/* Maybe make ->pollin per-stream state if we support multiple
-	 * concurrent streams in the future. */
-	atomic_set(&dev_priv->perf.oa.pollin, false);
+	atomic_set(&dev_priv->perf.pollin[RCS], false);
 }
 
 static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
@@ -1658,9 +1829,7 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 */
 	memset(dev_priv->perf.oa.oa_buffer.addr, 0, SZ_16M);
 
-	/* Maybe make ->pollin per-stream state if we support multiple
-	 * concurrent streams in the future. */
-	atomic_set(&dev_priv->perf.oa.pollin, false);
+	atomic_set(&dev_priv->perf.pollin[RCS], false);
 }
 
 static int alloc_obj(struct drm_i915_private *dev_priv,
@@ -1746,30 +1915,33 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-static int alloc_command_stream_buf(struct drm_i915_private *dev_priv)
+static int alloc_command_stream_buf(struct drm_i915_private *dev_priv,
+					enum intel_engine_id id)
 {
 	struct drm_i915_gem_object *bo;
 	struct i915_vma *vma;
 	u8 *obj_addr;
 	int ret;
 
-	BUG_ON(dev_priv->perf.command_stream_buf.obj);
+	BUG_ON(dev_priv->perf.command_stream_buf[id].obj);
 
 	ret = alloc_obj(dev_priv, &bo, &vma, &obj_addr);
 	if (ret)
 		return ret;
 
-	dev_priv->perf.command_stream_buf.obj = bo;
-	dev_priv->perf.command_stream_buf.vma = vma; 
-	dev_priv->perf.command_stream_buf.addr = obj_addr;
-	if (WARN_ON(!list_empty(&dev_priv->perf.node_list)))
-		INIT_LIST_HEAD(&dev_priv->perf.node_list);
+	dev_priv->perf.command_stream_buf[id].obj = bo;
+	dev_priv->perf.command_stream_buf[id].vma = vma; 
+	dev_priv->perf.command_stream_buf[id].addr = obj_addr;
+	if (WARN_ON(!list_empty(&dev_priv->perf.node_list[id])))
+		INIT_LIST_HEAD(&dev_priv->perf.node_list[id]);
+
+	atomic_set(&dev_priv->perf.pollin[id], false);
 
 	DRM_DEBUG_DRIVER(
 		"command stream buf initialized, gtt offset = 0x%x, vaddr = %p",
 		 (unsigned int)
-		 dev_priv->perf.command_stream_buf.vma->node.start,
-		 dev_priv->perf.command_stream_buf.addr);
+		 dev_priv->perf.command_stream_buf[id].vma->node.start,
+		 dev_priv->perf.command_stream_buf[id].addr);
 
 	return 0;
 }
@@ -2031,14 +2203,14 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
 {
 	assert_spin_locked(&dev_priv->perf.hook_lock);
 
-	if (dev_priv->perf.oa.exclusive_stream->state !=
+	if (dev_priv->perf.exclusive_stream->state !=
 					I915_PERF_STREAM_DISABLED) {
 		unsigned long ctx_id = 0;
 
-		if (dev_priv->perf.oa.exclusive_stream->ctx)
+		if (dev_priv->perf.exclusive_stream->ctx)
 			ctx_id = dev_priv->perf.oa.specific_ctx_id;
 
-		if (dev_priv->perf.oa.exclusive_stream->ctx == NULL || ctx_id) {
+		if (dev_priv->perf.exclusive_stream->ctx == NULL || ctx_id) {
 			bool periodic = dev_priv->perf.oa.periodic;
 			u32 period_exponent = dev_priv->perf.oa.period_exponent;
 			u32 report_format = dev_priv->perf.oa.oa_buffer.format;
@@ -2103,14 +2275,15 @@ static void gen8_oa_enable(struct drm_i915_private *dev_priv)
 				   GEN8_OA_COUNTER_ENABLE);
 }
 
-static void i915_oa_stream_enable(struct i915_perf_stream *stream)
+static void i915_ring_stream_enable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	dev_priv->perf.oa.ops.oa_enable(dev_priv);
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		dev_priv->perf.oa.ops.oa_enable(dev_priv);
 
-	if (dev_priv->perf.oa.periodic)
-		hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
+	if (stream->cs_mode || dev_priv->perf.oa.periodic)
+		hrtimer_start(&dev_priv->perf.poll_check_timer,
 			      ns_to_ktime(POLL_PERIOD),
 			      HRTIMER_MODE_REL_PINNED);
 }
@@ -2125,19 +2298,20 @@ static void gen8_oa_disable(struct drm_i915_private *dev_priv)
 	I915_WRITE(GEN8_OACONTROL, 0);
 }
 
-static void i915_oa_stream_disable(struct i915_perf_stream *stream)
+static void i915_ring_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	if (dev_priv->perf.oa.periodic)
-		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+	if (stream->cs_mode || dev_priv->perf.oa.periodic)
+		hrtimer_cancel(&dev_priv->perf.poll_check_timer);
 
 	if (stream->cs_mode) {
-		i915_oa_rcs_wait_gpu(dev_priv);
-		i915_oa_rcs_free_requests(dev_priv);
+		i915_ring_stream_wait_gpu(dev_priv, stream->engine);
+		i915_ring_stream_free_requests(dev_priv, stream->engine);
 	}
 
-	dev_priv->perf.oa.ops.oa_disable(dev_priv);
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		dev_priv->perf.oa.ops.oa_disable(dev_priv);
 }
 
 static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
@@ -2147,16 +2321,16 @@ static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
 }
 
 static const struct i915_perf_stream_ops i915_oa_stream_ops = {
-	.destroy = i915_oa_stream_destroy,
-	.enable = i915_oa_stream_enable,
-	.disable = i915_oa_stream_disable,
-	.wait_unlocked = i915_oa_wait_unlocked,
-	.poll_wait = i915_oa_poll_wait,
-	.read = i915_oa_read,
-	.command_stream_hook = i915_perf_command_stream_hook_oa,
+	.destroy = i915_ring_stream_destroy,
+	.enable = i915_ring_stream_enable,
+	.disable = i915_ring_stream_disable,
+	.wait_unlocked = i915_ring_stream_wait_unlocked,
+	.poll_wait = i915_ring_stream_poll_wait,
+	.read = i915_ring_stream_read,
+	.command_stream_hook = i915_ring_stream_cs_hook,
 };
 
-static int i915_oa_stream_init(struct i915_perf_stream *stream,
+static int i915_ring_stream_init(struct i915_perf_stream *stream,
 			       struct drm_i915_perf_open_param *param,
 			       struct perf_open_properties *props)
 {
@@ -2165,15 +2339,16 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 						      SAMPLE_OA_SOURCE_INFO);
 	bool require_cs_mode = props->sample_flags & (SAMPLE_PID |
 						      SAMPLE_TAG);
-	bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
+	bool cs_sample_data = props->sample_flags & (SAMPLE_OA_REPORT |
+							SAMPLE_TS);
 	int ret;
 
 	/* To avoid the complexity of having to accurately filter
 	 * counter reports and marshal to the appropriate client
 	 * we currently only allow exclusive access
 	 */
-	if (dev_priv->perf.oa.exclusive_stream) {
-		DRM_ERROR("OA unit already in use\n");
+	if (dev_priv->perf.exclusive_stream) {
+		DRM_ERROR("Stream already in use\n");
 		return -EBUSY;
 	}
 
@@ -2224,6 +2399,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			return -EINVAL;
 		}
 		stream->engine= RCS;
+		stream->using_oa = true;
 
 		format_size =
 			dev_priv->perf.oa.oa_formats[props->oa_format].size;
@@ -2316,8 +2492,22 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			require_cs_mode = true;
 	}
 
+	if (props->sample_flags & SAMPLE_TS) {
+		stream->sample_flags |= SAMPLE_TS;
+		stream->sample_size += I915_PERF_TS_SAMPLE_SIZE;
+
+		/*
+		 * NB: it's meaningful to request SAMPLE_TS with just CS
+		 * mode or periodic OA mode sampling but we don't allow
+		 * SAMPLE_TS without either mode
+		 */
+		if (!require_oa_unit)
+			require_cs_mode = true;
+	}
+
 	if (require_cs_mode && !props->cs_mode) {
-		DRM_ERROR("PID or TAG sampling require a ring to be specified");
+		DRM_ERROR(
+			"PID, TAG or TS sampling require a ring to be specified");
 		ret = -EINVAL;
 		goto cs_error;
 	}
@@ -2332,11 +2522,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 
 		/*
 		 * The only time we should allow enabling CS mode if it's not
-		 * strictly required, is if SAMPLE_CTX_ID has been requested
-		 * as it's usable with periodic OA or CS sampling.
+		 * strictly required, is if SAMPLE_CTX_ID  or SAMPLE_TS has been
+		 * requested, as they're usable with periodic OA or CS sampling.
 		 */
 		if (!require_cs_mode &&
-		    !(props->sample_flags & SAMPLE_CTX_ID)) {
+		    !(props->sample_flags & (SAMPLE_CTX_ID|SAMPLE_TS))) {
 			DRM_ERROR(
 				"Ring given without requesting any CS specific property");
 			ret = -EINVAL;
@@ -2344,6 +2534,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		}
 
 		stream->cs_mode = true;
+		stream->engine = props->engine;
 
 		if (props->sample_flags & SAMPLE_PID) {
 			stream->sample_flags |= SAMPLE_PID;
@@ -2355,14 +2546,14 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			stream->sample_size += 4;
 		}
 
-		ret = alloc_command_stream_buf(dev_priv);
+		ret = alloc_command_stream_buf(dev_priv, stream->engine);
 		if (ret)
 			goto cs_error;
 	}
 
 	stream->ops = &i915_oa_stream_ops;
 
-	dev_priv->perf.oa.exclusive_stream = stream;
+	dev_priv->perf.exclusive_stream = stream;
 
 	return 0;
 
@@ -2398,8 +2589,8 @@ i915_oa_legacy_context_pin_notify_locked(struct drm_i915_private *dev_priv,
 	if (dev_priv->perf.oa.ops.update_hw_ctx_id_locked == NULL)
 		return;
 
-	if (dev_priv->perf.oa.exclusive_stream &&
-	    dev_priv->perf.oa.exclusive_stream->ctx == ctx) {
+	if (dev_priv->perf.exclusive_stream &&
+	    dev_priv->perf.exclusive_stream->ctx == ctx) {
 		struct i915_vma *vma = ctx->engine[RCS].state;
 		u32 ctx_id = i915_ggtt_offset(vma);
 
@@ -2468,8 +2659,8 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req)
 	if (dev_priv->perf.oa.ops.legacy_ctx_switch_unlocked == NULL)
 		return;
 
-	if (dev_priv->perf.oa.exclusive_stream &&
-	    dev_priv->perf.oa.exclusive_stream->state !=
+	if (dev_priv->perf.exclusive_stream &&
+	    dev_priv->perf.exclusive_stream->state !=
 				I915_PERF_STREAM_DISABLED) {
 
 		/* XXX: We don't take a lock here and this may run
@@ -2626,21 +2817,19 @@ static ssize_t i915_perf_read(struct file *file,
 	}
 
 	if (ret >= 0) {
-		/* Maybe make ->pollin per-stream state if we support multiple
-		 * concurrent streams in the future. */
-		atomic_set(&dev_priv->perf.oa.pollin, false);
+		atomic_set(&dev_priv->perf.pollin[stream->engine], false);
 	}
 
 	return ret;
 }
 
-static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
+static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer)
 {
 	struct i915_perf_stream *stream;
 
 	struct drm_i915_private *dev_priv =
 		container_of(hrtimer, typeof(*dev_priv),
-			     perf.oa.poll_check_timer);
+			     perf.poll_check_timer);
 
 	/* No need to protect the streams list here, since the hrtimer is
 	 * disabled before the stream is removed from list, and currently a
@@ -2649,8 +2838,9 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
 	 */
 	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
 		if (stream_have_data__unlocked(stream)) {
-			atomic_set(&dev_priv->perf.oa.pollin, true);
-			wake_up(&dev_priv->perf.oa.poll_wq);
+			atomic_set(&dev_priv->perf.pollin[stream->engine],
+					true);
+			wake_up(&dev_priv->perf.poll_wq[stream->engine]);
 		}
 	}
 
@@ -2674,7 +2864,7 @@ static unsigned int i915_perf_poll_locked(struct drm_i915_private *dev_priv,
 	 * the hrtimer/oa_poll_check_timer_cb to notify us when there are
 	 * samples to read.
 	 */
-	if (atomic_read(&dev_priv->perf.oa.pollin))
+	if (atomic_read(&dev_priv->perf.pollin[stream->engine]))
 		events |= POLLIN;
 
 	return events;
@@ -2862,7 +3052,7 @@ int i915_perf_open_ioctl_locked(struct drm_device *dev,
 	stream->dev_priv = dev_priv;
 	stream->ctx = specific_ctx;
 
-	ret = i915_oa_stream_init(stream, param, props);
+	ret = i915_ring_stream_init(stream, param, props);
 	if (ret)
 		goto err_alloc;
 
@@ -3007,21 +3197,12 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_ENGINE: {
 				unsigned int user_ring_id =
 					value & I915_EXEC_RING_MASK;
-				enum intel_engine_id engine;
 
 				if (user_ring_id > I915_USER_RINGS)
 					return -EINVAL;
 
-				/* XXX: Currently only RCS is supported.
-				 * Remove this check when support for other
-				 * engines is added
-				 */
-				engine = user_ring_map[user_ring_id];
-				if (engine != RCS)
-					return -EINVAL;
-
 				props->cs_mode = true;
-				props->engine = engine;
+				props->engine = user_ring_map[user_ring_id];
 			}
 			break;
 		case DRM_I915_PERF_PROP_SAMPLE_CTX_ID:
@@ -3033,6 +3214,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_SAMPLE_TAG:
 			props->sample_flags |= SAMPLE_TAG;
 			break;
+		case DRM_I915_PERF_PROP_SAMPLE_TS:
+			props->sample_flags |= SAMPLE_TS;
+			break;
 		case DRM_I915_PERF_PROP_MAX:
 			BUG();
 		}
@@ -3180,22 +3364,27 @@ static struct ctl_table dev_root[] = {
 
 void i915_perf_init(struct drm_i915_private *dev_priv)
 {
+	int i;
+
 	if (!(IS_HASWELL(dev_priv) ||
 	      IS_BROADWELL(dev_priv) || IS_CHERRYVIEW(dev_priv) ||
 	      IS_SKYLAKE(dev_priv)))
 		return;
 
-	hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
+	hrtimer_init(&dev_priv->perf.poll_check_timer,
 		     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
-	init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
+	dev_priv->perf.poll_check_timer.function = poll_check_timer_cb;
+
+	for (i = 0; i < I915_NUM_ENGINES; i++) {
+		INIT_LIST_HEAD(&dev_priv->perf.node_list[i]);
+		spin_lock_init(&dev_priv->perf.node_list_lock[i]);
+		init_waitqueue_head(&dev_priv->perf.poll_wq[i]);
+	}
 
 	INIT_LIST_HEAD(&dev_priv->perf.streams);
-	INIT_LIST_HEAD(&dev_priv->perf.node_list);
 	mutex_init(&dev_priv->perf.lock);
 	mutex_init(&dev_priv->perf.streams_lock);
 	spin_lock_init(&dev_priv->perf.hook_lock);
-	spin_lock_init(&dev_priv->perf.node_list_lock);
 
 	if (IS_HASWELL(dev_priv)) {
 		dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index ef21d17..89a7078 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -413,6 +413,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
 #define   MI_INVALIDATE_TLB		(1<<18)
 #define   MI_FLUSH_DW_OP_STOREDW	(1<<14)
+#define   MI_FLUSH_DW_OP_STAMP		(3<<14)
 #define   MI_FLUSH_DW_OP_MASK		(3<<14)
 #define   MI_FLUSH_DW_NOTIFY		(1<<8)
 #define   MI_INVALIDATE_BSD		(1<<7)
@@ -496,6 +497,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   PIPE_CONTROL_TLB_INVALIDATE			(1<<18)
 #define   PIPE_CONTROL_MEDIA_STATE_CLEAR		(1<<16)
 #define   PIPE_CONTROL_QW_WRITE				(1<<14)
+#define   PIPE_CONTROL_TIMESTAMP_WRITE			(3<<14)
 #define   PIPE_CONTROL_POST_SYNC_OP_MASK                (3<<14)
 #define   PIPE_CONTROL_DEPTH_STALL			(1<<13)
 #define   PIPE_CONTROL_WRITE_FLUSH			(1<<12)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 15921c7..0dcc325 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1324,6 +1324,12 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_SAMPLE_TAG,
 
+	/**
+	 * The value of this property set to 1 requests inclusion of timestamp
+	 * in the perf sample data.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_TS,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1392,6 +1398,7 @@ enum drm_i915_perf_record_type {
 	 *     { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
 	 *     { u32 pid; } && DRM_I915_PERF_PROP_SAMPLE_PID
 	 *     { u32 tag; } && DRM_I915_PERF_PROP_SAMPLE_TAG
+	 *     { u64 timestamp; } && DRM_I915_PERF_PROP_SAMPLE_TS
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */
-- 
1.9.1