[Intel-gfx] [PATCH 06/16] drm/i915: Framework for capturing command stream based OA reports

Fri Apr 22 11:33:55 UTC 2016

From: Sourab Gupta <sourab.gupta at intel.com>

This patch introduces a framework to enable OA counter reports associated
with Render command stream. We can then associate the reports captured
through this mechanism with their corresponding context id's. This can be
further extended to associate any other metadata information with the
corresponding samples (since the association with Render command stream
gives us the ability to capture these information while inserting the
corresponding capture commands into the command stream).

The OA reports generated in this way are associated with a corresponding
workload, and thus can be used the delimit the workload (i.e. sample the
counters at the workload boundaries), within an ongoing stream of periodic
counter snapshots.

There may be usecases wherein we need more than periodic OA capture mode
which is supported currently. This mode is primarily used for two usecases:
    - Ability to capture system wide metrics, alongwith the ability to map
      the reports back to individual contexts (particularly for HSW).
    - Ability to inject tags for work, into the reports. This provides
      visibility into the multiple stages of work within single context.

The userspace will be able to distinguish between the periodic and CS based
OA reports by the virtue of source_info sample field.

The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA
counters, and is inserted at BB boundaries.
The data thus captured will be stored in a separate buffer, which will
be different from the buffer used otherwise for periodic OA capture mode.
The metadata information pertaining to snapshot is maintained in a list,
which also has offsets into the gem buffer object per captured snapshot.
In order to track whether the gpu has completed processing the node,
a field pertaining to corresponding gem request is added, which is tracked
for completion of the command.

Both periodic and RCS based reports are associated with a single stream
(corresponding to render engine), and it is expected to have the samples
in the sequential order according to their timestamps. Now, since these
reports are collected in separate buffers, these are merge sorted at the
time of forwarding to userspace during the read call.

v2: Aligining with the non-perf interface (custom drm ioctl based). Also,
few related patches are squashed together for better readability

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
Signed-off-by: Robert Bragg <robert at sixbynine.org>
---
 drivers/gpu/drm/i915/i915_drv.h            |  36 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   4 +
 drivers/gpu/drm/i915/i915_perf.c           | 829 +++++++++++++++++++++++++----
 drivers/gpu/drm/i915/intel_lrc.c           |   4 +
 include/uapi/drm/i915_drm.h                |  15 +
 5 files changed, 774 insertions(+), 114 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cfc135d..050df37 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1735,12 +1735,16 @@ struct i915_perf_stream {
 
 	struct list_head link;
 
+	enum intel_engine_id engine;
 	u32 sample_flags;
 	int sample_size;
 
 	struct intel_context *ctx;
 	bool enabled;
 
+	/* Whether command stream based data collection is enabled */
+	bool cs_mode;
+
 	/* Enables the collection of HW samples, either in response to
 	 * I915_PERF_IOCTL_ENABLE or implicitly called when stream is
 	 * opened without I915_PERF_FLAG_DISABLED.
@@ -1796,6 +1800,12 @@ struct i915_perf_stream {
 	 * The stream will always be disabled before this is called.
 	 */
 	void (*destroy)(struct i915_perf_stream *stream);
+
+	/*
+	 * Routine to emit the commands in the command streamer associated
+	 * with the corresponding gpu engine.
+	 */
+	void (*command_stream_hook)(struct drm_i915_gem_request *req);
 };
 
 struct i915_oa_ops {
@@ -1809,10 +1819,21 @@ struct i915_oa_ops {
 					u32 ctx_id);
 	void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req);
 	int (*read)(struct i915_perf_stream *stream,
-		    struct i915_perf_read_state *read_state);
+		    struct i915_perf_read_state *read_state, u32 ts);
 	bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
 };
 
+/*
+ * List element to hold info about the perf sample data associated
+ * with a particular GPU command stream.
+ */
+struct i915_perf_cs_data_node {
+	struct list_head link;
+	struct drm_i915_gem_request *request;
+	u32 offset;
+	u32 ctx_id;
+};
+
 struct drm_i915_private {
 	struct drm_device *dev;
 	struct kmem_cache *objects;
@@ -2093,6 +2114,8 @@ struct drm_i915_private {
 		struct ctl_table_header *sysctl_header;
 
 		struct mutex lock;
+
+		struct mutex streams_lock;
 		struct list_head streams;
 
 		spinlock_t hook_lock;
@@ -2137,6 +2160,16 @@ struct drm_i915_private {
 			const struct i915_oa_format *oa_formats;
 			int n_builtin_sets;
 		} oa;
+
+		/* Command stream based perf data buffer */
+		struct {
+			struct drm_i915_gem_object *obj;
+			struct i915_vma *vma;
+			u8 *addr;
+		} command_stream_buf;
+
+		struct list_head node_list;
+		spinlock_t node_list_lock;
 	} perf;
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
@@ -3504,6 +3537,7 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req);
 void i915_oa_update_reg_state(struct intel_engine_cs *engine,
 			      struct intel_context *ctx,
 			      uint32_t *reg_state);
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct drm_device *dev,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 6f4f2a6..89b114b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1313,12 +1313,16 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
 	if (exec_len == 0)
 		exec_len = params->batch_obj->base.size;
 
+	i915_perf_command_stream_hook(params->request);
+
 	ret = engine->dispatch_execbuffer(params->request,
 					exec_start, exec_len,
 					params->dispatch_flags);
 	if (ret)
 		return ret;
 
+	i915_perf_command_stream_hook(params->request);
+
 	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
 
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index f86cd15..4adbf26 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -80,6 +80,13 @@ static u32 i915_perf_stream_paranoid = true;
 #define GEN8_OAREPORT_REASON_GO_TRANSITION  (1<<23)
 #define GEN9_OAREPORT_REASON_CLK_RATIO      (1<<24)
 
+/* Data common to periodic and RCS based samples */
+struct oa_sample_data {
+	u32 source;
+	u32 ctx_id;
+	const u8 *report;
+};
+
 /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */
 static int zero;
 static int oa_exponent_max = OA_EXPONENT_MAX;
@@ -121,6 +128,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 
 #define SAMPLE_OA_REPORT	(1<<0)
 #define SAMPLE_OA_SOURCE_INFO	(1<<1)
+#define SAMPLE_CTX_ID		(1<<2)
 
 struct perf_open_properties {
 	u32 sample_flags;
@@ -133,8 +141,234 @@ struct perf_open_properties {
 	int oa_format;
 	bool oa_periodic;
 	int oa_period_exponent;
+
+	/* Command stream mode */
+	bool cs_mode;
+	enum intel_engine_id engine;
 };
 
+/*
+ * Emit the commands to capture metrics, into the command stream. This function
+ * can be called concurrently with the stream operations and doesn't require
+ * perf mutex lock.
+ */
+
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *req)
+{
+	struct intel_engine_cs *engine = req->engine;
+	struct drm_i915_private *dev_priv = engine->dev->dev_private;
+	struct i915_perf_stream *stream;
+
+	if (!dev_priv->perf.initialized)
+		return;
+
+	mutex_lock(&dev_priv->perf.streams_lock);
+	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+		if (stream->enabled && stream->command_stream_hook)
+			stream->command_stream_hook(req);
+	}
+	mutex_unlock(&dev_priv->perf.streams_lock);
+}
+
+/*
+ * Release some perf entries to make space for a new entry data. We dereference
+ * the associated request before deleting the entry. Also, no need to check for
+ * gpu completion of commands, since, these entries are anyways going to be
+ * replaced by a new entry, and gpu will overwrite the buffer contents
+ * eventually, when the request associated with new entry completes.
+ */
+static void release_some_perf_entries(struct drm_i915_private *dev_priv,
+					u32 target_size)
+{
+	struct i915_perf_cs_data_node *entry, *next;
+	u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u32 size = 0;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->perf.node_list, link) {
+
+		size += entry_size;
+		i915_gem_request_unreference(entry->request);
+		list_del(&entry->link);
+		kfree(entry);
+
+		if (size >= target_size)
+			break;
+	}
+}
+
+/*
+ * Insert the perf entry to the end of the list. This function never fails,
+ * since it always manages to insert the entry. If the space is exhausted in
+ * the buffer, it will remove the oldest entries in order to make space.
+ */
+static void insert_perf_entry(struct drm_i915_private *dev_priv,
+				struct i915_perf_cs_data_node *entry)
+{
+	struct i915_perf_cs_data_node *first_entry, *last_entry;
+	int max_offset = dev_priv->perf.command_stream_buf.obj->base.size;
+	u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size;
+
+	spin_lock(&dev_priv->perf.node_list_lock);
+	if (list_empty(&dev_priv->perf.node_list)) {
+		entry->offset = 0;
+		list_add_tail(&entry->link, &dev_priv->perf.node_list);
+		spin_unlock(&dev_priv->perf.node_list_lock);
+		return;
+	}
+
+	first_entry = list_first_entry(&dev_priv->perf.node_list,
+				       typeof(*first_entry), link);
+	last_entry = list_last_entry(&dev_priv->perf.node_list,
+				     typeof(*last_entry), link);
+
+	if (last_entry->offset >= first_entry->offset) {
+		/* Sufficient space available at the end of buffer? */
+		if (last_entry->offset + 2*entry_size < max_offset)
+			entry->offset = last_entry->offset + entry_size;
+		/*
+		 * Wraparound condition. Is sufficient space available at
+		 * beginning of buffer?
+		 */
+		else if (entry_size < first_entry->offset)
+			entry->offset = 0;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = entry_size - first_entry->offset;
+
+			release_some_perf_entries(dev_priv, target_size);
+			entry->offset = 0;
+		}
+	} else {
+		/* Sufficient space available? */
+		if (last_entry->offset + 2*entry_size < first_entry->offset)
+			entry->offset = last_entry->offset + entry_size;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = entry_size -
+				(first_entry->offset - last_entry->offset -
+				entry_size);
+
+			release_some_perf_entries(dev_priv, target_size);
+			entry->offset = last_entry->offset + entry_size;
+		}
+	}
+	list_add_tail(&entry->link, &dev_priv->perf.node_list);
+	spin_unlock(&dev_priv->perf.node_list_lock);
+}
+
+static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req)
+{
+	struct intel_engine_cs *engine = req->engine;
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	struct intel_context *ctx = req->ctx;
+	struct drm_i915_private *dev_priv = engine->dev->dev_private;
+	struct i915_perf_cs_data_node *entry;
+	u32 addr = 0;
+	int ret;
+
+	/* OA counters are only supported on the render engine */
+	BUG_ON(engine->id != RCS);
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL) {
+		DRM_ERROR("alloc failed\n");
+		return;
+	}
+
+	if (i915.enable_execlists)
+		ret = intel_logical_ring_begin(req, 4);
+	else
+		ret = intel_ring_begin(req, 4);
+
+	if (ret) {
+		kfree(entry);
+		return;
+	}
+
+	entry->ctx_id = ctx->global_id;
+	i915_gem_request_assign(&entry->request, req);
+
+	insert_perf_entry(dev_priv, entry);
+
+	addr = dev_priv->perf.command_stream_buf.vma->node.start +
+		entry->offset;
+
+	/* addr should be 64 byte aligned */
+	BUG_ON(addr & 0x3f);
+
+	if (i915.enable_execlists) {
+		intel_logical_ring_emit(ringbuf, MI_REPORT_PERF_COUNT | (2<<0));
+		intel_logical_ring_emit(ringbuf,
+					addr | MI_REPORT_PERF_COUNT_GGTT);
+		intel_logical_ring_emit(ringbuf, 0);
+		intel_logical_ring_emit(ringbuf,
+			i915_gem_request_get_seqno(req));
+		intel_logical_ring_advance(ringbuf);
+	} else {
+		if (INTEL_INFO(engine->dev)->gen >= 8) {
+			intel_ring_emit(engine, MI_REPORT_PERF_COUNT | (2<<0));
+			intel_ring_emit(engine, addr | MI_REPORT_PERF_COUNT_GGTT);
+			intel_ring_emit(engine, 0);
+			intel_ring_emit(engine,
+				i915_gem_request_get_seqno(req));
+		} else {
+			intel_ring_emit(engine, MI_REPORT_PERF_COUNT | (1<<0));
+			intel_ring_emit(engine, addr | MI_REPORT_PERF_COUNT_GGTT);
+			intel_ring_emit(engine, i915_gem_request_get_seqno(req));
+			intel_ring_emit(engine, MI_NOOP);
+		}
+		intel_ring_advance(engine);
+	}
+	i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, req);
+}
+
+static int i915_oa_rcs_wait_gpu(struct drm_i915_private *dev_priv)
+{
+	struct i915_perf_cs_data_node *last_entry = NULL;
+	struct drm_i915_gem_request *req = NULL;
+	int ret;
+
+	/*
+	 * Wait for the last scheduled request to complete. This would
+	 * implicitly wait for the prior submitted requests. The refcount
+	 * of the requests is not decremented here.
+	 */
+	spin_lock(&dev_priv->perf.node_list_lock);
+
+	if (!list_empty(&dev_priv->perf.node_list)) {
+		last_entry = list_last_entry(&dev_priv->perf.node_list,
+			struct i915_perf_cs_data_node, link);
+		req = last_entry->request;
+	}
+	spin_unlock(&dev_priv->perf.node_list_lock);
+
+	if (!req)
+		return 0;
+
+	ret = __i915_wait_request(req, true, NULL, NULL);
+	if (ret) {
+		DRM_ERROR("Failed to wait for request\n");
+		return ret;
+	}
+	return 0;
+}
+
+static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv)
+{
+	struct i915_perf_cs_data_node *entry, *next;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->perf.node_list, link) {
+		i915_gem_request_unreference__unlocked(entry->request);
+
+		spin_lock(&dev_priv->perf.node_list_lock);
+		list_del(&entry->link);
+		spin_unlock(&dev_priv->perf.node_list_lock);
+		kfree(entry);
+	}
+}
+
 /* NB: This is either called via fops or the poll check hrtimer (atomic ctx)
  *
  * It's safe to read OA config state here unlocked, assuming that this is only
@@ -203,7 +437,7 @@ static int append_oa_status(struct i915_perf_stream *stream,
  */
 static int append_oa_sample(struct i915_perf_stream *stream,
 			    struct i915_perf_read_state *read_state,
-			    const u8 *report)
+			    struct oa_sample_data *data)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -223,6 +457,38 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 	buf += sizeof(header);
 
 	if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
+		if (copy_to_user(buf, &data->source, 4))
+			return -EFAULT;
+		buf += 4;
+	}
+
+	if (sample_flags & SAMPLE_CTX_ID) {
+		if (copy_to_user(buf, &data->ctx_id, 4))
+			return -EFAULT;
+		buf += 4;
+	}
+
+	if (sample_flags & SAMPLE_OA_REPORT) {
+		if (copy_to_user(buf, data->report, report_size))
+			return -EFAULT;
+		buf += report_size;
+	}
+
+	read_state->buf = buf;
+	read_state->read += header.size;
+
+	return 0;
+}
+
+static int append_oa_buffer_sample(struct i915_perf_stream *stream,
+				    struct i915_perf_read_state *read_state,
+				    const u8 *report)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	u32 sample_flags = stream->sample_flags;
+	struct oa_sample_data data = { 0 };
+
+	if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
 		enum drm_i915_perf_oa_event_source source;
 
 		if (INTEL_INFO(dev_priv)->gen >= 8) {
@@ -238,20 +504,16 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 		} else
 			source = I915_PERF_OA_EVENT_SOURCE_PERIODIC;
 
-		if (copy_to_user(buf, &source, 4))
-			return -EFAULT;
-		buf += 4;
+		data.source = source;
 	}
+#warning "FIXME: append_oa_buffer_sample: read ctx ID from report and map that to an intel_context::global_id"
+	if (sample_flags & SAMPLE_CTX_ID)
+		data.ctx_id = 0;
 
-	if (sample_flags & SAMPLE_OA_REPORT) {
-		if (copy_to_user(buf, report, report_size))
-			return -EFAULT;
-	}
+	if (sample_flags & SAMPLE_OA_REPORT)
+		data.report = report;
 
-	read_state->buf += header.size;
-	read_state->read += header.size;
-
-	return 0;
+	return append_oa_sample(stream, read_state, &data);
 }
 
 /**
@@ -268,7 +530,7 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  struct i915_perf_read_state *read_state,
 				  u32 *head_ptr,
-				  u32 tail)
+				  u32 tail, u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -287,6 +549,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	 */
 
 	while ((taken = OA_TAKEN(tail, head))) {
+		u8 *report;
+		u32 report_ts;
 
 		/* The tail increases in 64 byte increments, not in
 		 * format_size steps. */
@@ -300,8 +564,14 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		WARN_ONCE((OA_BUFFER_SIZE - (head & mask)) < report_size,
 			  "i915: Misaligned OA head pointer");
 
+		report = oa_buf_base + head;
+
+		/* Report timestamp should not exceed passed in ts */
+		report_ts = *(u32 *)(report + 4);
+		if (report_ts > ts)
+			break;
+
 		if (dev_priv->perf.oa.exclusive_stream->enabled) {
-			u8 *report = oa_buf_base + head;
 			u32 ctx_id = *(u32 *)(report + 8);
 
 			if (i915.enable_execlists) {
@@ -340,7 +610,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				    dev_priv->perf.oa.specific_ctx_id != ctx_id)
 					*(u32 *)(report + 8) = 0x1fffff;
 
-				ret = append_oa_sample(stream, read_state, report);
+				ret = append_oa_buffer_sample(stream,
+							      read_state, report);
 				if (ret)
 					break;
 
@@ -358,7 +629,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 }
 
 static int gen8_oa_read(struct i915_perf_stream *stream,
-			struct i915_perf_read_state *read_state)
+			struct i915_perf_read_state *read_state, u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -400,7 +671,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 
 	/* If there is still buffer space */
 
-	ret = gen8_append_oa_reports(stream, read_state, &head, tail);
+	ret = gen8_append_oa_reports(stream, read_state, &head, tail, ts);
 
 	/* All the report sizes are a power of two and the
 	 * head should always be incremented by some multiple
@@ -431,7 +702,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  struct i915_perf_read_state *read_state,
 				  u32 *head_ptr,
-				  u32 tail)
+				  u32 tail, u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -505,7 +776,11 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 			continue;
 		}
 
-		ret = append_oa_sample(stream, read_state, report);
+		/* Report timestamp should not exceed passed in ts */
+		if (report32[1] > ts)
+			break;
+
+		ret = append_oa_buffer_sample(stream, read_state, report);
 		if (ret)
 			break;
 
@@ -524,7 +799,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 }
 
 static int gen7_oa_read(struct i915_perf_stream *stream,
-			struct i915_perf_read_state *read_state)
+			struct i915_perf_read_state *read_state, u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -596,7 +871,7 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 			GEN7_OASTATUS1_REPORT_LOST;
 	}
 
-	ret = gen7_append_oa_reports(stream, read_state, &head, tail);
+	ret = gen7_append_oa_reports(stream, read_state, &head, tail, ts);
 
 	/* All the report sizes are a power of two and the
 	 * head should always be incremented by some multiple
@@ -620,14 +895,129 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 	return ret;
 }
 
-static bool i915_oa_can_read(struct i915_perf_stream *stream)
+/**
+ * Copies a command stream OA report into userspace read() buffer, while also
+ * forwarding the periodic OA reports with timestamp lower than CS report.
+ *
+ * NB: some data may be successfully copied to the userspace buffer
+ * even if an error is returned, and this is reflected in the
+ * updated @read_state.
+ */
+static int append_oa_rcs_sample(struct i915_perf_stream *stream,
+				 struct i915_perf_read_state *read_state,
+				 struct i915_perf_cs_data_node *node)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct oa_sample_data data = { 0 };
+	const u8 *report = dev_priv->perf.command_stream_buf.addr +
+				node->offset;
+	u32 sample_flags = stream->sample_flags;
+	u32 report_ts;
+	int ret;
+
+	/* First, append the periodic OA samples having lower timestamps */
+	report_ts = *(u32 *)(report + 4);
+	ret = dev_priv->perf.oa.ops.read(stream, read_state, report_ts);
+	if (ret)
+		return ret;
+
+	if (sample_flags & SAMPLE_OA_SOURCE_INFO)
+		data.source = I915_PERF_OA_EVENT_SOURCE_RCS;
 
-	return !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv);
+	if (sample_flags & SAMPLE_CTX_ID)
+		data.ctx_id = node->ctx_id;
+
+	if (sample_flags & SAMPLE_OA_REPORT)
+		data.report = report;
+
+	return append_oa_sample(stream, read_state, &data);
 }
 
-static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+/**
+ * Copies all command stream based OA reports into userspace read() buffer.
+ *
+ * NB: some data may be successfully copied to the userspace buffer
+ * even if an error is returned, and this is reflected in the
+ * updated @read_state.
+ */
+static int oa_rcs_append_reports(struct i915_perf_stream *stream,
+				  struct i915_perf_read_state *read_state)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_data_node *entry, *next;
+	LIST_HEAD(free_list);
+	int ret = 0;
+
+	spin_lock(&dev_priv->perf.node_list_lock);
+	if (list_empty(&dev_priv->perf.node_list)) {
+		spin_unlock(&dev_priv->perf.node_list_lock);
+		return 0;
+	}
+	list_for_each_entry_safe(entry, next,
+				 &dev_priv->perf.node_list, link) {
+		if (!i915_gem_request_completed(entry->request, true))
+			break;
+		list_move_tail(&entry->link, &free_list);
+	}
+	spin_unlock(&dev_priv->perf.node_list_lock);
+
+	if (list_empty(&free_list))
+		return 0;
+
+	list_for_each_entry_safe(entry, next, &free_list, link) {
+		ret = append_oa_rcs_sample(stream, read_state, entry);
+		if (ret)
+			break;
+
+		list_del(&entry->link);
+		i915_gem_request_unreference__unlocked(entry->request);
+		kfree(entry);
+	}
+
+	/* Don't discard remaining entries, keep them for next read */
+	spin_lock(&dev_priv->perf.node_list_lock);
+	list_splice(&free_list, &dev_priv->perf.node_list);
+	spin_unlock(&dev_priv->perf.node_list_lock);
+
+	return ret;
+}
+
+/*
+ * Checks whether the command stream buffer associated with the stream has
+ * data ready to be forwarded to userspace.
+ * Returns true if atleast one request associated with command stream is
+ * completed, else returns false.
+ */
+static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
+
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_data_node *entry = NULL;
+	struct drm_i915_gem_request *request = NULL;
+
+	spin_lock(&dev_priv->perf.node_list_lock);
+	entry = list_first_entry_or_null(&dev_priv->perf.node_list,
+			struct i915_perf_cs_data_node, link);
+	if (entry)
+		request = entry->request;
+	spin_unlock(&dev_priv->perf.node_list_lock);
+
+	if (!entry)
+		return true;
+	else if (!i915_gem_request_completed(request, true))
+		return true;
+	else
+		return false;
+}
+
+/*
+ * Checks whether the stream has data ready to forward to userspace.
+ * For command stream based streams, check if the command stream buffer has
+ * atleast one sample ready, if not return false, irrespective of periodic
+ * oa buffer having the data or not.
+ */
+
+static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
@@ -637,8 +1027,31 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
 	 * can't be destroyed until completion (such as a read()) that ensures
 	 * the device + OA buffer can't disappear
 	 */
+	if (stream->cs_mode)
+		return !command_stream_buf_is_empty(stream);
+	else
+		return !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv);
+}
+
+static bool i915_oa_can_read(struct i915_perf_stream *stream)
+{
+
+	return stream_have_data__unlocked(stream);
+}
+
+static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	int ret;
+
+	if (stream->cs_mode) {
+		ret = i915_oa_rcs_wait_gpu(dev_priv);
+		if (ret)
+			return ret;
+	}
+
 	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
-					!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv));
+					stream_have_data__unlocked(stream));
 }
 
 static void i915_oa_poll_wait(struct i915_perf_stream *stream,
@@ -655,7 +1068,27 @@ static int i915_oa_read(struct i915_perf_stream *stream,
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	return dev_priv->perf.oa.ops.read(stream, read_state);
+	if (stream->cs_mode)
+		return oa_rcs_append_reports(stream, read_state);
+	else
+		return dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX);
+}
+
+static void
+free_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+	mutex_lock(&dev_priv->dev->struct_mutex);
+
+	vunmap(dev_priv->perf.command_stream_buf.addr);
+	i915_gem_object_ggtt_unpin(dev_priv->perf.command_stream_buf.obj);
+	drm_gem_object_unreference(
+			&dev_priv->perf.command_stream_buf.obj->base);
+
+	dev_priv->perf.command_stream_buf.obj = NULL;
+	dev_priv->perf.command_stream_buf.vma = NULL;
+	dev_priv->perf.command_stream_buf.addr = NULL;
+
+	mutex_unlock(&dev_priv->dev->struct_mutex);
 }
 
 static void
@@ -680,12 +1113,17 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 
 	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
 
-	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+	if (stream->cs_mode)
+		free_command_stream_buf(dev_priv);
 
-	free_oa_buffer(dev_priv);
+	if (dev_priv->perf.oa.oa_buffer.obj) {
+		dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-	intel_runtime_pm_put(dev_priv);
+		free_oa_buffer(dev_priv);
+
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
+	}
 
 	dev_priv->perf.oa.exclusive_stream = NULL;
 }
@@ -768,16 +1206,17 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 		    GEN8_OATAILPTR_MASK));
 }
 
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+static int alloc_obj(struct drm_i915_private *dev_priv,
+		     struct drm_i915_gem_object **obj, u8 **addr)
 {
 	struct drm_i915_gem_object *bo;
 	int ret;
 
-	BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
+	intel_runtime_pm_get(dev_priv);
 
 	ret = i915_mutex_lock_interruptible(dev_priv->dev);
 	if (ret)
-		return ret;
+		goto out;
 
 	bo = i915_gem_alloc_object(dev_priv->dev, OA_BUFFER_SIZE);
 	if (bo == NULL) {
@@ -785,8 +1224,6 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 		ret = -ENOMEM;
 		goto unlock;
 	}
-	dev_priv->perf.oa.oa_buffer.obj = bo;
-
 	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
 	if (ret)
 		goto err_unref;
@@ -796,8 +1233,42 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 	if (ret)
 		goto err_unref;
 
+	*addr = vmap_oa_buffer(bo);
+	if (*addr == NULL) {
+		ret = -ENOMEM;
+		goto err_unpin;
+	}
+
+	*obj = bo;
+	goto unlock;
+
+err_unpin:
+	i915_gem_object_ggtt_unpin(bo);
+err_unref:
+	drm_gem_object_unreference(&bo->base);
+unlock:
+	mutex_unlock(&dev_priv->dev->struct_mutex);
+out:
+	intel_runtime_pm_put(dev_priv);
+	return ret;
+}
+
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	struct drm_i915_gem_object *bo;
+	u8 *obj_addr;
+	int ret;
+
+	BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
+
+	ret = alloc_obj(dev_priv, &bo, &obj_addr);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.oa.oa_buffer.obj = bo;
+	dev_priv->perf.oa.oa_buffer.addr = obj_addr;
+
 	dev_priv->perf.oa.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
-	dev_priv->perf.oa.oa_buffer.addr = vmap_oa_buffer(bo);
 
 	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
 
@@ -805,14 +1276,34 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 			 dev_priv->perf.oa.oa_buffer.gtt_offset,
 			 dev_priv->perf.oa.oa_buffer.addr);
 
-	goto unlock;
+	return 0;
+}
 
-err_unref:
-	drm_gem_object_unreference(&bo->base);
+static int alloc_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+	struct drm_i915_gem_object *bo;
+	u8 *obj_addr;
+	int ret;
 
-unlock:
-	mutex_unlock(&dev_priv->dev->struct_mutex);
-	return ret;
+	BUG_ON(dev_priv->perf.command_stream_buf.obj);
+
+	ret = alloc_obj(dev_priv, &bo, &obj_addr);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.command_stream_buf.obj = bo;
+	dev_priv->perf.command_stream_buf.addr = obj_addr;
+	dev_priv->perf.command_stream_buf.vma = i915_gem_obj_to_ggtt(bo);
+	if (WARN_ON(!list_empty(&dev_priv->perf.node_list)))
+		INIT_LIST_HEAD(&dev_priv->perf.node_list);
+
+	DRM_DEBUG_DRIVER(
+		"command stream buf initialized, gtt offset = 0x%x, vaddr = %p",
+		 (unsigned int)
+		 dev_priv->perf.command_stream_buf.vma->node.start,
+		 dev_priv->perf.command_stream_buf.addr);
+
+	return 0;
 }
 
 static void config_oa_regs(struct drm_i915_private *dev_priv,
@@ -1119,6 +1610,9 @@ static void i915_oa_stream_enable(struct i915_perf_stream *stream)
 
 	dev_priv->perf.oa.ops.oa_enable(dev_priv);
 
+	if (stream->cs_mode)
+		stream->command_stream_hook = i915_perf_command_stream_hook_oa;
+
 	if (dev_priv->perf.oa.periodic)
 		hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
 			      ns_to_ktime(POLL_PERIOD),
@@ -1139,10 +1633,16 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	dev_priv->perf.oa.ops.oa_disable(dev_priv);
-
 	if (dev_priv->perf.oa.periodic)
 		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+
+	if (stream->cs_mode) {
+		stream->command_stream_hook = NULL;
+		i915_oa_rcs_wait_gpu(dev_priv);
+		i915_oa_rcs_free_requests(dev_priv);
+	}
+
+	dev_priv->perf.oa.ops.oa_disable(dev_priv);
 }
 
 static u64 oa_exponent_to_ns(struct drm_i915_private *dev_priv, int exponent)
@@ -1156,14 +1656,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			       struct perf_open_properties *props)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	int format_size;
+	bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
+						      SAMPLE_OA_SOURCE_INFO);
+	bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
 	int ret;
 
-	if (!dev_priv->perf.oa.ops.init_oa_buffer) {
-		DRM_ERROR("OA unit not supported\n");
-		return -ENODEV;
-	}
-
 	/* To avoid the complexity of having to accurately filter
 	 * counter reports and marshal to the appropriate client
 	 * we currently only allow exclusive access
@@ -1173,85 +1670,142 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		return -EBUSY;
 	}
 
-	if (!props->metrics_set) {
-		DRM_ERROR("OA metric set not specified\n");
-		return -EINVAL;
-	}
-
-	if (!props->oa_format) {
-		DRM_ERROR("OA report format not specified\n");
-		return -EINVAL;
+	if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) {
+		if (IS_HASWELL(dev_priv->dev)) {
+			DRM_ERROR(
+				"On HSW, context ID sampling only supported via command stream");
+			return -EINVAL;
+		} else if (!i915.enable_execlists) {
+			DRM_ERROR(
+				"On Gen8+ without execlists, context ID sampling only supported via command stream");
+			return -EINVAL;
+		}
 	}
 
 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
-	format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
+	if (require_oa_unit) {
+		int format_size;
 
-	if (props->sample_flags & SAMPLE_OA_REPORT) {
-		stream->sample_flags |= SAMPLE_OA_REPORT;
-		stream->sample_size += format_size;
-	}
+		if (!dev_priv->perf.oa.ops.init_oa_buffer) {
+			DRM_ERROR("OA unit not supported\n");
+			return -ENODEV;
+		}
+
+		if (!props->metrics_set) {
+			DRM_ERROR("OA metric set not specified\n");
+			return -EINVAL;
+		}
+
+		if (!props->oa_format) {
+			DRM_ERROR("OA report format not specified\n");
+			return -EINVAL;
+		}
 
-	if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
-		if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+		if (props->cs_mode  && (props->engine!= RCS)) {
 			DRM_ERROR(
-			"OA source type can't be sampled without OA report");
+				"Command stream OA metrics only available via Render CS\n");
 			return -EINVAL;
 		}
-		stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
-		stream->sample_size += 4;
-	}
+		stream->engine= RCS;
 
-	dev_priv->perf.oa.oa_buffer.format_size = format_size;
-	BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0);
+		format_size =
+			dev_priv->perf.oa.oa_formats[props->oa_format].size;
 
-	dev_priv->perf.oa.oa_buffer.format =
-		dev_priv->perf.oa.oa_formats[props->oa_format].format;
+		if (props->sample_flags & SAMPLE_OA_REPORT) {
+			stream->sample_flags |= SAMPLE_OA_REPORT;
+			stream->sample_size += format_size;
+		}
 
-	dev_priv->perf.oa.metrics_set = props->metrics_set;
+		if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
+			if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+				DRM_ERROR(
+					  "OA source type can't be sampled without OA report");
+				return -EINVAL;
+			}
+			stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
+			stream->sample_size += 4;
+		}
+
+		dev_priv->perf.oa.oa_buffer.format_size = format_size;
+		BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0);
+		dev_priv->perf.oa.oa_buffer.format =
+			dev_priv->perf.oa.oa_formats[props->oa_format].format;
+
+		dev_priv->perf.oa.metrics_set = props->metrics_set;
 
-	dev_priv->perf.oa.periodic = props->oa_periodic;
-	if (dev_priv->perf.oa.periodic) {
-		u64 period_ns = oa_exponent_to_ns(dev_priv,
-						  props->oa_period_exponent);
+		dev_priv->perf.oa.periodic = props->oa_periodic;
+		if (dev_priv->perf.oa.periodic) {
+			u64 period_ns = oa_exponent_to_ns(dev_priv,
+						props->oa_period_exponent);
 
-		dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+			dev_priv->perf.oa.period_exponent =
+						props->oa_period_exponent;
+
+			/* See comment for OA_TAIL_MARGIN_NSEC for details
+			 * about this tail_margin... */
+			dev_priv->perf.oa.tail_margin =
+				((OA_TAIL_MARGIN_NSEC / period_ns) + 1) *
+				format_size;
+		}
+
+		if (i915.enable_execlists && stream->ctx)
+			dev_priv->perf.oa.specific_ctx_id =
+				intel_execlists_ctx_id(stream->ctx);
+
+		ret = alloc_oa_buffer(dev_priv);
+		if (ret)
+			return ret;
+
+		/* PRM - observability performance counters:
+		 *
+		 *   OACONTROL, performance counter enable, note:
+		 *
+		 *   "When this bit is set, in order to have coherent counts,
+		 *   RC6 power state and trunk clock gating must be disabled.
+		 *   This can be achieved by programming MMIO registers as
+		 *   0xA094=0 and 0xA090[31]=1"
+		 *
+		 *   In our case we are expecting that taking pm + FORCEWAKE
+		 *   references will effectively disable RC6.
+		 */
+		intel_runtime_pm_get(dev_priv);
+		intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+		ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
+		if (ret) {
+			intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+			intel_runtime_pm_put(dev_priv);
+			free_oa_buffer(dev_priv);
+			return ret;
+		}
+	}
 
-		/* See comment for OA_TAIL_MARGIN_NSEC for details
-		 * about this tail_margin... */
-		dev_priv->perf.oa.tail_margin =
-			((OA_TAIL_MARGIN_NSEC / period_ns) + 1) * format_size;
+	if (props->sample_flags & SAMPLE_CTX_ID) {
+		stream->sample_flags |= SAMPLE_CTX_ID;
+		stream->sample_size += 4;
 	}
 
-	if (i915.enable_execlists && stream->ctx)
-		dev_priv->perf.oa.specific_ctx_id =
-			intel_execlists_ctx_id(stream->ctx);
+	if (props->cs_mode) {
+		if (!cs_sample_data) {
+			DRM_ERROR(
+				"Ring given without requesting any CS data to sample");
+			ret = -EINVAL;
+			goto cs_error;
+		}
 
-	ret = alloc_oa_buffer(dev_priv);
-	if (ret)
-		return ret;
+		if (!(props->sample_flags & SAMPLE_CTX_ID)) {
+			DRM_ERROR(
+				"Ring given without requesting any CS specific property");
+			ret = -EINVAL;
+			goto cs_error;
+		}
 
-	/* PRM - observability performance counters:
-	 *
-	 *   OACONTROL, performance counter enable, note:
-	 *
-	 *   "When this bit is set, in order to have coherent counts,
-	 *   RC6 power state and trunk clock gating must be disabled.
-	 *   This can be achieved by programming MMIO registers as
-	 *   0xA094=0 and 0xA090[31]=1"
-	 *
-	 *   In our case we are expecting that taking pm + FORCEWAKE
-	 *   references will effectively disable RC6.
-	 */
-	intel_runtime_pm_get(dev_priv);
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+		stream->cs_mode = true;
 
-	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
-	if (ret) {
-		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-		intel_runtime_pm_put(dev_priv);
-		free_oa_buffer(dev_priv);
-		return ret;
+		ret = alloc_command_stream_buf(dev_priv);
+		if (ret)
+			goto cs_error;
 	}
 
 	stream->destroy = i915_oa_stream_destroy;
@@ -1270,6 +1824,17 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	dev_priv->perf.oa.exclusive_stream = stream;
 
 	return 0;
+
+cs_error:
+	if (require_oa_unit) {
+		dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
+
+		free_oa_buffer(dev_priv);
+	}
+	return ret;
 }
 
 static void gen7_update_hw_ctx_id_locked(struct drm_i915_private *dev_priv,
@@ -1507,12 +2072,21 @@ static ssize_t i915_perf_read(struct file *file,
 
 static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
 {
+	struct i915_perf_stream *stream;
+
 	struct drm_i915_private *dev_priv =
 		container_of(hrtimer, typeof(*dev_priv),
 			     perf.oa.poll_check_timer);
 
-	if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv))
-		wake_up(&dev_priv->perf.oa.poll_wq);
+	/* No need to protect the streams list here, since the hrtimer is
+	 * disabled before the stream is removed from list, and currently a
+	 * single exclusive_stream is supported.
+	 * XXX: revisit this when multiple concurrent streams are supported.
+	 */
+	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+		if (stream_have_data__unlocked(stream))
+			wake_up(&dev_priv->perf.oa.poll_wq);
+	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
 
@@ -1605,14 +2179,16 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
+	mutex_lock(&dev_priv->perf.streams_lock);
+	list_del(&stream->link);
+	mutex_unlock(&dev_priv->perf.streams_lock);
+
 	if (stream->enabled)
 		i915_perf_disable_locked(stream);
 
 	if (stream->destroy)
 		stream->destroy(stream);
 
-	list_del(&stream->link);
-
 	if (stream->ctx) {
 		mutex_lock(&dev_priv->dev->struct_mutex);
 		i915_gem_context_unreference(stream->ctx);
@@ -1728,7 +2304,9 @@ int i915_perf_open_ioctl_locked(struct drm_device *dev,
 	 */
 	BUG_ON(stream->sample_flags != props->sample_flags);
 
+	mutex_lock(&dev_priv->perf.streams_lock);
 	list_add(&stream->link, &dev_priv->perf.streams);
+	mutex_unlock(&dev_priv->perf.streams_lock);
 
 	if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
 		f_flags |= O_CLOEXEC;
@@ -1747,7 +2325,9 @@ int i915_perf_open_ioctl_locked(struct drm_device *dev,
 	return stream_fd;
 
 err_open:
+	mutex_lock(&dev_priv->perf.streams_lock);
 	list_del(&stream->link);
+	mutex_unlock(&dev_priv->perf.streams_lock);
 	if (stream->destroy)
 		stream->destroy(stream);
 err_alloc:
@@ -1856,6 +2436,26 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE:
 			props->sample_flags |= SAMPLE_OA_SOURCE_INFO;
 			break;
+		case DRM_I915_PERF_PROP_ENGINE: {
+				u8 engine =
+					(value & I915_EXEC_RING_MASK) - 1;
+				if (engine >= I915_NUM_ENGINES)
+					return -EINVAL;
+
+				/* XXX: Currently only RCS is supported.
+				 * Remove this check when support for other
+				 * engines is added
+				 */
+				if (engine != RCS)
+					return -EINVAL;
+
+				props->cs_mode = true;
+				props->engine = engine;
+			}
+			break;
+		case DRM_I915_PERF_PROP_SAMPLE_CTX_ID:
+			props->sample_flags |= SAMPLE_CTX_ID;
+			break;
 		case DRM_I915_PERF_PROP_MAX:
 			BUG();
 		}
@@ -1963,8 +2563,11 @@ void i915_perf_init(struct drm_device *dev)
 	init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
 
 	INIT_LIST_HEAD(&dev_priv->perf.streams);
+	INIT_LIST_HEAD(&dev_priv->perf.node_list);
 	mutex_init(&dev_priv->perf.lock);
+	mutex_init(&dev_priv->perf.streams_lock);
 	spin_lock_init(&dev_priv->perf.hook_lock);
+	spin_lock_init(&dev_priv->perf.node_list_lock);
 
 	dev_priv->perf.oa.timestamp_frequency =
 				GT_CS_TIMESTAMP_FREQUENCY(dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1425ede..20f2aac 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1002,10 +1002,14 @@ int intel_execlists_submission(struct i915_execbuffer_params *params,
 	exec_start = params->batch_obj_vm_offset +
 		     args->batch_start_offset;
 
+	i915_perf_command_stream_hook(params->request);
+
 	ret = engine->emit_bb_start(params->request, exec_start, params->dispatch_flags);
 	if (ret)
 		return ret;
 
+	i915_perf_command_stream_hook(params->request);
+
 	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
 
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 2139f73..e7f9479 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1196,6 +1196,7 @@ enum drm_i915_perf_oa_event_source {
 	I915_PERF_OA_EVENT_SOURCE_UNDEFINED,
 	I915_PERF_OA_EVENT_SOURCE_PERIODIC,
 	I915_PERF_OA_EVENT_SOURCE_CONTEXT_SWITCH,
+	I915_PERF_OA_EVENT_SOURCE_RCS,
 
 	I915_PERF_OA_EVENT_SOURCE_MAX	/* non-ABI */
 };
@@ -1241,6 +1242,19 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE,
 
+	/**
+	 * The value of this property specifies the GPU engine for which
+	 * the samples need to be collected. Specifying this property also
+	 * implies the command stream based sample collection.
+	 */
+	DRM_I915_PERF_PROP_ENGINE,
+
+	/**
+	 * The value of this property set to 1 requests inclusion of context ID
+	 * in the perf sample data.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_CTX_ID,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1306,6 +1320,7 @@ enum drm_i915_perf_record_type {
 	 *     struct drm_i915_perf_record_header header;
 	 *
 	 *     { u32 source_info; } && DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE
+	 *     { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */
-- 
1.9.1