[PATCH 03/14] drm/i915: Framework for capturing command stream based OA reports and ctx id info.

Tue Jul 11 11:24:41 UTC 2017

From: Sourab Gupta <sourab.gupta at intel.com>

This patch introduces a framework to capture OA counter reports associated
with Render command stream. We can then associate the reports captured
through this mechanism with their corresponding context id's. This can be
further extended to associate any other metadata information with the
corresponding samples (since the association with Render command stream
gives us the ability to capture these information while inserting the
corresponding capture commands into the command stream).

The OA reports generated in this way are associated with a corresponding
workload, and thus can be used the delimit the workload (i.e. sample the
counters at the workload boundaries), within an ongoing stream of periodic
counter snapshots.

There may be usecases wherein we need more than periodic OA capture mode
which is supported currently. This mode is primarily used for two usecases:
    - Ability to capture system wide metrics, alongwith the ability to map
      the reports back to individual contexts (particularly for HSW).
    - Ability to inject tags for work, into the reports. This provides
      visibility into the multiple stages of work within single context.

The userspace will be able to distinguish between the periodic and CS based
OA reports by the virtue of source_info sample field.

The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA
counters, and is inserted at BB boundaries.
The data thus captured will be stored in a separate buffer, which will
be different from the buffer used otherwise for periodic OA capture mode.
The metadata information pertaining to snapshot is maintained in a list,
which also has offsets into the gem buffer object per captured snapshot.
In order to track whether the gpu has completed processing the node,
a field pertaining to corresponding gem request is added, which is tracked
for completion of the command.

Both periodic and CS based reports are associated with a single stream
(corresponding to render engine), and it is expected to have the samples
in the sequential order according to their timestamps. Now, since these
reports are collected in separate buffers, these are merge sorted at the
time of forwarding to userspace during the read call.

v2: Aligning with the non-perf interface (custom drm ioctl based). Also,
few related patches are squashed together for better readability

v3: Updated perf sample capture emit hook name. Reserving space upfront
in the ring for emitting sample capture commands and using
req->fence.seqno for tracking samples. Added SRCU protection for streams.
Changed the stream last_request tracking to resv object. (Chris)
Updated perf.sample_lock spin_lock usage to avoid softlockups. Moved
stream to global per-engine structure. (Sagar)
stall/flush prior to sample capture is not added. Do we need to give this
control to user to select whether to stall/flush at each sample?

Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
Signed-off-by: Robert Bragg <robert at sixbynine.org>
Signed-off-by: Sagar Arun Kamble <sagar.a.kamble at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  106 ++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |    8 +
 drivers/gpu/drm/i915/i915_perf.c           | 1178 ++++++++++++++++++++++------
 drivers/gpu/drm/i915/intel_engine_cs.c     |    4 +
 drivers/gpu/drm/i915/intel_ringbuffer.h    |    5 +
 include/uapi/drm/i915_drm.h                |   15 +
 6 files changed, 1073 insertions(+), 243 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 81cd21e..dbc3d70 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1961,6 +1961,24 @@ struct i915_perf_stream_ops {
 	 * The stream will always be disabled before this is called.
 	 */
 	void (*destroy)(struct i915_perf_stream *stream);
+
+	/*
+	 * @emit_sample_capture: Emit the commands in the command streamer
+	 * for a particular gpu engine.
+	 *
+	 * The commands are inserted to capture the perf sample data at
+	 * specific points during workload execution, such as before and after
+	 * the batch buffer.
+	 */
+	void (*emit_sample_capture)(struct i915_perf_stream *stream,
+				    struct drm_i915_gem_request *request,
+				    bool preallocate);
+};
+
+enum i915_perf_stream_state {
+	I915_PERF_STREAM_DISABLED,
+	I915_PERF_STREAM_ENABLE_IN_PROGRESS,
+	I915_PERF_STREAM_ENABLED,
 };
 
 /**
@@ -1973,9 +1991,9 @@ struct i915_perf_stream {
 	struct drm_i915_private *dev_priv;
 
 	/**
-	 * @link: Links the stream into ``&drm_i915_private->streams``
+	 * @engine: Engine to which this stream corresponds.
 	 */
-	struct list_head link;
+	struct intel_engine_cs *engine;
 
 	/**
 	 * @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*`
@@ -1998,17 +2016,47 @@ struct i915_perf_stream {
 	struct i915_gem_context *ctx;
 
 	/**
-	 * @enabled: Whether the stream is currently enabled, considering
-	 * whether the stream was opened in a disabled state and based
-	 * on `I915_PERF_IOCTL_ENABLE` and `I915_PERF_IOCTL_DISABLE` calls.
+	 * @state: Current stream state, which can be either disabled, enabled,
+	 * or enable_in_progress, while considering whether the stream was
+	 * opened in a disabled state and based on `I915_PERF_IOCTL_ENABLE` and
+	 * `I915_PERF_IOCTL_DISABLE` calls.
 	 */
-	bool enabled;
+	enum i915_perf_stream_state state;
+
+	/**
+	 * @cs_mode: Whether command stream based perf sample collection is
+	 * enabled for this stream
+	 */
+	bool cs_mode;
+
+	/**
+	 * @using_oa: Whether OA unit is in use for this particular stream
+	 */
+	bool using_oa;
+
+	/**
+	 * @sampled_req_resv: List of fences of requests for which perf. related
+	 * sample capture emit is done.
+	 */
+	struct reservation_object sampled_req_resv;
 
 	/**
 	 * @ops: The callbacks providing the implementation of this specific
 	 * type of configured stream.
 	 */
 	const struct i915_perf_stream_ops *ops;
+
+	/* Command stream based perf data buffer */
+	struct {
+		struct i915_vma *vma;
+		u8 *vaddr;
+	} cs_buffer;
+
+	struct list_head cs_samples;
+	spinlock_t cs_samples_lock;
+
+	wait_queue_head_t poll_wq;
+	bool pollin;
 };
 
 /**
@@ -2071,7 +2119,8 @@ struct i915_oa_ops {
 	int (*read)(struct i915_perf_stream *stream,
 		    char __user *buf,
 		    size_t count,
-		    size_t *offset);
+		    size_t *offset,
+		    u32 ts);
 
 	/**
 	 * @oa_hw_tail_read: read the OA tail pointer register
@@ -2083,6 +2132,36 @@ struct i915_oa_ops {
 	u32 (*oa_hw_tail_read)(struct drm_i915_private *dev_priv);
 };
 
+/*
+ * i915_perf_cs_sample - Sample element to hold info about a single perf
+ * sample data associated with a particular GPU command stream.
+ */
+struct i915_perf_cs_sample {
+	/**
+	 * @link: Links the sample into ``&stream->cs_samples``
+	 */
+	struct list_head link;
+
+	/**
+	 * @request: GEM request associated with the sample. The commands to
+	 * capture the perf metrics are inserted into the command streamer in
+	 * context of this request.
+	 */
+	struct drm_i915_gem_request *request;
+
+	/**
+	 * @offset: Offset into ``&stream->cs_buffer``
+	 * where the perf metrics will be collected, when the commands inserted
+	 * into the command stream are executed by GPU.
+	 */
+	u32 offset;
+
+	/**
+	 * @ctx_id: Context ID associated with this perf sample
+	 */
+	u32 ctx_id;
+};
+
 struct intel_cdclk_state {
 	unsigned int cdclk, vco, ref;
 };
@@ -2411,17 +2490,10 @@ struct drm_i915_private {
 		struct ctl_table_header *sysctl_header;
 
 		struct mutex lock;
-		struct list_head streams;
-
-		struct {
-			struct i915_perf_stream *exclusive_stream;
 
-			u32 specific_ctx_id;
-
-			struct hrtimer poll_check_timer;
-			wait_queue_head_t poll_wq;
-			bool pollin;
+		struct hrtimer poll_check_timer;
 
+		struct {
 			/**
 			 * For rate limiting any notifications of spurious
 			 * invalid OA reports
@@ -3592,6 +3664,8 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 void i915_oa_init_reg_state(struct intel_engine_cs *engine,
 			    struct i915_gem_context *ctx,
 			    uint32_t *reg_state);
+void i915_perf_emit_sample_capture(struct drm_i915_gem_request *req,
+				   bool preallocate);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 929f275..28f7fd0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1194,12 +1194,16 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
 	if (err)
 		goto err_request;
 
+	i915_perf_emit_sample_capture(rq, true);
+
 	err = eb->engine->emit_bb_start(rq,
 					batch->node.start, PAGE_SIZE,
 					cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
 	if (err)
 		goto err_request;
 
+	i915_perf_emit_sample_capture(rq, false);
+
 	GEM_BUG_ON(!reservation_object_test_signaled_rcu(batch->resv, true));
 	i915_vma_move_to_active(batch, rq, 0);
 	reservation_object_lock(batch->resv, NULL);
@@ -2027,6 +2031,8 @@ static int eb_submit(struct i915_execbuffer *eb)
 			return err;
 	}
 
+	i915_perf_emit_sample_capture(eb->request, true);
+
 	err = eb->engine->emit_bb_start(eb->request,
 					eb->batch->node.start +
 					eb->batch_start_offset,
@@ -2035,6 +2041,8 @@ static int eb_submit(struct i915_execbuffer *eb)
 	if (err)
 		return err;
 
+	i915_perf_emit_sample_capture(eb->request, false);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 49e4c93..68c1981 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -193,6 +193,7 @@
 
 #include <linux/anon_inodes.h>
 #include <linux/sizes.h>
+#include <linux/srcu.h>
 
 #include "i915_drv.h"
 #include "i915_oa_hsw.h"
@@ -288,6 +289,12 @@
 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
 #define OAREPORT_REASON_CLK_RATIO      (1<<5)
 
+/* Data common to periodic and RCS based OA samples */
+struct i915_perf_sample_data {
+	u32 source;
+	u32 ctx_id;
+	const u8 *report;
+};
 
 /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
  *
@@ -328,8 +335,19 @@
 	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
 };
 
+/* Duplicated from similar static enum in i915_gem_execbuffer.c */
+#define I915_USER_RINGS (4)
+static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
+	[I915_EXEC_DEFAULT]     = RCS,
+	[I915_EXEC_RENDER]      = RCS,
+	[I915_EXEC_BLT]         = BCS,
+	[I915_EXEC_BSD]         = VCS,
+	[I915_EXEC_VEBOX]       = VECS
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 #define SAMPLE_OA_SOURCE      (1<<1)
+#define SAMPLE_CTX_ID	      (1<<2)
 
 /**
  * struct perf_open_properties - for validated properties given to open a stream
@@ -340,6 +358,9 @@
  * @oa_format: An OA unit HW report format
  * @oa_periodic: Whether to enable periodic OA unit sampling
  * @oa_period_exponent: The OA unit sampling period is derived from this
+ * @cs_mode: Whether the stream is configured to enable collection of metrics
+ * associated with command stream of a particular GPU engine
+ * @engine: The GPU engine associated with the stream in case cs_mode is enabled
  *
  * As read_properties_unlocked() enumerates and validates the properties given
  * to open a stream of metrics the configuration is built up in the structure
@@ -356,6 +377,10 @@ struct perf_open_properties {
 	int oa_format;
 	bool oa_periodic;
 	int oa_period_exponent;
+
+	/* Command stream mode */
+	bool cs_mode;
+	enum intel_engine_id engine;
 };
 
 static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
@@ -371,6 +396,266 @@ static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
 }
 
 /**
+ * i915_perf_emit_sample_capture - Insert the commands to capture metrics into
+ * the command stream of a GPU engine.
+ * @request: request in whose context the metrics are being collected.
+ * @preallocate: allocate space in ring for related sample.
+ *
+ * The function provides a hook through which the commands to capture perf
+ * metrics, are inserted into the command stream of a GPU engine.
+ */
+void i915_perf_emit_sample_capture(struct drm_i915_gem_request *request,
+				   bool preallocate)
+{
+	struct intel_engine_cs *engine = request->engine;
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_perf_stream *stream;
+	int idx;
+
+	if (!dev_priv->perf.initialized)
+		return;
+
+	idx = srcu_read_lock(&engine->perf_srcu);
+	stream = rcu_dereference(engine->exclusive_stream);
+	if (stream && (stream->state == I915_PERF_STREAM_ENABLED) &&
+				stream->cs_mode)
+		stream->ops->emit_sample_capture(stream, request,
+						 preallocate);
+	srcu_read_unlock(&engine->perf_srcu, idx);
+}
+
+/**
+ * release_perf_samples - Release old perf samples to make space for new
+ * sample data.
+ * @stream: Stream from which space is to be freed up.
+ * @target_size: Space required to be freed up.
+ *
+ * We also dereference the associated request before deleting the sample.
+ * Also, no need to check whether the commands associated with old samples
+ * have been completed. This is because these sample entries are anyways going
+ * to be replaced by a new sample, and gpu will eventually overwrite the buffer
+ * contents, when the request associated with new sample completes.
+ */
+static void release_perf_samples(struct i915_perf_stream *stream,
+				 u32 target_size)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_sample *sample, *next;
+	u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u32 size = 0;
+
+	list_for_each_entry_safe
+		(sample, next, &stream->cs_samples, link) {
+		size += sample_size;
+		i915_gem_request_put(sample->request);
+		list_del(&sample->link);
+		kfree(sample);
+
+		if (size >= target_size)
+			break;
+	}
+}
+
+/**
+ * insert_perf_sample - Insert a perf sample entry to the sample list.
+ * @stream: Stream into which sample is to be inserted.
+ * @sample: perf CS sample to be inserted into the list
+ *
+ * This function never fails, since it always manages to insert the sample.
+ * If the space is exhausted in the buffer, it will remove the older
+ * entries in order to make space.
+ */
+static void insert_perf_sample(struct i915_perf_stream *stream,
+				struct i915_perf_cs_sample *sample)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_sample *first, *last;
+	int max_offset = stream->cs_buffer.vma->obj->base.size;
+	u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stream->cs_samples_lock, flags);
+	if (list_empty(&stream->cs_samples)) {
+		sample->offset = 0;
+		list_add_tail(&sample->link, &stream->cs_samples);
+		spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+		return;
+	}
+
+	first = list_first_entry(&stream->cs_samples, typeof(*first),
+				link);
+	last = list_last_entry(&stream->cs_samples, typeof(*last),
+				link);
+
+	if (last->offset >= first->offset) {
+		/* Sufficient space available at the end of buffer? */
+		if (last->offset + 2*sample_size < max_offset)
+			sample->offset = last->offset + sample_size;
+		/*
+		 * Wraparound condition. Is sufficient space available at
+		 * beginning of buffer?
+		 */
+		else if (sample_size < first->offset)
+			sample->offset = 0;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = sample_size - first->offset;
+
+			release_perf_samples(stream, target_size);
+			sample->offset = 0;
+		}
+	} else {
+		/* Sufficient space available? */
+		if (last->offset + 2*sample_size < first->offset)
+			sample->offset = last->offset + sample_size;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = sample_size -
+				(first->offset - last->offset -
+				sample_size);
+
+			release_perf_samples(stream, target_size);
+			sample->offset = last->offset + sample_size;
+		}
+	}
+	list_add_tail(&sample->link, &stream->cs_samples);
+	spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+}
+
+/**
+ * i915_emit_oa_report_capture - Insert the commands to capture OA
+ * reports metrics into the render command stream
+ * @request: request in whose context the metrics are being collected.
+ * @preallocate: allocate space in ring for related sample.
+ * @offset: command stream buffer offset where the OA metrics need to be
+ * collected
+ */
+static int i915_emit_oa_report_capture(
+				struct drm_i915_gem_request *request,
+				bool preallocate,
+				u32 offset)
+{
+	struct drm_i915_private *dev_priv = request->i915;
+	struct intel_engine_cs *engine = request->engine;
+	struct i915_perf_stream *stream;
+	u32 addr = 0;
+	u32 cmd, len = 4, *cs;
+	int idx;
+
+	idx = srcu_read_lock(&engine->perf_srcu);
+	stream = rcu_dereference(engine->exclusive_stream);
+	addr = stream->cs_buffer.vma->node.start + offset;
+	srcu_read_unlock(&engine->perf_srcu, idx);
+
+	if (WARN_ON(addr & 0x3f)) {
+		DRM_ERROR("OA buffer address not aligned to 64 byte\n");
+		return -EINVAL;
+	}
+
+	if (preallocate)
+		request->reserved_space += len;
+	else
+		request->reserved_space -= len;
+
+	cs = intel_ring_begin(request, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	cmd = MI_REPORT_PERF_COUNT | (1<<0);
+	if (INTEL_GEN(dev_priv) >= 8)
+		cmd |= (2<<0);
+
+	*cs++ = cmd;
+	*cs++ = addr | MI_REPORT_PERF_COUNT_GGTT;
+	*cs++ = request->fence.seqno;
+
+	if (INTEL_GEN(dev_priv) >= 8)
+		*cs++ = 0;
+	else
+		*cs++ = MI_NOOP;
+
+	intel_ring_advance(request, cs);
+
+	return 0;
+}
+
+/**
+ * i915_perf_stream_emit_sample_capture - Insert the commands to capture perf
+ * metrics into the GPU command stream
+ * @stream: An i915-perf stream opened for GPU metrics
+ * @request: request in whose context the metrics are being collected.
+ * @preallocate: allocate space in ring for related sample.
+ */
+static void i915_perf_stream_emit_sample_capture(
+					struct i915_perf_stream *stream,
+					struct drm_i915_gem_request *request,
+					bool preallocate)
+{
+	struct reservation_object *resv = &stream->sampled_req_resv;
+	struct i915_perf_cs_sample *sample;
+	unsigned long flags;
+	int ret;
+
+	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
+	if (sample == NULL) {
+		DRM_ERROR("Perf sample alloc failed\n");
+		return;
+	}
+
+	sample->request = i915_gem_request_get(request);
+	sample->ctx_id = request->ctx->hw_id;
+
+	insert_perf_sample(stream, sample);
+
+	if (stream->sample_flags & SAMPLE_OA_REPORT) {
+		ret = i915_emit_oa_report_capture(request,
+						  preallocate,
+						  sample->offset);
+		if (ret)
+			goto err_unref;
+	}
+
+	reservation_object_lock(resv, NULL);
+	if (reservation_object_reserve_shared(resv) == 0)
+		reservation_object_add_shared_fence(resv, &request->fence);
+	reservation_object_unlock(resv);
+
+	i915_vma_move_to_active(stream->cs_buffer.vma, request,
+					EXEC_OBJECT_WRITE);
+	return;
+
+err_unref:
+	i915_gem_request_put(sample->request);
+	spin_lock_irqsave(&stream->cs_samples_lock, flags);
+	list_del(&sample->link);
+	spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+	kfree(sample);
+}
+
+/**
+ * i915_perf_stream_release_samples - Release the perf command stream samples
+ * @stream: Stream from which sample are to be released.
+ *
+ * Note: The associated requests should be completed before releasing the
+ * references here.
+ */
+static void i915_perf_stream_release_samples(struct i915_perf_stream *stream)
+{
+	struct i915_perf_cs_sample *entry, *next;
+	unsigned long flags;
+
+	list_for_each_entry_safe
+		(entry, next, &stream->cs_samples, link) {
+		i915_gem_request_put(entry->request);
+
+		spin_lock_irqsave(&stream->cs_samples_lock, flags);
+		list_del(&entry->link);
+		spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+		kfree(entry);
+	}
+}
+
+/**
  * oa_buffer_check_unlocked - check for data and update tail ptr state
  * @dev_priv: i915 device instance
  *
@@ -521,12 +806,13 @@ static int append_oa_status(struct i915_perf_stream *stream,
 }
 
 /**
- * append_oa_sample - Copies single OA report into userspace read() buffer.
- * @stream: An i915-perf stream opened for OA metrics
+ * append_perf_sample - Copies single perf sample into userspace read() buffer.
+ * @stream: An i915-perf stream opened for perf samples
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
- * @report: A single OA report to (optionally) include as part of the sample
+ * @data: perf sample data which contains (optionally) metrics configured
+ * earlier when opening a stream
  *
  * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
  * properties when opening a stream, tracked as `stream->sample_flags`. This
@@ -537,11 +823,11 @@ static int append_oa_status(struct i915_perf_stream *stream,
  *
  * Returns: 0 on success, negative error code on failure.
  */
-static int append_oa_sample(struct i915_perf_stream *stream,
+static int append_perf_sample(struct i915_perf_stream *stream,
 			    char __user *buf,
 			    size_t count,
 			    size_t *offset,
-			    const u8 *report)
+			    const struct i915_perf_sample_data *data)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -572,14 +858,21 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 		enum drm_i915_perf_sample_oa_source source;
 
 		source = I915_PERF_SAMPLE_OA_SOURCE_OABUFFER;
-		if (copy_to_user(buf, &source, 4))
+		if (copy_to_user(buf, &data->source, 4))
+			return -EFAULT;
+		buf += 4;
+	}
+
+	if (sample_flags & SAMPLE_CTX_ID) {
+		if (copy_to_user(buf, &data->ctx_id, 4))
 			return -EFAULT;
 		buf += 4;
 	}
 
 	if (sample_flags & SAMPLE_OA_REPORT) {
-		if (copy_to_user(buf, report, report_size))
+		if (copy_to_user(buf, data->report, report_size))
 			return -EFAULT;
+		buf += report_size;
 	}
 
 	(*offset) += header.size;
@@ -588,11 +881,54 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 }
 
 /**
+ * append_oa_buffer_sample - Copies single periodic OA report into userspace
+ * read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @report: A single OA report to (optionally) include as part of the sample
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_oa_buffer_sample(struct i915_perf_stream *stream,
+				char __user *buf, size_t count,
+				size_t *offset,	const u8 *report)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	u32 sample_flags = stream->sample_flags;
+	struct i915_perf_sample_data data = { 0 };
+	u32 *report32 = (u32 *)report;
+
+	if (sample_flags & SAMPLE_OA_SOURCE)
+		data.source = I915_PERF_SAMPLE_OA_SOURCE_OABUFFER;
+
+	if (sample_flags & SAMPLE_CTX_ID) {
+		if (INTEL_INFO(dev_priv)->gen < 8)
+			data.ctx_id = 0;
+		else {
+			/*
+			 * XXX: Just keep the lower 21 bits for now since I'm
+			 * not entirely sure if the HW touches any of the higher
+			 * bits in this field
+			 */
+			data.ctx_id = report32[2] & 0x1fffff;
+		}
+	}
+
+	if (sample_flags & SAMPLE_OA_REPORT)
+		data.report = report;
+
+	return append_perf_sample(stream, buf, count, offset, &data);
+}
+
+/**
  * Copies all buffered OA reports into userspace read() buffer.
  * @stream: An i915-perf stream opened for OA metrics
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -610,7 +946,8 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
-				  size_t *offset)
+				  size_t *offset,
+				  u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -624,7 +961,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	u32 taken;
 	int ret = 0;
 
-	if (WARN_ON(!stream->enabled))
+	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
 
 	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
@@ -670,6 +1007,11 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		u32 *report32 = (void *)report;
 		u32 ctx_id;
 		u32 reason;
+		u32 report_ts = report32[1];
+
+		/* Report timestamp should not exceed the given ts */
+		if (report_ts > ts)
+			break;
 
 		/*
 		 * All the report sizes factor neatly into the buffer
@@ -751,23 +1093,23 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * switches since it's not-uncommon for periodic samples to
 		 * identify a switch before any 'context switch' report.
 		 */
-		if (!dev_priv->perf.oa.exclusive_stream->ctx ||
-		    dev_priv->perf.oa.specific_ctx_id == ctx_id ||
+		if (!stream->ctx ||
+		    stream->engine->specific_ctx_id == ctx_id ||
 		    (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
-		     dev_priv->perf.oa.specific_ctx_id) ||
+		     stream->engine->specific_ctx_id) ||
 		    reason & OAREPORT_REASON_CTX_SWITCH) {
 
 			/*
 			 * While filtering for a single context we avoid
 			 * leaking the IDs of other contexts.
 			 */
-			if (dev_priv->perf.oa.exclusive_stream->ctx &&
-			    dev_priv->perf.oa.specific_ctx_id != ctx_id) {
+			if (stream->ctx &&
+			    stream->engine->specific_ctx_id != ctx_id) {
 				report32[2] = INVALID_CTX_ID;
 			}
 
-			ret = append_oa_sample(stream, buf, count, offset,
-					       report);
+			ret = append_oa_buffer_sample(stream, buf, count,
+						      offset, report);
 			if (ret)
 				break;
 
@@ -808,6 +1150,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Checks OA unit status registers and if necessary appends corresponding
  * status records for userspace (such as for a buffer full condition) and then
@@ -825,7 +1168,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 static int gen8_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset)
+			size_t *offset,
+			u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus;
@@ -878,7 +1222,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 			   oastatus & ~GEN8_OASTATUS_REPORT_LOST);
 	}
 
-	return gen8_append_oa_reports(stream, buf, count, offset);
+	return gen8_append_oa_reports(stream, buf, count, offset, ts);
 }
 
 /**
@@ -887,6 +1231,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -904,7 +1249,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
-				  size_t *offset)
+				  size_t *offset,
+				  u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -918,7 +1264,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	u32 taken;
 	int ret = 0;
 
-	if (WARN_ON(!stream->enabled))
+	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
 
 	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
@@ -985,7 +1331,12 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 			continue;
 		}
 
-		ret = append_oa_sample(stream, buf, count, offset, report);
+		/* Report timestamp should not exceed the given ts */
+		if (report32[1] > ts)
+			break;
+
+		ret = append_oa_buffer_sample(stream, buf, count, offset,
+					      report);
 		if (ret)
 			break;
 
@@ -1023,6 +1374,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Checks Gen 7 specific OA unit status registers and if necessary appends
  * corresponding status records for userspace (such as for a buffer full
@@ -1036,7 +1388,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 static int gen7_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset)
+			size_t *offset,
+			u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus1;
@@ -1098,16 +1451,172 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 			GEN7_OASTATUS1_REPORT_LOST;
 	}
 
-	return gen7_append_oa_reports(stream, buf, count, offset);
+	return gen7_append_oa_reports(stream, buf, count, offset, ts);
+}
+
+/**
+ * append_cs_buffer_sample - Copies single perf sample data associated with
+ * GPU command stream, into userspace read() buffer.
+ * @stream: An i915-perf stream opened for perf CS metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @node: Sample data associated with perf metrics
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_cs_buffer_sample(struct i915_perf_stream *stream,
+				char __user *buf,
+				size_t count,
+				size_t *offset,
+				struct i915_perf_cs_sample *node)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_sample_data data = { 0 };
+	u32 sample_flags = stream->sample_flags;
+	int ret = 0;
+
+	if (sample_flags & SAMPLE_OA_REPORT) {
+		const u8 *report = stream->cs_buffer.vaddr + node->offset;
+		u32 sample_ts = *(u32 *)(report + 4);
+
+		data.report = report;
+
+		/* First, append the periodic OA samples having lower
+		 * timestamp values
+		 */
+		ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+						 sample_ts);
+		if (ret)
+			return ret;
+	}
+
+	if (sample_flags & SAMPLE_OA_SOURCE)
+		data.source = I915_PERF_SAMPLE_OA_SOURCE_CS;
+
+	if (sample_flags & SAMPLE_CTX_ID)
+		data.ctx_id = node->ctx_id;
+
+	return append_perf_sample(stream, buf, count, offset, &data);
+}
+
+/**
+ * append_cs_buffer_samples: Copies all command stream based perf samples
+ * into userspace read() buffer.
+ * @stream: An i915-perf stream opened for perf CS metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ *
+ * Notably any error condition resulting in a short read (-%ENOSPC or
+ * -%EFAULT) will be returned even though one or more records may
+ * have been successfully copied. In this case it's up to the caller
+ * to decide if the error should be squashed before returning to
+ * userspace.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_cs_buffer_samples(struct i915_perf_stream *stream,
+				char __user *buf,
+				size_t count,
+				size_t *offset)
+{
+	struct i915_perf_cs_sample *entry, *next;
+	LIST_HEAD(free_list);
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stream->cs_samples_lock, flags);
+	if (list_empty(&stream->cs_samples)) {
+		spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+		return 0;
+	}
+	list_for_each_entry_safe(entry, next,
+				 &stream->cs_samples, link) {
+		if (!i915_gem_request_completed(entry->request))
+			break;
+		list_move_tail(&entry->link, &free_list);
+	}
+	spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+
+	if (list_empty(&free_list))
+		return 0;
+
+	list_for_each_entry_safe(entry, next, &free_list, link) {
+		ret = append_cs_buffer_sample(stream, buf, count, offset,
+					      entry);
+		if (ret)
+			break;
+
+		list_del(&entry->link);
+		i915_gem_request_put(entry->request);
+		kfree(entry);
+	}
+
+	/* Don't discard remaining entries, keep them for next read */
+	spin_lock_irqsave(&stream->cs_samples_lock, flags);
+	list_splice(&free_list, &stream->cs_samples);
+	spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+
+	return ret;
+}
+
+/*
+ * cs_buffer_is_empty - Checks whether the command stream buffer
+ * associated with the stream has data available.
+ * @stream: An i915-perf stream opened for OA metrics
+ *
+ * Returns: true if atleast one request associated with command stream is
+ * completed, else returns false.
+ */
+static bool cs_buffer_is_empty(struct i915_perf_stream *stream)
+
+{
+	struct i915_perf_cs_sample *entry = NULL;
+	struct drm_i915_gem_request *request = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stream->cs_samples_lock, flags);
+	entry = list_first_entry_or_null(&stream->cs_samples,
+			struct i915_perf_cs_sample, link);
+	if (entry)
+		request = entry->request;
+	spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
+
+	if (!entry)
+		return true;
+	else if (!i915_gem_request_completed(request))
+		return true;
+	else
+		return false;
 }
 
 /**
- * i915_oa_wait_unlocked - handles blocking IO until OA data available
+ * stream_have_data_unlocked - Checks whether the stream has data available
  * @stream: An i915-perf stream opened for OA metrics
  *
+ * For command stream based streams, check if the command stream buffer has
+ * atleast one sample available, if not return false, irrespective of periodic
+ * oa buffer having the data or not.
+ */
+
+static bool stream_have_data_unlocked(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	if (stream->cs_mode)
+		return !cs_buffer_is_empty(stream);
+	else
+		return oa_buffer_check_unlocked(dev_priv);
+}
+
+/**
+ * i915_perf_stream_wait_unlocked - handles blocking IO until data available
+ * @stream: An i915-perf stream opened for GPU metrics
+ *
  * Called when userspace tries to read() from a blocking stream FD opened
- * for OA metrics. It waits until the hrtimer callback finds a non-empty
- * OA buffer and wakes us.
+ * for perf metrics. It waits until the hrtimer callback finds a non-empty
+ * command stream buffer / OA buffer and wakes us.
  *
  * Note: it's acceptable to have this return with some false positives
  * since any subsequent read handling will return -EAGAIN if there isn't
@@ -1115,7 +1624,7 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
  *
  * Returns: zero on success or a negative error code
  */
-static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
+static int i915_perf_stream_wait_unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
@@ -1123,32 +1632,47 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
 	if (!dev_priv->perf.oa.periodic)
 		return -EIO;
 
-	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
-					oa_buffer_check_unlocked(dev_priv));
+	if (stream->cs_mode) {
+		long int ret;
+
+		/* Wait for the all sampled requests. */
+		ret = reservation_object_wait_timeout_rcu(
+						    &stream->sampled_req_resv,
+						    true,
+						    true,
+						    MAX_SCHEDULE_TIMEOUT);
+		if (unlikely(ret < 0)) {
+			DRM_DEBUG_DRIVER("Failed to wait for sampled requests: %li\n", ret);
+			return ret;
+		}
+	}
+
+	return wait_event_interruptible(stream->poll_wq,
+					stream_have_data_unlocked(stream));
 }
 
 /**
- * i915_oa_poll_wait - call poll_wait() for an OA stream poll()
- * @stream: An i915-perf stream opened for OA metrics
+ * i915_perf_stream_poll_wait - call poll_wait() for an stream poll()
+ * @stream: An i915-perf stream opened for GPU metrics
  * @file: An i915 perf stream file
  * @wait: poll() state table
  *
- * For handling userspace polling on an i915 perf stream opened for OA metrics,
+ * For handling userspace polling on an i915 perf stream opened for metrics,
  * this starts a poll_wait with the wait queue that our hrtimer callback wakes
- * when it sees data ready to read in the circular OA buffer.
+ * when it sees data ready to read either in command stream buffer or in the
+ * circular OA buffer.
  */
-static void i915_oa_poll_wait(struct i915_perf_stream *stream,
+static void i915_perf_stream_poll_wait(struct i915_perf_stream *stream,
 			      struct file *file,
 			      poll_table *wait)
 {
-	struct drm_i915_private *dev_priv = stream->dev_priv;
-
-	poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
+	poll_wait(file, &stream->poll_wq, wait);
 }
 
 /**
- * i915_oa_read - just calls through to &i915_oa_ops->read
- * @stream: An i915-perf stream opened for OA metrics
+ * i915_perf_stream_read - Reads perf metrics available into userspace read
+ * buffer
+ * @stream: An i915-perf stream opened for GPU metrics
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
@@ -1158,14 +1682,21 @@ static void i915_oa_poll_wait(struct i915_perf_stream *stream,
  *
  * Returns: zero on success or a negative error code
  */
-static int i915_oa_read(struct i915_perf_stream *stream,
+static int i915_perf_stream_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
 			size_t *offset)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
+
+	if (stream->cs_mode)
+		return append_cs_buffer_samples(stream, buf, count, offset);
+	else if (stream->sample_flags & SAMPLE_OA_REPORT)
+		return dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+						U32_MAX);
+	else
+		return -EINVAL;
 }
 
 /**
@@ -1183,7 +1714,7 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
 	if (i915.enable_execlists)
-		dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
+		stream->engine->specific_ctx_id = stream->ctx->hw_id;
 	else {
 		struct intel_engine_cs *engine = dev_priv->engine[RCS];
 		struct intel_ring *ring;
@@ -1210,7 +1741,7 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 		 * i915_ggtt_offset() on the fly) considering the difference
 		 * with gen8+ and execlists
 		 */
-		dev_priv->perf.oa.specific_ctx_id =
+		stream->engine->specific_ctx_id =
 			i915_ggtt_offset(stream->ctx->engine[engine->id].state);
 	}
 
@@ -1229,13 +1760,13 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
 	if (i915.enable_execlists) {
-		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+		stream->engine->specific_ctx_id = INVALID_CTX_ID;
 	} else {
 		struct intel_engine_cs *engine = dev_priv->engine[RCS];
 
 		mutex_lock(&dev_priv->drm.struct_mutex);
 
-		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+		stream->engine->specific_ctx_id = INVALID_CTX_ID;
 		engine->context_unpin(engine, stream->ctx);
 
 		mutex_unlock(&dev_priv->drm.struct_mutex);
@@ -1243,6 +1774,23 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 }
 
 static void
+free_cs_buffer(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	mutex_lock(&dev_priv->drm.struct_mutex);
+
+	i915_gem_object_unpin_map(stream->cs_buffer.vma->obj);
+	i915_vma_unpin(stream->cs_buffer.vma);
+	i915_gem_object_put(stream->cs_buffer.vma->obj);
+
+	stream->cs_buffer.vma = NULL;
+	stream->cs_buffer.vaddr = NULL;
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+}
+
+static void
 free_oa_buffer(struct drm_i915_private *i915)
 {
 	mutex_lock(&i915->drm.struct_mutex);
@@ -1257,27 +1805,42 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 	mutex_unlock(&i915->drm.struct_mutex);
 }
 
-static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
+static void i915_perf_stream_destroy(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct intel_engine_cs *engine = stream->engine;
+	struct i915_perf_stream *engine_stream;
+	int idx;
 
-	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
+	idx = srcu_read_lock(&engine->perf_srcu);
+	engine_stream = rcu_dereference(engine->exclusive_stream);
+	if (WARN_ON(stream != engine_stream))
+		return;
+	srcu_read_unlock(&engine->perf_srcu, idx);
 
 	/*
 	 * Unset exclusive_stream first, it might be checked while
 	 * disabling the metric set on gen8+.
 	 */
-	dev_priv->perf.oa.exclusive_stream = NULL;
+	rcu_assign_pointer(stream->engine->exclusive_stream, NULL);
+	synchronize_srcu(&stream->engine->perf_srcu);
 
-	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
+	if (stream->using_oa) {
+		dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
-	free_oa_buffer(dev_priv);
+		free_oa_buffer(dev_priv);
 
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-	intel_runtime_pm_put(dev_priv);
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
 
-	if (stream->ctx)
-		oa_put_render_ctx_id(stream);
+		if (stream->ctx)
+			oa_put_render_ctx_id(stream);
+	}
+
+	if (stream->cs_mode) {
+		free_cs_buffer(stream);
+		reservation_object_fini(&stream->sampled_req_resv);
+	}
 
 	if (dev_priv->perf.oa.spurious_report_rs.missed) {
 		DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
@@ -1326,11 +1889,6 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 * memory...
 	 */
 	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
-
-	/* Maybe make ->pollin per-stream state if we support multiple
-	 * concurrent streams in the future.
-	 */
-	dev_priv->perf.oa.pollin = false;
 }
 
 static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
@@ -1384,33 +1942,26 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	 * memory...
 	 */
 	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
-
-	/*
-	 * Maybe make ->pollin per-stream state if we support multiple
-	 * concurrent streams in the future.
-	 */
-	dev_priv->perf.oa.pollin = false;
 }
 
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+static int alloc_obj(struct drm_i915_private *dev_priv,
+		     struct i915_vma **vma, u8 **vaddr)
 {
 	struct drm_i915_gem_object *bo;
-	struct i915_vma *vma;
 	int ret;
 
-	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
-		return -ENODEV;
+	intel_runtime_pm_get(dev_priv);
 
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
 	if (ret)
-		return ret;
+		goto out;
 
 	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
 	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
 
 	bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
 	if (IS_ERR(bo)) {
-		DRM_ERROR("Failed to allocate OA buffer\n");
+		DRM_ERROR("Failed to allocate i915 perf obj\n");
 		ret = PTR_ERR(bo);
 		goto unlock;
 	}
@@ -1420,42 +1971,83 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 		goto err_unref;
 
 	/* PreHSW required 512K alignment, HSW requires 16M */
-	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	*vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
+	if (IS_ERR(*vma)) {
+		ret = PTR_ERR(*vma);
 		goto err_unref;
 	}
-	dev_priv->perf.oa.oa_buffer.vma = vma;
 
-	dev_priv->perf.oa.oa_buffer.vaddr =
-		i915_gem_object_pin_map(bo, I915_MAP_WB);
-	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
-		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
+	*vaddr = i915_gem_object_pin_map(bo, I915_MAP_WB);
+	if (IS_ERR(*vaddr)) {
+		ret = PTR_ERR(*vaddr);
 		goto err_unpin;
 	}
 
-	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
-
-	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
-			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
-			 dev_priv->perf.oa.oa_buffer.vaddr);
-
 	goto unlock;
 
 err_unpin:
-	__i915_vma_unpin(vma);
+	i915_vma_unpin(*vma);
 
 err_unref:
 	i915_gem_object_put(bo);
 
-	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
-	dev_priv->perf.oa.oa_buffer.vma = NULL;
-
 unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
+out:
+	intel_runtime_pm_put(dev_priv);
 	return ret;
 }
 
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	struct i915_vma *vma;
+	u8 *vaddr;
+	int ret;
+
+	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
+		return -ENODEV;
+
+	ret = alloc_obj(dev_priv, &vma, &vaddr);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.oa.oa_buffer.vma = vma;
+	dev_priv->perf.oa.oa_buffer.vaddr = vaddr;
+
+	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
+
+	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
+			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
+			 dev_priv->perf.oa.oa_buffer.vaddr);
+	return 0;
+}
+
+static int alloc_cs_buffer(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_vma *vma;
+	u8 *vaddr;
+	int ret;
+
+	if (WARN_ON(stream->cs_buffer.vma))
+		return -ENODEV;
+
+	ret = alloc_obj(dev_priv, &vma, &vaddr);
+	if (ret)
+		return ret;
+
+	stream->cs_buffer.vma = vma;
+	stream->cs_buffer.vaddr = vaddr;
+	if (WARN_ON(!list_empty(&stream->cs_samples)))
+		INIT_LIST_HEAD(&stream->cs_samples);
+
+	DRM_DEBUG_DRIVER("Command stream buf initialized, gtt offset = 0x%x, vaddr = %p",
+			 i915_ggtt_offset(stream->cs_buffer.vma),
+			 stream->cs_buffer.vaddr);
+
+	return 0;
+}
+
 static void config_oa_regs(struct drm_i915_private *dev_priv,
 			   const struct i915_oa_reg *regs,
 			   int n_regs)
@@ -1860,6 +2452,10 @@ static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
 
 static void gen7_oa_enable(struct drm_i915_private *dev_priv)
 {
+	struct i915_perf_stream *stream;
+	struct intel_engine_cs *engine = dev_priv->engine[RCS];
+	int idx;
+
 	/*
 	 * Reset buf pointers so we don't forward reports from before now.
 	 *
@@ -1871,11 +2467,11 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
 	 */
 	gen7_init_oa_buffer(dev_priv);
 
-	if (dev_priv->perf.oa.exclusive_stream->enabled) {
-		struct i915_gem_context *ctx =
-			dev_priv->perf.oa.exclusive_stream->ctx;
-		u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
-
+	idx = srcu_read_lock(&engine->perf_srcu);
+	stream = rcu_dereference(engine->exclusive_stream);
+	if (stream->state != I915_PERF_STREAM_DISABLED) {
+		struct i915_gem_context *ctx = stream->ctx;
+		u32 ctx_id = engine->specific_ctx_id;
 		bool periodic = dev_priv->perf.oa.periodic;
 		u32 period_exponent = dev_priv->perf.oa.period_exponent;
 		u32 report_format = dev_priv->perf.oa.oa_buffer.format;
@@ -1890,6 +2486,7 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
 			   GEN7_OACONTROL_ENABLE);
 	} else
 		I915_WRITE(GEN7_OACONTROL, 0);
+	srcu_read_unlock(&engine->perf_srcu, idx);
 }
 
 static void gen8_oa_enable(struct drm_i915_private *dev_priv)
@@ -1918,22 +2515,23 @@ static void gen8_oa_enable(struct drm_i915_private *dev_priv)
 }
 
 /**
- * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
- * @stream: An i915 perf stream opened for OA metrics
+ * i915_perf_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for perf stream
+ * @stream: An i915 perf stream opened for GPU metrics
  *
  * [Re]enables hardware periodic sampling according to the period configured
  * when opening the stream. This also starts a hrtimer that will periodically
  * check for data in the circular OA buffer for notifying userspace (e.g.
  * during a read() or poll()).
  */
-static void i915_oa_stream_enable(struct i915_perf_stream *stream)
+static void i915_perf_stream_enable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	dev_priv->perf.oa.ops.oa_enable(dev_priv);
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		dev_priv->perf.oa.ops.oa_enable(dev_priv);
 
-	if (dev_priv->perf.oa.periodic)
-		hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
+	if (stream->cs_mode || dev_priv->perf.oa.periodic)
+		hrtimer_start(&dev_priv->perf.poll_check_timer,
 			      ns_to_ktime(POLL_PERIOD),
 			      HRTIMER_MODE_REL_PINNED);
 }
@@ -1949,34 +2547,56 @@ static void gen8_oa_disable(struct drm_i915_private *dev_priv)
 }
 
 /**
- * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
- * @stream: An i915 perf stream opened for OA metrics
+ * i915_perf_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for perf stream
+ * @stream: An i915 perf stream opened for GPU metrics
  *
  * Stops the OA unit from periodically writing counter reports into the
  * circular OA buffer. This also stops the hrtimer that periodically checks for
  * data in the circular OA buffer, for notifying userspace.
  */
-static void i915_oa_stream_disable(struct i915_perf_stream *stream)
+static void i915_perf_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	long ret;
+
+	if (stream->cs_mode || dev_priv->perf.oa.periodic)
+		hrtimer_cancel(&dev_priv->perf.poll_check_timer);
+
+	if (stream->cs_mode) {
+		/*
+		 * Wait for the all sampled requests, before freeing
+		 * the requests associated with the stream.
+		 */
+
+		ret = reservation_object_wait_timeout_rcu(
+						    &stream->sampled_req_resv,
+						    true,
+						    true,
+						    MAX_SCHEDULE_TIMEOUT);
+		if (unlikely(ret < 0)) {
+			DRM_DEBUG_DRIVER("Failed to wait for sampled requests: %li\n", ret);
+			return;
+		}
 
-	dev_priv->perf.oa.ops.oa_disable(dev_priv);
+		i915_perf_stream_release_samples(stream);
+	}
 
-	if (dev_priv->perf.oa.periodic)
-		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		dev_priv->perf.oa.ops.oa_disable(dev_priv);
 }
 
-static const struct i915_perf_stream_ops i915_oa_stream_ops = {
-	.destroy = i915_oa_stream_destroy,
-	.enable = i915_oa_stream_enable,
-	.disable = i915_oa_stream_disable,
-	.wait_unlocked = i915_oa_wait_unlocked,
-	.poll_wait = i915_oa_poll_wait,
-	.read = i915_oa_read,
+static const struct i915_perf_stream_ops perf_stream_ops = {
+	.destroy = i915_perf_stream_destroy,
+	.enable = i915_perf_stream_enable,
+	.disable = i915_perf_stream_disable,
+	.wait_unlocked = i915_perf_stream_wait_unlocked,
+	.poll_wait = i915_perf_stream_poll_wait,
+	.read = i915_perf_stream_read,
+	.emit_sample_capture = i915_perf_stream_emit_sample_capture,
 };
 
 /**
- * i915_oa_stream_init - validate combined props for OA stream and init
+ * i915_perf_stream_init - validate combined props for stream and init
  * @stream: An i915 perf stream
  * @param: The open parameters passed to `DRM_I915_PERF_OPEN`
  * @props: The property state that configures stream (individually validated)
@@ -1985,58 +2605,35 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
  * doesn't ensure that the combination necessarily makes sense.
  *
  * At this point it has been determined that userspace wants a stream of
- * OA metrics, but still we need to further validate the combined
+ * perf metrics, but still we need to further validate the combined
  * properties are OK.
  *
  * If the configuration makes sense then we can allocate memory for
- * a circular OA buffer and apply the requested metric set configuration.
+ * a circular perf buffer and apply the requested metric set configuration.
  *
  * Returns: zero on success or a negative error code.
  */
-static int i915_oa_stream_init(struct i915_perf_stream *stream,
+static int i915_perf_stream_init(struct i915_perf_stream *stream,
 			       struct drm_i915_perf_open_param *param,
 			       struct perf_open_properties *props)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	int format_size;
+	bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
+						      SAMPLE_OA_SOURCE);
+	bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
+	struct i915_perf_stream *curr_stream;
+	struct intel_engine_cs *engine = NULL;
+	int idx;
 	int ret;
 
-	/* If the sysfs metrics/ directory wasn't registered for some
-	 * reason then don't let userspace try their luck with config
-	 * IDs
-	 */
-	if (!dev_priv->perf.metrics_kobj) {
-		DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
-		return -EINVAL;
-	}
-
-	if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
-		DRM_DEBUG("Only OA report sampling supported\n");
-		return -EINVAL;
-	}
-
-	if (!dev_priv->perf.oa.ops.init_oa_buffer) {
-		DRM_DEBUG("OA unit not supported\n");
-		return -ENODEV;
-	}
-
-	/* To avoid the complexity of having to accurately filter
-	 * counter reports and marshal to the appropriate client
-	 * we currently only allow exclusive access
-	 */
-	if (dev_priv->perf.oa.exclusive_stream) {
-		DRM_DEBUG("OA unit already in use\n");
-		return -EBUSY;
-	}
-
-	if (!props->metrics_set) {
-		DRM_DEBUG("OA metric set not specified\n");
-		return -EINVAL;
-	}
-
-	if (!props->oa_format) {
-		DRM_DEBUG("OA report format not specified\n");
-		return -EINVAL;
+	if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) {
+		if (IS_HASWELL(dev_priv)) {
+			DRM_ERROR("On HSW, context ID sampling only supported via command stream\n");
+			return -EINVAL;
+		} else if (!i915.enable_execlists) {
+			DRM_ERROR("On Gen8+ without execlists, context ID sampling only supported via command stream\n");
+			return -EINVAL;
+		}
 	}
 
 	/* We set up some ratelimit state to potentially throttle any _NOTES
@@ -2061,70 +2658,166 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 
 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
-	format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
+	if (require_oa_unit) {
+		int format_size;
 
-	stream->sample_flags |= SAMPLE_OA_REPORT;
-	stream->sample_size += format_size;
+		/* If the sysfs metrics/ directory wasn't registered for some
+		 * reason then don't let userspace try their luck with config
+		 * IDs
+		 */
+		if (!dev_priv->perf.metrics_kobj) {
+			DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
+			return -EINVAL;
+		}
 
-	if (props->sample_flags & SAMPLE_OA_SOURCE) {
-		stream->sample_flags |= SAMPLE_OA_SOURCE;
-		stream->sample_size += 4;
-	}
+		if (!dev_priv->perf.oa.ops.init_oa_buffer) {
+			DRM_DEBUG("OA unit not supported\n");
+			return -ENODEV;
+		}
 
-	dev_priv->perf.oa.oa_buffer.format_size = format_size;
-	if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
-		return -EINVAL;
+		if (!props->metrics_set) {
+			DRM_DEBUG("OA metric set not specified\n");
+			return -EINVAL;
+		}
+
+		if (!props->oa_format) {
+			DRM_DEBUG("OA report format not specified\n");
+			return -EINVAL;
+		}
+
+		if (props->cs_mode && (props->engine != RCS)) {
+			DRM_ERROR("Command stream OA metrics only available via Render CS\n");
+			return -EINVAL;
+		}
+
+		engine = dev_priv->engine[RCS];
+		stream->using_oa = true;
+
+		idx = srcu_read_lock(&engine->perf_srcu);
+		curr_stream = rcu_dereference(engine->exclusive_stream);
+		if (curr_stream) {
+			DRM_ERROR("Stream already opened\n");
+			ret = -EINVAL;
+			goto err_enable;
+		}
+		srcu_read_unlock(&engine->perf_srcu, idx);
+
+		format_size =
+			dev_priv->perf.oa.oa_formats[props->oa_format].size;
+
+		if (props->sample_flags & SAMPLE_OA_REPORT) {
+			stream->sample_flags |= SAMPLE_OA_REPORT;
+			stream->sample_size += format_size;
+		}
+
+		if (props->sample_flags & SAMPLE_OA_SOURCE) {
+			if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+				DRM_ERROR("OA source type can't be sampled without OA report\n");
+				return -EINVAL;
+			}
+			stream->sample_flags |= SAMPLE_OA_SOURCE;
+			stream->sample_size += 4;
+		}
+
+		dev_priv->perf.oa.oa_buffer.format_size = format_size;
+		if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
+			return -EINVAL;
+
+		dev_priv->perf.oa.oa_buffer.format =
+			dev_priv->perf.oa.oa_formats[props->oa_format].format;
+
+		dev_priv->perf.oa.metrics_set = props->metrics_set;
 
-	dev_priv->perf.oa.oa_buffer.format =
-		dev_priv->perf.oa.oa_formats[props->oa_format].format;
+		dev_priv->perf.oa.periodic = props->oa_periodic;
+		if (dev_priv->perf.oa.periodic)
+			dev_priv->perf.oa.period_exponent =
+				props->oa_period_exponent;
 
-	dev_priv->perf.oa.metrics_set = props->metrics_set;
+		if (stream->ctx) {
+			ret = oa_get_render_ctx_id(stream);
+			if (ret)
+				return ret;
+		}
+
+		/* PRM - observability performance counters:
+		 *
+		 *   OACONTROL, performance counter enable, note:
+		 *
+		 *   "When this bit is set, in order to have coherent counts,
+		 *   RC6 power state and trunk clock gating must be disabled.
+		 *   This can be achieved by programming MMIO registers as
+		 *   0xA094=0 and 0xA090[31]=1"
+		 *
+		 *   In our case we are expecting that taking pm + FORCEWAKE
+		 *   references will effectively disable RC6.
+		 */
+		intel_runtime_pm_get(dev_priv);
+		intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
-	dev_priv->perf.oa.periodic = props->oa_periodic;
-	if (dev_priv->perf.oa.periodic)
-		dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+		ret = alloc_oa_buffer(dev_priv);
+		if (ret)
+			goto err_oa_buf_alloc;
 
-	if (stream->ctx) {
-		ret = oa_get_render_ctx_id(stream);
+		ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
 		if (ret)
-			return ret;
+			goto err_enable;
 	}
 
-	/* PRM - observability performance counters:
-	 *
-	 *   OACONTROL, performance counter enable, note:
-	 *
-	 *   "When this bit is set, in order to have coherent counts,
-	 *   RC6 power state and trunk clock gating must be disabled.
-	 *   This can be achieved by programming MMIO registers as
-	 *   0xA094=0 and 0xA090[31]=1"
-	 *
-	 *   In our case we are expecting that taking pm + FORCEWAKE
-	 *   references will effectively disable RC6.
-	 */
-	intel_runtime_pm_get(dev_priv);
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+	if (props->sample_flags & SAMPLE_CTX_ID) {
+		stream->sample_flags |= SAMPLE_CTX_ID;
+		stream->sample_size += 4;
+	}
 
-	ret = alloc_oa_buffer(dev_priv);
-	if (ret)
-		goto err_oa_buf_alloc;
+	if (props->cs_mode) {
+		if (!cs_sample_data) {
+			DRM_ERROR("Stream engine given without requesting any CS data to sample\n");
+			ret = -EINVAL;
+			goto err_enable;
+		}
 
-	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
-	if (ret)
-		goto err_enable;
+		if (!(props->sample_flags & SAMPLE_CTX_ID)) {
+			DRM_ERROR("Stream engine given without requesting any CS specific property\n");
+			ret = -EINVAL;
+			goto err_enable;
+		}
+
+		engine = dev_priv->engine[props->engine];
+
+		idx = srcu_read_lock(&engine->perf_srcu);
+		curr_stream = rcu_dereference(engine->exclusive_stream);
+		if (curr_stream) {
+			DRM_ERROR("Stream already opened\n");
+			ret = -EINVAL;
+			goto err_enable;
+		}
+		srcu_read_unlock(&engine->perf_srcu, idx);
 
-	stream->ops = &i915_oa_stream_ops;
+		INIT_LIST_HEAD(&stream->cs_samples);
+		ret = alloc_cs_buffer(stream);
+		if (ret)
+			goto err_enable;
+
+		stream->cs_mode = true;
+		reservation_object_init(&stream->sampled_req_resv);
+	}
 
-	dev_priv->perf.oa.exclusive_stream = stream;
+	init_waitqueue_head(&stream->poll_wq);
+	stream->pollin = false;
+	stream->ops = &perf_stream_ops;
+	stream->engine = engine;
+	rcu_assign_pointer(engine->exclusive_stream, stream);
 
 	return 0;
 
 err_enable:
-	free_oa_buffer(dev_priv);
+	if (require_oa_unit)
+		free_oa_buffer(dev_priv);
 
 err_oa_buf_alloc:
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-	intel_runtime_pm_put(dev_priv);
+	if (require_oa_unit) {
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
+	}
 	if (stream->ctx)
 		oa_put_render_ctx_id(stream);
 
@@ -2220,7 +2913,7 @@ static ssize_t i915_perf_read(struct file *file,
 	 * disabled stream as an error. In particular it might otherwise lead
 	 * to a deadlock for blocking file descriptors...
 	 */
-	if (!stream->enabled)
+	if (stream->state == I915_PERF_STREAM_DISABLED)
 		return -EIO;
 
 	if (!(file->f_flags & O_NONBLOCK)) {
@@ -2255,25 +2948,31 @@ static ssize_t i915_perf_read(struct file *file,
 	 * effectively ensures we back off until the next hrtimer callback
 	 * before reporting another POLLIN event.
 	 */
-	if (ret >= 0 || ret == -EAGAIN) {
-		/* Maybe make ->pollin per-stream state if we support multiple
-		 * concurrent streams in the future.
-		 */
-		dev_priv->perf.oa.pollin = false;
-	}
+	if (ret >= 0 || ret == -EAGAIN)
+		stream->pollin = false;
 
 	return ret;
 }
 
-static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
+static enum hrtimer_restart poll_check_timer_cb(struct hrtimer *hrtimer)
 {
+	struct i915_perf_stream *stream;
 	struct drm_i915_private *dev_priv =
 		container_of(hrtimer, typeof(*dev_priv),
-			     perf.oa.poll_check_timer);
-
-	if (oa_buffer_check_unlocked(dev_priv)) {
-		dev_priv->perf.oa.pollin = true;
-		wake_up(&dev_priv->perf.oa.poll_wq);
+			     perf.poll_check_timer);
+	int idx;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	for_each_engine(engine, dev_priv, id) {
+		idx = srcu_read_lock(&engine->perf_srcu);
+		stream = rcu_dereference(engine->exclusive_stream);
+		if (stream && (stream->state == I915_PERF_STREAM_ENABLED) &&
+		    stream_have_data_unlocked(stream)) {
+			stream->pollin = true;
+			wake_up(&stream->poll_wq);
+		}
+		srcu_read_unlock(&engine->perf_srcu, idx);
 	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
@@ -2312,7 +3011,7 @@ static unsigned int i915_perf_poll_locked(struct drm_i915_private *dev_priv,
 	 * the hrtimer/oa_poll_check_timer_cb to notify us when there are
 	 * samples to read.
 	 */
-	if (dev_priv->perf.oa.pollin)
+	if (stream->pollin)
 		events |= POLLIN;
 
 	return events;
@@ -2356,14 +3055,16 @@ static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
  */
 static void i915_perf_enable_locked(struct i915_perf_stream *stream)
 {
-	if (stream->enabled)
+	if (stream->state != I915_PERF_STREAM_DISABLED)
 		return;
 
 	/* Allow stream->ops->enable() to refer to this */
-	stream->enabled = true;
+	stream->state = I915_PERF_STREAM_ENABLE_IN_PROGRESS;
 
 	if (stream->ops->enable)
 		stream->ops->enable(stream);
+
+	stream->state = I915_PERF_STREAM_ENABLED;
 }
 
 /**
@@ -2382,11 +3083,11 @@ static void i915_perf_enable_locked(struct i915_perf_stream *stream)
  */
 static void i915_perf_disable_locked(struct i915_perf_stream *stream)
 {
-	if (!stream->enabled)
+	if (stream->state != I915_PERF_STREAM_ENABLED)
 		return;
 
 	/* Allow stream->ops->disable() to refer to this */
-	stream->enabled = false;
+	stream->state = I915_PERF_STREAM_DISABLED;
 
 	if (stream->ops->disable)
 		stream->ops->disable(stream);
@@ -2458,14 +3159,12 @@ static long i915_perf_ioctl(struct file *file,
  */
 static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
 {
-	if (stream->enabled)
+	if (stream->state == I915_PERF_STREAM_ENABLED)
 		i915_perf_disable_locked(stream);
 
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
 
-	list_del(&stream->link);
-
 	if (stream->ctx)
 		i915_gem_context_put(stream->ctx);
 
@@ -2616,7 +3315,7 @@ static int i915_perf_release(struct inode *inode, struct file *file)
 	stream->dev_priv = dev_priv;
 	stream->ctx = specific_ctx;
 
-	ret = i915_oa_stream_init(stream, param, props);
+	ret = i915_perf_stream_init(stream, param, props);
 	if (ret)
 		goto err_alloc;
 
@@ -2629,8 +3328,6 @@ static int i915_perf_release(struct inode *inode, struct file *file)
 		goto err_flags;
 	}
 
-	list_add(&stream->link, &dev_priv->perf.streams);
-
 	if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
 		f_flags |= O_CLOEXEC;
 	if (param->flags & I915_PERF_FLAG_FD_NONBLOCK)
@@ -2648,7 +3345,6 @@ static int i915_perf_release(struct inode *inode, struct file *file)
 	return stream_fd;
 
 err_open:
-	list_del(&stream->link);
 err_flags:
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
@@ -2797,6 +3493,29 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE:
 			props->sample_flags |= SAMPLE_OA_SOURCE;
 			break;
+		case DRM_I915_PERF_PROP_ENGINE: {
+				unsigned int user_ring_id =
+					value & I915_EXEC_RING_MASK;
+				enum intel_engine_id engine;
+
+				if (user_ring_id > I915_USER_RINGS)
+					return -EINVAL;
+
+				/* XXX: Currently only RCS is supported.
+				 * Remove this check when support for other
+				 * engines is added
+				 */
+				engine = user_ring_map[user_ring_id];
+				if (engine != RCS)
+					return -EINVAL;
+
+				props->cs_mode = true;
+				props->engine = engine;
+			}
+			break;
+		case DRM_I915_PERF_PROP_SAMPLE_CTX_ID:
+			props->sample_flags |= SAMPLE_CTX_ID;
+			break;
 		case DRM_I915_PERF_PROP_MAX:
 			MISSING_CASE(id);
 			return -EINVAL;
@@ -3148,12 +3867,10 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 	}
 
 	if (dev_priv->perf.oa.n_builtin_sets) {
-		hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
+		hrtimer_init(&dev_priv->perf.poll_check_timer,
 				CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
-		init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
+		dev_priv->perf.poll_check_timer.function = poll_check_timer_cb;
 
-		INIT_LIST_HEAD(&dev_priv->perf.streams);
 		mutex_init(&dev_priv->perf.lock);
 		spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
 
@@ -3171,6 +3888,9 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
  */
 void i915_perf_fini(struct drm_i915_private *dev_priv)
 {
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
 	if (!dev_priv->perf.initialized)
 		return;
 
@@ -3178,5 +3898,9 @@ void i915_perf_fini(struct drm_i915_private *dev_priv)
 
 	memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
 
+	for_each_engine(engine, dev_priv, id) {
+		cleanup_srcu_struct(&engine->perf_srcu);
+	}
+
 	dev_priv->perf.initialized = false;
 }
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 24db316..17d9a0a 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -317,6 +317,10 @@ int intel_engines_init(struct drm_i915_private *dev_priv)
 			goto cleanup;
 
 		GEM_BUG_ON(!engine->submit_request);
+
+		/* Perf stream related initialization for Engine */
+		rcu_assign_pointer(engine->exclusive_stream, NULL);
+		init_srcu_struct(&engine->perf_srcu);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d33c934..0ac8491 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -441,6 +441,11 @@ struct intel_engine_cs {
 	 * certain bits to encode the command length in the header).
 	 */
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
+
+	/* Global per-engine stream */
+	struct srcu_struct perf_srcu;
+	struct i915_perf_stream __rcu *exclusive_stream;
+	u32 specific_ctx_id;
 };
 
 static inline unsigned int
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 59074df..c1bdc07 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1350,6 +1350,7 @@ enum drm_i915_oa_format {
 
 enum drm_i915_perf_sample_oa_source {
 	I915_PERF_SAMPLE_OA_SOURCE_OABUFFER,
+	I915_PERF_SAMPLE_OA_SOURCE_CS,
 	I915_PERF_SAMPLE_OA_SOURCE_MAX	/* non-ABI */
 };
 
@@ -1394,6 +1395,19 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE,
 
+	/**
+	 * The value of this property specifies the GPU engine for which
+	 * the samples need to be collected. Specifying this property also
+	 * implies the command stream based sample collection.
+	 */
+	DRM_I915_PERF_PROP_ENGINE,
+
+	/**
+	 * The value of this property set to 1 requests inclusion of context ID
+	 * in the perf sample data.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_CTX_ID,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1460,6 +1474,7 @@ enum drm_i915_perf_record_type {
 	 *     struct drm_i915_perf_record_header header;
 	 *
 	 *     { u32 source; } && DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE
+	 *     { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */
-- 
1.9.1