[Intel-gfx] [PATCH 5/6] drm/i915: Add OA unit support for Gen 8+

Wed Feb 22 16:36:33 UTC 2017

Enables access to OA unit metrics for BDW, CHV, SKL and BXT which all
share (more-or-less) the same OA unit design.

Of particular note in comparison to Haswell: some OA unit HW config
state has become per-context state and as a consequence it is somewhat
more complicated to manage synchronous state changes from the cpu while
there's no guarantee of what context (if any) is currently actively
running on the gpu.

The periodic sampling frequency which can be particularly useful for
system-wide analysis (as opposed to command stream synchronised
MI_REPORT_PERF_COUNT commands) is perhaps the most surprising state to
have become per-context save and restored (while the OABUFFER
destination is still a shared, system-wide resource).

This support for gen8+ takes care to consider a number of timing
challenges involved in synchronously updating per-context state
primarily by programming all config state from the cpu and updating all
current and saved contexts synchronously while the OA unit is still
disabled.

The driver intentionally avoids depending on command streamer
programming to update OA state considering the lack of synchronization
between the automatic loading of OACTXCONTROL state (that includes the
periodic sampling state and enable state) on context restore and the
parsing of any general purpose BB the driver can control. I.e. this
implementation is careful to avoid the possibility of a context restore
temporarily enabling any out-of-date periodic sampling state. In
addition to the risk of transiently-out-of-date state being loaded
automatically; there are also internal HW latencies involved in the
loading of MUX configurations which would be difficult to account for
from the command streamer (and we only want to enable the unit when once
the MUX configuration is complete).

Since the Gen8+ OA unit design no longer supports clock gating the unit
off for a single given context (which effectively stopped any progress
of counters while any other context was running) and instead supports
tagging OA reports with a context ID for filtering on the CPU, it means
we can no longer hide the system-wide progress of counters from a
non-privileged application only interested in metrics for its own
context. Although we could theoretically try and subtract the progress
of other contexts before forwarding reports via read() we aren't in a
position to filter reports captured via MI_REPORT_PERF_COUNT commands.
As a result, for Gen8+, we always require the
dev.i915.perf_stream_paranoid to be unset for any access to OA metrics
if not root.

Signed-off-by: Robert Bragg <robert at sixbynine.org>
---
 drivers/gpu/drm/i915/i915_drv.h         |   14 +
 drivers/gpu/drm/i915/i915_gem_context.h |    1 +
 drivers/gpu/drm/i915/i915_perf.c        | 1067 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_reg.h         |    9 +
 drivers/gpu/drm/i915/intel_lrc.c        |    4 +
 include/uapi/drm/i915_drm.h             |   19 +-
 6 files changed, 1044 insertions(+), 70 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d5e084b46023..b6d617e308af 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2443,6 +2443,7 @@ struct drm_i915_private {
 			struct {
 				struct i915_vma *vma;
 				u8 *vaddr;
+				u32 last_ctx_id;
 				int format;
 				int format_size;
 
@@ -2512,6 +2513,14 @@ struct drm_i915_private {
 			} oa_buffer;
 
 			u32 gen7_latched_oastatus1;
+			u32 ctx_oactxctrl_off;
+			u32 ctx_flexeu0_off;
+
+			/* The RPT_ID/reason field for Gen8+ includes a bit
+			 * to determine if the CTX ID in the report is valid
+			 * but the specific bit differs between Gen 8 and 9
+			 */
+			u32 gen8_valid_ctx_bit;
 
 			struct i915_oa_ops ops;
 			const struct i915_oa_format *oa_formats;
@@ -2823,6 +2832,8 @@ intel_info(const struct drm_i915_private *dev_priv)
 #define IS_KBL_ULX(dev_priv)	(INTEL_DEVID(dev_priv) == 0x590E || \
 				 INTEL_DEVID(dev_priv) == 0x5915 || \
 				 INTEL_DEVID(dev_priv) == 0x591E)
+#define IS_SKL_GT2(dev_priv)	(IS_SKYLAKE(dev_priv) && \
+				 (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0010)
 #define IS_SKL_GT3(dev_priv)	(IS_SKYLAKE(dev_priv) && \
 				 (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
 #define IS_SKL_GT4(dev_priv)	(IS_SKYLAKE(dev_priv) && \
@@ -3566,6 +3577,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx,
 
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file);
+void i915_oa_update_reg_state(struct intel_engine_cs *engine,
+			      struct i915_gem_context *ctx,
+			      uint32_t *reg_state);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 0ac750b90f3d..32c7921faeb0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -153,6 +153,7 @@ struct i915_gem_context {
 		u64 lrc_desc;
 		int pin_count;
 		bool initialised;
+		atomic_t oa_state_dirty;
 	} engine[I915_NUM_ENGINES];
 
 	/** ring_size: size for allocating the per-engine ring buffer */
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index a901bcd80263..d1591e2ed735 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -196,6 +196,12 @@
 
 #include "i915_drv.h"
 #include "i915_oa_hsw.h"
+#include "i915_oa_bdw.h"
+#include "i915_oa_chv.h"
+#include "i915_oa_sklgt2.h"
+#include "i915_oa_sklgt3.h"
+#include "i915_oa_sklgt4.h"
+#include "i915_oa_bxt.h"
 
 /* HW requires this to be a power of two, between 128k and 16M, though driver
  * is currently generally designed assuming the largest 16M size is used such
@@ -272,6 +278,13 @@ static u32 i915_perf_stream_paranoid = true;
 
 #define INVALID_CTX_ID 0xffffffff
 
+/* On Gen8+ automatically triggered OA reports include a 'reason' field... */
+#define OAREPORT_REASON_MASK           0x3f
+#define OAREPORT_REASON_SHIFT          19
+#define OAREPORT_REASON_TIMER          (1<<0)
+#define OAREPORT_REASON_CTX_SWITCH     (1<<3)
+#define OAREPORT_REASON_CLK_RATIO      (1<<5)
+
 
 /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
  *
@@ -303,6 +316,13 @@ static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
 	[I915_OA_FORMAT_C4_B8]	    = { 7, 64 },
 };
 
+static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
+	[I915_OA_FORMAT_A12]		    = { 0, 64 },
+	[I915_OA_FORMAT_A12_B8_C8]	    = { 2, 128 },
+	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
+	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 
 /**
@@ -333,6 +353,122 @@ struct perf_open_properties {
 };
 
 /**
+ * gen8_oa_buffer_check_unlocked - check for data and update tail ptr state
+ * @dev_priv: i915 device instance
+ *
+ * This is either called via fops (for blocking reads in user ctx) or the poll
+ * check hrtimer (atomic ctx) to check the OA buffer tail pointer and check
+ * if there is data available for userspace to read.
+ *
+ * This function is central to providing a workaround for the OA unit tail
+ * pointer having a race with respect to what data is visible to the CPU.
+ * It is responsible for reading tail pointers from the hardware and giving
+ * the pointers time to 'age' before they are made available for reading.
+ * (See description of OA_TAIL_MARGIN_NSEC above for further details.)
+ *
+ * Besides returning true when there is data available to read() this function
+ * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp
+ * and .aged_tail_idx state used for reading.
+ *
+ * Note: It's safe to read OA config state here unlocked, assuming that this is
+ * only called while the stream is enabled, while the global OA configuration
+ * can't be modified.
+ *
+ * Returns: %true if the OA buffer contains data, else %false
+ */
+static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
+{
+	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
+	unsigned long flags;
+	unsigned int aged_idx;
+	u32 head, hw_tail, aged_tail, aging_tail;
+	u64 now;
+
+	/* We have to consider the (unlikely) possibility that read() errors
+	 * could result in an OA buffer reset which might reset the head,
+	 * tails[] and aged_tail state.
+	 */
+	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+
+	/* NB: The head we observe here might effectively be a little out of
+	 * date (between head and tails[aged_idx].offset if there is currently
+	 * a read() in progress.
+	 */
+	head = dev_priv->perf.oa.oa_buffer.head;
+
+	aged_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
+	aged_tail = dev_priv->perf.oa.oa_buffer.tails[aged_idx].offset;
+	aging_tail = dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset;
+
+	hw_tail = I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
+
+	/* The tail pointer increases in 64 byte increments,
+	 * not in report_size steps...
+	 */
+	hw_tail &= ~(report_size - 1);
+
+	now = ktime_get_mono_fast_ns();
+
+	/* Update the aged tail
+	 *
+	 * Flip the tail pointer available for read()s once the aging tail is
+	 * old enough to trust that the corresponding data will be visible to
+	 * the CPU...
+	 *
+	 * Do this before updating the aging pointer in case we may be able to
+	 * immediately start aging a new pointer too (if new data has become
+	 * available) without needing to wait for a later hrtimer callback.
+	 */
+	if (aging_tail != INVALID_TAIL_PTR &&
+	    ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
+	     OA_TAIL_MARGIN_NSEC)) {
+
+		aged_idx ^= 1;
+		dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
+
+		aged_tail = aging_tail;
+
+		/* Mark that we need a new pointer to start aging... */
+		dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset = INVALID_TAIL_PTR;
+		aging_tail = INVALID_TAIL_PTR;
+	}
+
+	/* Update the aging tail
+	 *
+	 * We throttle aging tail updates until we have a new tail that
+	 * represents >= one report more data than is already available for
+	 * reading. This ensures there will be enough data for a successful
+	 * read once this new pointer has aged and ensures we will give the new
+	 * pointer time to age.
+	 */
+	if (aging_tail == INVALID_TAIL_PTR &&
+	    (aged_tail == INVALID_TAIL_PTR ||
+	     OA_TAKEN(hw_tail, aged_tail) >= report_size)) {
+		struct i915_vma *vma = dev_priv->perf.oa.oa_buffer.vma;
+		u32 gtt_offset = i915_ggtt_offset(vma);
+
+		/* Be paranoid and do a bounds check on the pointer read back
+		 * from hardware, just in case some spurious hardware condition
+		 * could put the tail out of bounds...
+		 */
+		if (hw_tail >= gtt_offset &&
+		    hw_tail < (gtt_offset + OA_BUFFER_SIZE)) {
+			dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset =
+				aging_tail = hw_tail;
+			dev_priv->perf.oa.oa_buffer.aging_timestamp = now;
+		} else {
+			DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %u\n",
+				  hw_tail);
+		}
+	}
+
+	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+
+	return aged_tail == INVALID_TAIL_PTR ?
+		false : OA_TAKEN(aged_tail, head) >= report_size;
+}
+
+/**
  * gen7_oa_buffer_check_unlocked - check for data and update tail ptr state
  * @dev_priv: i915 device instance
  *
@@ -530,7 +666,284 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 
 	(*offset) += header.size;
 
-	return 0;
+	return 0;
+}
+
+/**
+ * Copies all buffered OA reports into userspace read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ *
+ * Notably any error condition resulting in a short read (-%ENOSPC or
+ * -%EFAULT) will be returned even though one or more records may
+ * have been successfully copied. In this case it's up to the caller
+ * to decide if the error should be squashed before returning to
+ * userspace.
+ *
+ * Note: reports are consumed from the head, and appended to the
+ * tail, so the tail chases the head?... If you think that's mad
+ * and back-to-front you're not alone, but this follows the
+ * Gen PRM naming convention.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int gen8_append_oa_reports(struct i915_perf_stream *stream,
+				  char __user *buf,
+				  size_t count,
+				  size_t *offset)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
+	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	size_t start_offset = *offset;
+	unsigned long flags;
+	unsigned int aged_tail_idx;
+	u32 head, tail;
+	u32 taken;
+	int ret = 0;
+
+	if (WARN_ON(!stream->enabled))
+		return -EIO;
+
+	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+
+	head = dev_priv->perf.oa.oa_buffer.head;
+	aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
+	tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
+
+	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+
+	/* An invalid tail pointer here means we're still waiting for the poll
+	 * hrtimer callback to give us a pointer
+	 */
+	if (tail == INVALID_TAIL_PTR)
+		return -EAGAIN;
+
+	/* NB: oa_buffer.head/tail include the gtt_offset which we don't want
+	 * while indexing relative to oa_buf_base.
+	 */
+	head -= gtt_offset;
+	tail -= gtt_offset;
+
+	/* An out of bounds or misaligned head or tail pointer implies a driver
+	 * bug since we validate + align the tail pointers we read from the
+	 * hardware and we are in full control of the head pointer which should
+	 * only be incremented by multiples of the report size (notably also
+	 * all a power of two).
+	 */
+	if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
+		      tail > OA_BUFFER_SIZE || tail % report_size,
+		      "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
+		      head, tail))
+		return -EIO;
+
+
+	for (/* none */;
+	     (taken = OA_TAKEN(tail, head));
+	     head = (head + report_size) & mask) {
+		u8 *report = oa_buf_base + head;
+		u32 *report32 = (void *)report;
+		u32 ctx_id;
+		u32 reason;
+
+		/* All the report sizes factor neatly into the buffer
+		 * size so we never expect to see a report split
+		 * between the beginning and end of the buffer.
+		 *
+		 * Given the initial alignment check a misalignment
+		 * here would imply a driver bug that would result
+		 * in an overrun.
+		 */
+		if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
+			DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
+			break;
+		}
+
+		/* The reason field includes flags identifying what
+		 * triggered this specific report (mostly timer
+		 * triggered or e.g. due to a context switch).
+		 *
+		 * This field is never expected to be zero so we can
+		 * check that the report isn't invalid before copying
+		 * it to userspace...
+		 */
+		reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
+			  OAREPORT_REASON_MASK);
+		if (reason == 0) {
+			if (printk_ratelimit())
+				DRM_NOTE("Skipping spurious, invalid OA report\n");
+			continue;
+		}
+
+		/* XXX: Just keep the lower 21 bits for now since I'm not
+		 * entirely sure if the HW touches any of the higher bits in
+		 * this field
+		 */
+		ctx_id = report32[2] & 0x1fffff;
+
+		/* Squash whatever is in the CTX_ID field if it's
+		 * marked as invalid to be sure we avoid
+		 * false-positive, single-context filtering below...
+		 */
+		if (!(report32[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
+			ctx_id = report32[2] = INVALID_CTX_ID;
+
+		/* NB: For Gen 8 the OA unit no longer supports clock gating
+		 * off for a specific context and the kernel can't securely
+		 * stop the counters from updating as system-wide / global
+		 * values.
+		 *
+		 * Automatic reports now include a context ID so reports can be
+		 * filtered on the cpu but it's not worth trying to
+		 * automatically subtract/hide counter progress for other
+		 * contexts while filtering since we can't stop userspace
+		 * issuing MI_REPORT_PERF_COUNT commands which would still
+		 * provide a side-band view of the real values.
+		 *
+		 * To allow userspace (such as Mesa/GL_INTEL_performance_query)
+		 * to normalize counters for a single filtered context then it
+		 * needs be forwarded bookend context-switch reports so that it
+		 * can track switches in between MI_REPORT_PERF_COUNT commands
+		 * and can itself subtract/ignore the progress of counters
+		 * associated with other contexts. Note that the hardware
+		 * automatically triggers reports when switching to a new
+		 * context which are tagged with the ID of the newly active
+		 * context. To avoid the complexity (and likely fragility) of
+		 * reading ahead while parsing reports to try and minimize
+		 * forwarding redundant context switch reports (i.e. between
+		 * other, unrelated contexts) we simply elect to forward them
+		 * all.
+		 *
+		 * We don't rely solely on the reason field to identify context
+		 * switches since it's not-uncommon for periodic samples to
+		 * identify a switch before any 'context switch' report.
+		 */
+		if (!dev_priv->perf.oa.exclusive_stream->ctx ||
+		    dev_priv->perf.oa.specific_ctx_id == ctx_id ||
+		    (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
+		     dev_priv->perf.oa.specific_ctx_id) ||
+		    reason & OAREPORT_REASON_CTX_SWITCH) {
+
+			/* While filtering for a single context we avoid
+			 * leaking the IDs of other contexts.
+			 */
+			if (dev_priv->perf.oa.exclusive_stream->ctx &&
+			    dev_priv->perf.oa.specific_ctx_id != ctx_id) {
+				report32[2] = INVALID_CTX_ID;
+			}
+
+			ret = append_oa_sample(stream, buf, count, offset,
+					       report);
+			if (ret)
+				break;
+
+			dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
+		}
+
+		/* The above reason field sanity check is based on
+		 * the assumption that the OA buffer is initially
+		 * zeroed and we reset the field after copying so the
+		 * check is still meaningful once old reports start
+		 * being overwritten.
+		 */
+		report32[0] = 0;
+	}
+
+	if (start_offset != *offset) {
+		spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+
+		/* We removed the gtt_offset for the copy loop above, indexing
+		 * relative to oa_buf_base so put back here...
+		 */
+		head += gtt_offset;
+
+		I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
+		dev_priv->perf.oa.oa_buffer.head = head;
+
+		spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+	}
+
+	return ret;
+}
+
+/**
+ * gen8_oa_read - copy status records then buffered OA reports
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ *
+ * Checks OA unit status registers and if necessary appends corresponding
+ * status records for userspace (such as for a buffer full condition) and then
+ * initiate appending any buffered OA reports.
+ *
+ * Updates @offset according to the number of bytes successfully copied into
+ * the userspace buffer.
+ *
+ * NB: some data may be successfully copied to the userspace buffer
+ * even if an error is returned, and this is reflected in the
+ * updated @read_state.
+ *
+ * Returns: zero on success or a negative error code
+ */
+static int gen8_oa_read(struct i915_perf_stream *stream,
+			char __user *buf,
+			size_t count,
+			size_t *offset)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	u32 oastatus;
+	int ret;
+
+	if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
+		return -EIO;
+
+	oastatus = I915_READ(GEN8_OASTATUS);
+
+	/* We treat OABUFFER_OVERFLOW as a significant error:
+	 *
+	 * Although theoretically we could handle this more gracefully
+	 * sometimes, some Gens don't correctly suppress certain
+	 * automatically triggered reports in this condition and so we
+	 * have to assume that old reports are now being trampled
+	 * over.
+	 *
+	 * Considering how we don't currently give userspace control
+	 * over the OA buffer size and always configure a large 16MB
+	 * buffer, then a buffer overflow does anyway likely indicate
+	 * that something has gone quite badly wrong.
+	 */
+	if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
+		ret = append_oa_status(stream, buf, count, offset,
+				       DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
+		if (ret)
+			return ret;
+
+		DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
+			  dev_priv->perf.oa.period_exponent);
+
+		dev_priv->perf.oa.ops.oa_disable(dev_priv);
+		dev_priv->perf.oa.ops.oa_enable(dev_priv);
+
+		/* Note: .oa_enable() is expected to re-init the oabuffer
+		 * and reset GEN8_OASTATUS for us */
+		oastatus = I915_READ(GEN8_OASTATUS);
+	}
+
+	if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
+		ret = append_oa_status(stream, buf, count, offset,
+				       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
+		if (ret == 0) {
+			I915_WRITE(GEN8_OASTATUS,
+				   oastatus & ~GEN8_OASTATUS_REPORT_LOST);
+		}
+	}
+
+	return gen8_append_oa_reports(stream, buf, count, offset);
 }
 
 /**
@@ -733,7 +1146,8 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 		if (ret)
 			return ret;
 
-		DRM_DEBUG("OA buffer overflow: force restart\n");
+		DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
+			  dev_priv->perf.oa.period_exponent);
 
 		dev_priv->perf.oa.ops.oa_disable(dev_priv);
 		dev_priv->perf.oa.ops.oa_enable(dev_priv);
@@ -833,33 +1247,37 @@ static int i915_oa_read(struct i915_perf_stream *stream,
 static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	struct intel_engine_cs *engine = dev_priv->engine[RCS];
-	int ret;
 
-	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
-	if (ret)
-		return ret;
+	if (i915.enable_execlists)
+		dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
+	else {
+		struct intel_engine_cs *engine = dev_priv->engine[RCS];
+		int ret;
 
-	/* As the ID is the gtt offset of the context's vma we pin
-	 * the vma to ensure the ID remains fixed.
-	 *
-	 * NB: implied RCS engine...
-	 */
-	ret = engine->context_pin(engine, stream->ctx);
-	if (ret)
-		goto unlock;
+		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
+		if (ret)
+			return ret;
 
-	/* Explicitly track the ID (instead of calling i915_ggtt_offset()
-	 * on the fly) considering the difference with gen8+ and
-	 * execlists
-	 */
-	dev_priv->perf.oa.specific_ctx_id =
-		i915_ggtt_offset(stream->ctx->engine[engine->id].state);
+		/* As the ID is the gtt offset of the context's vma we pin
+		 * the vma to ensure the ID remains fixed.
+		 */
+		ret = engine->context_pin(engine, stream->ctx);
+		if (ret) {
+			mutex_unlock(&dev_priv->drm.struct_mutex);
+			return ret;
+		}
 
-unlock:
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+		/* Explicitly track the ID (instead of calling
+		 * i915_ggtt_offset() on the fly) considering the difference
+		 * with gen8+ and execlists
+		 */
+		dev_priv->perf.oa.specific_ctx_id =
+			i915_ggtt_offset(stream->ctx->engine[engine->id].state);
 
-	return ret;
+		mutex_unlock(&dev_priv->drm.struct_mutex);
+	}
+
+	return 0;
 }
 
 /**
@@ -874,12 +1292,16 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	struct intel_engine_cs *engine = dev_priv->engine[RCS];
 
-	mutex_lock(&dev_priv->drm.struct_mutex);
+	if (i915.enable_execlists) {
+		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+	} else {
+		mutex_lock(&dev_priv->drm.struct_mutex);
 
-	dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
-	engine->context_unpin(engine, stream->ctx);
+		dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+		engine->context_unpin(engine, stream->ctx);
 
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+		mutex_unlock(&dev_priv->drm.struct_mutex);
+	}
 }
 
 static void
@@ -964,6 +1386,56 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
 	dev_priv->perf.oa.pollin = false;
 }
 
+static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+
+	I915_WRITE(GEN8_OASTATUS, 0);
+	I915_WRITE(GEN8_OAHEADPTR, gtt_offset);
+	dev_priv->perf.oa.oa_buffer.head = gtt_offset;
+
+	I915_WRITE(GEN8_OABUFFER_UDW, 0);
+
+	/* PRM says:
+	 *
+	 *  "This MMIO must be set before the OATAILPTR
+	 *  register and after the OAHEADPTR register. This is
+	 *  to enable proper functionality of the overflow
+	 *  bit."
+	 */
+	I915_WRITE(GEN8_OABUFFER, gtt_offset |
+		   OABUFFER_SIZE_16M | OA_MEM_SELECT_GGTT);
+	I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
+
+	/* Mark that we need updated tail pointers to read from... */
+	dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
+	dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
+
+	spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
+
+
+	/* NB: although the OA buffer will initially be allocated
+	 * zeroed via shmfs (and so this memset is redundant when
+	 * first allocating), we may re-init the OA buffer, either
+	 * when re-enabling a stream or in error/reset paths.
+	 *
+	 * The reason we clear the buffer for each re-init is for the
+	 * sanity check in gen8_append_oa_reports() that looks at the
+	 * reason field to make sure it's non-zero which relies on
+	 * the assumption that new reports are being written to zeroed
+	 * memory...
+	 */
+	memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
+
+	/* Maybe make ->pollin per-stream state if we support multiple
+	 * concurrent streams in the future.
+	 */
+	dev_priv->perf.oa.pollin = false;
+}
+
 static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 {
 	struct drm_i915_gem_object *bo;
@@ -1108,6 +1580,260 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
 				      ~GT_NOA_ENABLE));
 }
 
+/* Manages updating the per-context aspects of the OA stream
+ * configuration across all contexts.
+ *
+ * The awkward consideration here is that OACTXCONTROL controls the
+ * exponent for periodic sampling which is primarily used for system
+ * wide profiling where we'd like a consistent sampling period even in
+ * the face of context switches.
+ *
+ * Our approach of updating the register state context (as opposed to
+ * say using a workaround batch buffer) ensures that the hardware
+ * won't automatically reload an out-of-date timer exponent even
+ * transiently before a WA BB could be parsed.
+ */
+static int configure_all_contexts(struct drm_i915_private *dev_priv)
+{
+	struct i915_gem_context *ctx;
+	int ret;
+
+	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(ctx, &dev_priv->context_list, link) {
+		int i;
+
+		for (i = 0; i < I915_NUM_ENGINES; i++) {
+			/* The actual update of the register state context
+			 * will happen the next time this logical ring
+			 * is submitted. (See i915_oa_update_reg_state()
+			 * which hooks into execlists_update_context())
+			 */
+			atomic_set(&ctx->engine[i].oa_state_dirty, true);
+		}
+	}
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+
+	/* Now update the current context.
+	 *
+	 * Note: Using MMIO to update per-context registers requires
+	 * some extra care...
+	 */
+	ret = intel_uncore_begin_ctx_mmio(dev_priv);
+	if (ret) {
+		DRM_ERROR("Failed to bring RCS out of idle to update current ctx OA state");
+		return ret;
+	}
+
+	I915_WRITE(GEN8_OACTXCONTROL, ((dev_priv->perf.oa.period_exponent <<
+					GEN8_OA_TIMER_PERIOD_SHIFT) |
+				      (dev_priv->perf.oa.periodic ?
+				       GEN8_OA_TIMER_ENABLE : 0) |
+				      GEN8_OA_COUNTER_RESUME));
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.flex_regs,
+			dev_priv->perf.oa.flex_regs_len);
+
+	intel_uncore_end_ctx_mmio(dev_priv);
+
+	return 0;
+}
+
+static int bdw_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	int ret = i915_oa_select_metric_set_bdw(dev_priv);
+
+	if (ret)
+		return ret;
+
+	I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+	I915_WRITE(GDT_CHICKEN_BITS, 0x80);
+
+	/* It takes a fairly long time for a new MUX configuration to
+	 * be be applied after these register writes. This delay
+	 * duration is take from Haswell (derived empirically based on
+	 * the render_basic config) but hopefully it covers the
+	 * maximum configuration latency on BDW too...
+	 */
+	usleep_range(15000, 20000);
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	configure_all_contexts(dev_priv);
+
+	return 0;
+}
+
+static void bdw_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	/* NOP */
+}
+
+static int chv_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	int ret = i915_oa_select_metric_set_chv(dev_priv);
+
+	if (ret)
+		return ret;
+
+	I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+	I915_WRITE(GDT_CHICKEN_BITS, 0x80);
+
+	/* It takes a fairly long time for a new MUX configuration to
+	 * be be applied after these register writes. This delay
+	 * duration is take from Haswell (derived empirically based on
+	 * the render_basic config) but hopefully it covers the
+	 * maximum configuration latency on CHV too...
+	 */
+	usleep_range(15000, 20000);
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	configure_all_contexts(dev_priv);
+
+	return 0;
+}
+
+static void chv_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	/* NOP */
+}
+
+static int skl_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	if (IS_SKL_GT2(dev_priv)) {
+		int ret = i915_oa_select_metric_set_sklgt2(dev_priv);
+		if (ret)
+			return ret;
+	} else if (IS_SKL_GT3(dev_priv)) {
+		int ret = i915_oa_select_metric_set_sklgt3(dev_priv);
+		if (ret)
+			return ret;
+	} else if (IS_SKL_GT4(dev_priv)) {
+		int ret = i915_oa_select_metric_set_sklgt4(dev_priv);
+		if (ret)
+			return ret;
+	} else
+		return -ENOTSUPP;
+
+	/* We disable slice/unslice clock ratio change reports on SKL since
+	 * they are too noisy. The HW generates a lot redundant reports where
+	 * the ratio hasn't really changed causing a lot of redundant work to
+	 * processes and increasing the chances we'll hit buffer overruns.
+	 *
+	 * Although we don't currently use the 'disable overrun' OABUFFER
+	 * feature it's worth noting that clock ratio reports have to be
+	 * disabled before considering to use that feature since the HW doesn't
+	 * correctly block these reports.
+	 *
+	 * Currently none of the high-level metrics we have depend on knowing
+	 * this ratio to normalize.
+	 *
+	 * Note: This register is not power context saved and restored, but
+	 * that's OK considering that we disable RC6 while the OA unit is
+	 * enabled.
+	 *
+	 * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
+	 * be read back from automatically triggered reports, as part of the
+	 * RPT_ID field.
+	 */
+	I915_WRITE(GEN8_OA_DEBUG,
+		   _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
+				      GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
+
+	I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+	I915_WRITE(GDT_CHICKEN_BITS, 0x80);
+
+	/* It takes a fairly long time for a new MUX configuration to
+	 * be be applied after these register writes. This delay
+	 * duration is take from Haswell (derived empirically based on
+	 * the render_basic config) but hopefully it covers the
+	 * maximum configuration latency on CHV too...
+	 */
+	usleep_range(15000, 20000);
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	configure_all_contexts(dev_priv);
+
+	return 0;
+}
+
+static void skl_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	/* NOP */
+}
+
+static int bxt_enable_metric_set(struct drm_i915_private *dev_priv)
+{
+	int ret = i915_oa_select_metric_set_bxt(dev_priv);
+
+	if (ret)
+		return ret;
+
+	/* We disable slice/unslice clock ratio change reports on SKL since
+	 * they are too noisy. The HW generates a lot redundant reports where
+	 * the ratio hasn't really changed causing a lot of redundant work to
+	 * processes and increasing the chances we'll hit buffer overruns.
+	 *
+	 * Although we don't currently use the 'disable overrun' OABUFFER
+	 * feature it's worth noting that clock ratio reports have to be
+	 * disabled before considering to use that feature since the HW doesn't
+	 * correctly block these reports.
+	 *
+	 * Currently none of the high-level metrics we have depend on knowing
+	 * this ratio to normalize.
+	 *
+	 * Note: This register is not power context saved and restored, but
+	 * that's OK considering that we disable RC6 while the OA unit is
+	 * enabled.
+	 *
+	 * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
+	 * be read back from automatically triggered reports, as part of the
+	 * RPT_ID field.
+	 */
+	I915_WRITE(GEN8_OA_DEBUG,
+		   _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
+				      GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
+
+	I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
+	config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
+		       dev_priv->perf.oa.mux_regs_len);
+	I915_WRITE(GDT_CHICKEN_BITS, 0x80);
+
+	/* It takes a fairly long time for a new MUX configuration to
+	 * be be applied after these register writes. This delay
+	 * duration is take from Haswell (derived empirically based on
+	 * the render_basic config) but hopefully it covers the
+	 * maximum configuration latency on CHV too...
+	 */
+	usleep_range(15000, 20000);
+
+	config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
+		       dev_priv->perf.oa.b_counter_regs_len);
+
+	configure_all_contexts(dev_priv);
+
+	return 0;
+}
+
+static void bxt_disable_metric_set(struct drm_i915_private *dev_priv)
+{
+	/* NOP */
+}
+
 static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
 {
 	assert_spin_locked(&dev_priv->perf.hook_lock);
@@ -1152,6 +1878,29 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
 	spin_unlock_irqrestore(&dev_priv->perf.hook_lock, flags);
 }
 
+static void gen8_oa_enable(struct drm_i915_private *dev_priv)
+{
+	u32 report_format = dev_priv->perf.oa.oa_buffer.format;
+
+	/* Reset buf pointers so we don't forward reports from before now.
+	 *
+	 * Think carefully if considering trying to avoid this, since it
+	 * also ensures status flags and the buffer itself are cleared
+	 * in error paths, and we have checks for invalid reports based
+	 * on the assumption that certain fields are written to zeroed
+	 * memory which this helps maintains.
+	 */
+	gen8_init_oa_buffer(dev_priv);
+
+	/* Note: we don't rely on the hardware to perform single context
+	 * filtering and instead filter on the cpu based on the context-id
+	 * field of reports
+	 */
+	I915_WRITE(GEN8_OACONTROL, (report_format <<
+				    GEN8_OA_REPORT_FORMAT_SHIFT) |
+				   GEN8_OA_COUNTER_ENABLE);
+}
+
 /**
  * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
  * @stream: An i915 perf stream opened for OA metrics
@@ -1178,6 +1927,11 @@ static void gen7_oa_disable(struct drm_i915_private *dev_priv)
 	I915_WRITE(GEN7_OACONTROL, 0);
 }
 
+static void gen8_oa_disable(struct drm_i915_private *dev_priv)
+{
+	I915_WRITE(GEN8_OACONTROL, 0);
+}
+
 /**
  * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
  * @stream: An i915 perf stream opened for OA metrics
@@ -1336,6 +2090,74 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	return ret;
 }
 
+static void gen8_update_reg_state_unlocked(struct intel_engine_cs *engine,
+					   struct i915_gem_context *ctx,
+					   uint32_t *reg_state)
+{
+	struct drm_i915_private *dev_priv = ctx->i915;
+	const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs;
+	int n_flex_regs = dev_priv->perf.oa.flex_regs_len;
+	int ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_off;
+	int ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_off;
+	int i;
+
+	if (!atomic_read(&ctx->engine[engine->id].oa_state_dirty))
+		return;
+
+	reg_state[ctx_oactxctrl] = i915_mmio_reg_offset(GEN8_OACTXCONTROL);
+	reg_state[ctx_oactxctrl+1] = (dev_priv->perf.oa.period_exponent <<
+				      GEN8_OA_TIMER_PERIOD_SHIFT) |
+				     (dev_priv->perf.oa.periodic ?
+				      GEN8_OA_TIMER_ENABLE : 0) |
+				     GEN8_OA_COUNTER_RESUME;
+
+	for (i = 0; i < n_flex_regs; i++) {
+		uint32_t offset = i915_mmio_reg_offset(flex_regs[i].addr);
+
+		/* Map from mmio address to register state context
+		 * offset...
+		 */
+
+		offset -= i915_mmio_reg_offset(EU_PERF_CNTL0);
+
+		/* Flex EU mmio registers are separated by 256 bytes */
+		offset /= 256;
+
+		/* This offset is in dwords not bytes */
+		offset *= 2;
+
+		/* EU_PERF_CNTL0 offset in register state context... */
+		offset += ctx_flexeu0;
+
+		reg_state[offset] = i915_mmio_reg_offset(flex_regs[i].addr);
+		reg_state[offset+1] = flex_regs[i].value;
+	}
+
+	atomic_set(&ctx->engine[engine->id].oa_state_dirty, false);
+}
+
+void i915_oa_update_reg_state(struct intel_engine_cs *engine,
+			      struct i915_gem_context *ctx,
+			      uint32_t *reg_state)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	if (!dev_priv->perf.initialized)
+		return;
+
+	/* XXX: We don't take a lock here and this may run async with
+	 * respect to stream methods. Notably we don't want to block
+	 * context switches by long i915 perf read() operations.
+	 *
+	 * It's expect to always be safe to read the dev_priv->perf
+	 * state needed here, and expected to be benign to redundantly
+	 * update the state if the OA unit has been disabled since
+	 * oa_state_dirty was last set.
+	 */
+
+	gen8_update_reg_state_unlocked(engine, ctx, reg_state);
+}
+
 /**
  * i915_perf_read_locked - &i915_perf_stream_ops->read with error normalisation
  * @stream: An i915 perf stream
@@ -1750,6 +2572,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
 	struct i915_gem_context *specific_ctx = NULL;
 	struct i915_perf_stream *stream = NULL;
 	unsigned long f_flags = 0;
+	bool privileged_op = true;
 	int stream_fd;
 	int ret;
 
@@ -1767,12 +2590,28 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
 		}
 	}
 
+	/* On Haswell the OA unit supports clock gating off for a specific
+	 * context and in this mode there's no visibility of metrics for the
+	 * rest of the system, which we consider acceptable for a
+	 * non-privileged client.
+	 *
+	 * For Gen8+ the OA unit no longer supports clock gating off for a
+	 * specific context and the kernel can't securely stop the counters
+	 * from updating as system-wide / global values. Even though we can
+	 * filter reports based on the included context ID we can't block
+	 * clients from seeing the raw / global counter values via
+	 * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
+	 * enable the OA unit by default.
+	 */
+	if (IS_HASWELL(dev_priv) && specific_ctx)
+		privileged_op = false;
+
 	/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
 	 * we check a dev.i915.perf_stream_paranoid sysctl option
 	 * to determine if it's ok to access system wide OA counters
 	 * without CAP_SYS_ADMIN privileges.
 	 */
-	if (!specific_ctx &&
+	if (privileged_op &&
 	    i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
 		DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n");
 		ret = -EACCES;
@@ -2039,9 +2878,6 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
  */
 void i915_perf_register(struct drm_i915_private *dev_priv)
 {
-	if (!IS_HASWELL(dev_priv))
-		return;
-
 	if (!dev_priv->perf.initialized)
 		return;
 
@@ -2057,11 +2893,38 @@ void i915_perf_register(struct drm_i915_private *dev_priv)
 	if (!dev_priv->perf.metrics_kobj)
 		goto exit;
 
-	if (i915_perf_register_sysfs_hsw(dev_priv)) {
-		kobject_put(dev_priv->perf.metrics_kobj);
-		dev_priv->perf.metrics_kobj = NULL;
+	if (IS_HASWELL(dev_priv)) {
+		if (i915_perf_register_sysfs_hsw(dev_priv))
+			goto sysfs_error;
+	} else if (IS_BROADWELL(dev_priv)) {
+		if (i915_perf_register_sysfs_bdw(dev_priv))
+			goto sysfs_error;
+	} else if (IS_CHERRYVIEW(dev_priv)) {
+		if (i915_perf_register_sysfs_chv(dev_priv))
+			goto sysfs_error;
+	} else if (IS_SKYLAKE(dev_priv)) {
+		if (IS_SKL_GT2(dev_priv)) {
+			if (i915_perf_register_sysfs_sklgt2(dev_priv))
+				goto sysfs_error;
+		} else if (IS_SKL_GT3(dev_priv)) {
+			if (i915_perf_register_sysfs_sklgt3(dev_priv))
+				goto sysfs_error;
+		} else if (IS_SKL_GT4(dev_priv)) {
+			if (i915_perf_register_sysfs_sklgt4(dev_priv))
+				goto sysfs_error;
+		} else
+			goto sysfs_error;
+	} else if (IS_BROXTON(dev_priv)) {
+		if (i915_perf_register_sysfs_bxt(dev_priv))
+			goto sysfs_error;
 	}
 
+	goto exit;
+
+sysfs_error:
+	kobject_put(dev_priv->perf.metrics_kobj);
+	dev_priv->perf.metrics_kobj = NULL;
+
 exit:
 	mutex_unlock(&dev_priv->perf.lock);
 }
@@ -2077,13 +2940,24 @@ void i915_perf_register(struct drm_i915_private *dev_priv)
  */
 void i915_perf_unregister(struct drm_i915_private *dev_priv)
 {
-	if (!IS_HASWELL(dev_priv))
-		return;
-
 	if (!dev_priv->perf.metrics_kobj)
 		return;
 
-	i915_perf_unregister_sysfs_hsw(dev_priv);
+        if (IS_HASWELL(dev_priv))
+                i915_perf_unregister_sysfs_hsw(dev_priv);
+        else if (IS_BROADWELL(dev_priv))
+                i915_perf_unregister_sysfs_bdw(dev_priv);
+        else if (IS_CHERRYVIEW(dev_priv))
+                i915_perf_unregister_sysfs_chv(dev_priv);
+        else if (IS_SKYLAKE(dev_priv)) {
+		if (IS_SKL_GT2(dev_priv))
+			i915_perf_unregister_sysfs_sklgt2(dev_priv);
+		else if (IS_SKL_GT3(dev_priv))
+			i915_perf_unregister_sysfs_sklgt3(dev_priv);
+		else if (IS_SKL_GT4(dev_priv))
+			i915_perf_unregister_sysfs_sklgt4(dev_priv);
+	} else if (IS_BROXTON(dev_priv))
+                i915_perf_unregister_sysfs_bxt(dev_priv);
 
 	kobject_put(dev_priv->perf.metrics_kobj);
 	dev_priv->perf.metrics_kobj = NULL;
@@ -2142,36 +3016,102 @@ static struct ctl_table dev_root[] = {
  */
 void i915_perf_init(struct drm_i915_private *dev_priv)
 {
-	if (!IS_HASWELL(dev_priv))
-		return;
-
-	hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
-		     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
-	init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
+	dev_priv->perf.oa.n_builtin_sets = 0;
+
+	if (IS_HASWELL(dev_priv)) {
+		dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
+		dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
+		dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
+		dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
+		dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
+		dev_priv->perf.oa.ops.read = gen7_oa_read;
+		dev_priv->perf.oa.ops.oa_buffer_check =
+			gen7_oa_buffer_check_unlocked;
+
+		dev_priv->perf.oa.oa_formats = hsw_oa_formats;
+
+		dev_priv->perf.oa.n_builtin_sets =
+			i915_oa_n_builtin_metric_sets_hsw;
+	} else if (i915.enable_execlists) {
+		if (IS_BROADWELL(dev_priv)) {
+			dev_priv->perf.oa.ops.enable_metric_set =
+				bdw_enable_metric_set;
+			dev_priv->perf.oa.ops.disable_metric_set =
+				bdw_disable_metric_set;
+			dev_priv->perf.oa.ctx_oactxctrl_off = 0x120;
+			dev_priv->perf.oa.ctx_flexeu0_off = 0x2ce;
+			dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<25);
+			dev_priv->perf.oa.n_builtin_sets =
+				i915_oa_n_builtin_metric_sets_bdw;
+		} else if (IS_CHERRYVIEW(dev_priv)) {
+			dev_priv->perf.oa.ops.enable_metric_set =
+				chv_enable_metric_set;
+			dev_priv->perf.oa.ops.disable_metric_set =
+				chv_disable_metric_set;
+			dev_priv->perf.oa.ctx_oactxctrl_off = 0x120;
+			dev_priv->perf.oa.ctx_flexeu0_off = 0x2ce;
+			dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<25);
+			dev_priv->perf.oa.n_builtin_sets =
+				i915_oa_n_builtin_metric_sets_chv;
+		} else if (IS_SKYLAKE(dev_priv)) {
+			dev_priv->perf.oa.ops.enable_metric_set =
+				skl_enable_metric_set;
+			dev_priv->perf.oa.ops.disable_metric_set =
+				skl_disable_metric_set;
+			dev_priv->perf.oa.ctx_oactxctrl_off = 0x128;
+			dev_priv->perf.oa.ctx_flexeu0_off = 0x3de;
+			dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
+
+
+			if (IS_SKL_GT2(dev_priv)) {
+				dev_priv->perf.oa.n_builtin_sets =
+					i915_oa_n_builtin_metric_sets_sklgt2;
+			} else if (IS_SKL_GT3(dev_priv)) {
+				dev_priv->perf.oa.n_builtin_sets =
+					i915_oa_n_builtin_metric_sets_sklgt3;
+			} else if (IS_SKL_GT4(dev_priv)) {
+				dev_priv->perf.oa.n_builtin_sets =
+					i915_oa_n_builtin_metric_sets_sklgt4;
+			}
+		} else if (IS_BROXTON(dev_priv)) {
+			dev_priv->perf.oa.ops.enable_metric_set =
+				bxt_enable_metric_set;
+			dev_priv->perf.oa.ops.disable_metric_set =
+				bxt_disable_metric_set;
+			dev_priv->perf.oa.ctx_oactxctrl_off = 0x128;
+			dev_priv->perf.oa.ctx_flexeu0_off = 0x3de;
+			dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
+			dev_priv->perf.oa.n_builtin_sets =
+				i915_oa_n_builtin_metric_sets_bxt;
+		}
 
-	INIT_LIST_HEAD(&dev_priv->perf.streams);
-	mutex_init(&dev_priv->perf.lock);
-	spin_lock_init(&dev_priv->perf.hook_lock);
-	spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
+		if (dev_priv->perf.oa.n_builtin_sets) {
+			dev_priv->perf.oa.ops.init_oa_buffer = gen8_init_oa_buffer;
+			dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
+			dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
+			dev_priv->perf.oa.ops.read = gen8_oa_read;
+			dev_priv->perf.oa.ops.oa_buffer_check =
+				gen8_oa_buffer_check_unlocked;
 
-	dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
-	dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
-	dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
-	dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
-	dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
-	dev_priv->perf.oa.ops.read = gen7_oa_read;
-	dev_priv->perf.oa.ops.oa_buffer_check =
-		gen7_oa_buffer_check_unlocked;
+			dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
+		}
+	}
 
-	dev_priv->perf.oa.oa_formats = hsw_oa_formats;
+	if (dev_priv->perf.oa.n_builtin_sets) {
+		hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
+				CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
+		init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
 
-	dev_priv->perf.oa.n_builtin_sets =
-		i915_oa_n_builtin_metric_sets_hsw;
+		INIT_LIST_HEAD(&dev_priv->perf.streams);
+		mutex_init(&dev_priv->perf.lock);
+		spin_lock_init(&dev_priv->perf.hook_lock);
+		spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
 
-	dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
+		dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
 
-	dev_priv->perf.initialized = true;
+		dev_priv->perf.initialized = true;
+	}
 }
 
 /**
@@ -2186,5 +3126,6 @@ void i915_perf_fini(struct drm_i915_private *dev_priv)
 	unregister_sysctl_table(dev_priv->perf.sysctl_header);
 
 	memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
+
 	dev_priv->perf.initialized = false;
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 94d40e82edc1..f60554410711 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -639,6 +639,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 
 #define GEN8_OACTXID _MMIO(0x2364)
 
+#define GEN8_OA_DEBUG _MMIO(0x2B04)
+#define  GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS    (1<<5)
+#define  GEN9_OA_DEBUG_INCLUDE_CLK_RATIO	    (1<<6)
+#define  GEN9_OA_DEBUG_DISABLE_GO_1_0_REPORTS	    (1<<2)
+#define  GEN9_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS   (1<<1)
+
 #define GEN8_OACONTROL _MMIO(0x2B00)
 #define  GEN8_OA_REPORT_FORMAT_A12	    (0<<2)
 #define  GEN8_OA_REPORT_FORMAT_A12_B8_C8    (2<<2)
@@ -660,6 +666,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN7_OABUFFER_STOP_RESUME_ENABLE   (1<<1)
 #define  GEN7_OABUFFER_RESUME		    (1<<0)
 
+#define GEN8_OABUFFER_UDW _MMIO(0x23b4)
 #define GEN8_OABUFFER _MMIO(0x2b14)
 
 #define GEN7_OASTATUS1 _MMIO(0x2364)
@@ -678,7 +685,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN8_OASTATUS_REPORT_LOST	    (1<<0)
 
 #define GEN8_OAHEADPTR _MMIO(0x2B0C)
+#define GEN8_OAHEADPTR_MASK    0xffffffc0
 #define GEN8_OATAILPTR _MMIO(0x2B10)
+#define GEN8_OATAILPTR_MASK    0xffffffc0
 
 #define OABUFFER_SIZE_128K  (0<<3)
 #define OABUFFER_SIZE_256K  (1<<3)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 39329d40da46..197e22c9cd39 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -336,6 +336,8 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 	if (ppgtt && !i915_vm_is_48bit(&ppgtt->base))
 		execlists_update_context_pdps(ppgtt, reg_state);
 
+	i915_oa_update_reg_state(rq->engine, rq->ctx, reg_state);
+
 	return ce->lrc_desc;
 }
 
@@ -1898,6 +1900,8 @@ static void execlists_init_reg_state(u32 *regs,
 		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
 			make_rpcs(dev_priv));
 	}
+
+	i915_oa_update_reg_state(engine, ctx, regs);
 }
 
 static int
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index e0599e729e68..03b833849919 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1295,13 +1295,18 @@ struct drm_i915_gem_context_param {
 };
 
 enum drm_i915_oa_format {
-	I915_OA_FORMAT_A13 = 1,
-	I915_OA_FORMAT_A29,
-	I915_OA_FORMAT_A13_B8_C8,
-	I915_OA_FORMAT_B4_C8,
-	I915_OA_FORMAT_A45_B8_C8,
-	I915_OA_FORMAT_B4_C8_A16,
-	I915_OA_FORMAT_C4_B8,
+	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
+	I915_OA_FORMAT_A29,	    /* HSW only */
+	I915_OA_FORMAT_A13_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8,	    /* HSW only */
+	I915_OA_FORMAT_A45_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8_A16,   /* HSW only */
+	I915_OA_FORMAT_C4_B8,	    /* HSW+ */
+
+	/* Gen8+ */
+	I915_OA_FORMAT_A12,
+	I915_OA_FORMAT_A12_B8_C8,
+	I915_OA_FORMAT_A32u40_A4u32_B8_C8,
 
 	I915_OA_FORMAT_MAX	    /* non-ABI */
 };
-- 
2.11.1