[Intel-gfx] [PATCH v6 12/15] drm/i915/perf: Add OA unit support for Gen 8+

Tue Apr 25 17:10:07 UTC 2017

On 25/04/17 09:42, Matthew Auld wrote:
> On 24 April 2017 at 19:49, Lionel Landwerlin
> <lionel.g.landwerlin at intel.com> wrote:
>> From: Robert Bragg <robert at sixbynine.org>
>>
>> Enables access to OA unit metrics for BDW, CHV, SKL and BXT which all
>> share (more-or-less) the same OA unit design.
>>
>> Of particular note in comparison to Haswell: some OA unit HW config
>> state has become per-context state and as a consequence it is somewhat
>> more complicated to manage synchronous state changes from the cpu while
>> there's no guarantee of what context (if any) is currently actively
>> running on the gpu.
>>
>> The periodic sampling frequency which can be particularly useful for
>> system-wide analysis (as opposed to command stream synchronised
>> MI_REPORT_PERF_COUNT commands) is perhaps the most surprising state to
>> have become per-context save and restored (while the OABUFFER
>> destination is still a shared, system-wide resource).
>>
>> This support for gen8+ takes care to consider a number of timing
>> challenges involved in synchronously updating per-context state
>> primarily by programming all config state from the cpu and updating all
>> current and saved contexts synchronously while the OA unit is still
>> disabled.
>>
>> The driver intentionally avoids depending on command streamer
>> programming to update OA state considering the lack of synchronization
>> between the automatic loading of OACTXCONTROL state (that includes the
>> periodic sampling state and enable state) on context restore and the
>> parsing of any general purpose BB the driver can control. I.e. this
>> implementation is careful to avoid the possibility of a context restore
>> temporarily enabling any out-of-date periodic sampling state. In
>> addition to the risk of transiently-out-of-date state being loaded
>> automatically; there are also internal HW latencies involved in the
>> loading of MUX configurations which would be difficult to account for
>> from the command streamer (and we only want to enable the unit when once
>> the MUX configuration is complete).
>>
>> Since the Gen8+ OA unit design no longer supports clock gating the unit
>> off for a single given context (which effectively stopped any progress
>> of counters while any other context was running) and instead supports
>> tagging OA reports with a context ID for filtering on the CPU, it means
>> we can no longer hide the system-wide progress of counters from a
>> non-privileged application only interested in metrics for its own
>> context. Although we could theoretically try and subtract the progress
>> of other contexts before forwarding reports via read() we aren't in a
>> position to filter reports captured via MI_REPORT_PERF_COUNT commands.
>> As a result, for Gen8+, we always require the
>> dev.i915.perf_stream_paranoid to be unset for any access to OA metrics
>> if not root.
>>
>> v5: Drain submitted requests when enabling metric set to ensure no
>>      lite-restore erases the context image we just updated (Lionel)
>>
>> v6: In addition to drain, switch to kernel context & update all
>>      context in place (Chris)
>>
>> Signed-off-by: Robert Bragg <robert at sixbynine.org>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>> Reviewed-by: Matthew Auld <matthew.auld at intel.com> \o/
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h  |  45 +-
>>   drivers/gpu/drm/i915/i915_perf.c | 957 ++++++++++++++++++++++++++++++++++++---
>>   drivers/gpu/drm/i915/i915_reg.h  |  22 +
>>   drivers/gpu/drm/i915/intel_lrc.c |   2 +
>>   include/uapi/drm/i915_drm.h      |  19 +-
>>   5 files changed, 952 insertions(+), 93 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index ffa1fc5eddfd..676b1227067c 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2064,9 +2064,17 @@ struct i915_oa_ops {
>>          void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
>>
>>          /**
>> -        * @enable_metric_set: Applies any MUX configuration to set up the
>> -        * Boolean and Custom (B/C) counters that are part of the counter
>> -        * reports being sampled. May apply system constraints such as
>> +        * @select_metric_set: The auto generated code that checks whether a
>> +        * requested OA config is applicable to the system and if so sets up
>> +        * the mux, oa and flex eu register config pointers according to the
>> +        * current dev_priv->perf.oa.metrics_set.
>> +        */
>> +       int (*select_metric_set)(struct drm_i915_private *dev_priv);
>> +
>> +       /**
>> +        * @enable_metric_set: Selects and applies any MUX configuration to set
>> +        * up the Boolean and Custom (B/C) counters that are part of the
>> +        * counter reports being sampled. May apply system constraints such as
>>           * disabling EU clock gating as required.
>>           */
>>          int (*enable_metric_set)(struct drm_i915_private *dev_priv);
>> @@ -2097,20 +2105,13 @@ struct i915_oa_ops {
>>                      size_t *offset);
>>
>>          /**
>> -        * @oa_buffer_check: Check for OA buffer data + update tail
>> -        *
>> -        * This is either called via fops or the poll check hrtimer (atomic
>> -        * ctx) without any locks taken.
>> +        * @oa_hw_tail_read: read the OA tail pointer register
>>           *
>> -        * It's safe to read OA config state here unlocked, assuming that this
>> -        * is only called while the stream is enabled, while the global OA
>> -        * configuration can't be modified.
>> -        *
>> -        * Efficiency is more important than avoiding some false positives
>> -        * here, which will be handled gracefully - likely resulting in an
>> -        * %EAGAIN error for userspace.
>> +        * In particular this enables us to share all the fiddly code for
>> +        * handling the OA unit tail pointer race that affects multiple
>> +        * generations.
>>           */
>> -       bool (*oa_buffer_check)(struct drm_i915_private *dev_priv);
>> +       u32 (*oa_hw_tail_read)(struct drm_i915_private *dev_priv);
>>   };
>>
>>   struct intel_cdclk_state {
>> @@ -2472,6 +2473,7 @@ struct drm_i915_private {
>>                          struct {
>>                                  struct i915_vma *vma;
>>                                  u8 *vaddr;
>> +                               u32 last_ctx_id;
>>                                  int format;
>>                                  int format_size;
>>
>> @@ -2541,6 +2543,14 @@ struct drm_i915_private {
>>                          } oa_buffer;
>>
>>                          u32 gen7_latched_oastatus1;
>> +                       u32 ctx_oactxctrl_offset;
>> +                       u32 ctx_flexeu0_offset;
>> +
>> +                       /* The RPT_ID/reason field for Gen8+ includes a bit
>> +                        * to determine if the CTX ID in the report is valid
>> +                        * but the specific bit differs between Gen 8 and 9
>> +                        */
>> +                       u32 gen8_valid_ctx_bit;
>>
>>                          struct i915_oa_ops ops;
>>                          const struct i915_oa_format *oa_formats;
>> @@ -2851,6 +2861,8 @@ intel_info(const struct drm_i915_private *dev_priv)
>>   #define IS_KBL_ULX(dev_priv)   (INTEL_DEVID(dev_priv) == 0x590E || \
>>                                   INTEL_DEVID(dev_priv) == 0x5915 || \
>>                                   INTEL_DEVID(dev_priv) == 0x591E)
>> +#define IS_SKL_GT2(dev_priv)   (IS_SKYLAKE(dev_priv) && \
>> +                                (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0010)
>>   #define IS_SKL_GT3(dev_priv)   (IS_SKYLAKE(dev_priv) && \
>>                                   (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
>>   #define IS_SKL_GT4(dev_priv)   (IS_SKYLAKE(dev_priv) && \
>> @@ -3615,6 +3627,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx,
>>
>>   int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>>                           struct drm_file *file);
>> +void i915_oa_init_reg_state(struct intel_engine_cs *engine,
>> +                           struct i915_gem_context *ctx,
>> +                           uint32_t *reg_state);
>>
>>   /* i915_gem_evict.c */
>>   int __must_check i915_gem_evict_something(struct i915_address_space *vm,
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index 3277a52ce98e..f2688dee5a03 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -196,6 +196,12 @@
>>
>>   #include "i915_drv.h"
>>   #include "i915_oa_hsw.h"
>> +#include "i915_oa_bdw.h"
>> +#include "i915_oa_chv.h"
>> +#include "i915_oa_sklgt2.h"
>> +#include "i915_oa_sklgt3.h"
>> +#include "i915_oa_sklgt4.h"
>> +#include "i915_oa_bxt.h"
>>
>>   /* HW requires this to be a power of two, between 128k and 16M, though driver
>>    * is currently generally designed assuming the largest 16M size is used such
>> @@ -215,7 +221,7 @@
>>    *
>>    * Although this can be observed explicitly while copying reports to userspace
>>    * by checking for a zeroed report-id field in tail reports, we want to account
>> - * for this earlier, as part of the _oa_buffer_check to avoid lots of redundant
>> + * for this earlier, as part of the oa_buffer_check to avoid lots of redundant
>>    * read() attempts.
>>    *
>>    * In effect we define a tail pointer for reading that lags the real tail
>> @@ -237,7 +243,7 @@
>>    * indicates that an updated tail pointer is needed.
>>    *
>>    * Most of the implementation details for this workaround are in
>> - * gen7_oa_buffer_check_unlocked() and gen7_appand_oa_reports()
>> + * oa_buffer_check_unlocked() and _append_oa_reports()
>>    *
>>    * Note for posterity: previously the driver used to define an effective tail
>>    * pointer that lagged the real pointer by a 'tail margin' measured in bytes
>> @@ -272,6 +278,13 @@ static u32 i915_perf_stream_paranoid = true;
>>
>>   #define INVALID_CTX_ID 0xffffffff
>>
>> +/* On Gen8+ automatically triggered OA reports include a 'reason' field... */
>> +#define OAREPORT_REASON_MASK           0x3f
>> +#define OAREPORT_REASON_SHIFT          19
>> +#define OAREPORT_REASON_TIMER          (1<<0)
>> +#define OAREPORT_REASON_CTX_SWITCH     (1<<3)
>> +#define OAREPORT_REASON_CLK_RATIO      (1<<5)
>> +
>>
>>   /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
>>    *
>> @@ -303,6 +316,13 @@ static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
>>          [I915_OA_FORMAT_C4_B8]      = { 7, 64 },
>>   };
>>
>> +static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
>> +       [I915_OA_FORMAT_A12]                = { 0, 64 },
>> +       [I915_OA_FORMAT_A12_B8_C8]          = { 2, 128 },
>> +       [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
>> +       [I915_OA_FORMAT_C4_B8]              = { 7, 64 },
>> +};
>> +
>>   #define SAMPLE_OA_REPORT      (1<<0)
>>
>>   /**
>> @@ -332,8 +352,20 @@ struct perf_open_properties {
>>          int oa_period_exponent;
>>   };
>>
>> +static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
>> +{
>> +       return I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
>> +}
>> +
>> +static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
>> +{
>> +       u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
>> +
>> +       return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
>> +}
>> +
>>   /**
>> - * gen7_oa_buffer_check_unlocked - check for data and update tail ptr state
>> + * oa_buffer_check_unlocked - check for data and update tail ptr state
>>    * @dev_priv: i915 device instance
>>    *
>>    * This is either called via fops (for blocking reads in user ctx) or the poll
>> @@ -356,12 +388,11 @@ struct perf_open_properties {
>>    *
>>    * Returns: %true if the OA buffer contains data, else %false
>>    */
>> -static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
>> +static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
>>   {
>>          int report_size = dev_priv->perf.oa.oa_buffer.format_size;
>>          unsigned long flags;
>>          unsigned int aged_idx;
>> -       u32 oastatus1;
>>          u32 head, hw_tail, aged_tail, aging_tail;
>>          u64 now;
>>
>> @@ -381,8 +412,7 @@ static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
>>          aged_tail = dev_priv->perf.oa.oa_buffer.tails[aged_idx].offset;
>>          aging_tail = dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset;
>>
>> -       oastatus1 = I915_READ(GEN7_OASTATUS1);
>> -       hw_tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
>> +       hw_tail = dev_priv->perf.oa.ops.oa_hw_tail_read(dev_priv);
>>
>>          /* The tail pointer increases in 64 byte increments,
>>           * not in report_size steps...
>> @@ -404,6 +434,7 @@ static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
>>          if (aging_tail != INVALID_TAIL_PTR &&
>>              ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
>>               OA_TAIL_MARGIN_NSEC)) {
>> +
>>                  aged_idx ^= 1;
>>                  dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
>>
>> @@ -553,6 +584,287 @@ static int append_oa_sample(struct i915_perf_stream *stream,
>>    *
>>    * Returns: 0 on success, negative error code on failure.
>>    */
>> +static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>> +                                 char __user *buf,
>> +                                 size_t count,
>> +                                 size_t *offset)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       int report_size = dev_priv->perf.oa.oa_buffer.format_size;
>> +       u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
>> +       u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
>> +       u32 mask = (OA_BUFFER_SIZE - 1);
>> +       size_t start_offset = *offset;
>> +       unsigned long flags;
>> +       unsigned int aged_tail_idx;
>> +       u32 head, tail;
>> +       u32 taken;
>> +       int ret = 0;
>> +
>> +       if (WARN_ON(!stream->enabled))
>> +               return -EIO;
>> +
>> +       spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> +
>> +       head = dev_priv->perf.oa.oa_buffer.head;
>> +       aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
>> +       tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
>> +
>> +       spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> +
>> +       /* An invalid tail pointer here means we're still waiting for the poll
>> +        * hrtimer callback to give us a pointer
>> +        */
>> +       if (tail == INVALID_TAIL_PTR)
>> +               return -EAGAIN;
>> +
>> +       /* NB: oa_buffer.head/tail include the gtt_offset which we don't want
>> +        * while indexing relative to oa_buf_base.
>> +        */
>> +       head -= gtt_offset;
>> +       tail -= gtt_offset;
>> +
>> +       /* An out of bounds or misaligned head or tail pointer implies a driver
>> +        * bug since we validate + align the tail pointers we read from the
>> +        * hardware and we are in full control of the head pointer which should
>> +        * only be incremented by multiples of the report size (notably also
>> +        * all a power of two).
>> +        */
>> +       if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
>> +                     tail > OA_BUFFER_SIZE || tail % report_size,
>> +                     "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
>> +                     head, tail))
>> +               return -EIO;
>> +
>> +
>> +       for (/* none */;
>> +            (taken = OA_TAKEN(tail, head));
>> +            head = (head + report_size) & mask) {
>> +               u8 *report = oa_buf_base + head;
>> +               u32 *report32 = (void *)report;
>> +               u32 ctx_id;
>> +               u32 reason;
>> +
>> +               /* All the report sizes factor neatly into the buffer
>> +                * size so we never expect to see a report split
>> +                * between the beginning and end of the buffer.
>> +                *
>> +                * Given the initial alignment check a misalignment
>> +                * here would imply a driver bug that would result
>> +                * in an overrun.
>> +                */
>> +               if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
>> +                       DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
>> +                       break;
>> +               }
>> +
>> +               /* The reason field includes flags identifying what
>> +                * triggered this specific report (mostly timer
>> +                * triggered or e.g. due to a context switch).
>> +                *
>> +                * This field is never expected to be zero so we can
>> +                * check that the report isn't invalid before copying
>> +                * it to userspace...
>> +                */
>> +               reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
>> +                         OAREPORT_REASON_MASK);
>> +               if (reason == 0) {
>> +                       if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
>> +                               DRM_NOTE("Skipping spurious, invalid OA report\n");
>> +                       continue;
>> +               }
>> +
>> +               /* XXX: Just keep the lower 21 bits for now since I'm not
>> +                * entirely sure if the HW touches any of the higher bits in
>> +                * this field
>> +                */
>> +               ctx_id = report32[2] & 0x1fffff;
>> +
>> +               /* Squash whatever is in the CTX_ID field if it's marked as
>> +                * invalid to be sure we avoid false-positive, single-context
>> +                * filtering below...
>> +                *
>> +                * Note: that we don't clear the valid_ctx_bit so userspace can
>> +                * understand that the ID has been squashed by the kernel.
>> +                */
>> +               if (!(report32[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
>> +                       ctx_id = report32[2] = INVALID_CTX_ID;
>> +
>> +               /* NB: For Gen 8 the OA unit no longer supports clock gating
>> +                * off for a specific context and the kernel can't securely
>> +                * stop the counters from updating as system-wide / global
>> +                * values.
>> +                *
>> +                * Automatic reports now include a context ID so reports can be
>> +                * filtered on the cpu but it's not worth trying to
>> +                * automatically subtract/hide counter progress for other
>> +                * contexts while filtering since we can't stop userspace
>> +                * issuing MI_REPORT_PERF_COUNT commands which would still
>> +                * provide a side-band view of the real values.
>> +                *
>> +                * To allow userspace (such as Mesa/GL_INTEL_performance_query)
>> +                * to normalize counters for a single filtered context then it
>> +                * needs be forwarded bookend context-switch reports so that it
>> +                * can track switches in between MI_REPORT_PERF_COUNT commands
>> +                * and can itself subtract/ignore the progress of counters
>> +                * associated with other contexts. Note that the hardware
>> +                * automatically triggers reports when switching to a new
>> +                * context which are tagged with the ID of the newly active
>> +                * context. To avoid the complexity (and likely fragility) of
>> +                * reading ahead while parsing reports to try and minimize
>> +                * forwarding redundant context switch reports (i.e. between
>> +                * other, unrelated contexts) we simply elect to forward them
>> +                * all.
>> +                *
>> +                * We don't rely solely on the reason field to identify context
>> +                * switches since it's not-uncommon for periodic samples to
>> +                * identify a switch before any 'context switch' report.
>> +                */
>> +               if (!dev_priv->perf.oa.exclusive_stream->ctx ||
>> +                   dev_priv->perf.oa.specific_ctx_id == ctx_id ||
>> +                   (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
>> +                    dev_priv->perf.oa.specific_ctx_id) ||
>> +                   reason & OAREPORT_REASON_CTX_SWITCH) {
>> +
>> +                       /* While filtering for a single context we avoid
>> +                        * leaking the IDs of other contexts.
>> +                        */
>> +                       if (dev_priv->perf.oa.exclusive_stream->ctx &&
>> +                           dev_priv->perf.oa.specific_ctx_id != ctx_id) {
>> +                               report32[2] = INVALID_CTX_ID;
>> +                       }
>> +
>> +                       ret = append_oa_sample(stream, buf, count, offset,
>> +                                              report);
>> +                       if (ret)
>> +                               break;
>> +
>> +                       dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
>> +               }
>> +
>> +               /* The above reason field sanity check is based on
>> +                * the assumption that the OA buffer is initially
>> +                * zeroed and we reset the field after copying so the
>> +                * check is still meaningful once old reports start
>> +                * being overwritten.
>> +                */
>> +               report32[0] = 0;
>> +       }
>> +
>> +       if (start_offset != *offset) {
>> +               spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> +
>> +               /* We removed the gtt_offset for the copy loop above, indexing
>> +                * relative to oa_buf_base so put back here...
>> +                */
>> +               head += gtt_offset;
>> +
>> +               I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
>> +               dev_priv->perf.oa.oa_buffer.head = head;
>> +
>> +               spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> +       }
>> +
>> +       return ret;
>> +}
>> +
>> +/**
>> + * gen8_oa_read - copy status records then buffered OA reports
>> + * @stream: An i915-perf stream opened for OA metrics
>> + * @buf: destination buffer given by userspace
>> + * @count: the number of bytes userspace wants to read
>> + * @offset: (inout): the current position for writing into @buf
>> + *
>> + * Checks OA unit status registers and if necessary appends corresponding
>> + * status records for userspace (such as for a buffer full condition) and then
>> + * initiate appending any buffered OA reports.
>> + *
>> + * Updates @offset according to the number of bytes successfully copied into
>> + * the userspace buffer.
>> + *
>> + * NB: some data may be successfully copied to the userspace buffer
>> + * even if an error is returned, and this is reflected in the
>> + * updated @offset.
>> + *
>> + * Returns: zero on success or a negative error code
>> + */
>> +static int gen8_oa_read(struct i915_perf_stream *stream,
>> +                       char __user *buf,
>> +                       size_t count,
>> +                       size_t *offset)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       u32 oastatus;
>> +       int ret;
>> +
>> +       if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
>> +               return -EIO;
>> +
>> +       oastatus = I915_READ(GEN8_OASTATUS);
>> +
>> +       /* We treat OABUFFER_OVERFLOW as a significant error:
>> +        *
>> +        * Although theoretically we could handle this more gracefully
>> +        * sometimes, some Gens don't correctly suppress certain
>> +        * automatically triggered reports in this condition and so we
>> +        * have to assume that old reports are now being trampled
>> +        * over.
>> +        *
>> +        * Considering how we don't currently give userspace control
>> +        * over the OA buffer size and always configure a large 16MB
>> +        * buffer, then a buffer overflow does anyway likely indicate
>> +        * that something has gone quite badly wrong.
>> +        */
>> +       if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
>> +               ret = append_oa_status(stream, buf, count, offset,
>> +                                      DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
>> +               if (ret)
>> +                       return ret;
>> +
>> +               DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
>> +                         dev_priv->perf.oa.period_exponent);
>> +
>> +               dev_priv->perf.oa.ops.oa_disable(dev_priv);
>> +               dev_priv->perf.oa.ops.oa_enable(dev_priv);
>> +
>> +               /* Note: .oa_enable() is expected to re-init the oabuffer
>> +                * and reset GEN8_OASTATUS for us
>> +                */
>> +               oastatus = I915_READ(GEN8_OASTATUS);
>> +       }
>> +
>> +       if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
>> +               ret = append_oa_status(stream, buf, count, offset,
>> +                                      DRM_I915_PERF_RECORD_OA_REPORT_LOST);
>> +               if (ret)
>> +                       return ret;
>> +               I915_WRITE(GEN8_OASTATUS,
>> +                          oastatus & ~GEN8_OASTATUS_REPORT_LOST);
>> +       }
>> +
>> +       return gen8_append_oa_reports(stream, buf, count, offset);
>> +}
>> +
>> +/**
>> + * Copies all buffered OA reports into userspace read() buffer.
>> + * @stream: An i915-perf stream opened for OA metrics
>> + * @buf: destination buffer given by userspace
>> + * @count: the number of bytes userspace wants to read
>> + * @offset: (inout): the current position for writing into @buf
>> + *
>> + * Notably any error condition resulting in a short read (-%ENOSPC or
>> + * -%EFAULT) will be returned even though one or more records may
>> + * have been successfully copied. In this case it's up to the caller
>> + * to decide if the error should be squashed before returning to
>> + * userspace.
>> + *
>> + * Note: reports are consumed from the head, and appended to the
>> + * tail, so the tail chases the head?... If you think that's mad
>> + * and back-to-front you're not alone, but this follows the
>> + * Gen PRM naming convention.
>> + *
>> + * Returns: 0 on success, negative error code on failure.
>> + */
>>   static int gen7_append_oa_reports(struct i915_perf_stream *stream,
>>                                    char __user *buf,
>>                                    size_t count,
>> @@ -733,7 +1045,8 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
>>                  if (ret)
>>                          return ret;
>>
>> -               DRM_DEBUG("OA buffer overflow: force restart\n");
>> +               DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
>> +                         dev_priv->perf.oa.period_exponent);
>>
>>                  dev_priv->perf.oa.ops.oa_disable(dev_priv);
>>                  dev_priv->perf.oa.ops.oa_enable(dev_priv);
>> @@ -776,7 +1089,7 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
>>                  return -EIO;
>>
>>          return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
>> -                                       dev_priv->perf.oa.ops.oa_buffer_check(dev_priv));
>> +                                       oa_buffer_check_unlocked(dev_priv));
>>   }
>>
>>   /**
>> @@ -833,33 +1146,37 @@ static int i915_oa_read(struct i915_perf_stream *stream,
>>   static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>>   {
>>          struct drm_i915_private *dev_priv = stream->dev_priv;
>> -       struct intel_engine_cs *engine = dev_priv->engine[RCS];
>> -       int ret;
>>
>> -       ret = i915_mutex_lock_interruptible(&dev_priv->drm);
>> -       if (ret)
>> -               return ret;
>> +       if (i915.enable_execlists)
>> +               dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
>> +       else {
>> +               struct intel_engine_cs *engine = dev_priv->engine[RCS];
>> +               int ret;
>>
>> -       /* As the ID is the gtt offset of the context's vma we pin
>> -        * the vma to ensure the ID remains fixed.
>> -        *
>> -        * NB: implied RCS engine...
>> -        */
>> -       ret = engine->context_pin(engine, stream->ctx);
>> -       if (ret)
>> -               goto unlock;
>> +               ret = i915_mutex_lock_interruptible(&dev_priv->drm);
>> +               if (ret)
>> +                       return ret;
>>
>> -       /* Explicitly track the ID (instead of calling i915_ggtt_offset()
>> -        * on the fly) considering the difference with gen8+ and
>> -        * execlists
>> -        */
>> -       dev_priv->perf.oa.specific_ctx_id =
>> -               i915_ggtt_offset(stream->ctx->engine[engine->id].state);
>> +               /* As the ID is the gtt offset of the context's vma we pin
>> +                * the vma to ensure the ID remains fixed.
>> +                */
>> +               ret = engine->context_pin(engine, stream->ctx);
>> +               if (ret) {
>> +                       mutex_unlock(&dev_priv->drm.struct_mutex);
>> +                       return ret;
>> +               }
>>
>> -unlock:
>> -       mutex_unlock(&dev_priv->drm.struct_mutex);
>> +               /* Explicitly track the ID (instead of calling
>> +                * i915_ggtt_offset() on the fly) considering the difference
>> +                * with gen8+ and execlists
>> +                */
>> +               dev_priv->perf.oa.specific_ctx_id =
>> +                       i915_ggtt_offset(stream->ctx->engine[engine->id].state);
>>
>> -       return ret;
>> +               mutex_unlock(&dev_priv->drm.struct_mutex);
>> +       }
>> +
>> +       return 0;
>>   }
>>
>>   /**
>> @@ -874,12 +1191,16 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
>>          struct drm_i915_private *dev_priv = stream->dev_priv;
>>          struct intel_engine_cs *engine = dev_priv->engine[RCS];
>>
>> -       mutex_lock(&dev_priv->drm.struct_mutex);
>> +       if (i915.enable_execlists) {
>> +               dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
>> +       } else {
>> +               mutex_lock(&dev_priv->drm.struct_mutex);
>>
>> -       dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
>> -       engine->context_unpin(engine, stream->ctx);
>> +               dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
>> +               engine->context_unpin(engine, stream->ctx);
>>
>> -       mutex_unlock(&dev_priv->drm.struct_mutex);
>> +               mutex_unlock(&dev_priv->drm.struct_mutex);
>> +       }
>>   }
>>
>>   static void
>> @@ -969,6 +1290,61 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
>>          dev_priv->perf.oa.pollin = false;
>>   }
>>
>> +static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
>> +{
>> +       u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
>> +       unsigned long flags;
>> +
>> +       spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> +
>> +       I915_WRITE(GEN8_OASTATUS, 0);
>> +       I915_WRITE(GEN8_OAHEADPTR, gtt_offset);
>> +       dev_priv->perf.oa.oa_buffer.head = gtt_offset;
>> +
>> +       I915_WRITE(GEN8_OABUFFER_UDW, 0);
>> +
>> +       /* PRM says:
>> +        *
>> +        *  "This MMIO must be set before the OATAILPTR
>> +        *  register and after the OAHEADPTR register. This is
>> +        *  to enable proper functionality of the overflow
>> +        *  bit."
>> +        */
>> +       I915_WRITE(GEN8_OABUFFER, gtt_offset |
>> +                  OABUFFER_SIZE_16M | OA_MEM_SELECT_GGTT);
>> +       I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
>> +
>> +       /* Mark that we need updated tail pointers to read from... */
>> +       dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
>> +       dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
>> +
>> +       /* Reset state used to recognise context switches, affecting which
>> +        * reports we will forward to userspace while filtering for a single
>> +        * context.
>> +        */
>> +       dev_priv->perf.oa.oa_buffer.last_ctx_id = INVALID_CTX_ID;
>> +
>> +       spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> +
>> +       /* NB: although the OA buffer will initially be allocated
>> +        * zeroed via shmfs (and so this memset is redundant when
>> +        * first allocating), we may re-init the OA buffer, either
>> +        * when re-enabling a stream or in error/reset paths.
>> +        *
>> +        * The reason we clear the buffer for each re-init is for the
>> +        * sanity check in gen8_append_oa_reports() that looks at the
>> +        * reason field to make sure it's non-zero which relies on
>> +        * the assumption that new reports are being written to zeroed
>> +        * memory...
>> +        */
>> +       memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
>> +
>> +       /* Maybe make ->pollin per-stream state if we support multiple
>> +        * concurrent streams in the future.
>> +        */
>> +       dev_priv->perf.oa.pollin = false;
>> +}
>> +
>>   static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
>>   {
>>          struct drm_i915_gem_object *bo;
>> @@ -1113,6 +1489,280 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
>>                                        ~GT_NOA_ENABLE));
>>   }
>>
>> +/*
>> + * From Broadwell PRM, 3D-Media-GPGPU -> Register State Context
>> + *
>> + * MMIO reads or writes to any of the registers listed in the
>> + * “Register State Context image” subsections through HOST/IA
>> + * MMIO interface for debug purposes must follow the steps below:
>> + *
>> + * - SW should set the Force Wakeup bit to prevent GT from entering C6.
>> + * - Write 0x2050[31:0] = 0x00010001 (disable sequence).
>> + * - Disable IDLE messaging in CS (Write 0x2050[31:0] = 0x00010001).
>> + * - BDW:  Poll/Wait for register bits of 0x22AC[6:0] turn to 0x30 value.
>> + * - SKL+: Poll/Wait for register bits of 0x22A4[6:0] turn to 0x30 value.
>> + * - Read/Write to desired MMIO registers.
>> + * - Enable IDLE messaging in CS (Write 0x2050[31:0] = 0x00010000).
>> + * - Force Wakeup bit should be reset to enable C6 entry.
>> + *
>> + * XXX: don't nest or overlap calls to this API, it has no ref
>> + * counting to track how many entities require the RCS to be
>> + * blocked from being idle.
>> + */
>> +static int gen8_begin_ctx_mmio(struct drm_i915_private *dev_priv)
>> +{
>> +       i915_reg_t fsm_reg = INTEL_GEN(dev_priv) > 8 ?
>> +               GEN9_RCS_FE_FSM2 : GEN6_RCS_PWR_FSM;
>> +       int ret = 0;
>> +
>> +       /* There's only no active context while idle in execlist mode
>> +        * (though we shouldn't be using this in any other case)
>> +        */
>> +       if (WARN_ON(!i915.enable_execlists))
>> +               return -EINVAL;
>> +
>> +       intel_uncore_forcewake_get(dev_priv, FORCEWAKE_RENDER);
>> +
>> +       I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>> +                  _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
>> +
>> +       /* Note: we don't currently have a good handle on the maximum
>> +        * latency for this wake up so while we only need to hold rcs
>> +        * busy from process context we at least keep the waiting
>> +        * interruptible...
>> +        */
>> +       ret = intel_wait_for_register(dev_priv, fsm_reg, 0x3f, 0x30, 50);
>> +       if (ret == -ETIMEDOUT)
>> +               DRM_ERROR("Timeout waiting to be able to begin per-ctx mmio\n");
>> +
>> +       if (ret)
>> +               intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
>> +
>> +       return ret;
>> +}
>> +
>> +static void gen8_end_ctx_mmio(struct drm_i915_private *dev_priv)
>> +{
>> +       if (WARN_ON(!i915.enable_execlists))
>> +               return;
>> +
>> +       I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>> +                  _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
>> +       intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
>> +}
>> +
>> +/* NB: It must always remain pointer safe to run this even if the OA unit
>> + * has been disabled.
>> + *
>> + * It's fine to put out-of-date values into these per-context registers
>> + * in the case that the OA unit has been disabled.
>> + */
>> +static void gen8_update_reg_state_unlocked(struct i915_gem_context *ctx,
>> +                                          u32 *reg_state)
>> +{
>> +       struct drm_i915_private *dev_priv = ctx->i915;
>> +       const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs;
>> +       int n_flex_regs = dev_priv->perf.oa.flex_regs_len;
>> +       u32 ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_offset;
>> +       u32 ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_offset;
>> +       /* The MMIO offsets for Flex EU registers aren't contiguous */
>> +       u32 flex_mmio[] = {
>> +               i915_mmio_reg_offset(EU_PERF_CNTL0),
>> +               i915_mmio_reg_offset(EU_PERF_CNTL1),
>> +               i915_mmio_reg_offset(EU_PERF_CNTL2),
>> +               i915_mmio_reg_offset(EU_PERF_CNTL3),
>> +               i915_mmio_reg_offset(EU_PERF_CNTL4),
>> +               i915_mmio_reg_offset(EU_PERF_CNTL5),
>> +               i915_mmio_reg_offset(EU_PERF_CNTL6),
>> +       };
>> +       int i;
>> +
>> +       reg_state[ctx_oactxctrl] = i915_mmio_reg_offset(GEN8_OACTXCONTROL);
>> +       reg_state[ctx_oactxctrl+1] = (dev_priv->perf.oa.period_exponent <<
>> +                                     GEN8_OA_TIMER_PERIOD_SHIFT) |
>> +                                    (dev_priv->perf.oa.periodic ?
>> +                                     GEN8_OA_TIMER_ENABLE : 0) |
>> +                                    GEN8_OA_COUNTER_RESUME;
>> +
>> +       for (i = 0; i < ARRAY_SIZE(flex_mmio); i++) {
>> +               u32 state_offset = ctx_flexeu0 + i * 2;
>> +               u32 mmio = flex_mmio[i];
>> +
>> +               /* This arbitrary default will select the 'EU FPU0 Pipeline
>> +                * Active' event. In the future it's anticipated that there
>> +                * will be an explicit 'No Event' we can select, but not yet...
>> +                */
>> +               u32 value = 0;
>> +               int j;
>> +
>> +               for (j = 0; j < n_flex_regs; j++) {
>> +                       if (i915_mmio_reg_offset(flex_regs[j].addr) == mmio) {
>> +                               value = flex_regs[j].value;
>> +                               break;
>> +                       }
>> +               }
>> +
>> +               reg_state[state_offset] = mmio;
>> +               reg_state[state_offset+1] = value;
>> +       }
>> +}
>> +
>> +/* Manages updating the per-context aspects of the OA stream
>> + * configuration across all contexts.
>> + *
>> + * The awkward consideration here is that OACTXCONTROL controls the
>> + * exponent for periodic sampling which is primarily used for system
>> + * wide profiling where we'd like a consistent sampling period even in
>> + * the face of context switches.
>> + *
>> + * Our approach of updating the register state context (as opposed to
>> + * say using a workaround batch buffer) ensures that the hardware
>> + * won't automatically reload an out-of-date timer exponent even
>> + * transiently before a WA BB could be parsed.
>> + *
>> + * This function needs to:
>> + * - Ensure the currently running context's per-context OA state is
>> + *   updated
>> + * - Ensure that all existing contexts will have the correct per-context
>> + *   OA state if they are scheduled for use.
>> + * - Ensure any new contexts will be initialized with the correct
>> + *   per-context OA state.
>> + *
>> + * Note: it's only the RCS/Render context that has any OA state.
>> + */
>> +static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv)
>> +{
>> +       struct i915_gem_context *ctx;
>> +       int ret;
>> +
>> +       ret = i915_mutex_lock_interruptible(&dev_priv->drm);
>> +       if (ret)
>> +               return ret;
>> +
>> +       /* Switch away from any user context.  */
>> +       ret = i915_gem_switch_to_kernel_context(dev_priv);
>> +       if (ret)
>   mutex_unlock(&dev_priv->drm.struct_mutex);

Duh! Thanks!

>
>> +               return ret;
>> +
>> +       /* The OA register config is setup through the context image. This image
>> +        * might be written to by the GPU on context switch (in particular on
>> +        * lite-restore). This means we can't safely update a context's image,
>> +        * if this context is scheduled/submitted to run on the GPU.
>> +        *
>> +        * We could emit the OA register config through the batch buffer but
>> +        * this might leave small interval of time where the OA unit is
>> +        * configured at an invalid sampling period.
>> +        *
>> +        * So far the best way to work around this issue seems to be draining
>> +        * the GPU from any submitted work.
>> +        */
>> +       ret = i915_gem_wait_for_idle(dev_priv,
>> +                                    I915_WAIT_INTERRUPTIBLE |
>> +                                    I915_WAIT_LOCKED);
>> +       if (ret) {
>> +               mutex_unlock(&dev_priv->drm.struct_mutex);
>> +               return ret;
>> +       }
>> +
>> +       /* Update all contexts now that we've stalled the submission. */
>> +       list_for_each_entry(ctx, &dev_priv->context_list, link) {
>> +               if (!ctx->engine[RCS].initialised)
>> +                       continue;
>> +
>> +               gen8_update_reg_state_unlocked(ctx,
>> +                                              ctx->engine[RCS].lrc_reg_state);
>> +       }
>> +
>> +       mutex_unlock(&dev_priv->drm.struct_mutex);
>> +
>> +       /* Now update the current context.
>> +        *
>> +        * Note: Using MMIO to update per-context registers requires
>> +        * some extra care...
>> +        */
>> +       ret = gen8_begin_ctx_mmio(dev_priv);
>> +       if (ret) {
>> +               DRM_ERROR("Failed to bring RCS out of idle to update current ctx OA state\n");
>> +               return ret;
>> +       }
> So do we still need the mmio for the current context, given that we
> now idle everything and then update the reg state?

I would say yes, because we don't have any guarantee that another 
context is going to be scheduled after that.
Yet we want the config to be applied before we return.

>> +
>> +       I915_WRITE(GEN8_OACTXCONTROL, ((dev_priv->perf.oa.period_exponent <<
>> +                                       GEN8_OA_TIMER_PERIOD_SHIFT) |
>> +                                     (dev_priv->perf.oa.periodic ?
>> +                                      GEN8_OA_TIMER_ENABLE : 0) |
>> +                                     GEN8_OA_COUNTER_RESUME));
>> +
>> +       config_oa_regs(dev_priv, dev_priv->perf.oa.flex_regs,
>> +                       dev_priv->perf.oa.flex_regs_len);
>> +
>> +       gen8_end_ctx_mmio(dev_priv);
>> +
>> +       return 0;
>> +}
>> +
>> +static int gen8_enable_metric_set(struct drm_i915_private *dev_priv)
>> +{
>> +       int ret = dev_priv->perf.oa.ops.select_metric_set(dev_priv);
>> +
>> +       if (ret)
>> +               return ret;
>> +
>> +       /* We disable slice/unslice clock ratio change reports on SKL since
>> +        * they are too noisy. The HW generates a lot of redundant reports
>> +        * where the ratio hasn't really changed causing a lot of redundant
>> +        * work to processes and increasing the chances we'll hit buffer
>> +        * overruns.
>> +        *
>> +        * Although we don't currently use the 'disable overrun' OABUFFER
>> +        * feature it's worth noting that clock ratio reports have to be
>> +        * disabled before considering to use that feature since the HW doesn't
>> +        * correctly block these reports.
>> +        *
>> +        * Currently none of the high-level metrics we have depend on knowing
>> +        * this ratio to normalize.
>> +        *
>> +        * Note: This register is not power context saved and restored, but
>> +        * that's OK considering that we disable RC6 while the OA unit is
>> +        * enabled.
>> +        *
>> +        * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
>> +        * be read back from automatically triggered reports, as part of the
>> +        * RPT_ID field.
>> +        */
>> +       if (IS_SKYLAKE(dev_priv) || IS_BROXTON(dev_priv)) {
>> +               I915_WRITE(GEN8_OA_DEBUG,
>> +                          _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
>> +                                             GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
>> +       }
>> +
>> +       I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
>> +       config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
>> +                      dev_priv->perf.oa.mux_regs_len);
>> +       I915_WRITE(GDT_CHICKEN_BITS, 0x80);
>> +
>> +       /* It takes a fairly long time for a new MUX configuration to
>> +        * be be applied after these register writes. This delay
>> +        * duration is take from Haswell (derived empirically based on
>> +        * the render_basic config) but hopefully it covers the
>> +        * maximum configuration latency for Gen8+ too...
>> +        */
>> +       usleep_range(15000, 20000);
>> +
>> +       config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
>> +                      dev_priv->perf.oa.b_counter_regs_len);
>> +
>> +       ret = gen8_configure_all_contexts(dev_priv);
>> +       if (ret)
>> +               return ret;
>> +
>> +       return 0;
>> +}
>> +
>> +static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
>> +{
>> +       /* NOP */
>> +}
>> +
>>   static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
>>   {
>>          lockdep_assert_held(&dev_priv->perf.hook_lock);
>> @@ -1157,6 +1807,29 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
>>          spin_unlock_irqrestore(&dev_priv->perf.hook_lock, flags);
>>   }
>>
>> +static void gen8_oa_enable(struct drm_i915_private *dev_priv)
>> +{
>> +       u32 report_format = dev_priv->perf.oa.oa_buffer.format;
>> +
>> +       /* Reset buf pointers so we don't forward reports from before now.
>> +        *
>> +        * Think carefully if considering trying to avoid this, since it
>> +        * also ensures status flags and the buffer itself are cleared
>> +        * in error paths, and we have checks for invalid reports based
>> +        * on the assumption that certain fields are written to zeroed
>> +        * memory which this helps maintains.
>> +        */
>> +       gen8_init_oa_buffer(dev_priv);
>> +
>> +       /* Note: we don't rely on the hardware to perform single context
>> +        * filtering and instead filter on the cpu based on the context-id
>> +        * field of reports
>> +        */
>> +       I915_WRITE(GEN8_OACONTROL, (report_format <<
>> +                                   GEN8_OA_REPORT_FORMAT_SHIFT) |
>> +                                  GEN8_OA_COUNTER_ENABLE);
>> +}
>> +
>>   /**
>>    * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
>>    * @stream: An i915 perf stream opened for OA metrics
>> @@ -1183,6 +1856,11 @@ static void gen7_oa_disable(struct drm_i915_private *dev_priv)
>>          I915_WRITE(GEN7_OACONTROL, 0);
>>   }
>>
>> +static void gen8_oa_disable(struct drm_i915_private *dev_priv)
>> +{
>> +       I915_WRITE(GEN8_OACONTROL, 0);
>> +}
>> +
>>   /**
>>    * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
>>    * @stream: An i915 perf stream opened for OA metrics
>> @@ -1361,6 +2039,21 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>          return ret;
>>   }
>>
>> +void i915_oa_init_reg_state(struct intel_engine_cs *engine,
>> +                           struct i915_gem_context *ctx,
>> +                           u32 *reg_state)
>> +{
>> +       struct drm_i915_private *dev_priv = engine->i915;
>> +
>> +       if (engine->id != RCS)
>> +               return;
>> +
>> +       if (!dev_priv->perf.initialized)
>> +               return;
>> +
>> +       gen8_update_reg_state_unlocked(ctx, reg_state);
>> +}
>> +
>>   /**
>>    * i915_perf_read_locked - &i915_perf_stream_ops->read with error normalisation
>>    * @stream: An i915 perf stream
>> @@ -1486,7 +2179,7 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
>>                  container_of(hrtimer, typeof(*dev_priv),
>>                               perf.oa.poll_check_timer);
>>
>> -       if (dev_priv->perf.oa.ops.oa_buffer_check(dev_priv)) {
>> +       if (oa_buffer_check_unlocked(dev_priv)) {
>>                  dev_priv->perf.oa.pollin = true;
>>                  wake_up(&dev_priv->perf.oa.poll_wq);
>>          }
>> @@ -1775,6 +2468,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
>>          struct i915_gem_context *specific_ctx = NULL;
>>          struct i915_perf_stream *stream = NULL;
>>          unsigned long f_flags = 0;
>> +       bool privileged_op = true;
>>          int stream_fd;
>>          int ret;
>>
>> @@ -1792,12 +2486,28 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
>>                  }
>>          }
>>
>> +       /* On Haswell the OA unit supports clock gating off for a specific
>> +        * context and in this mode there's no visibility of metrics for the
>> +        * rest of the system, which we consider acceptable for a
>> +        * non-privileged client.
>> +        *
>> +        * For Gen8+ the OA unit no longer supports clock gating off for a
>> +        * specific context and the kernel can't securely stop the counters
>> +        * from updating as system-wide / global values. Even though we can
>> +        * filter reports based on the included context ID we can't block
>> +        * clients from seeing the raw / global counter values via
>> +        * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
>> +        * enable the OA unit by default.
>> +        */
>> +       if (IS_HASWELL(dev_priv) && specific_ctx)
>> +               privileged_op = false;
>> +
>>          /* Similar to perf's kernel.perf_paranoid_cpu sysctl option
>>           * we check a dev.i915.perf_stream_paranoid sysctl option
>>           * to determine if it's ok to access system wide OA counters
>>           * without CAP_SYS_ADMIN privileges.
>>           */
>> -       if (!specific_ctx &&
>> +       if (privileged_op &&
>>              i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
>>                  DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n");
>>                  ret = -EACCES;
>> @@ -2069,9 +2779,6 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>>    */
>>   void i915_perf_register(struct drm_i915_private *dev_priv)
>>   {
>> -       if (!IS_HASWELL(dev_priv))
>> -               return;
>> -
>>          if (!dev_priv->perf.initialized)
>>                  return;
>>
>> @@ -2087,11 +2794,38 @@ void i915_perf_register(struct drm_i915_private *dev_priv)
>>          if (!dev_priv->perf.metrics_kobj)
>>                  goto exit;
>>
>> -       if (i915_perf_register_sysfs_hsw(dev_priv)) {
>> -               kobject_put(dev_priv->perf.metrics_kobj);
>> -               dev_priv->perf.metrics_kobj = NULL;
>> +       if (IS_HASWELL(dev_priv)) {
>> +               if (i915_perf_register_sysfs_hsw(dev_priv))
>> +                       goto sysfs_error;
>> +       } else if (IS_BROADWELL(dev_priv)) {
>> +               if (i915_perf_register_sysfs_bdw(dev_priv))
>> +                       goto sysfs_error;
>> +       } else if (IS_CHERRYVIEW(dev_priv)) {
>> +               if (i915_perf_register_sysfs_chv(dev_priv))
>> +                       goto sysfs_error;
>> +       } else if (IS_SKYLAKE(dev_priv)) {
>> +               if (IS_SKL_GT2(dev_priv)) {
>> +                       if (i915_perf_register_sysfs_sklgt2(dev_priv))
>> +                               goto sysfs_error;
>> +               } else if (IS_SKL_GT3(dev_priv)) {
>> +                       if (i915_perf_register_sysfs_sklgt3(dev_priv))
>> +                               goto sysfs_error;
>> +               } else if (IS_SKL_GT4(dev_priv)) {
>> +                       if (i915_perf_register_sysfs_sklgt4(dev_priv))
>> +                               goto sysfs_error;
>> +               } else
>> +                       goto sysfs_error;
>> +       } else if (IS_BROXTON(dev_priv)) {
>> +               if (i915_perf_register_sysfs_bxt(dev_priv))
>> +                       goto sysfs_error;
>>          }
>>
>> +       goto exit;
>> +
>> +sysfs_error:
>> +       kobject_put(dev_priv->perf.metrics_kobj);
>> +       dev_priv->perf.metrics_kobj = NULL;
>> +
>>   exit:
>>          mutex_unlock(&dev_priv->perf.lock);
>>   }
>> @@ -2107,13 +2841,24 @@ void i915_perf_register(struct drm_i915_private *dev_priv)
>>    */
>>   void i915_perf_unregister(struct drm_i915_private *dev_priv)
>>   {
>> -       if (!IS_HASWELL(dev_priv))
>> -               return;
>> -
>>          if (!dev_priv->perf.metrics_kobj)
>>                  return;
>>
>> -       i915_perf_unregister_sysfs_hsw(dev_priv);
>> +        if (IS_HASWELL(dev_priv))
>> +                i915_perf_unregister_sysfs_hsw(dev_priv);
>> +        else if (IS_BROADWELL(dev_priv))
>> +                i915_perf_unregister_sysfs_bdw(dev_priv);
>> +        else if (IS_CHERRYVIEW(dev_priv))
>> +                i915_perf_unregister_sysfs_chv(dev_priv);
>> +        else if (IS_SKYLAKE(dev_priv)) {
>> +               if (IS_SKL_GT2(dev_priv))
>> +                       i915_perf_unregister_sysfs_sklgt2(dev_priv);
>> +               else if (IS_SKL_GT3(dev_priv))
>> +                       i915_perf_unregister_sysfs_sklgt3(dev_priv);
>> +               else if (IS_SKL_GT4(dev_priv))
>> +                       i915_perf_unregister_sysfs_sklgt4(dev_priv);
>> +       } else if (IS_BROXTON(dev_priv))
>> +                i915_perf_unregister_sysfs_bxt(dev_priv);
>>
>>          kobject_put(dev_priv->perf.metrics_kobj);
>>          dev_priv->perf.metrics_kobj = NULL;
>> @@ -2172,36 +2917,105 @@ static struct ctl_table dev_root[] = {
>>    */
>>   void i915_perf_init(struct drm_i915_private *dev_priv)
>>   {
>> -       if (!IS_HASWELL(dev_priv))
>> -               return;
>> -
>> -       hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
>> -                    CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> -       dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
>> -       init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
>> +       dev_priv->perf.oa.n_builtin_sets = 0;
>> +
>> +       if (IS_HASWELL(dev_priv)) {
>> +               dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
>> +               dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
>> +               dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
>> +               dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
>> +               dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
>> +               dev_priv->perf.oa.ops.read = gen7_oa_read;
>> +               dev_priv->perf.oa.ops.oa_hw_tail_read =
>> +                       gen7_oa_hw_tail_read;
>> +
>> +               dev_priv->perf.oa.oa_formats = hsw_oa_formats;
>> +
>> +               dev_priv->perf.oa.n_builtin_sets =
>> +                       i915_oa_n_builtin_metric_sets_hsw;
>> +       } else if (i915.enable_execlists) {
>> +               /* Note: that although we could theoretically also support the
>> +                * legacy ringbuffer mode on BDW (and earlier iterations of
>> +                * this driver, before upstreaming did this) it didn't seem
>> +                * worth the complexity to maintain now that BDW+ enable
>> +                * execlist mode by default.
>> +                */
>>
>> -       INIT_LIST_HEAD(&dev_priv->perf.streams);
>> -       mutex_init(&dev_priv->perf.lock);
>> -       spin_lock_init(&dev_priv->perf.hook_lock);
>> -       spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
>> +               if (IS_GEN8(dev_priv)) {
>> +                       dev_priv->perf.oa.ctx_oactxctrl_offset = 0x120;
>> +                       dev_priv->perf.oa.ctx_flexeu0_offset = 0x2ce;
>> +                       dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<25);
>> +
>> +                       if (IS_BROADWELL(dev_priv)) {
>> +                               dev_priv->perf.oa.n_builtin_sets =
>> +                                       i915_oa_n_builtin_metric_sets_bdw;
>> +                               dev_priv->perf.oa.ops.select_metric_set =
>> +                                       i915_oa_select_metric_set_bdw;
>> +                       } else if (IS_CHERRYVIEW(dev_priv)) {
>> +                               dev_priv->perf.oa.n_builtin_sets =
>> +                                       i915_oa_n_builtin_metric_sets_chv;
>> +                               dev_priv->perf.oa.ops.select_metric_set =
>> +                                       i915_oa_select_metric_set_chv;
>> +                       }
>> +               } else if (IS_GEN9(dev_priv)) {
>> +                       dev_priv->perf.oa.ctx_oactxctrl_offset = 0x128;
>> +                       dev_priv->perf.oa.ctx_flexeu0_offset = 0x3de;
>> +                       dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
>> +
>> +                       if (IS_SKL_GT2(dev_priv)) {
>> +                               dev_priv->perf.oa.n_builtin_sets =
>> +                                       i915_oa_n_builtin_metric_sets_sklgt2;
>> +                               dev_priv->perf.oa.ops.select_metric_set =
>> +                                       i915_oa_select_metric_set_sklgt2;
>> +                       } else if (IS_SKL_GT3(dev_priv)) {
>> +                               dev_priv->perf.oa.n_builtin_sets =
>> +                                       i915_oa_n_builtin_metric_sets_sklgt3;
>> +                               dev_priv->perf.oa.ops.select_metric_set =
>> +                                       i915_oa_select_metric_set_sklgt3;
>> +                       } else if (IS_SKL_GT4(dev_priv)) {
>> +                               dev_priv->perf.oa.n_builtin_sets =
>> +                                       i915_oa_n_builtin_metric_sets_sklgt4;
>> +                               dev_priv->perf.oa.ops.select_metric_set =
>> +                                       i915_oa_select_metric_set_sklgt4;
>> +                       } else if (IS_BROXTON(dev_priv)) {
>> +                               dev_priv->perf.oa.n_builtin_sets =
>> +                                       i915_oa_n_builtin_metric_sets_bxt;
>> +                               dev_priv->perf.oa.ops.select_metric_set =
>> +                                       i915_oa_select_metric_set_bxt;
>> +                       }
>> +               }
>>
>> -       dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
>> -       dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
>> -       dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
>> -       dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
>> -       dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
>> -       dev_priv->perf.oa.ops.read = gen7_oa_read;
>> -       dev_priv->perf.oa.ops.oa_buffer_check =
>> -               gen7_oa_buffer_check_unlocked;
>> +               if (dev_priv->perf.oa.n_builtin_sets) {
>> +                       dev_priv->perf.oa.ops.init_oa_buffer = gen8_init_oa_buffer;
>> +                       dev_priv->perf.oa.ops.enable_metric_set =
>> +                               gen8_enable_metric_set;
>> +                       dev_priv->perf.oa.ops.disable_metric_set =
>> +                               gen8_disable_metric_set;
>> +                       dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
>> +                       dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
>> +                       dev_priv->perf.oa.ops.read = gen8_oa_read;
>> +                       dev_priv->perf.oa.ops.oa_hw_tail_read =
>> +                               gen8_oa_hw_tail_read;
>> +
>> +                       dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
>> +               }
>> +       }
>>
>> -       dev_priv->perf.oa.oa_formats = hsw_oa_formats;
>> +       if (dev_priv->perf.oa.n_builtin_sets) {
>> +               hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
>> +                               CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> +               dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
>> +               init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
>>
>> -       dev_priv->perf.oa.n_builtin_sets =
>> -               i915_oa_n_builtin_metric_sets_hsw;
>> +               INIT_LIST_HEAD(&dev_priv->perf.streams);
>> +               mutex_init(&dev_priv->perf.lock);
>> +               spin_lock_init(&dev_priv->perf.hook_lock);
>> +               spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
>>
>> -       dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
>> +               dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
>>
>> -       dev_priv->perf.initialized = true;
>> +               dev_priv->perf.initialized = true;
>> +       }
>>   }
>>
>>   /**
>> @@ -2216,5 +3030,6 @@ void i915_perf_fini(struct drm_i915_private *dev_priv)
>>          unregister_sysctl_table(dev_priv->perf.sysctl_header);
>>
>>          memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
>> +
>>          dev_priv->perf.initialized = false;
>>   }
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index 4c72adae368b..c0bbba6d5b78 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -653,6 +653,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>
>>   #define GEN8_OACTXID _MMIO(0x2364)
>>
>> +#define GEN8_OA_DEBUG _MMIO(0x2B04)
>> +#define  GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS    (1<<5)
>> +#define  GEN9_OA_DEBUG_INCLUDE_CLK_RATIO           (1<<6)
>> +#define  GEN9_OA_DEBUG_DISABLE_GO_1_0_REPORTS      (1<<2)
>> +#define  GEN9_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS   (1<<1)
>> +
>>   #define GEN8_OACONTROL _MMIO(0x2B00)
>>   #define  GEN8_OA_REPORT_FORMAT_A12         (0<<2)
>>   #define  GEN8_OA_REPORT_FORMAT_A12_B8_C8    (2<<2)
>> @@ -674,6 +680,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>   #define  GEN7_OABUFFER_STOP_RESUME_ENABLE   (1<<1)
>>   #define  GEN7_OABUFFER_RESUME              (1<<0)
>>
>> +#define GEN8_OABUFFER_UDW _MMIO(0x23b4)
>>   #define GEN8_OABUFFER _MMIO(0x2b14)
>>
>>   #define GEN7_OASTATUS1 _MMIO(0x2364)
>> @@ -692,7 +699,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>   #define  GEN8_OASTATUS_REPORT_LOST         (1<<0)
>>
>>   #define GEN8_OAHEADPTR _MMIO(0x2B0C)
>> +#define GEN8_OAHEADPTR_MASK    0xffffffc0
>>   #define GEN8_OATAILPTR _MMIO(0x2B10)
>> +#define GEN8_OATAILPTR_MASK    0xffffffc0
>>
>>   #define OABUFFER_SIZE_128K  (0<<3)
>>   #define OABUFFER_SIZE_256K  (1<<3)
>> @@ -705,7 +714,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>
>>   #define OA_MEM_SELECT_GGTT  (1<<0)
>>
>> +/*
>> + * Flexible, Aggregate EU Counter Registers.
>> + * Note: these aren't contiguous
>> + */
>>   #define EU_PERF_CNTL0      _MMIO(0xe458)
>> +#define EU_PERF_CNTL1      _MMIO(0xe558)
>> +#define EU_PERF_CNTL2      _MMIO(0xe658)
>> +#define EU_PERF_CNTL3      _MMIO(0xe758)
>> +#define EU_PERF_CNTL4      _MMIO(0xe45c)
>> +#define EU_PERF_CNTL5      _MMIO(0xe55c)
>> +#define EU_PERF_CNTL6      _MMIO(0xe65c)
>>
>>   #define GDT_CHICKEN_BITS    _MMIO(0x9840)
>>   #define GT_NOA_ENABLE      0x00000080
>> @@ -2325,6 +2344,9 @@ enum skl_disp_power_wells {
>>   #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE        (1 << 12)
>>   #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE       (1<<10)
>>
>> +#define GEN6_RCS_PWR_FSM _MMIO(0x22ac)
>> +#define GEN9_RCS_FE_FSM2 _MMIO(0x22a4)
>> +
>>   /* Fuse readout registers for GT */
>>   #define CHV_FUSE_GT                    _MMIO(VLV_DISPLAY_BASE + 0x2168)
>>   #define   CHV_FGT_DISABLE_SS0          (1 << 10)
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 7df278fe492e..351e231a02f4 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -1879,6 +1879,8 @@ static void execlists_init_reg_state(u32 *regs,
>>                  regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
>>                  CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
>>                          make_rpcs(dev_priv));
>> +
>> +               i915_oa_init_reg_state(engine, ctx, regs);
>>          }
>>   }
>>
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index 464547d08173..15bc9f78ba4d 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -1316,13 +1316,18 @@ struct drm_i915_gem_context_param {
>>   };
>>
>>   enum drm_i915_oa_format {
>> -       I915_OA_FORMAT_A13 = 1,
>> -       I915_OA_FORMAT_A29,
>> -       I915_OA_FORMAT_A13_B8_C8,
>> -       I915_OA_FORMAT_B4_C8,
>> -       I915_OA_FORMAT_A45_B8_C8,
>> -       I915_OA_FORMAT_B4_C8_A16,
>> -       I915_OA_FORMAT_C4_B8,
>> +       I915_OA_FORMAT_A13 = 1,     /* HSW only */
>> +       I915_OA_FORMAT_A29,         /* HSW only */
>> +       I915_OA_FORMAT_A13_B8_C8,   /* HSW only */
>> +       I915_OA_FORMAT_B4_C8,       /* HSW only */
>> +       I915_OA_FORMAT_A45_B8_C8,   /* HSW only */
>> +       I915_OA_FORMAT_B4_C8_A16,   /* HSW only */
>> +       I915_OA_FORMAT_C4_B8,       /* HSW+ */
>> +
>> +       /* Gen8+ */
>> +       I915_OA_FORMAT_A12,
>> +       I915_OA_FORMAT_A12_B8_C8,
>> +       I915_OA_FORMAT_A32u40_A4u32_B8_C8,
>>
>>          I915_OA_FORMAT_MAX          /* non-ABI */
>>   };
>> --
>> 2.11.0
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx