[Intel-gfx] [PATCH 03/12] drm/i915: Framework for capturing command stream based OA reports and ctx id info.

sourab gupta sourabgupta at gmail.com
Mon Jul 31 11:38:04 UTC 2017


On Mon, Jul 31, 2017 at 3:13 PM, Lionel Landwerlin <
lionel.g.landwerlin at intel.com> wrote:

> On 31/07/17 08:59, Sagar Arun Kamble wrote:
>
>> From: Sourab Gupta <sourab.gupta at intel.com>
>>
>> This patch introduces a framework to capture OA counter reports associated
>> with Render command stream. We can then associate the reports captured
>> through this mechanism with their corresponding context id's. This can be
>> further extended to associate any other metadata information with the
>> corresponding samples (since the association with Render command stream
>> gives us the ability to capture these information while inserting the
>> corresponding capture commands into the command stream).
>>
>> The OA reports generated in this way are associated with a corresponding
>> workload, and thus can be used the delimit the workload (i.e. sample the
>> counters at the workload boundaries), within an ongoing stream of periodic
>> counter snapshots.
>>
>> There may be usecases wherein we need more than periodic OA capture mode
>> which is supported currently. This mode is primarily used for two
>> usecases:
>>      - Ability to capture system wide metrics, alongwith the ability to
>> map
>>        the reports back to individual contexts (particularly for HSW).
>>      - Ability to inject tags for work, into the reports. This provides
>>        visibility into the multiple stages of work within single context.
>>
>> The userspace will be able to distinguish between the periodic and CS
>> based
>> OA reports by the virtue of source_info sample field.
>>
>> The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA
>> counters, and is inserted at BB boundaries.
>> The data thus captured will be stored in a separate buffer, which will
>> be different from the buffer used otherwise for periodic OA capture mode.
>> The metadata information pertaining to snapshot is maintained in a list,
>> which also has offsets into the gem buffer object per captured snapshot.
>> In order to track whether the gpu has completed processing the node,
>> a field pertaining to corresponding gem request is added, which is tracked
>> for completion of the command.
>>
>> Both periodic and CS based reports are associated with a single stream
>> (corresponding to render engine), and it is expected to have the samples
>> in the sequential order according to their timestamps. Now, since these
>> reports are collected in separate buffers, these are merge sorted at the
>> time of forwarding to userspace during the read call.
>>
>> v2: Aligning with the non-perf interface (custom drm ioctl based). Also,
>> few related patches are squashed together for better readability
>>
>> v3: Updated perf sample capture emit hook name. Reserving space upfront
>> in the ring for emitting sample capture commands and using
>> req->fence.seqno for tracking samples. Added SRCU protection for streams.
>> Changed the stream last_request tracking to resv object. (Chris)
>> Updated perf.sample_lock spin_lock usage to avoid softlockups. Moved
>> stream to global per-engine structure. (Sagar)
>> Update unpin and put in the free routines to i915_vma_unpin_and_release.
>> Making use of perf stream cs_buffer vma resv instead of separate resv obj.
>> Pruned perf stream vma resv during gem_idle. (Chris)
>> Changed payload field ctx_id to u64 to keep all sample data aligned at 8
>> bytes. (Lionel)
>> stall/flush prior to sample capture is not added. Do we need to give this
>> control to user to select whether to stall/flush at each sample?
>>
>> Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
>> Signed-off-by: Robert Bragg <robert at sixbynine.org>
>> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble at intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h            |  101 ++-
>>   drivers/gpu/drm/i915/i915_gem.c            |    1 +
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |    8 +
>>   drivers/gpu/drm/i915/i915_perf.c           | 1185
>> ++++++++++++++++++++++------
>>   drivers/gpu/drm/i915/intel_engine_cs.c     |    4 +
>>   drivers/gpu/drm/i915/intel_ringbuffer.c    |    2 +
>>   drivers/gpu/drm/i915/intel_ringbuffer.h    |    5 +
>>   include/uapi/drm/i915_drm.h                |   15 +
>>   8 files changed, 1073 insertions(+), 248 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 2c7456f..8b1cecf 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1985,6 +1985,24 @@ struct i915_perf_stream_ops {
>>          * The stream will always be disabled before this is called.
>>          */
>>         void (*destroy)(struct i915_perf_stream *stream);
>> +
>> +       /*
>> +        * @emit_sample_capture: Emit the commands in the command streamer
>> +        * for a particular gpu engine.
>> +        *
>> +        * The commands are inserted to capture the perf sample data at
>> +        * specific points during workload execution, such as before and
>> after
>> +        * the batch buffer.
>> +        */
>> +       void (*emit_sample_capture)(struct i915_perf_stream *stream,
>> +                                   struct drm_i915_gem_request *request,
>> +                                   bool preallocate);
>> +};
>> +
>> +enum i915_perf_stream_state {
>> +       I915_PERF_STREAM_DISABLED,
>> +       I915_PERF_STREAM_ENABLE_IN_PROGRESS,
>> +       I915_PERF_STREAM_ENABLED,
>>   };
>>     /**
>> @@ -1997,9 +2015,9 @@ struct i915_perf_stream {
>>         struct drm_i915_private *dev_priv;
>>         /**
>> -        * @link: Links the stream into ``&drm_i915_private->streams``
>> +        * @engine: Engine to which this stream corresponds.
>>          */
>> -       struct list_head link;
>> +       struct intel_engine_cs *engine;
>>         /**
>>          * @sample_flags: Flags representing the
>> `DRM_I915_PERF_PROP_SAMPLE_*`
>> @@ -2022,17 +2040,41 @@ struct i915_perf_stream {
>>         struct i915_gem_context *ctx;
>>         /**
>> -        * @enabled: Whether the stream is currently enabled, considering
>> -        * whether the stream was opened in a disabled state and based
>> -        * on `I915_PERF_IOCTL_ENABLE` and `I915_PERF_IOCTL_DISABLE`
>> calls.
>> +        * @state: Current stream state, which can be either disabled,
>> enabled,
>> +        * or enable_in_progress, while considering whether the stream was
>> +        * opened in a disabled state and based on
>> `I915_PERF_IOCTL_ENABLE` and
>> +        * `I915_PERF_IOCTL_DISABLE` calls.
>>          */
>> -       bool enabled;
>> +       enum i915_perf_stream_state state;
>> +
>> +       /**
>> +        * @cs_mode: Whether command stream based perf sample collection
>> is
>> +        * enabled for this stream
>> +        */
>> +       bool cs_mode;
>> +
>> +       /**
>> +        * @using_oa: Whether OA unit is in use for this particular stream
>> +        */
>> +       bool using_oa;
>>         /**
>>          * @ops: The callbacks providing the implementation of this
>> specific
>>          * type of configured stream.
>>          */
>>         const struct i915_perf_stream_ops *ops;
>> +
>> +       /* Command stream based perf data buffer */
>> +       struct {
>> +               struct i915_vma *vma;
>> +               u8 *vaddr;
>> +       } cs_buffer;
>> +
>> +       struct list_head cs_samples;
>> +       spinlock_t cs_samples_lock;
>> +
>> +       wait_queue_head_t poll_wq;
>> +       bool pollin;
>>   };
>>     /**
>> @@ -2095,7 +2137,8 @@ struct i915_oa_ops {
>>         int (*read)(struct i915_perf_stream *stream,
>>                     char __user *buf,
>>                     size_t count,
>> -                   size_t *offset);
>> +                   size_t *offset,
>> +                   u32 ts);
>>         /**
>>          * @oa_hw_tail_read: read the OA tail pointer register
>> @@ -2107,6 +2150,36 @@ struct i915_oa_ops {
>>         u32 (*oa_hw_tail_read)(struct drm_i915_private *dev_priv);
>>   };
>>   +/*
>> + * i915_perf_cs_sample - Sample element to hold info about a single perf
>> + * sample data associated with a particular GPU command stream.
>> + */
>> +struct i915_perf_cs_sample {
>> +       /**
>> +        * @link: Links the sample into ``&stream->cs_samples``
>> +        */
>> +       struct list_head link;
>> +
>> +       /**
>> +        * @request: GEM request associated with the sample. The commands
>> to
>> +        * capture the perf metrics are inserted into the command
>> streamer in
>> +        * context of this request.
>> +        */
>> +       struct drm_i915_gem_request *request;
>> +
>> +       /**
>> +        * @offset: Offset into ``&stream->cs_buffer``
>> +        * where the perf metrics will be collected, when the commands
>> inserted
>> +        * into the command stream are executed by GPU.
>> +        */
>> +       u32 offset;
>> +
>> +       /**
>> +        * @ctx_id: Context ID associated with this perf sample
>> +        */
>> +       u32 ctx_id;
>> +};
>> +
>>   struct intel_cdclk_state {
>>         unsigned int cdclk, vco, ref;
>>   };
>> @@ -2431,17 +2504,10 @@ struct drm_i915_private {
>>                 struct ctl_table_header *sysctl_header;
>>                 struct mutex lock;
>> -               struct list_head streams;
>> -
>> -               struct {
>> -                       struct i915_perf_stream *exclusive_stream;
>>   -                     u32 specific_ctx_id;
>> -
>> -                       struct hrtimer poll_check_timer;
>> -                       wait_queue_head_t poll_wq;
>> -                       bool pollin;
>> +               struct hrtimer poll_check_timer;
>>   +             struct {
>>                         /**
>>                          * For rate limiting any notifications of spurious
>>                          * invalid OA reports
>> @@ -3636,6 +3702,8 @@ int i915_perf_open_ioctl(struct drm_device *dev,
>> void *data,
>>   void i915_oa_init_reg_state(struct intel_engine_cs *engine,
>>                             struct i915_gem_context *ctx,
>>                             uint32_t *reg_state);
>> +void i915_perf_emit_sample_capture(struct drm_i915_gem_request *req,
>> +                                  bool preallocate);
>>     /* i915_gem_evict.c */
>>   int __must_check i915_gem_evict_something(struct i915_address_space
>> *vm,
>> @@ -3795,6 +3863,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs
>> *engine,
>>   /* i915_perf.c */
>>   extern void i915_perf_init(struct drm_i915_private *dev_priv);
>>   extern void i915_perf_fini(struct drm_i915_private *dev_priv);
>> +extern void i915_perf_streams_mark_idle(struct drm_i915_private
>> *dev_priv);
>>   extern void i915_perf_register(struct drm_i915_private *dev_priv);
>>   extern void i915_perf_unregister(struct drm_i915_private *dev_priv);
>>   diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 000a764..7b01548 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -3220,6 +3220,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private
>> *i915)
>>         intel_engines_mark_idle(dev_priv);
>>         i915_gem_timelines_mark_idle(dev_priv);
>> +       i915_perf_streams_mark_idle(dev_priv);
>>         GEM_BUG_ON(!dev_priv->gt.awake);
>>         dev_priv->gt.awake = false;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index 5fa4476..bfe546b 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -1194,12 +1194,16 @@ static int __reloc_gpu_alloc(struct
>> i915_execbuffer *eb,
>>         if (err)
>>                 goto err_request;
>>   +     i915_perf_emit_sample_capture(rq, true);
>> +
>>         err = eb->engine->emit_bb_start(rq,
>>                                         batch->node.start, PAGE_SIZE,
>>                                         cache->gen > 5 ? 0 :
>> I915_DISPATCH_SECURE);
>>         if (err)
>>                 goto err_request;
>>   +     i915_perf_emit_sample_capture(rq, false);
>> +
>>         GEM_BUG_ON(!reservation_object_test_signaled_rcu(batch->resv,
>> true));
>>         i915_vma_move_to_active(batch, rq, 0);
>>         reservation_object_lock(batch->resv, NULL);
>> @@ -2029,6 +2033,8 @@ static int eb_submit(struct i915_execbuffer *eb)
>>                         return err;
>>         }
>>   +     i915_perf_emit_sample_capture(eb->request, true);
>> +
>>         err = eb->engine->emit_bb_start(eb->request,
>>                                         eb->batch->node.start +
>>                                         eb->batch_start_offset,
>> @@ -2037,6 +2043,8 @@ static int eb_submit(struct i915_execbuffer *eb)
>>         if (err)
>>                 return err;
>>   +     i915_perf_emit_sample_capture(eb->request, false);
>> +
>>         return 0;
>>   }
>>   diff --git a/drivers/gpu/drm/i915/i915_perf.c
>> b/drivers/gpu/drm/i915/i915_perf.c
>> index b272653..57e1936 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -193,6 +193,7 @@
>>     #include <linux/anon_inodes.h>
>>   #include <linux/sizes.h>
>> +#include <linux/srcu.h>
>>     #include "i915_drv.h"
>>   #include "i915_oa_hsw.h"
>> @@ -288,6 +289,12 @@
>>   #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
>>   #define OAREPORT_REASON_CLK_RATIO      (1<<5)
>>   +/* Data common to periodic and RCS based OA samples */
>> +struct i915_perf_sample_data {
>> +       u64 source;
>> +       u64 ctx_id;
>> +       const u8 *report;
>> +};
>>     /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
>>    *
>> @@ -328,8 +335,19 @@
>>         [I915_OA_FORMAT_C4_B8]              = { 7, 64 },
>>   };
>>   +/* Duplicated from similar static enum in i915_gem_execbuffer.c */
>> +#define I915_USER_RINGS (4)
>> +static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
>> +       [I915_EXEC_DEFAULT]     = RCS,
>> +       [I915_EXEC_RENDER]      = RCS,
>> +       [I915_EXEC_BLT]         = BCS,
>> +       [I915_EXEC_BSD]         = VCS,
>> +       [I915_EXEC_VEBOX]       = VECS
>> +};
>> +
>>   #define SAMPLE_OA_REPORT      (1<<0)
>>   #define SAMPLE_OA_SOURCE      (1<<1)
>> +#define SAMPLE_CTX_ID        (1<<2)
>>     /**
>>    * struct perf_open_properties - for validated properties given to open
>> a stream
>> @@ -340,6 +358,9 @@
>>    * @oa_format: An OA unit HW report format
>>    * @oa_periodic: Whether to enable periodic OA unit sampling
>>    * @oa_period_exponent: The OA unit sampling period is derived from this
>> + * @cs_mode: Whether the stream is configured to enable collection of
>> metrics
>> + * associated with command stream of a particular GPU engine
>> + * @engine: The GPU engine associated with the stream in case cs_mode is
>> enabled
>>    *
>>    * As read_properties_unlocked() enumerates and validates the
>> properties given
>>    * to open a stream of metrics the configuration is built up in the
>> structure
>> @@ -356,6 +377,10 @@ struct perf_open_properties {
>>         int oa_format;
>>         bool oa_periodic;
>>         int oa_period_exponent;
>> +
>> +       /* Command stream mode */
>> +       bool cs_mode;
>> +       enum intel_engine_id engine;
>>   };
>>     static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
>> @@ -371,6 +396,266 @@ static u32 gen7_oa_hw_tail_read(struct
>> drm_i915_private *dev_priv)
>>   }
>>     /**
>> + * i915_perf_emit_sample_capture - Insert the commands to capture
>> metrics into
>> + * the command stream of a GPU engine.
>> + * @request: request in whose context the metrics are being collected.
>> + * @preallocate: allocate space in ring for related sample.
>> + *
>> + * The function provides a hook through which the commands to capture
>> perf
>> + * metrics, are inserted into the command stream of a GPU engine.
>> + */
>> +void i915_perf_emit_sample_capture(struct drm_i915_gem_request *request,
>> +                                  bool preallocate)
>> +{
>> +       struct intel_engine_cs *engine = request->engine;
>> +       struct drm_i915_private *dev_priv = engine->i915;
>> +       struct i915_perf_stream *stream;
>> +       int idx;
>> +
>> +       if (!dev_priv->perf.initialized)
>> +               return;
>> +
>> +       idx = srcu_read_lock(&engine->perf_srcu);
>> +       stream = srcu_dereference(engine->exclusive_stream,
>> &engine->perf_srcu);
>> +       if (stream && (stream->state == I915_PERF_STREAM_ENABLED) &&
>> +                               stream->cs_mode)
>> +               stream->ops->emit_sample_capture(stream, request,
>> +                                                preallocate);
>> +       srcu_read_unlock(&engine->perf_srcu, idx);
>> +}
>> +
>> +/**
>> + * release_perf_samples - Release old perf samples to make space for new
>> + * sample data.
>> + * @stream: Stream from which space is to be freed up.
>> + * @target_size: Space required to be freed up.
>> + *
>> + * We also dereference the associated request before deleting the sample.
>> + * Also, no need to check whether the commands associated with old
>> samples
>> + * have been completed. This is because these sample entries are anyways
>> going
>> + * to be replaced by a new sample, and gpu will eventually overwrite the
>> buffer
>> + * contents, when the request associated with new sample completes.
>> + */
>> +static void release_perf_samples(struct i915_perf_stream *stream,
>> +                                u32 target_size)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       struct i915_perf_cs_sample *sample, *next;
>> +       u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
>> +       u32 size = 0;
>> +
>> +       list_for_each_entry_safe
>> +               (sample, next, &stream->cs_samples, link) {
>> +               size += sample_size;
>> +               i915_gem_request_put(sample->request);
>> +               list_del(&sample->link);
>> +               kfree(sample);
>> +
>> +               if (size >= target_size)
>> +                       break;
>> +       }
>> +}
>> +
>> +/**
>> + * insert_perf_sample - Insert a perf sample entry to the sample list.
>> + * @stream: Stream into which sample is to be inserted.
>> + * @sample: perf CS sample to be inserted into the list
>> + *
>> + * This function never fails, since it always manages to insert the
>> sample.
>> + * If the space is exhausted in the buffer, it will remove the older
>> + * entries in order to make space.
>> + */
>> +static void insert_perf_sample(struct i915_perf_stream *stream,
>> +                               struct i915_perf_cs_sample *sample)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       struct i915_perf_cs_sample *first, *last;
>> +       int max_offset = stream->cs_buffer.vma->obj->base.size;
>> +       u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
>> +       unsigned long flags;
>> +
>> +       spin_lock_irqsave(&stream->cs_samples_lock, flags);
>> +       if (list_empty(&stream->cs_samples)) {
>> +               sample->offset = 0;
>> +               list_add_tail(&sample->link, &stream->cs_samples);
>> +               spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +               return;
>> +       }
>> +
>> +       first = list_first_entry(&stream->cs_samples, typeof(*first),
>> +                               link);
>> +       last = list_last_entry(&stream->cs_samples, typeof(*last),
>> +                               link);
>> +
>> +       if (last->offset >= first->offset) {
>> +               /* Sufficient space available at the end of buffer? */
>> +               if (last->offset + 2*sample_size < max_offset)
>> +                       sample->offset = last->offset + sample_size;
>> +               /*
>> +                * Wraparound condition. Is sufficient space available at
>> +                * beginning of buffer?
>> +                */
>> +               else if (sample_size < first->offset)
>> +                       sample->offset = 0;
>> +               /* Insufficient space. Overwrite existing old entries */
>> +               else {
>> +                       u32 target_size = sample_size - first->offset;
>> +
>> +                       release_perf_samples(stream, target_size);
>> +                       sample->offset = 0;
>> +               }
>> +       } else {
>> +               /* Sufficient space available? */
>> +               if (last->offset + 2*sample_size < first->offset)
>> +                       sample->offset = last->offset + sample_size;
>> +               /* Insufficient space. Overwrite existing old entries */
>> +               else {
>> +                       u32 target_size = sample_size -
>> +                               (first->offset - last->offset -
>> +                               sample_size);
>> +
>> +                       release_perf_samples(stream, target_size);
>> +                       sample->offset = last->offset + sample_size;
>> +               }
>> +       }
>> +       list_add_tail(&sample->link, &stream->cs_samples);
>> +       spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +}
>> +
>> +/**
>> + * i915_emit_oa_report_capture - Insert the commands to capture OA
>> + * reports metrics into the render command stream
>> + * @request: request in whose context the metrics are being collected.
>> + * @preallocate: allocate space in ring for related sample.
>> + * @offset: command stream buffer offset where the OA metrics need to be
>> + * collected
>> + */
>> +static int i915_emit_oa_report_capture(
>> +                               struct drm_i915_gem_request *request,
>> +                               bool preallocate,
>> +                               u32 offset)
>> +{
>> +       struct drm_i915_private *dev_priv = request->i915;
>> +       struct intel_engine_cs *engine = request->engine;
>> +       struct i915_perf_stream *stream;
>> +       u32 addr = 0;
>> +       u32 cmd, len = 4, *cs;
>> +       int idx;
>> +
>> +       idx = srcu_read_lock(&engine->perf_srcu);
>> +       stream = srcu_dereference(engine->exclusive_stream,
>> &engine->perf_srcu);
>> +       addr = stream->cs_buffer.vma->node.start + offset;
>> +       srcu_read_unlock(&engine->perf_srcu, idx);
>> +
>> +       if (WARN_ON(addr & 0x3f)) {
>> +               DRM_ERROR("OA buffer address not aligned to 64 byte\n");
>> +               return -EINVAL;
>> +       }
>> +
>> +       if (preallocate)
>> +               request->reserved_space += len;
>> +       else
>> +               request->reserved_space -= len;
>> +
>> +       cs = intel_ring_begin(request, 4);
>> +       if (IS_ERR(cs))
>> +               return PTR_ERR(cs);
>> +
>> +       cmd = MI_REPORT_PERF_COUNT | (1<<0);
>> +       if (INTEL_GEN(dev_priv) >= 8)
>> +               cmd |= (2<<0);
>> +
>> +       *cs++ = cmd;
>> +       *cs++ = addr | MI_REPORT_PERF_COUNT_GGTT;
>> +       *cs++ = request->fence.seqno;
>> +
>> +       if (INTEL_GEN(dev_priv) >= 8)
>> +               *cs++ = 0;
>> +       else
>> +               *cs++ = MI_NOOP;
>> +
>> +       intel_ring_advance(request, cs);
>> +
>> +       return 0;
>> +}
>> +
>> +/**
>> + * i915_perf_stream_emit_sample_capture - Insert the commands to
>> capture perf
>> + * metrics into the GPU command stream
>> + * @stream: An i915-perf stream opened for GPU metrics
>> + * @request: request in whose context the metrics are being collected.
>> + * @preallocate: allocate space in ring for related sample.
>> + */
>> +static void i915_perf_stream_emit_sample_capture(
>> +                                       struct i915_perf_stream *stream,
>> +                                       struct drm_i915_gem_request
>> *request,
>> +                                       bool preallocate)
>> +{
>> +       struct reservation_object *resv = stream->cs_buffer.vma->resv;
>> +       struct i915_perf_cs_sample *sample;
>> +       unsigned long flags;
>> +       int ret;
>> +
>> +       sample = kzalloc(sizeof(*sample), GFP_KERNEL);
>> +       if (sample == NULL) {
>> +               DRM_ERROR("Perf sample alloc failed\n");
>> +               return;
>> +       }
>> +
>> +       sample->request = i915_gem_request_get(request);
>> +       sample->ctx_id = request->ctx->hw_id;
>> +
>> +       insert_perf_sample(stream, sample);
>> +
>> +       if (stream->sample_flags & SAMPLE_OA_REPORT) {
>> +               ret = i915_emit_oa_report_capture(request,
>> +                                                 preallocate,
>> +                                                 sample->offset);
>> +               if (ret)
>> +                       goto err_unref;
>> +       }
>> +
>> +       reservation_object_lock(resv, NULL);
>> +       if (reservation_object_reserve_shared(resv) == 0)
>> +               reservation_object_add_shared_fence(resv,
>> &request->fence);
>> +       reservation_object_unlock(resv);
>> +
>> +       i915_vma_move_to_active(stream->cs_buffer.vma, request,
>> +                                       EXEC_OBJECT_WRITE);
>> +       return;
>> +
>> +err_unref:
>> +       i915_gem_request_put(sample->request);
>> +       spin_lock_irqsave(&stream->cs_samples_lock, flags);
>> +       list_del(&sample->link);
>> +       spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +       kfree(sample);
>> +}
>> +
>> +/**
>> + * i915_perf_stream_release_samples - Release the perf command stream
>> samples
>> + * @stream: Stream from which sample are to be released.
>> + *
>> + * Note: The associated requests should be completed before releasing the
>> + * references here.
>> + */
>> +static void i915_perf_stream_release_samples(struct i915_perf_stream
>> *stream)
>> +{
>> +       struct i915_perf_cs_sample *entry, *next;
>> +       unsigned long flags;
>> +
>> +       list_for_each_entry_safe
>> +               (entry, next, &stream->cs_samples, link) {
>> +               i915_gem_request_put(entry->request);
>> +
>> +               spin_lock_irqsave(&stream->cs_samples_lock, flags);
>> +               list_del(&entry->link);
>> +               spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +               kfree(entry);
>> +       }
>> +}
>> +
>> +/**
>>    * oa_buffer_check_unlocked - check for data and update tail ptr state
>>    * @dev_priv: i915 device instance
>>    *
>> @@ -521,12 +806,13 @@ static int append_oa_status(struct i915_perf_stream
>> *stream,
>>   }
>>     /**
>> - * append_oa_sample - Copies single OA report into userspace read()
>> buffer.
>> - * @stream: An i915-perf stream opened for OA metrics
>> + * append_perf_sample - Copies single perf sample into userspace read()
>> buffer.
>> + * @stream: An i915-perf stream opened for perf samples
>>    * @buf: destination buffer given by userspace
>>    * @count: the number of bytes userspace wants to read
>>    * @offset: (inout): the current position for writing into @buf
>> - * @report: A single OA report to (optionally) include as part of the
>> sample
>> + * @data: perf sample data which contains (optionally) metrics configured
>> + * earlier when opening a stream
>>    *
>>    * The contents of a sample are configured through
>> `DRM_I915_PERF_PROP_SAMPLE_*`
>>    * properties when opening a stream, tracked as `stream->sample_flags`.
>> This
>> @@ -537,11 +823,11 @@ static int append_oa_status(struct i915_perf_stream
>> *stream,
>>    *
>>    * Returns: 0 on success, negative error code on failure.
>>    */
>> -static int append_oa_sample(struct i915_perf_stream *stream,
>> +static int append_perf_sample(struct i915_perf_stream *stream,
>>                             char __user *buf,
>>                             size_t count,
>>                             size_t *offset,
>> -                           const u8 *report)
>> +                           const struct i915_perf_sample_data *data)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>         int report_size = dev_priv->perf.oa.oa_buffer.format_size;
>> @@ -569,16 +855,21 @@ static int append_oa_sample(struct i915_perf_stream
>> *stream,
>>          * transition. These are considered as source 'OABUFFER'.
>>          */
>>         if (sample_flags & SAMPLE_OA_SOURCE) {
>> -               u64 source = I915_PERF_SAMPLE_OA_SOURCE_OABUFFER;
>> +               if (copy_to_user(buf, &data->source, 8))
>> +                       return -EFAULT;
>> +               buf += 8;
>> +       }
>>   -             if (copy_to_user(buf, &source, 8))
>> +       if (sample_flags & SAMPLE_CTX_ID) {
>> +               if (copy_to_user(buf, &data->ctx_id, 8))
>>                         return -EFAULT;
>>                 buf += 8;
>>         }
>>         if (sample_flags & SAMPLE_OA_REPORT) {
>> -               if (copy_to_user(buf, report, report_size))
>> +               if (copy_to_user(buf, data->report, report_size))
>>                         return -EFAULT;
>> +               buf += report_size;
>>         }
>>         (*offset) += header.size;
>> @@ -587,11 +878,54 @@ static int append_oa_sample(struct i915_perf_stream
>> *stream,
>>   }
>>     /**
>> + * append_oa_buffer_sample - Copies single periodic OA report into
>> userspace
>> + * read() buffer.
>> + * @stream: An i915-perf stream opened for OA metrics
>> + * @buf: destination buffer given by userspace
>> + * @count: the number of bytes userspace wants to read
>> + * @offset: (inout): the current position for writing into @buf
>> + * @report: A single OA report to (optionally) include as part of the
>> sample
>> + *
>> + * Returns: 0 on success, negative error code on failure.
>> + */
>> +static int append_oa_buffer_sample(struct i915_perf_stream *stream,
>> +                               char __user *buf, size_t count,
>> +                               size_t *offset, const u8 *report)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       u32 sample_flags = stream->sample_flags;
>> +       struct i915_perf_sample_data data = { 0 };
>> +       u32 *report32 = (u32 *)report;
>> +
>> +       if (sample_flags & SAMPLE_OA_SOURCE)
>> +               data.source = I915_PERF_SAMPLE_OA_SOURCE_OABUFFER;
>> +
>> +       if (sample_flags & SAMPLE_CTX_ID) {
>> +               if (INTEL_INFO(dev_priv)->gen < 8)
>> +                       data.ctx_id = 0;
>> +               else {
>> +                       /*
>> +                        * XXX: Just keep the lower 21 bits for now since
>> I'm
>> +                        * not entirely sure if the HW touches any of the
>> higher
>> +                        * bits in this field
>> +                        */
>> +                       data.ctx_id = report32[2] & 0x1fffff;
>> +               }
>> +       }
>> +
>> +       if (sample_flags & SAMPLE_OA_REPORT)
>> +               data.report = report;
>> +
>> +       return append_perf_sample(stream, buf, count, offset, &data);
>> +}
>> +
>> +/**
>>    * Copies all buffered OA reports into userspace read() buffer.
>>    * @stream: An i915-perf stream opened for OA metrics
>>    * @buf: destination buffer given by userspace
>>    * @count: the number of bytes userspace wants to read
>>    * @offset: (inout): the current position for writing into @buf
>> + * @ts: copy OA reports till this timestamp
>>    *
>>    * Notably any error condition resulting in a short read (-%ENOSPC or
>>    * -%EFAULT) will be returned even though one or more records may
>> @@ -609,7 +943,8 @@ static int append_oa_sample(struct i915_perf_stream
>> *stream,
>>   static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>>                                   char __user *buf,
>>                                   size_t count,
>> -                                 size_t *offset)
>> +                                 size_t *offset,
>> +                                 u32 ts)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>         int report_size = dev_priv->perf.oa.oa_buffer.format_size;
>> @@ -623,7 +958,7 @@ static int gen8_append_oa_reports(struct
>> i915_perf_stream *stream,
>>         u32 taken;
>>         int ret = 0;
>>   -     if (WARN_ON(!stream->enabled))
>> +       if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
>>                 return -EIO;
>>         spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> @@ -669,6 +1004,11 @@ static int gen8_append_oa_reports(struct
>> i915_perf_stream *stream,
>>                 u32 *report32 = (void *)report;
>>                 u32 ctx_id;
>>                 u32 reason;
>> +               u32 report_ts = report32[1];
>> +
>> +               /* Report timestamp should not exceed the given ts */
>> +               if (report_ts > ts)
>> +                       break;
>>                 /*
>>                  * All the report sizes factor neatly into the buffer
>> @@ -750,23 +1090,23 @@ static int gen8_append_oa_reports(struct
>> i915_perf_stream *stream,
>>                  * switches since it's not-uncommon for periodic samples
>> to
>>                  * identify a switch before any 'context switch' report.
>>                  */
>> -               if (!dev_priv->perf.oa.exclusive_stream->ctx ||
>> -                   dev_priv->perf.oa.specific_ctx_id == ctx_id ||
>> +               if (!stream->ctx ||
>> +                   stream->engine->specific_ctx_id == ctx_id ||
>>                     (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
>> -                    dev_priv->perf.oa.specific_ctx_id) ||
>> +                    stream->engine->specific_ctx_id) ||
>>                     reason & OAREPORT_REASON_CTX_SWITCH) {
>>                         /*
>>                          * While filtering for a single context we avoid
>>                          * leaking the IDs of other contexts.
>>                          */
>> -                       if (dev_priv->perf.oa.exclusive_stream->ctx &&
>> -                           dev_priv->perf.oa.specific_ctx_id != ctx_id)
>> {
>> +                       if (stream->ctx &&
>> +                           stream->engine->specific_ctx_id != ctx_id) {
>>                                 report32[2] = INVALID_CTX_ID;
>>                         }
>>   -                     ret = append_oa_sample(stream, buf, count, offset,
>> -                                              report);
>> +                       ret = append_oa_buffer_sample(stream, buf, count,
>> +                                                     offset, report);
>>                         if (ret)
>>                                 break;
>>   @@ -807,6 +1147,7 @@ static int gen8_append_oa_reports(struct
>> i915_perf_stream *stream,
>>    * @buf: destination buffer given by userspace
>>    * @count: the number of bytes userspace wants to read
>>    * @offset: (inout): the current position for writing into @buf
>> + * @ts: copy OA reports till this timestamp
>>    *
>>    * Checks OA unit status registers and if necessary appends
>> corresponding
>>    * status records for userspace (such as for a buffer full condition)
>> and then
>> @@ -824,7 +1165,8 @@ static int gen8_append_oa_reports(struct
>> i915_perf_stream *stream,
>>   static int gen8_oa_read(struct i915_perf_stream *stream,
>>                         char __user *buf,
>>                         size_t count,
>> -                       size_t *offset)
>> +                       size_t *offset,
>> +                       u32 ts)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>         u32 oastatus;
>> @@ -877,7 +1219,7 @@ static int gen8_oa_read(struct i915_perf_stream
>> *stream,
>>                            oastatus & ~GEN8_OASTATUS_REPORT_LOST);
>>         }
>>   -     return gen8_append_oa_reports(stream, buf, count, offset);
>> +       return gen8_append_oa_reports(stream, buf, count, offset, ts);
>>   }
>>     /**
>> @@ -886,6 +1228,7 @@ static int gen8_oa_read(struct i915_perf_stream
>> *stream,
>>    * @buf: destination buffer given by userspace
>>    * @count: the number of bytes userspace wants to read
>>    * @offset: (inout): the current position for writing into @buf
>> + * @ts: copy OA reports till this timestamp
>>    *
>>    * Notably any error condition resulting in a short read (-%ENOSPC or
>>    * -%EFAULT) will be returned even though one or more records may
>> @@ -903,7 +1246,8 @@ static int gen8_oa_read(struct i915_perf_stream
>> *stream,
>>   static int gen7_append_oa_reports(struct i915_perf_stream *stream,
>>                                   char __user *buf,
>>                                   size_t count,
>> -                                 size_t *offset)
>> +                                 size_t *offset,
>> +                                 u32 ts)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>         int report_size = dev_priv->perf.oa.oa_buffer.format_size;
>> @@ -917,7 +1261,7 @@ static int gen7_append_oa_reports(struct
>> i915_perf_stream *stream,
>>         u32 taken;
>>         int ret = 0;
>>   -     if (WARN_ON(!stream->enabled))
>> +       if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
>>                 return -EIO;
>>         spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
>> @@ -984,7 +1328,12 @@ static int gen7_append_oa_reports(struct
>> i915_perf_stream *stream,
>>                         continue;
>>                 }
>>   -             ret = append_oa_sample(stream, buf, count, offset,
>> report);
>> +               /* Report timestamp should not exceed the given ts */
>> +               if (report32[1] > ts)
>> +                       break;
>> +
>> +               ret = append_oa_buffer_sample(stream, buf, count, offset,
>> +                                             report);
>>                 if (ret)
>>                         break;
>>   @@ -1022,6 +1371,7 @@ static int gen7_append_oa_reports(struct
>> i915_perf_stream *stream,
>>    * @buf: destination buffer given by userspace
>>    * @count: the number of bytes userspace wants to read
>>    * @offset: (inout): the current position for writing into @buf
>> + * @ts: copy OA reports till this timestamp
>>    *
>>    * Checks Gen 7 specific OA unit status registers and if necessary
>> appends
>>    * corresponding status records for userspace (such as for a buffer full
>> @@ -1035,7 +1385,8 @@ static int gen7_append_oa_reports(struct
>> i915_perf_stream *stream,
>>   static int gen7_oa_read(struct i915_perf_stream *stream,
>>                         char __user *buf,
>>                         size_t count,
>> -                       size_t *offset)
>> +                       size_t *offset,
>> +                       u32 ts)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>         u32 oastatus1;
>> @@ -1097,16 +1448,172 @@ static int gen7_oa_read(struct i915_perf_stream
>> *stream,
>>                         GEN7_OASTATUS1_REPORT_LOST;
>>         }
>>   -     return gen7_append_oa_reports(stream, buf, count, offset);
>> +       return gen7_append_oa_reports(stream, buf, count, offset, ts);
>> +}
>> +
>> +/**
>> + * append_cs_buffer_sample - Copies single perf sample data associated
>> with
>> + * GPU command stream, into userspace read() buffer.
>> + * @stream: An i915-perf stream opened for perf CS metrics
>> + * @buf: destination buffer given by userspace
>> + * @count: the number of bytes userspace wants to read
>> + * @offset: (inout): the current position for writing into @buf
>> + * @node: Sample data associated with perf metrics
>> + *
>> + * Returns: 0 on success, negative error code on failure.
>> + */
>> +static int append_cs_buffer_sample(struct i915_perf_stream *stream,
>> +                               char __user *buf,
>> +                               size_t count,
>> +                               size_t *offset,
>> +                               struct i915_perf_cs_sample *node)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       struct i915_perf_sample_data data = { 0 };
>> +       u32 sample_flags = stream->sample_flags;
>> +       int ret = 0;
>> +
>> +       if (sample_flags & SAMPLE_OA_REPORT) {
>> +               const u8 *report = stream->cs_buffer.vaddr + node->offset;
>> +               u32 sample_ts = *(u32 *)(report + 4);
>> +
>> +               data.report = report;
>> +
>> +               /* First, append the periodic OA samples having lower
>> +                * timestamp values
>> +                */
>> +               ret = dev_priv->perf.oa.ops.read(stream, buf, count,
>> offset,
>> +                                                sample_ts);
>> +               if (ret)
>> +                       return ret;
>> +       }
>> +
>> +       if (sample_flags & SAMPLE_OA_SOURCE)
>> +               data.source = I915_PERF_SAMPLE_OA_SOURCE_CS;
>> +
>> +       if (sample_flags & SAMPLE_CTX_ID)
>> +               data.ctx_id = node->ctx_id;
>> +
>> +       return append_perf_sample(stream, buf, count, offset, &data);
>>   }
>>     /**
>> - * i915_oa_wait_unlocked - handles blocking IO until OA data available
>> + * append_cs_buffer_samples: Copies all command stream based perf samples
>> + * into userspace read() buffer.
>> + * @stream: An i915-perf stream opened for perf CS metrics
>> + * @buf: destination buffer given by userspace
>> + * @count: the number of bytes userspace wants to read
>> + * @offset: (inout): the current position for writing into @buf
>> + *
>> + * Notably any error condition resulting in a short read (-%ENOSPC or
>> + * -%EFAULT) will be returned even though one or more records may
>> + * have been successfully copied. In this case it's up to the caller
>> + * to decide if the error should be squashed before returning to
>> + * userspace.
>> + *
>> + * Returns: 0 on success, negative error code on failure.
>> + */
>> +static int append_cs_buffer_samples(struct i915_perf_stream *stream,
>> +                               char __user *buf,
>> +                               size_t count,
>> +                               size_t *offset)
>> +{
>> +       struct i915_perf_cs_sample *entry, *next;
>> +       LIST_HEAD(free_list);
>> +       int ret = 0;
>> +       unsigned long flags;
>> +
>> +       spin_lock_irqsave(&stream->cs_samples_lock, flags);
>> +       if (list_empty(&stream->cs_samples)) {
>> +               spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +               return 0;
>> +       }
>> +       list_for_each_entry_safe(entry, next,
>> +                                &stream->cs_samples, link) {
>> +               if (!i915_gem_request_completed(entry->request))
>> +                       break;
>> +               list_move_tail(&entry->link, &free_list);
>> +       }
>> +       spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +
>> +       if (list_empty(&free_list))
>> +               return 0;
>> +
>> +       list_for_each_entry_safe(entry, next, &free_list, link) {
>> +               ret = append_cs_buffer_sample(stream, buf, count, offset,
>> +                                             entry);
>> +               if (ret)
>> +                       break;
>> +
>> +               list_del(&entry->link);
>> +               i915_gem_request_put(entry->request);
>> +               kfree(entry);
>> +       }
>> +
>> +       /* Don't discard remaining entries, keep them for next read */
>> +       spin_lock_irqsave(&stream->cs_samples_lock, flags);
>> +       list_splice(&free_list, &stream->cs_samples);
>> +       spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +
>> +       return ret;
>> +}
>> +
>> +/*
>> + * cs_buffer_is_empty - Checks whether the command stream buffer
>> + * associated with the stream has data available.
>>    * @stream: An i915-perf stream opened for OA metrics
>>    *
>> + * Returns: true if atleast one request associated with command stream is
>> + * completed, else returns false.
>> + */
>> +static bool cs_buffer_is_empty(struct i915_perf_stream *stream)
>> +
>> +{
>> +       struct i915_perf_cs_sample *entry = NULL;
>> +       struct drm_i915_gem_request *request = NULL;
>> +       unsigned long flags;
>> +
>> +       spin_lock_irqsave(&stream->cs_samples_lock, flags);
>> +       entry = list_first_entry_or_null(&stream->cs_samples,
>> +                       struct i915_perf_cs_sample, link);
>> +       if (entry)
>> +               request = entry->request;
>> +       spin_unlock_irqrestore(&stream->cs_samples_lock, flags);
>> +
>> +       if (!entry)
>> +               return true;
>> +       else if (!i915_gem_request_completed(request))
>> +               return true;
>> +       else
>> +               return false;
>> +}
>> +
>> +/**
>> + * stream_have_data_unlocked - Checks whether the stream has data
>> available
>> + * @stream: An i915-perf stream opened for OA metrics
>> + *
>> + * For command stream based streams, check if the command stream buffer
>> has
>> + * atleast one sample available, if not return false, irrespective of
>> periodic
>> + * oa buffer having the data or not.
>> + */
>> +
>> +static bool stream_have_data_unlocked(struct i915_perf_stream *stream)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +
>> +       if (stream->cs_mode)
>> +               return !cs_buffer_is_empty(stream);
>> +       else
>> +               return oa_buffer_check_unlocked(dev_priv);
>> +}
>> +
>> +/**
>> + * i915_perf_stream_wait_unlocked - handles blocking IO until data
>> available
>> + * @stream: An i915-perf stream opened for GPU metrics
>> + *
>>    * Called when userspace tries to read() from a blocking stream FD
>> opened
>> - * for OA metrics. It waits until the hrtimer callback finds a non-empty
>> - * OA buffer and wakes us.
>> + * for perf metrics. It waits until the hrtimer callback finds a
>> non-empty
>> + * command stream buffer / OA buffer and wakes us.
>>    *
>>    * Note: it's acceptable to have this return with some false positives
>>    * since any subsequent read handling will return -EAGAIN if there isn't
>> @@ -1114,7 +1621,7 @@ static int gen7_oa_read(struct i915_perf_stream
>> *stream,
>>    *
>>    * Returns: zero on success or a negative error code
>>    */
>> -static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
>> +static int i915_perf_stream_wait_unlocked(struct i915_perf_stream
>> *stream)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>   @@ -1122,32 +1629,47 @@ static int i915_oa_wait_unlocked(struct
>> i915_perf_stream *stream)
>>         if (!dev_priv->perf.oa.periodic)
>>                 return -EIO;
>>   -     return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
>> -                                       oa_buffer_check_unlocked(dev_
>> priv));
>> +       if (stream->cs_mode) {
>> +               long int ret;
>> +
>> +               /* Wait for the all sampled requests. */
>> +               ret = reservation_object_wait_timeout_rcu(
>> +
>>  stream->cs_buffer.vma->resv,
>> +                                                   true,
>> +                                                   true,
>> +                                                   MAX_SCHEDULE_TIMEOUT);
>> +               if (unlikely(ret < 0)) {
>> +                       DRM_DEBUG_DRIVER("Failed to wait for sampled
>> requests: %li\n", ret);
>> +                       return ret;
>> +               }
>> +       }
>> +
>> +       return wait_event_interruptible(stream->poll_wq,
>> +                                       stream_have_data_unlocked(str
>> eam));
>>   }
>>     /**
>> - * i915_oa_poll_wait - call poll_wait() for an OA stream poll()
>> - * @stream: An i915-perf stream opened for OA metrics
>> + * i915_perf_stream_poll_wait - call poll_wait() for an stream poll()
>> + * @stream: An i915-perf stream opened for GPU metrics
>>    * @file: An i915 perf stream file
>>    * @wait: poll() state table
>>    *
>> - * For handling userspace polling on an i915 perf stream opened for OA
>> metrics,
>> + * For handling userspace polling on an i915 perf stream opened for
>> metrics,
>>    * this starts a poll_wait with the wait queue that our hrtimer
>> callback wakes
>> - * when it sees data ready to read in the circular OA buffer.
>> + * when it sees data ready to read either in command stream buffer or in
>> the
>> + * circular OA buffer.
>>    */
>> -static void i915_oa_poll_wait(struct i915_perf_stream *stream,
>> +static void i915_perf_stream_poll_wait(struct i915_perf_stream *stream,
>>                               struct file *file,
>>                               poll_table *wait)
>>   {
>> -       struct drm_i915_private *dev_priv = stream->dev_priv;
>> -
>> -       poll_wait(file, &dev_priv->perf.oa.poll_wq, wait);
>> +       poll_wait(file, &stream->poll_wq, wait);
>>   }
>>     /**
>> - * i915_oa_read - just calls through to &i915_oa_ops->read
>> - * @stream: An i915-perf stream opened for OA metrics
>> + * i915_perf_stream_read - Reads perf metrics available into userspace
>> read
>> + * buffer
>> + * @stream: An i915-perf stream opened for GPU metrics
>>    * @buf: destination buffer given by userspace
>>    * @count: the number of bytes userspace wants to read
>>    * @offset: (inout): the current position for writing into @buf
>> @@ -1157,14 +1679,21 @@ static void i915_oa_poll_wait(struct
>> i915_perf_stream *stream,
>>    *
>>    * Returns: zero on success or a negative error code
>>    */
>> -static int i915_oa_read(struct i915_perf_stream *stream,
>> +static int i915_perf_stream_read(struct i915_perf_stream *stream,
>>                         char __user *buf,
>>                         size_t count,
>>                         size_t *offset)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>   -     return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
>> +
>>
>
> Does the following code mean that a perf stream is either in cs_mode or OA
> mode?
> I couldn't see that condition in the function processing the opening
> parameters.
>
> The comments in the patch description also says :
>
> "Both periodic and CS based reports are associated with a single stream"
>
> The following code seems to contradict that. Can you explain how it works?
>
> Thanks
>

Hi Lionel,

If you look closely, append_cs_buffer_sample() function does merge sorting
of
OA reports from two independent buffers (OA buffer which has the periodic OA
samples and Command stream buffer for RCS based OA reports). This is done on
the basis of the report timestamps.
Therefore, in the code below, if stream->cs_mode is enabled, that means the
append_cs_buffer_samples() function needs to be called which will take care
of
collating the samples from these two independent buffers and copying to
stream's
buffer in merge sort'ed order. If cs_mode is not enabled, we can simply
collect
samples from periodic OA buffer and forward them to userspace (done via
perf.oa.ops.read() function).
Hope this addresses your question.

Regards,
Sourab

>
> +       if (stream->cs_mode)
>> +               return append_cs_buffer_samples(stream, buf, count,
>> offset);
>> +       else if (stream->sample_flags & SAMPLE_OA_REPORT)
>> +               return dev_priv->perf.oa.ops.read(stream, buf, count,
>> offset,
>> +                                               U32_MAX);
>> +       else
>> +               return -EINVAL;
>>   }
>>     /**
>> @@ -1182,7 +1711,7 @@ static int oa_get_render_ctx_id(struct
>> i915_perf_stream *stream)
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>         if (i915.enable_execlists)
>> -               dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
>> +               stream->engine->specific_ctx_id = stream->ctx->hw_id;
>>         else {
>>                 struct intel_engine_cs *engine = dev_priv->engine[RCS];
>>                 struct intel_ring *ring;
>> @@ -1209,7 +1738,7 @@ static int oa_get_render_ctx_id(struct
>> i915_perf_stream *stream)
>>                  * i915_ggtt_offset() on the fly) considering the
>> difference
>>                  * with gen8+ and execlists
>>                  */
>> -               dev_priv->perf.oa.specific_ctx_id =
>> +               stream->engine->specific_ctx_id =
>>                         i915_ggtt_offset(stream->ctx->
>> engine[engine->id].state);
>>         }
>>   @@ -1228,13 +1757,13 @@ static void oa_put_render_ctx_id(struct
>> i915_perf_stream *stream)
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>         if (i915.enable_execlists) {
>> -               dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
>> +               stream->engine->specific_ctx_id = INVALID_CTX_ID;
>>         } else {
>>                 struct intel_engine_cs *engine = dev_priv->engine[RCS];
>>                 mutex_lock(&dev_priv->drm.struct_mutex);
>>   -             dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
>> +               stream->engine->specific_ctx_id = INVALID_CTX_ID;
>>                 engine->context_unpin(engine, stream->ctx);
>>                 mutex_unlock(&dev_priv->drm.struct_mutex);
>> @@ -1242,13 +1771,28 @@ static void oa_put_render_ctx_id(struct
>> i915_perf_stream *stream)
>>   }
>>     static void
>> +free_cs_buffer(struct i915_perf_stream *stream)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +
>> +       mutex_lock(&dev_priv->drm.struct_mutex);
>> +
>> +       i915_gem_object_unpin_map(stream->cs_buffer.vma->obj);
>> +       i915_vma_unpin_and_release(&stream->cs_buffer.vma);
>> +
>> +       stream->cs_buffer.vma = NULL;
>> +       stream->cs_buffer.vaddr = NULL;
>> +
>> +       mutex_unlock(&dev_priv->drm.struct_mutex);
>> +}
>> +
>> +static void
>>   free_oa_buffer(struct drm_i915_private *i915)
>>   {
>>         mutex_lock(&i915->drm.struct_mutex);
>>         i915_gem_object_unpin_map(i915->perf.oa.oa_buffer.vma->obj);
>> -       i915_vma_unpin(i915->perf.oa.oa_buffer.vma);
>> -       i915_gem_object_put(i915->perf.oa.oa_buffer.vma->obj);
>> +       i915_vma_unpin_and_release(&i915->perf.oa.oa_buffer.vma);
>>         i915->perf.oa.oa_buffer.vma = NULL;
>>         i915->perf.oa.oa_buffer.vaddr = NULL;
>> @@ -1256,27 +1800,41 @@ static void oa_put_render_ctx_id(struct
>> i915_perf_stream *stream)
>>         mutex_unlock(&i915->drm.struct_mutex);
>>   }
>>   -static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
>> +static void i915_perf_stream_destroy(struct i915_perf_stream *stream)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>> -
>> -       BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
>> +       struct intel_engine_cs *engine = stream->engine;
>> +       struct i915_perf_stream *engine_stream;
>> +       int idx;
>> +
>> +       idx = srcu_read_lock(&engine->perf_srcu);
>> +       engine_stream = srcu_dereference(engine->exclusive_stream,
>> +                                        &engine->perf_srcu);
>> +       if (WARN_ON(stream != engine_stream))
>> +               return;
>> +       srcu_read_unlock(&engine->perf_srcu, idx);
>>         /*
>>          * Unset exclusive_stream first, it might be checked while
>>          * disabling the metric set on gen8+.
>>          */
>> -       dev_priv->perf.oa.exclusive_stream = NULL;
>> +       rcu_assign_pointer(stream->engine->exclusive_stream, NULL);
>> +       synchronize_srcu(&stream->engine->perf_srcu);
>>   -     dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
>> +       if (stream->using_oa) {
>> +               dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
>>   -     free_oa_buffer(dev_priv);
>> +               free_oa_buffer(dev_priv);
>>   -     intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>> -       intel_runtime_pm_put(dev_priv);
>> +               intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>> +               intel_runtime_pm_put(dev_priv);
>>   -     if (stream->ctx)
>> -               oa_put_render_ctx_id(stream);
>> +               if (stream->ctx)
>> +                       oa_put_render_ctx_id(stream);
>> +       }
>> +
>> +       if (stream->cs_mode)
>> +               free_cs_buffer(stream);
>>         if (dev_priv->perf.oa.spurious_report_rs.missed) {
>>                 DRM_NOTE("%d spurious OA report notices suppressed due to
>> ratelimiting\n",
>> @@ -1325,11 +1883,6 @@ static void gen7_init_oa_buffer(struct
>> drm_i915_private *dev_priv)
>>          * memory...
>>          */
>>         memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
>> -
>> -       /* Maybe make ->pollin per-stream state if we support multiple
>> -        * concurrent streams in the future.
>> -        */
>> -       dev_priv->perf.oa.pollin = false;
>>   }
>>     static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
>> @@ -1383,33 +1936,26 @@ static void gen8_init_oa_buffer(struct
>> drm_i915_private *dev_priv)
>>          * memory...
>>          */
>>         memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
>> -
>> -       /*
>> -        * Maybe make ->pollin per-stream state if we support multiple
>> -        * concurrent streams in the future.
>> -        */
>> -       dev_priv->perf.oa.pollin = false;
>>   }
>>   -static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
>> +static int alloc_obj(struct drm_i915_private *dev_priv,
>> +                    struct i915_vma **vma, u8 **vaddr)
>>   {
>>         struct drm_i915_gem_object *bo;
>> -       struct i915_vma *vma;
>>         int ret;
>>   -     if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
>> -               return -ENODEV;
>> +       intel_runtime_pm_get(dev_priv);
>>         ret = i915_mutex_lock_interruptible(&dev_priv->drm);
>>         if (ret)
>> -               return ret;
>> +               goto out;
>>         BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
>>         BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
>>         bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
>>         if (IS_ERR(bo)) {
>> -               DRM_ERROR("Failed to allocate OA buffer\n");
>> +               DRM_ERROR("Failed to allocate i915 perf obj\n");
>>                 ret = PTR_ERR(bo);
>>                 goto unlock;
>>         }
>> @@ -1419,42 +1965,83 @@ static int alloc_oa_buffer(struct
>> drm_i915_private *dev_priv)
>>                 goto err_unref;
>>         /* PreHSW required 512K alignment, HSW requires 16M */
>> -       vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
>> -       if (IS_ERR(vma)) {
>> -               ret = PTR_ERR(vma);
>> +       *vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
>> +       if (IS_ERR(*vma)) {
>> +               ret = PTR_ERR(*vma);
>>                 goto err_unref;
>>         }
>> -       dev_priv->perf.oa.oa_buffer.vma = vma;
>>   -     dev_priv->perf.oa.oa_buffer.vaddr =
>> -               i915_gem_object_pin_map(bo, I915_MAP_WB);
>> -       if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
>> -               ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
>> +       *vaddr = i915_gem_object_pin_map(bo, I915_MAP_WB);
>> +       if (IS_ERR(*vaddr)) {
>> +               ret = PTR_ERR(*vaddr);
>>                 goto err_unpin;
>>         }
>>   -     dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
>> -
>> -       DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr
>> = %p\n",
>> -                        i915_ggtt_offset(dev_priv->per
>> f.oa.oa_buffer.vma),
>> -                        dev_priv->perf.oa.oa_buffer.vaddr);
>> -
>>         goto unlock;
>>     err_unpin:
>> -       __i915_vma_unpin(vma);
>> +       i915_vma_unpin(*vma);
>>     err_unref:
>>         i915_gem_object_put(bo);
>>   -     dev_priv->perf.oa.oa_buffer.vaddr = NULL;
>> -       dev_priv->perf.oa.oa_buffer.vma = NULL;
>> -
>>   unlock:
>>         mutex_unlock(&dev_priv->drm.struct_mutex);
>> +out:
>> +       intel_runtime_pm_put(dev_priv);
>>         return ret;
>>   }
>>   +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
>> +{
>> +       struct i915_vma *vma;
>> +       u8 *vaddr;
>> +       int ret;
>> +
>> +       if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
>> +               return -ENODEV;
>> +
>> +       ret = alloc_obj(dev_priv, &vma, &vaddr);
>> +       if (ret)
>> +               return ret;
>> +
>> +       dev_priv->perf.oa.oa_buffer.vma = vma;
>> +       dev_priv->perf.oa.oa_buffer.vaddr = vaddr;
>> +
>> +       dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
>> +
>> +       DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr
>> = %p",
>> +                        i915_ggtt_offset(dev_priv->per
>> f.oa.oa_buffer.vma),
>> +                        dev_priv->perf.oa.oa_buffer.vaddr);
>> +       return 0;
>> +}
>> +
>> +static int alloc_cs_buffer(struct i915_perf_stream *stream)
>> +{
>> +       struct drm_i915_private *dev_priv = stream->dev_priv;
>> +       struct i915_vma *vma;
>> +       u8 *vaddr;
>> +       int ret;
>> +
>> +       if (WARN_ON(stream->cs_buffer.vma))
>> +               return -ENODEV;
>> +
>> +       ret = alloc_obj(dev_priv, &vma, &vaddr);
>> +       if (ret)
>> +               return ret;
>> +
>> +       stream->cs_buffer.vma = vma;
>> +       stream->cs_buffer.vaddr = vaddr;
>> +       if (WARN_ON(!list_empty(&stream->cs_samples)))
>> +               INIT_LIST_HEAD(&stream->cs_samples);
>> +
>> +       DRM_DEBUG_DRIVER("Command stream buf initialized, gtt offset =
>> 0x%x, vaddr = %p",
>> +                        i915_ggtt_offset(stream->cs_buffer.vma),
>> +                        stream->cs_buffer.vaddr);
>> +
>> +       return 0;
>> +}
>> +
>>   static void config_oa_regs(struct drm_i915_private *dev_priv,
>>                            const struct i915_oa_reg *regs,
>>                            int n_regs)
>> @@ -1859,6 +2446,10 @@ static void gen8_disable_metric_set(struct
>> drm_i915_private *dev_priv)
>>     static void gen7_oa_enable(struct drm_i915_private *dev_priv)
>>   {
>> +       struct i915_perf_stream *stream;
>> +       struct intel_engine_cs *engine = dev_priv->engine[RCS];
>> +       int idx;
>> +
>>         /*
>>          * Reset buf pointers so we don't forward reports from before now.
>>          *
>> @@ -1870,11 +2461,11 @@ static void gen7_oa_enable(struct
>> drm_i915_private *dev_priv)
>>          */
>>         gen7_init_oa_buffer(dev_priv);
>>   -     if (dev_priv->perf.oa.exclusive_stream->enabled) {
>> -               struct i915_gem_context *ctx =
>> -                       dev_priv->perf.oa.exclusive_stream->ctx;
>> -               u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
>> -
>> +       idx = srcu_read_lock(&engine->perf_srcu);
>> +       stream = srcu_dereference(engine->exclusive_stream,
>> &engine->perf_srcu);
>> +       if (stream->state != I915_PERF_STREAM_DISABLED) {
>> +               struct i915_gem_context *ctx = stream->ctx;
>> +               u32 ctx_id = engine->specific_ctx_id;
>>                 bool periodic = dev_priv->perf.oa.periodic;
>>                 u32 period_exponent = dev_priv->perf.oa.period_exponent;
>>                 u32 report_format = dev_priv->perf.oa.oa_buffer.format;
>> @@ -1889,6 +2480,7 @@ static void gen7_oa_enable(struct drm_i915_private
>> *dev_priv)
>>                            GEN7_OACONTROL_ENABLE);
>>         } else
>>                 I915_WRITE(GEN7_OACONTROL, 0);
>> +       srcu_read_unlock(&engine->perf_srcu, idx);
>>   }
>>     static void gen8_oa_enable(struct drm_i915_private *dev_priv)
>> @@ -1917,22 +2509,23 @@ static void gen8_oa_enable(struct
>> drm_i915_private *dev_priv)
>>   }
>>     /**
>> - * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
>> - * @stream: An i915 perf stream opened for OA metrics
>> + * i915_perf_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for perf
>> stream
>> + * @stream: An i915 perf stream opened for GPU metrics
>>    *
>>    * [Re]enables hardware periodic sampling according to the period
>> configured
>>    * when opening the stream. This also starts a hrtimer that will
>> periodically
>>    * check for data in the circular OA buffer for notifying userspace
>> (e.g.
>>    * during a read() or poll()).
>>    */
>> -static void i915_oa_stream_enable(struct i915_perf_stream *stream)
>> +static void i915_perf_stream_enable(struct i915_perf_stream *stream)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>   -     dev_priv->perf.oa.ops.oa_enable(dev_priv);
>> +       if (stream->sample_flags & SAMPLE_OA_REPORT)
>> +               dev_priv->perf.oa.ops.oa_enable(dev_priv);
>>   -     if (dev_priv->perf.oa.periodic)
>> -               hrtimer_start(&dev_priv->perf.oa.poll_check_timer,
>> +       if (stream->cs_mode || dev_priv->perf.oa.periodic)
>> +               hrtimer_start(&dev_priv->perf.poll_check_timer,
>>                               ns_to_ktime(POLL_PERIOD),
>>                               HRTIMER_MODE_REL_PINNED);
>>   }
>> @@ -1948,34 +2541,39 @@ static void gen8_oa_disable(struct
>> drm_i915_private *dev_priv)
>>   }
>>     /**
>> - * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA
>> stream
>> - * @stream: An i915 perf stream opened for OA metrics
>> + * i915_perf_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for perf
>> stream
>> + * @stream: An i915 perf stream opened for GPU metrics
>>    *
>>    * Stops the OA unit from periodically writing counter reports into the
>>    * circular OA buffer. This also stops the hrtimer that periodically
>> checks for
>>    * data in the circular OA buffer, for notifying userspace.
>>    */
>> -static void i915_oa_stream_disable(struct i915_perf_stream *stream)
>> +static void i915_perf_stream_disable(struct i915_perf_stream *stream)
>>   {
>>         struct drm_i915_private *dev_priv = stream->dev_priv;
>>   -     dev_priv->perf.oa.ops.oa_disable(dev_priv);
>> +       if (stream->cs_mode || dev_priv->perf.oa.periodic)
>> +               hrtimer_cancel(&dev_priv->perf.poll_check_timer);
>> +
>> +       if (stream->cs_mode)
>> +               i915_perf_stream_release_samples(stream);
>>   -     if (dev_priv->perf.oa.periodic)
>> -               hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
>> +       if (stream->sample_flags & SAMPLE_OA_REPORT)
>> +               dev_priv->perf.oa.ops.oa_disable(dev_priv);
>>   }
>>   -static const struct i915_perf_stream_ops i915_oa_stream_ops = {
>> -       .destroy = i915_oa_stream_destroy,
>> -       .enable = i915_oa_stream_enable,
>> -       .disable = i915_oa_stream_disable,
>> -       .wait_unlocked = i915_oa_wait_unlocked,
>> -       .poll_wait = i915_oa_poll_wait,
>> -       .read = i915_oa_read,
>> +static const struct i915_perf_stream_ops perf_stream_ops = {
>> +       .destroy = i915_perf_stream_destroy,
>> +       .enable = i915_perf_stream_enable,
>> +       .disable = i915_perf_stream_disable,
>> +       .wait_unlocked = i915_perf_stream_wait_unlocked,
>> +       .poll_wait = i915_perf_stream_poll_wait,
>> +       .read = i915_perf_stream_read,
>> +       .emit_sample_capture = i915_perf_stream_emit_sample_capture,
>>   };
>>     /**
>> - * i915_oa_stream_init - validate combined props for OA stream and init
>> + * i915_perf_stream_init - validate combined props for stream and init
>>    * @stream: An i915 perf stream
>>    * @param: The open parameters passed to `DRM_I915_PERF_OPEN`
>>    * @props: The property state that configures stream (individually
>> validated)
>> @@ -1984,58 +2582,35 @@ static void i915_oa_stream_disable(struct
>> i915_perf_stream *stream)
>>    * doesn't ensure that the combination necessarily makes sense.
>>    *
>>    * At this point it has been determined that userspace wants a stream of
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-gfx/attachments/20170731/461346e5/attachment-0001.html>


More information about the Intel-gfx mailing list