[Intel-gfx] [PATCH v3 4/6] drm/i915: Request watchdog infrastructure

Matthew Auld matthew.william.auld at gmail.com
Tue Mar 23 10:54:13 UTC 2021


On Mon, 22 Mar 2021 at 13:29, Tvrtko Ursulin
<tvrtko.ursulin at linux.intel.com> wrote:
>
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Prepares the plumbing for setting request/fence expiration time. All code
> is put in place but is never activeted due yet missing ability to actually

                 activated

> configure the timer.
>
> Outline of the basic operation:
>
> A timer is started when request is ready for execution. If the request
> completes (retires) before the timer fires, timer is cancelled and nothing
> further happens.
>
> If the timer fires request is added to a lockless list and worker queued.
> Purpose of this is twofold: a) It allows request cancellation from a more
> friendly context and b) coalesces multiple expirations into a single event
> of consuming the list.
>
> Worker locklessly consumes the list of expired requests and cancels them
> all using previous added i915_request_cancel().
>
> Associated timeout value is stored in rq->context.watchdog.timeout_us.
>
> v2:
>  * Log expiration.
>
> v3:
>  * Include more information about user timeline in the log message.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> ---
>  drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++
>  .../drm/i915/gt/intel_execlists_submission.h  |  2 +
>  drivers/gpu/drm/i915/gt/intel_gt.c            |  3 +
>  drivers/gpu/drm/i915/gt/intel_gt.h            |  2 +
>  drivers/gpu/drm/i915/gt/intel_gt_requests.c   | 28 ++++++++++
>  drivers/gpu/drm/i915/gt/intel_gt_types.h      |  7 +++
>  drivers/gpu/drm/i915/i915_request.c           | 56 +++++++++++++++++++
>  drivers/gpu/drm/i915/i915_request.h           |  8 +++
>  8 files changed, 110 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
> index 0ea18c9e2aca..65a5730a4f5b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> @@ -99,6 +99,10 @@ struct intel_context {
>  #define CONTEXT_FORCE_SINGLE_SUBMISSION        7
>  #define CONTEXT_NOPREEMPT              8
>
> +       struct {
> +               u64 timeout_us;
> +       } watchdog;
> +
>         u32 *lrc_reg_state;
>         union {
>                 struct {
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> index f7bd3fccfee8..4ca9b475e252 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.h
> @@ -6,6 +6,7 @@
>  #ifndef __INTEL_EXECLISTS_SUBMISSION_H__
>  #define __INTEL_EXECLISTS_SUBMISSION_H__
>
> +#include <linux/llist.h>
>  #include <linux/types.h>
>
>  struct drm_printer;
> @@ -13,6 +14,7 @@ struct drm_printer;
>  struct i915_request;
>  struct intel_context;
>  struct intel_engine_cs;
> +struct intel_gt;
>
>  enum {
>         INTEL_CONTEXT_SCHEDULE_IN = 0,
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index ca76f93bc03d..8d77dcbad059 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -31,6 +31,9 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
>         INIT_LIST_HEAD(&gt->closed_vma);
>         spin_lock_init(&gt->closed_lock);
>
> +       init_llist_head(&gt->watchdog.list);
> +       INIT_WORK(&gt->watchdog.work, intel_gt_watchdog_work);
> +
>         intel_gt_init_buffer_pool(gt);
>         intel_gt_init_reset(gt);
>         intel_gt_init_requests(gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h
> index a17bd8b3195f..7ec395cace69 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.h
> @@ -78,4 +78,6 @@ static inline bool intel_gt_is_wedged(const struct intel_gt *gt)
>  void intel_gt_info_print(const struct intel_gt_info *info,
>                          struct drm_printer *p);
>
> +void intel_gt_watchdog_work(struct work_struct *work);
> +
>  #endif /* __INTEL_GT_H__ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 36ec97f79174..fbfd19b2e5f2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -8,6 +8,7 @@
>  #include "i915_drv.h" /* for_each_engine() */
>  #include "i915_request.h"
>  #include "intel_engine_heartbeat.h"
> +#include "intel_execlists_submission.h"
>  #include "intel_gt.h"
>  #include "intel_gt_pm.h"
>  #include "intel_gt_requests.h"
> @@ -242,4 +243,31 @@ void intel_gt_fini_requests(struct intel_gt *gt)
>  {
>         /* Wait until the work is marked as finished before unloading! */
>         cancel_delayed_work_sync(&gt->requests.retire_work);
> +
> +       flush_work(&gt->watchdog.work);
> +}
> +
> +void intel_gt_watchdog_work(struct work_struct *work)
> +{
> +       struct intel_gt *gt =
> +               container_of(work, typeof(*gt), watchdog.work);
> +       struct i915_request *rq, *rn;
> +       struct llist_node *first;
> +
> +       first = llist_del_all(&gt->watchdog.list);
> +       if (!first)
> +               return;
> +
> +       llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
> +               if (!i915_request_completed(rq)) {
> +                       struct dma_fence *f = &rq->fence;
> +
> +                       pr_notice("Fence expiration time out i915-%s:%s:%llx!\n",
> +                                 f->ops->get_driver_name(f),
> +                                 f->ops->get_timeline_name(f),
> +                                 f->seqno);
> +                       i915_request_cancel(rq, -EINTR);
> +               }
> +               i915_request_put(rq);
> +       }
>  }
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index 626af37c7790..d70ebcc6f19f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -8,10 +8,12 @@
>
>  #include <linux/ktime.h>
>  #include <linux/list.h>
> +#include <linux/llist.h>
>  #include <linux/mutex.h>
>  #include <linux/notifier.h>
>  #include <linux/spinlock.h>
>  #include <linux/types.h>
> +#include <linux/workqueue.h>
>
>  #include "uc/intel_uc.h"
>
> @@ -62,6 +64,11 @@ struct intel_gt {
>                 struct delayed_work retire_work;
>         } requests;
>
> +       struct {
> +               struct llist_head list;
> +               struct work_struct work;
> +       } watchdog;
> +
>         struct intel_wakeref wakeref;
>         atomic_t user_wakeref;
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index b4511ac05e9a..9dd5e588b0a4 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -277,6 +277,57 @@ static void remove_from_engine(struct i915_request *rq)
>         __notify_execute_cb_imm(rq);
>  }
>
> +static void __rq_init_watchdog(struct i915_request *rq)
> +{
> +       rq->watchdog.timer.function = NULL;
> +}
> +
> +static enum hrtimer_restart __rq_watchdog_expired(struct hrtimer *hrtimer)
> +{
> +       struct i915_request *rq =
> +               container_of(hrtimer, struct i915_request, watchdog.timer);
> +       struct intel_gt *gt = rq->engine->gt;
> +
> +       if (!i915_request_completed(rq)) {
> +               if (llist_add(&rq->watchdog.link, &gt->watchdog.list))
> +                       schedule_work(&gt->watchdog.work);
> +       } else {
> +               i915_request_put(rq);
> +       }
> +
> +       return HRTIMER_NORESTART;
> +}
> +
> +static void __rq_arm_watchdog(struct i915_request *rq)
> +{
> +       struct i915_request_watchdog *wdg = &rq->watchdog;
> +       struct intel_context *ce = rq->context;
> +
> +       if (!ce->watchdog.timeout_us)
> +               return;
> +
> +       hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +       wdg->timer.function = __rq_watchdog_expired;
> +       hrtimer_start_range_ns(&wdg->timer,
> +                              ns_to_ktime(ce->watchdog.timeout_us *
> +                                          NSEC_PER_USEC),
> +                               /*
> +                                * FIXME check if it gives the "not sooner"
> +                                * guarantee or slack is both ways
> +                                */

It looks like the slack/fuzziness just delays the timer, in case it
can coalesce multiple timer events. So shouldn't be sooner I think?

> +                               NSEC_PER_MSEC,

Formatting.

Reviewed-by: Matthew Auld <matthew.auld at intel.com>


More information about the Intel-gfx mailing list