[Intel-gfx] [PATCH 8/8] drm/i915/gt: Use intel_gt as the primary object for handling resets

Mon Jul 8 22:48:51 UTC 2019

On 7/5/19 12:46 AM, Chris Wilson wrote:
> Having taken the first step in encapsulating the functionality by moving
> the related files under gt/, the next step is to start encapsulating by
> passing around the relevant structs rather than the global
> drm_i915_private. In this step, we pass intel_gt to intel_reset.c
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> ---
>   drivers/gpu/drm/i915/display/intel_display.c  |  21 +-
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |   2 +-
>   drivers/gpu/drm/i915/gem/i915_gem_mman.c      |   8 +-
>   drivers/gpu/drm/i915/gem/i915_gem_pm.c        |  25 +-
>   drivers/gpu/drm/i915/gem/i915_gem_throttle.c  |   2 +-
>   .../gpu/drm/i915/gem/selftests/huge_pages.c   |  20 +-
>   .../i915/gem/selftests/i915_gem_client_blt.c  |   4 +-
>   .../i915/gem/selftests/i915_gem_coherency.c   |   6 +-
>   .../drm/i915/gem/selftests/i915_gem_context.c |  17 +-
>   .../drm/i915/gem/selftests/i915_gem_mman.c    |   2 +-
>   .../i915/gem/selftests/i915_gem_object_blt.c  |   4 +-
>   drivers/gpu/drm/i915/gt/intel_engine.h        |   8 +-
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  16 +-
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c     |   3 +-
>   drivers/gpu/drm/i915/gt/intel_gt.c            |   7 +
>   drivers/gpu/drm/i915/gt/intel_gt.h            |  12 +
>   drivers/gpu/drm/i915/gt/intel_gt_pm.c         |  19 +-
>   drivers/gpu/drm/i915/gt/intel_gt_types.h      |  12 +
>   drivers/gpu/drm/i915/gt/intel_hangcheck.c     |  67 +--
>   drivers/gpu/drm/i915/gt/intel_lrc.c           |   2 +-
>   drivers/gpu/drm/i915/gt/intel_reset.c         | 435 +++++++++---------
>   drivers/gpu/drm/i915/gt/intel_reset.h         |  73 +--
>   drivers/gpu/drm/i915/gt/intel_reset_types.h   |  50 ++
>   drivers/gpu/drm/i915/gt/intel_ringbuffer.c    |   2 +-
>   drivers/gpu/drm/i915/gt/selftest_hangcheck.c  | 107 +++--
>   drivers/gpu/drm/i915/gt/selftest_lrc.c        |  36 +-
>   drivers/gpu/drm/i915/gt/selftest_reset.c      |  50 +-
>   drivers/gpu/drm/i915/gt/selftest_timeline.c   |   3 +-
>   .../gpu/drm/i915/gt/selftest_workarounds.c    |  33 +-
>   drivers/gpu/drm/i915/i915_debugfs.c           |  63 +--
>   drivers/gpu/drm/i915/i915_drv.c               |   5 +-
>   drivers/gpu/drm/i915/i915_drv.h               |  35 +-
>   drivers/gpu/drm/i915/i915_gem.c               |  31 +-
>   drivers/gpu/drm/i915/i915_gpu_error.h         |  52 +--
>   drivers/gpu/drm/i915/i915_request.c           |   5 +-
>   drivers/gpu/drm/i915/intel_guc_submission.c   |   2 +-
>   drivers/gpu/drm/i915/intel_uc.c               |   2 +-
>   drivers/gpu/drm/i915/selftests/i915_active.c  |   3 +-
>   drivers/gpu/drm/i915/selftests/i915_gem.c     |   3 +-
>   .../gpu/drm/i915/selftests/i915_gem_evict.c   |   3 +-
>   drivers/gpu/drm/i915/selftests/i915_request.c |   4 +-
>   .../gpu/drm/i915/selftests/i915_selftest.c    |   2 +-
>   .../gpu/drm/i915/selftests/igt_flush_test.c   |   5 +-
>   drivers/gpu/drm/i915/selftests/igt_reset.c    |  38 +-
>   drivers/gpu/drm/i915/selftests/igt_reset.h    |  10 +-
>   drivers/gpu/drm/i915/selftests/igt_wedge_me.h |  58 ---
>   .../gpu/drm/i915/selftests/mock_gem_device.c  |   5 -
>   48 files changed, 665 insertions(+), 709 deletions(-)
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_reset_types.h
>   delete mode 100644 drivers/gpu/drm/i915/selftests/igt_wedge_me.h
> 
> diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
> index 919f5ac844c8..4dd6856bf722 100644
> --- a/drivers/gpu/drm/i915/display/intel_display.c
> +++ b/drivers/gpu/drm/i915/display/intel_display.c
> @@ -4249,12 +4249,12 @@ void intel_prepare_reset(struct drm_i915_private *dev_priv)
>   		return;
>   
>   	/* We have a modeset vs reset deadlock, defensively unbreak it. */
> -	set_bit(I915_RESET_MODESET, &dev_priv->gpu_error.flags);
> -	wake_up_all(&dev_priv->gpu_error.wait_queue);
> +	set_bit(I915_RESET_MODESET, &dev_priv->gt.reset.flags);
> +	wake_up_bit(&dev_priv->gt.reset.flags, I915_RESET_MODESET);

The doc for wake_up_bit says that we need a memory barrier before 
calling it. Do we have it implicitly somewhere or is it missing?

>   
>   	if (atomic_read(&dev_priv->gpu_error.pending_fb_pin)) {
>   		DRM_DEBUG_KMS("Modeset potentially stuck, unbreaking through wedging\n");
> -		i915_gem_set_wedged(dev_priv);
> +		intel_gt_set_wedged(&dev_priv->gt);
>   	}
>   
>   	/*

<snip>

> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c b/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> index bf085b0cb7c6..8e2eeaec06cb 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pm.c

> @@ -123,18 +124,18 @@ static bool switch_to_kernel_context_sync(struct drm_i915_private *i915)
>   			 * Forcibly cancel outstanding work and leave
>   			 * the gpu quiet.
>   			 */
> -			i915_gem_set_wedged(i915);
> +			intel_gt_set_wedged(gt);
>   			result = false;
>   		}
> -	} while (i915_retire_requests(i915) && result);
> +	} while (i915_retire_requests(gt->i915) && result);
>   
> -	GEM_BUG_ON(i915->gt.awake);
> +	GEM_BUG_ON(gt->awake);
>   	return result;
>   }
>   
>   bool i915_gem_load_power_context(struct drm_i915_private *i915)

This should probably move to intel_gt as well since it is the gt power 
context that we're loading.

>   {
> -	return switch_to_kernel_context_sync(i915);
> +	return switch_to_kernel_context_sync(&i915->gt);
>   }
>   
>   void i915_gem_suspend(struct drm_i915_private *i915)

<snip>

> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> index 36ba80e6a0b7..9047e28e75ae 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
> @@ -5,7 +5,9 @@
>    */
>   
>   #include "i915_drv.h"
> +#include "i915_params.h"
>   #include "intel_engine_pm.h"
> +#include "intel_gt.h"
>   #include "intel_gt_pm.h"
>   #include "intel_pm.h"
>   #include "intel_wakeref.h"
> @@ -17,6 +19,7 @@ static void pm_notify(struct drm_i915_private *i915, int state)
>   
>   static int intel_gt_unpark(struct intel_wakeref *wf)
>   {
> +	struct intel_gt *gt = container_of(wf, typeof(*gt), wakeref);
>   	struct drm_i915_private *i915 =
>   		container_of(wf, typeof(*i915), gt.wakeref);

gt->i915 ?

>   

<snip>

> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index f43ea830b1e8..70d723e801bf 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -14,12 +14,21 @@
>   #include <linux/types.h>
>   
>   #include "i915_vma.h"
> +#include "intel_reset_types.h"
>   #include "intel_wakeref.h"
>   
>   struct drm_i915_private;
>   struct i915_ggtt;
>   struct intel_uncore;
>   
> +struct intel_hangcheck {
> +	/* For hangcheck timer */
> +#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
> +#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
> +
> +	struct delayed_work work;
> +};

should intel_hangcheck be part of intel_reset_types?

> +
>   struct intel_gt {
>   	struct drm_i915_private *i915;
>   	struct intel_uncore *uncore;
> @@ -39,6 +48,9 @@ struct intel_gt {
>   	struct list_head closed_vma;
>   	spinlock_t closed_lock; /* guards the list of closed_vma */
>   
> +	struct intel_hangcheck hangcheck;
> +	struct intel_reset reset;
> +
>   	/**
>   	 * Is the GPU currently considered idle, or busy executing
>   	 * userspace requests? Whilst idle, we allow runtime power

<snip>

> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index 72002c0f9698..67c06231b004 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c

> @@ -1267,113 +1252,119 @@ void i915_handle_error(struct drm_i915_private *i915,
>   	synchronize_rcu_expedited();
>   
>   	/* Prevent any other reset-engine attempt. */
> -	for_each_engine(engine, i915, tmp) {
> +	for_each_engine(engine, gt->i915, tmp) {
>   		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -					&error->flags))
> -			wait_on_bit(&error->flags,
> +					&gt->reset.flags))
> +			wait_on_bit(&gt->reset.flags,
>   				    I915_RESET_ENGINE + engine->id,
>   				    TASK_UNINTERRUPTIBLE);
>   	}
>   
> -	i915_reset_device(i915, engine_mask, msg);
> +	intel_gt_reset_global(gt, engine_mask, msg);
>   
> -	for_each_engine(engine, i915, tmp) {
> -		clear_bit(I915_RESET_ENGINE + engine->id,
> -			  &error->flags);
> -	}
> -
> -	clear_bit(I915_RESET_BACKOFF, &error->flags);
> -	wake_up_all(&error->reset_queue);
> +	for_each_engine(engine, gt->i915, tmp)
> +		clear_bit_unlock(I915_RESET_ENGINE + engine->id,
> +				 &gt->reset.flags);
> +	clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
> +	smp_mb__after_atomic();

Why do we now need this barrier here?

> +	wake_up_all(&gt->reset.queue);
>   
>   out:
> -	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
> +	intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
>   }
>   

<snip>

> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h
> index 03fba0ab3868..62f6cb520f96 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.h
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.h
> @@ -11,56 +11,67 @@
>   #include <linux/types.h>
>   #include <linux/srcu.h>
>   
> -#include "gt/intel_engine_types.h"
> +#include "intel_engine_types.h"
> +#include "intel_reset_types.h"
>   
>   struct drm_i915_private;
>   struct i915_request;
>   struct intel_engine_cs;
> +struct intel_gt;
>   struct intel_guc;
>   
> +void intel_gt_init_reset(struct intel_gt *gt);
> +void intel_gt_fini_reset(struct intel_gt *gt);
> +
>   __printf(4, 5)
> -void i915_handle_error(struct drm_i915_private *i915,
> -		       intel_engine_mask_t engine_mask,
> -		       unsigned long flags,
> -		       const char *fmt, ...);
> +void intel_gt_handle_error(struct intel_gt *gt,
> +			   intel_engine_mask_t engine_mask,
> +			   unsigned long flags,
> +			   const char *fmt, ...);
>   #define I915_ERROR_CAPTURE BIT(0)
>   
> -void i915_reset(struct drm_i915_private *i915,
> -		intel_engine_mask_t stalled_mask,
> -		const char *reason);
> -int i915_reset_engine(struct intel_engine_cs *engine,
> -		      const char *reason);
> -
> -void i915_reset_request(struct i915_request *rq, bool guilty);
> +void intel_gt_reset(struct intel_gt *gt,
> +		    intel_engine_mask_t stalled_mask,
> +		    const char *reason);
> +int intel_engine_reset(struct intel_engine_cs *engine,
> +		       const char *reason);
>   
> -int __must_check i915_reset_trylock(struct drm_i915_private *i915);
> -void i915_reset_unlock(struct drm_i915_private *i915, int tag);
> +void __i915_request_reset(struct i915_request *rq, bool guilty);
>   
> -int i915_terminally_wedged(struct drm_i915_private *i915);
> +int __must_check intel_gt_reset_trylock(struct intel_gt *gt);
> +void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
>   
> -bool intel_has_gpu_reset(struct drm_i915_private *i915);
> -bool intel_has_reset_engine(struct drm_i915_private *i915);
> +void intel_gt_set_wedged(struct intel_gt *gt);
> +bool intel_gt_unset_wedged(struct intel_gt *gt);
> +int intel_gt_terminally_wedged(struct intel_gt *gt);
>   
> -int intel_gpu_reset(struct drm_i915_private *i915,
> -		    intel_engine_mask_t engine_mask);
> +int intel_gpu_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask);

time to rename to intel_gt_reset?

>   
> -int intel_reset_guc(struct drm_i915_private *i915);
> +int intel_reset_guc(struct intel_gt *gt);
>   
> -struct i915_wedge_me {
> +struct intel_wedge_me {
>   	struct delayed_work work;
> -	struct drm_i915_private *i915;
> +	struct intel_gt *gt;
>   	const char *name;
>   };
>   
> -void __i915_init_wedge(struct i915_wedge_me *w,
> -		       struct drm_i915_private *i915,
> -		       long timeout,
> -		       const char *name);
> -void __i915_fini_wedge(struct i915_wedge_me *w);
> +void __intel_init_wedge(struct intel_wedge_me *w,
> +			struct intel_gt *gt,
> +			long timeout,
> +			const char *name);
> +void __intel_fini_wedge(struct intel_wedge_me *w);
>   
> -#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
> -	for (__i915_init_wedge((W), (DEV), (TIMEOUT), __func__);	\
> -	     (W)->i915;							\
> -	     __i915_fini_wedge((W)))
> +#define intel_wedge_on_timeout(W, GT, TIMEOUT)				\
> +	for (__intel_init_wedge((W), (GT), (TIMEOUT), __func__);	\
> +	     (W)->gt;							\
> +	     __intel_fini_wedge((W)))
> +
> +static inline bool __intel_reset_failed(const struct intel_reset *reset)
> +{
> +	return unlikely(test_bit(I915_WEDGED, &reset->flags));
> +}
> +
> +bool intel_has_gpu_reset(struct drm_i915_private *i915);
> +bool intel_has_reset_engine(struct drm_i915_private *i915);
>   
>   #endif /* I915_RESET_H */

<snip>

> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index 2d9cc3cd1f27..8caad19683a1 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c

> @@ -442,7 +441,7 @@ static int igt_reset_nop(void *arg)
>   
>   out:
>   	mock_file_free(i915, file);
> -	if (i915_reset_failed(i915))
> +	if (intel_gt_is_wedged(&i915->gt))

&i915->gt is used 5 times in igt_reset_nop(), might be worth having a 
local variable.

>   		err = -EIO;
>   	return err;
>   }

<snip>

> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index ce1b6568515e..cf5155646927 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c

> @@ -1061,23 +1062,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>   	return 0;
>   }
>   
> -static int i915_reset_info(struct seq_file *m, void *unused)

The fact that this is going away sould be mentioned in the commit message.

> -{
> -	struct drm_i915_private *dev_priv = node_to_i915(m->private);
> -	struct i915_gpu_error *error = &dev_priv->gpu_error;
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	seq_printf(m, "full gpu reset = %u\n", i915_reset_count(error));
> -
> -	for_each_engine(engine, dev_priv, id) {
> -		seq_printf(m, "%s = %u\n", engine->name,
> -			   i915_reset_engine_count(error, engine));
> -	}
> -
> -	return 0;
> -}
> -
>   static int ironlake_drpc_info(struct seq_file *m)
>   {
>   	struct drm_i915_private *i915 = node_to_i915(m->private);

<snip>

> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b6f3baa74da4..cec3cea7d86f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -894,13 +894,13 @@ void i915_gem_runtime_suspend(struct drm_i915_private *i915)
>   	}
>   }
>   
> -static int wait_for_engines(struct drm_i915_private *i915)
> +static int wait_for_engines(struct intel_gt *gt)
>   {
> -	if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
> -		dev_err(i915->drm.dev,
> +	if (wait_for(intel_engines_are_idle(gt), I915_IDLE_ENGINES_TIMEOUT)) {
> +		dev_err(gt->i915->drm.dev,
>   			"Failed to idle engines, declaring wedged!\n");
>   		GEM_TRACE_DUMP();
> -		i915_gem_set_wedged(i915);
> +		intel_gt_set_wedged(gt);
>   		return -EIO;
>   	}
>   
> @@ -971,7 +971,7 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,

Is it time for this to become intel_gt_wait_for_idle? (as a follow up)

>   
>   		lockdep_assert_held(&i915->drm.struct_mutex);
>   
> -		err = wait_for_engines(i915);
> +		err = wait_for_engines(&i915->gt);
>   		if (err)
>   			return err;
>   
> @@ -1149,8 +1149,8 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
>   	 * back to defaults, recovering from whatever wedged state we left it
>   	 * in and so worth trying to use the device once more.
>   	 */
> -	if (i915_terminally_wedged(i915))
> -		i915_gem_unset_wedged(i915);
> +	if (intel_gt_is_wedged(&i915->gt))
> +		intel_gt_unset_wedged(&i915->gt);

I was wondering if we should move this inside the intel_gt_sanitize just 
below, but then I realized that intel_gt_unset_wedged calls 
intel_gt_sanitize as well so that's a no (for now). However, it is IMO 
still worth to at least flip the i915_gem_sanitize function to work on 
intel_gt since al the calls inside, rpm aside, can use the gt.

>   
>   	/*
>   	 * If we inherit context state from the BIOS or earlier occupants

<snip>

> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 2ecd0c6a1c94..7dfbfda48733 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -7,6 +7,7 @@
>   #ifndef _I915_GPU_ERROR_H_
>   #define _I915_GPU_ERROR_H_
>   
> +#include <linux/atomic.h>
>   #include <linux/kref.h>
>   #include <linux/ktime.h>
>   #include <linux/sched.h>
> @@ -180,12 +181,6 @@ struct i915_gpu_state {
>   };
>   
>   struct i915_gpu_error {
> -	/* For hangcheck timer */
> -#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
> -#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
> -
> -	struct delayed_work hangcheck_work;
> -
>   	/* For reset and error_state handling. */
>   	spinlock_t lock;
>   	/* Protected by the above dev->gpu_error.lock. */
> @@ -193,52 +188,11 @@ struct i915_gpu_error {
>   
>   	atomic_t pending_fb_pin;
>   
> -	/**
> -	 * flags: Control various stages of the GPU reset
> -	 *
> -	 * #I915_RESET_BACKOFF - When we start a global reset, we need to
> -	 * serialise with any other users attempting to do the same, and
> -	 * any global resources that may be clobber by the reset (such as
> -	 * FENCE registers).
> -	 *
> -	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
> -	 * acquire the struct_mutex to reset an engine, we need an explicit
> -	 * flag to prevent two concurrent reset attempts in the same engine.
> -	 * As the number of engines continues to grow, allocate the flags from
> -	 * the most significant bits.
> -	 *
> -	 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
> -	 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
> -	 * i915_request_alloc(), this bit is checked and the sequence
> -	 * aborted (with -EIO reported to userspace) if set.
> -	 */
> -	unsigned long flags;
> -#define I915_RESET_BACKOFF	0
> -#define I915_RESET_MODESET	1
> -#define I915_RESET_ENGINE	2
> -#define I915_WEDGED		(BITS_PER_LONG - 1)
> -
>   	/** Number of times the device has been reset (global) */
> -	u32 reset_count;
> +	atomic_t reset_count;
>   
>   	/** Number of times an engine has been reset */
> -	u32 reset_engine_count[I915_NUM_ENGINES];
> -
> -	struct mutex wedge_mutex; /* serialises wedging/unwedging */
> -
> -	/**
> -	 * Waitqueue to signal when a hang is detected. Used to for waiters
> -	 * to release the struct_mutex for the reset to procede.
> -	 */
> -	wait_queue_head_t wait_queue;
> -
> -	/**
> -	 * Waitqueue to signal when the reset has completed. Used by clients
> -	 * that wait for dev_priv->mm.wedged to settle.
> -	 */
> -	wait_queue_head_t reset_queue;
> -
> -	struct srcu_struct reset_backoff_srcu;
> +	atomic_t reset_engine_count[I915_NUM_ENGINES];

I'd argue that both reset counts should go inside the intel_reset 
structure under intel_gt since we're counting resets of the GT and its 
components.

>   };
>   
>   struct drm_i915_error_state_buf {

<snip>

> diff --git a/drivers/gpu/drm/i915/selftests/igt_wedge_me.h b/drivers/gpu/drm/i915/selftests/igt_wedge_me.h
> deleted file mode 100644
> index 08e5ff11bbd9..000000000000
> --- a/drivers/gpu/drm/i915/selftests/igt_wedge_me.h
> +++ /dev/null
> @@ -1,58 +0,0 @@
> -/*
> - * SPDX-License-Identifier: MIT
> - *
> - * Copyright © 2018 Intel Corporation
> - */
> -
> -#ifndef IGT_WEDGE_ME_H
> -#define IGT_WEDGE_ME_H
> -
> -#include <linux/workqueue.h>
> -
> -#include "../i915_gem.h"
> -
> -struct drm_i915_private;
> -
> -struct igt_wedge_me {
> -	struct delayed_work work;
> -	struct drm_i915_private *i915;
> -	const char *name;
> -};
> -
> -static void __igt_wedge_me(struct work_struct *work)
> -{
> -	struct igt_wedge_me *w = container_of(work, typeof(*w), work.work);
> -
> -	pr_err("%s timed out, cancelling test.\n", w->name);
> -
> -	GEM_TRACE("%s timed out.\n", w->name);
> -	GEM_TRACE_DUMP();

These dumps are not in intel_wedge_me, which replaced this in selftests. 
Should we add them there or are they not meaningful enough for us to care?

Daniele

> -
> -	i915_gem_set_wedged(w->i915);
> -}
> -
> -static void __igt_init_wedge(struct igt_wedge_me *w,
> -			     struct drm_i915_private *i915,
> -			     long timeout,
> -			     const char *name)
> -{
> -	w->i915 = i915;
> -	w->name = name;
> -
> -	INIT_DELAYED_WORK_ONSTACK(&w->work, __igt_wedge_me);
> -	schedule_delayed_work(&w->work, timeout);
> -}
> -
> -static void __igt_fini_wedge(struct igt_wedge_me *w)
> -{
> -	cancel_delayed_work_sync(&w->work);
> -	destroy_delayed_work_on_stack(&w->work);
> -	w->i915 = NULL;
> -}
> -
> -#define igt_wedge_on_timeout(W, DEV, TIMEOUT)				\
> -	for (__igt_init_wedge((W), (DEV), (TIMEOUT), __func__);		\
> -	     (W)->i915;							\
> -	     __igt_fini_wedge((W)))
> -
> -#endif /* IGT_WEDGE_ME_H */
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 2741805b56c2..fd4cc4809eb8 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -183,11 +183,6 @@ struct drm_i915_private *mock_gem_device(void)
>   	intel_gt_init_early(&i915->gt, i915);
>   	atomic_inc(&i915->gt.wakeref.count); /* disable; no hw support */
>   
> -	init_waitqueue_head(&i915->gpu_error.wait_queue);
> -	init_waitqueue_head(&i915->gpu_error.reset_queue);
> -	init_srcu_struct(&i915->gpu_error.reset_backoff_srcu);
> -	mutex_init(&i915->gpu_error.wedge_mutex);
> -
>   	i915->wq = alloc_ordered_workqueue("mock", 0);
>   	if (!i915->wq)
>   		goto err_drv;
>