[Intel-gfx] [PATCH] drm/i915: Skip the ERR_PTR error state

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Fri Dec 7 10:45:17 UTC 2018


On 07/12/2018 09:16, Chris Wilson wrote:
> Although commit fb6f0b64e455 ("drm/i915: Prevent machine hang from
> Broxton's vtd w/a and error capture") applied cleanly after a 24 month
> hiatus, the code had moved on with new methods for peeking and fetching
> the captured gpu info. Make sure we catch all uses of the stashed error
> state and avoid dereferencing the error pointer.
> 
> v2: Move error pointer determination into i915_gpu_capture_state
> 
> Fixes: fb6f0b64e455 ("drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c   | 12 +++++++++---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 26 ++++++++++++++------------
>   drivers/gpu/drm/i915/i915_sysfs.c     |  4 +++-
>   3 files changed, 26 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 38dcee1ca062..40a61ef9aac1 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -984,8 +984,8 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)
>   	intel_runtime_pm_get(i915);
>   	gpu = i915_capture_gpu_state(i915);
>   	intel_runtime_pm_put(i915);
> -	if (!gpu)
> -		return -ENOMEM;
> +	if (IS_ERR(gpu))
> +		return PTR_ERR(gpu);
>   
>   	file->private_data = gpu;
>   	return 0;
> @@ -1018,7 +1018,13 @@ i915_error_state_write(struct file *filp,
>   
>   static int i915_error_state_open(struct inode *inode, struct file *file)
>   {
> -	file->private_data = i915_first_error_state(inode->i_private);
> +	struct i915_gpu_state *error;
> +
> +	error = i915_first_error_state(inode->i_private);
> +	if (IS_ERR(error))
> +		return PTR_ERR(error);
> +
> +	file->private_data  = error;
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 07465123c166..9dc6600544c5 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1907,9 +1907,16 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
>   {
>   	struct i915_gpu_state *error;
>   
> +	/* Check if GPU capture has been disabled */
> +	error = READ_ONCE(i915->gpu_error.first_error);
> +	if (IS_ERR(error))
> +		return error;
> +
>   	error = kzalloc(sizeof(*error), GFP_ATOMIC);
> -	if (!error)
> -		return NULL;
> +	if (!error) {
> +		i915_disable_error_state(i915, -ENOMEM);
> +		return ERR_PTR(-ENOMEM);
> +	}
>   
>   	kref_init(&error->ref);
>   	error->i915 = i915;
> @@ -1941,15 +1948,9 @@ void i915_capture_error_state(struct drm_i915_private *i915,
>   	if (!i915_modparams.error_capture)
>   		return;
>   
> -	if (READ_ONCE(i915->gpu_error.first_error))
> -		return;
> -

Will it now overwrite the first error on subsequent hangs? 
i915_capture_gpu_state only does early exit on stored error.

Regards,

Tvrtko

>   	error = i915_capture_gpu_state(i915);
> -	if (!error) {
> -		DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
> -		i915_disable_error_state(i915, -ENOMEM);
> +	if (IS_ERR(error))
>   		return;
> -	}
>   
>   	i915_error_capture_msg(i915, error, engine_mask, error_msg);
>   	DRM_INFO("%s\n", error->error_msg);
> @@ -1987,7 +1988,7 @@ i915_first_error_state(struct drm_i915_private *i915)
>   
>   	spin_lock_irq(&i915->gpu_error.lock);
>   	error = i915->gpu_error.first_error;
> -	if (error)
> +	if (!IS_ERR_OR_NULL(error))
>   		i915_gpu_state_get(error);
>   	spin_unlock_irq(&i915->gpu_error.lock);
>   
> @@ -2000,10 +2001,11 @@ void i915_reset_error_state(struct drm_i915_private *i915)
>   
>   	spin_lock_irq(&i915->gpu_error.lock);
>   	error = i915->gpu_error.first_error;
> -	i915->gpu_error.first_error = NULL;
> +	if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */
> +		i915->gpu_error.first_error = NULL;
>   	spin_unlock_irq(&i915->gpu_error.lock);
>   
> -	if (!IS_ERR(error))
> +	if (!IS_ERR_OR_NULL(error))
>   		i915_gpu_state_put(error);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
> index 535caebd9813..c0cfe7ae2ba5 100644
> --- a/drivers/gpu/drm/i915/i915_sysfs.c
> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
> @@ -521,7 +521,9 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
>   	ssize_t ret;
>   
>   	gpu = i915_first_error_state(i915);
> -	if (gpu) {
> +	if (IS_ERR(gpu)) {
> +		ret = PTR_ERR(gpu);
> +	} else if (gpu) {
>   		ret = i915_gpu_state_copy_to_buffer(gpu, buf, off, count);
>   		i915_gpu_state_put(gpu);
>   	} else {
> 


More information about the Intel-gfx mailing list