[PATCH 3/3] drm/amdgpu: fix gpu reset issue

Fri Apr 21 11:47:24 UTC 2017

See amdgpu_gpu_reset():

>         /* block scheduler */
>         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                 struct amdgpu_ring *ring = adev->rings[i];
>
>                 if (!ring)
>                         continue;
>                 kthread_park(ring->sched.thread);
>                 amd_sched_hw_job_reset(&ring->sched);
>         }

and

>                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                         struct amdgpu_ring *ring = adev->rings[i];
>                         if (!ring)
>                                 continue;
>
> amd_sched_job_recovery(&ring->sched);
>                         kthread_unpark(ring->sched.thread);
>                 }

Regards,
Christian.

Am 21.04.2017 um 11:54 schrieb He, Hongbo:
>> During GPU reset, IOCTL still  submit command to GPU.
> No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.
>            Could you point out the related code? Thanks.
>
>
> Thanks
> Roger(Hongbo.He)
>
> -----Original Message-----
> From: Christian König [mailto:deathsimple at vodafone.de]
> Sent: Friday, April 21, 2017 5:16 PM
> To: He, Hongbo; amd-gfx at lists.freedesktop.org
> Cc: Zhou, David(ChunMing)
> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>
>> During GPU reset, IOCTL still  submit command to GPU.
> No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue.
>
> Do you have a concrete problem (e.g. dmesg?) you ran into?
>
> Regards,
> Christian.
>
> Am 21.04.2017 um 11:00 schrieb He, Hongbo:
>> Hi Christian:
>>
>> During GPU reset, IOCTL still  submit command to GPU.
>> That will result in lots of strange problems when we test this feature.
>>
>> I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion.
>> But that doesn't work and it lead to other problems.
>>
>> Thanks
>> Roger(Hongbo.He)
>>
>> -----Original Message-----
>> From: Christian König [mailto:deathsimple at vodafone.de]
>> Sent: Friday, April 21, 2017 4:27 PM
>> To: He, Hongbo; amd-gfx at lists.freedesktop.org
>> Cc: Zhou, David(ChunMing)
>> Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
>>
>> NAK, that is exactly what we wanted to avoid.
>>
>> We used to have an exclusive lock for this and it cause a whole bunch of problems.
>>
>> Please elaborate why that should be necessary.
>>
>> Regards,
>> Christian.
>>
>> Am 21.04.2017 um 09:08 schrieb Roger.He:
>>> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
>>> Signed-off-by: Roger.He <Hongbo.He at amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>>>     4 files changed, 31 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 71364f5..ab0ffa8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>>>     	atomic64_t			num_bytes_moved;
>>>     	atomic64_t			num_evictions;
>>>     	atomic_t			gpu_reset_counter;
>>> +	atomic_t			gpu_reset_state;
>>>     
>>>     	/* data for buffer migration throttling */
>>>     	struct {
>>> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>     #define amdgpu_psp_check_fw_loading_status(adev, i)
>>> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>>>     
>>>     /* Common functions */
>>> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
>>> +{
>>> +	return atomic_read(&adev->gpu_reset_state);
>>> +}
>>> +
>>>     int amdgpu_gpu_reset(struct amdgpu_device *adev);
>>>     bool amdgpu_need_backup(struct amdgpu_device *adev);
>>>     void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff
>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f882496..0fb4716 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>     	mutex_init(&adev->grbm_idx_mutex);
>>>     	mutex_init(&adev->mn_lock);
>>>     	hash_init(adev->mn_hash);
>>> +	atomic_set(&adev->gpu_reset_state, 0);
>>>     
>>>     	amdgpu_check_arguments(adev);
>>>     
>>> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>>>     }
>>>     
>>>     /**
>>> + * amdgpu_device_set_reset_state - set gpu reset state
>>> + *
>>> + * @adev: amdgpu device pointer
>>> + * @state: true when start to reset gpu; false: reset done  */
>>> +static inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
>>> +							bool state)
>>> +{
>>> +	atomic_set(&adev->gpu_reset_state, state); }
>>> +
>>> +/**
>>>      * amdgpu_gpu_reset - reset the asic
>>>      *
>>>      * @adev: amdgpu device pointer
>>> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>     	}
>>>     
>>>     	atomic_inc(&adev->gpu_reset_counter);
>>> -
>>> +	amdgpu_device_set_reset_state(adev, true);
>>>     	/* block TTM */
>>>     	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>>>     	/* store modesetting */
>>> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>>     		dev_info(adev->dev, "GPU reset failed\n");
>>>     	}
>>>     
>>> +	amdgpu_device_set_reset_state(adev, false);
>>>     	return r;
>>>     }
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index ead00d7..8cc14af 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>>>     	struct drm_file *file_priv = filp->private_data;
>>>     	struct drm_device *dev;
>>>     	long ret;
>>> +
>>>     	dev = file_priv->minor->dev;
>>>     	ret = pm_runtime_get_sync(dev->dev);
>>>     	if (ret < 0)
>>>     		return ret;
>>>     
>>> +	while (amdgpu_device_is_reset(dev->dev_private))
>>> +		msleep(100);
>>> +
>>>     	ret = drm_ioctl(filp, cmd, arg);
>>>     
>>>     	pm_runtime_mark_last_busy(dev->dev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> index 2648291..22b8059 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
>>> @@ -36,10 +36,15 @@
>>>     long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>>     {
>>>     	unsigned int nr = DRM_IOCTL_NR(cmd);
>>> +	struct drm_file *file_priv = filp->private_data;
>>> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>>>     	int ret;
>>>     
>>> -	if (nr < DRM_COMMAND_BASE)
>>> +	if (nr < DRM_COMMAND_BASE) {
>>> +		while (amdgpu_device_is_reset(adev))
>>> +			msleep(100);
>>>     		return drm_compat_ioctl(filp, cmd, arg);
>>> +	}
>>>     
>>>     	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>>>     
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx