[PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
Alex Deucher
alexdeucher at gmail.com
Mon Aug 18 09:10:03 PDT 2014
On Mon, Aug 18, 2014 at 12:07 PM, Christian König
<deathsimple at vodafone.de> wrote:
>> Yeah, looks like a bug. I think the attached patch should fix it.
>
> Sounds logical and the patch is Reviewed-by: Christian König
> <christian.koenig at amd.com>
>
> Going to apply Maartens patch on top and test that one a bit to make sure it
> works as expected.
pushed my current -fixes queue to my drm-fixes-3.17-wip branch if that helps.
Alex
>
> Regards,
> Christian.
>
> Am 18.08.2014 um 18:03 schrieb Alex Deucher:
>
>> On Mon, Aug 18, 2014 at 11:02 AM, Christian König
>> <deathsimple at vodafone.de> wrote:
>>>
>>> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>>>
>>>> This is needed for the next commit, because the lockup detection
>>>> will need the read lock to run.
>>>>
>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at canonical.com>
>>>> ---
>>>> drivers/gpu/drm/radeon/radeon.h | 2 +-
>>>> drivers/gpu/drm/radeon/radeon_device.c | 61
>>>> ++++++++++++++++++++--------------
>>>> 2 files changed, 37 insertions(+), 26 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>>>> b/drivers/gpu/drm/radeon/radeon.h
>>>> index 9e1732eb402c..1d806983ec7b 100644
>>>> --- a/drivers/gpu/drm/radeon/radeon.h
>>>> +++ b/drivers/gpu/drm/radeon/radeon.h
>>>> @@ -2312,7 +2312,7 @@ struct radeon_device {
>>>> bool need_dma32;
>>>> bool accel_working;
>>>> bool fastfb_working; /* IGP
>>>> feature*/
>>>> - bool needs_reset;
>>>> + bool needs_reset, in_reset;
>>>> struct radeon_surface_reg
>>>> surface_regs[RADEON_GEM_MAX_SURFACES];
>>>> const struct firmware *me_fw; /* all family ME firmware */
>>>> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
>>>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
>>>> b/drivers/gpu/drm/radeon/radeon_device.c
>>>> index c8ea050c8fa4..82633fdd399d 100644
>>>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>>>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>>>> @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>>>> down_write(&rdev->exclusive_lock);
>>>> if (!rdev->needs_reset) {
>>>> + WARN_ON(rdev->in_reset);
>>>> up_write(&rdev->exclusive_lock);
>>>> return 0;
>>>> }
>>>> rdev->needs_reset = false;
>>>> -
>>>> - radeon_save_bios_scratch_regs(rdev);
>>>> - /* block TTM */
>>>> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>>>> - radeon_pm_suspend(rdev);
>>>> - radeon_suspend(rdev);
>>>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
>>>> - &ring_data[i]);
>>>> - if (ring_sizes[i]) {
>>>> - saved = true;
>>>> - dev_info(rdev->dev, "Saved %d dwords of commands
>>>> "
>>>> - "on ring %d.\n", ring_sizes[i], i);
>>>> + if (!rdev->in_reset) {
>>>> + rdev->in_reset = true;
>>>> +
>>>> + radeon_save_bios_scratch_regs(rdev);
>>>> + /* block TTM */
>>>> + radeon_pm_suspend(rdev);
>>>> + radeon_suspend(rdev);
>>>
>>>
>>> That won't work correctly because you might end up with calling pm_resume
>>> more often than suspend and that can only lead to a crash. Saying this we
>>> probably already have a bug in the reset code at this point anyway, but
>>> see
>>> below.
>>>
>>>
>>>> +
>>>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> + ring_sizes[i] = radeon_ring_backup(rdev,
>>>> &rdev->ring[i],
>>>> +
>>>> &ring_data[i]);
>>>> + if (ring_sizes[i]) {
>>>> + saved = true;
>>>> + dev_info(rdev->dev, "Saved %d dwords of
>>>> commands "
>>>> + "on ring %d.\n", ring_sizes[i],
>>>> i);
>>>> + }
>>>> }
>>>> - }
>>>> + } else
>>>> + memset(ring_data, 0, sizeof(ring_data));
>>>> -retry:
>>>> r = radeon_asic_reset(rdev);
>>>> if (!r) {
>>>> dev_info(rdev->dev, "GPU reset succeeded, trying to
>>>> resume\n");
>>>> @@ -1702,40 +1707,46 @@ retry:
>>>> radeon_restore_bios_scratch_regs(rdev);
>>>
>>>
>>> We should resume PM here as well.
>>>
>>>
>>>> - if (!r) {
>>>> + if (!r && saved) {
>>>> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> radeon_ring_restore(rdev, &rdev->ring[i],
>>>> ring_sizes[i],
>>>> ring_data[i]);
>>>> - ring_sizes[i] = 0;
>>>> ring_data[i] = NULL;
>>>> }
>>>> + } else {
>>>> + radeon_fence_driver_force_completion(rdev);
>>>> +
>>>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> + kfree(ring_data[i]);
>>>> + }
>>>> + }
>>>> + downgrade_write(&rdev->exclusive_lock);
>>>> + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>>>
>>>
>>> I would unlock the delayed_workqueue first and then downgrade the
>>> readlock.
>>>
>>>
>>>> + if (!r) {
>>>> r = radeon_ib_ring_tests(rdev);
>>>> if (r) {
>>>> dev_err(rdev->dev, "ib ring test failed
>>>> (%d).\n",
>>>> r);
>>>> if (saved) {
>>>> - saved = false;
>>>> + /* if reset fails, try without saving
>>>> data
>>>> */
>>>> + rdev->needs_reset = true;
>>>> radeon_suspend(rdev);
>>>> - goto retry;
>>>> + up_read(&rdev->exclusive_lock);
>>>> + return -EAGAIN;
>>>> }
>>>> }
>>>> - } else {
>>>> - radeon_fence_driver_force_completion(rdev);
>>>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>>> - kfree(ring_data[i]);
>>>> - }
>>>> }
>>>>
>>>
>>>> radeon_pm_resume(rdev);
>>>
>>>
>>> Move this more up.
>>>
>>> Alex is more into this, but it's probably a bug in the current reset code
>>> that this is after the IB tests, cause the IB tests needs everything
>>> powered
>>> up and with PM handling suspended it is possible that individual blocks
>>> are
>>> powered down.
>>
>> Yeah, looks like a bug. I think the attached patch should fix it.
>>
>> Alex
>>
>>> Thanks,
>>> Christian.
>>>
>>>
>>>> drm_helper_resume_force_mode(rdev->ddev);
>>>> - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>>>> if (r) {
>>>> /* bad news, how to tell it to userspace ? */
>>>> dev_info(rdev->dev, "GPU reset failed\n");
>>>> }
>>>> - up_write(&rdev->exclusive_lock);
>>>> + rdev->in_reset = false;
>>>> + up_read(&rdev->exclusive_lock);
>>>> return r;
>>>> }
>>>>
>>>
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel at lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
>
More information about the dri-devel
mailing list