[PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests

Alex Deucher alexdeucher at gmail.com
Mon Aug 18 09:03:17 PDT 2014


On Mon, Aug 18, 2014 at 11:02 AM, Christian König
<deathsimple at vodafone.de> wrote:
> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>
>> This is needed for the next commit, because the lockup detection
>> will need the read lock to run.
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at canonical.com>
>> ---
>>   drivers/gpu/drm/radeon/radeon.h        |  2 +-
>>   drivers/gpu/drm/radeon/radeon_device.c | 61
>> ++++++++++++++++++++--------------
>>   2 files changed, 37 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 9e1732eb402c..1d806983ec7b 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -2312,7 +2312,7 @@ struct radeon_device {
>>         bool                            need_dma32;
>>         bool                            accel_working;
>>         bool                            fastfb_working; /* IGP feature*/
>> -       bool                            needs_reset;
>> +       bool                            needs_reset, in_reset;
>>         struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
>>         const struct firmware *me_fw;   /* all family ME firmware */
>>         const struct firmware *pfp_fw;  /* r6/700 PFP firmware */
>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
>> b/drivers/gpu/drm/radeon/radeon_device.c
>> index c8ea050c8fa4..82633fdd399d 100644
>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>> @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>>         down_write(&rdev->exclusive_lock);
>>         if (!rdev->needs_reset) {
>> +               WARN_ON(rdev->in_reset);
>>                 up_write(&rdev->exclusive_lock);
>>                 return 0;
>>         }
>>         rdev->needs_reset = false;
>> -
>> -       radeon_save_bios_scratch_regs(rdev);
>> -       /* block TTM */
>>         resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>> -       radeon_pm_suspend(rdev);
>> -       radeon_suspend(rdev);
>>   -     for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> -               ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
>> -                                                  &ring_data[i]);
>> -               if (ring_sizes[i]) {
>> -                       saved = true;
>> -                       dev_info(rdev->dev, "Saved %d dwords of commands "
>> -                                "on ring %d.\n", ring_sizes[i], i);
>> +       if (!rdev->in_reset) {
>> +               rdev->in_reset = true;
>> +
>> +               radeon_save_bios_scratch_regs(rdev);
>> +               /* block TTM */
>> +               radeon_pm_suspend(rdev);
>> +               radeon_suspend(rdev);
>
>
> That won't work correctly because you might end up with calling pm_resume
> more often than suspend and that can only lead to a crash. Saying this we
> probably already have a bug in the reset code at this point anyway, but see
> below.
>
>
>> +
>> +               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> +                       ring_sizes[i] = radeon_ring_backup(rdev,
>> &rdev->ring[i],
>> +                                                          &ring_data[i]);
>> +                       if (ring_sizes[i]) {
>> +                               saved = true;
>> +                               dev_info(rdev->dev, "Saved %d dwords of
>> commands "
>> +                                        "on ring %d.\n", ring_sizes[i],
>> i);
>> +                       }
>>                 }
>> -       }
>> +       } else
>> +               memset(ring_data, 0, sizeof(ring_data));
>>   -retry:
>>         r = radeon_asic_reset(rdev);
>>         if (!r) {
>>                 dev_info(rdev->dev, "GPU reset succeeded, trying to
>> resume\n");
>> @@ -1702,40 +1707,46 @@ retry:
>>         radeon_restore_bios_scratch_regs(rdev);
>
>
> We should resume PM here as well.
>
>
>>   -     if (!r) {
>> +       if (!r && saved) {
>>                 for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>>                         radeon_ring_restore(rdev, &rdev->ring[i],
>>                                             ring_sizes[i], ring_data[i]);
>> -                       ring_sizes[i] = 0;
>>                         ring_data[i] = NULL;
>>                 }
>> +       } else {
>> +               radeon_fence_driver_force_completion(rdev);
>> +
>> +               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> +                       kfree(ring_data[i]);
>> +               }
>> +       }
>> +       downgrade_write(&rdev->exclusive_lock);
>> +       ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>
>
> I would unlock the delayed_workqueue first and then downgrade the readlock.
>
>
>>   +     if (!r) {
>>                 r = radeon_ib_ring_tests(rdev);
>>                 if (r) {
>>                         dev_err(rdev->dev, "ib ring test failed (%d).\n",
>> r);
>>                         if (saved) {
>> -                               saved = false;
>> +                               /* if reset fails, try without saving data
>> */
>> +                               rdev->needs_reset = true;
>>                                 radeon_suspend(rdev);
>> -                               goto retry;
>> +                               up_read(&rdev->exclusive_lock);
>> +                               return -EAGAIN;
>>                         }
>>                 }
>> -       } else {
>> -               radeon_fence_driver_force_completion(rdev);
>> -               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> -                       kfree(ring_data[i]);
>> -               }
>>         }
>>
>
>
>>         radeon_pm_resume(rdev);
>
>
> Move this more up.
>
> Alex is more into this, but it's probably a bug in the current reset code
> that this is after the IB tests, cause the IB tests needs everything powered
> up and with PM handling suspended it is possible that individual blocks are
> powered down.

Yeah, looks like a bug.  I think the attached patch should fix it.

Alex

>
> Thanks,
> Christian.
>
>
>>         drm_helper_resume_force_mode(rdev->ddev);
>>   -     ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>>         if (r) {
>>                 /* bad news, how to tell it to userspace ? */
>>                 dev_info(rdev->dev, "GPU reset failed\n");
>>         }
>>   -     up_write(&rdev->exclusive_lock);
>> +       rdev->in_reset = false;
>> +       up_read(&rdev->exclusive_lock);
>>         return r;
>>   }
>>
>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-drm-radeon-fix-pm-handling-in-radeon_gpu_reset.patch
Type: text/x-diff
Size: 1883 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/dri-devel/attachments/20140818/457a5748/attachment.patch>


More information about the dri-devel mailing list