[PATCH 1/3] drm/radeon: take exclusive_lock in read mode during ring tests
Alex Deucher
alexdeucher at gmail.com
Mon Aug 18 09:03:17 PDT 2014
On Mon, Aug 18, 2014 at 11:02 AM, Christian König
<deathsimple at vodafone.de> wrote:
> Am 18.08.2014 um 16:45 schrieb Maarten Lankhorst:
>
>> This is needed for the next commit, because the lockup detection
>> will need the read lock to run.
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at canonical.com>
>> ---
>> drivers/gpu/drm/radeon/radeon.h | 2 +-
>> drivers/gpu/drm/radeon/radeon_device.c | 61
>> ++++++++++++++++++++--------------
>> 2 files changed, 37 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 9e1732eb402c..1d806983ec7b 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -2312,7 +2312,7 @@ struct radeon_device {
>> bool need_dma32;
>> bool accel_working;
>> bool fastfb_working; /* IGP feature*/
>> - bool needs_reset;
>> + bool needs_reset, in_reset;
>> struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
>> const struct firmware *me_fw; /* all family ME firmware */
>> const struct firmware *pfp_fw; /* r6/700 PFP firmware */
>> diff --git a/drivers/gpu/drm/radeon/radeon_device.c
>> b/drivers/gpu/drm/radeon/radeon_device.c
>> index c8ea050c8fa4..82633fdd399d 100644
>> --- a/drivers/gpu/drm/radeon/radeon_device.c
>> +++ b/drivers/gpu/drm/radeon/radeon_device.c
>> @@ -1671,29 +1671,34 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>> down_write(&rdev->exclusive_lock);
>> if (!rdev->needs_reset) {
>> + WARN_ON(rdev->in_reset);
>> up_write(&rdev->exclusive_lock);
>> return 0;
>> }
>> rdev->needs_reset = false;
>> -
>> - radeon_save_bios_scratch_regs(rdev);
>> - /* block TTM */
>> resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>> - radeon_pm_suspend(rdev);
>> - radeon_suspend(rdev);
>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> - ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
>> - &ring_data[i]);
>> - if (ring_sizes[i]) {
>> - saved = true;
>> - dev_info(rdev->dev, "Saved %d dwords of commands "
>> - "on ring %d.\n", ring_sizes[i], i);
>> + if (!rdev->in_reset) {
>> + rdev->in_reset = true;
>> +
>> + radeon_save_bios_scratch_regs(rdev);
>> + /* block TTM */
>> + radeon_pm_suspend(rdev);
>> + radeon_suspend(rdev);
>
>
> That won't work correctly because you might end up with calling pm_resume
> more often than suspend and that can only lead to a crash. Saying this we
> probably already have a bug in the reset code at this point anyway, but see
> below.
>
>
>> +
>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> + ring_sizes[i] = radeon_ring_backup(rdev,
>> &rdev->ring[i],
>> + &ring_data[i]);
>> + if (ring_sizes[i]) {
>> + saved = true;
>> + dev_info(rdev->dev, "Saved %d dwords of
>> commands "
>> + "on ring %d.\n", ring_sizes[i],
>> i);
>> + }
>> }
>> - }
>> + } else
>> + memset(ring_data, 0, sizeof(ring_data));
>> -retry:
>> r = radeon_asic_reset(rdev);
>> if (!r) {
>> dev_info(rdev->dev, "GPU reset succeeded, trying to
>> resume\n");
>> @@ -1702,40 +1707,46 @@ retry:
>> radeon_restore_bios_scratch_regs(rdev);
>
>
> We should resume PM here as well.
>
>
>> - if (!r) {
>> + if (!r && saved) {
>> for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> radeon_ring_restore(rdev, &rdev->ring[i],
>> ring_sizes[i], ring_data[i]);
>> - ring_sizes[i] = 0;
>> ring_data[i] = NULL;
>> }
>> + } else {
>> + radeon_fence_driver_force_completion(rdev);
>> +
>> + for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> + kfree(ring_data[i]);
>> + }
>> + }
>> + downgrade_write(&rdev->exclusive_lock);
>> + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>
>
> I would unlock the delayed_workqueue first and then downgrade the readlock.
>
>
>> + if (!r) {
>> r = radeon_ib_ring_tests(rdev);
>> if (r) {
>> dev_err(rdev->dev, "ib ring test failed (%d).\n",
>> r);
>> if (saved) {
>> - saved = false;
>> + /* if reset fails, try without saving data
>> */
>> + rdev->needs_reset = true;
>> radeon_suspend(rdev);
>> - goto retry;
>> + up_read(&rdev->exclusive_lock);
>> + return -EAGAIN;
>> }
>> }
>> - } else {
>> - radeon_fence_driver_force_completion(rdev);
>> - for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> - kfree(ring_data[i]);
>> - }
>> }
>>
>
>
>> radeon_pm_resume(rdev);
>
>
> Move this more up.
>
> Alex is more into this, but it's probably a bug in the current reset code
> that this is after the IB tests, cause the IB tests needs everything powered
> up and with PM handling suspended it is possible that individual blocks are
> powered down.
Yeah, looks like a bug. I think the attached patch should fix it.
Alex
>
> Thanks,
> Christian.
>
>
>> drm_helper_resume_force_mode(rdev->ddev);
>> - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
>> if (r) {
>> /* bad news, how to tell it to userspace ? */
>> dev_info(rdev->dev, "GPU reset failed\n");
>> }
>> - up_write(&rdev->exclusive_lock);
>> + rdev->in_reset = false;
>> + up_read(&rdev->exclusive_lock);
>> return r;
>> }
>>
>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-drm-radeon-fix-pm-handling-in-radeon_gpu_reset.patch
Type: text/x-diff
Size: 1883 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/dri-devel/attachments/20140818/457a5748/attachment.patch>
More information about the dri-devel
mailing list