[PATCH 1/4] drm/amdgpu: check if vram is lost v2

Wed May 17 04:37:07 UTC 2017


On 2017年05月16日 18:49, Christian König wrote:
> Am 16.05.2017 um 11:25 schrieb Chunming Zhou:
>> bakup first 64 byte of gart table as reset magic, check if magic is same
>> after gpu hw reset.
>> v2: use memcmp instead of manual innovation.
>>
>> Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704
>> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>
>
> Patch #1-#3 are Reviewed-by: Christian König <christian.koenig at amd.com>
pushed.
>
> Patch #4:
>
> You need to add the new enum on line 591 or otherwise you will get an 
> "unsupported operation" error.
>
> Line 604 should be changed as well or otherwise we need a BO for this 
> operation.
are you sure you are talking this patch#4? I cannot address what you said.

>
> A libdrm test case to just call this IOCTL would probably be a good idea.
>
> Additional to that I would ping Marek (Mesa) and Michel (DDX) for 
> their opinion on this. Could be that this is completely superfluous 
> and the UMDs needs something else.
Michel seems have different opinion/concern, maybe we need more 
discussions before we make new interfaces.

Thanks,
David Zhou
>
> Regards,
> Christian.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
>>   2 files changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index de08ff0..f9da215 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1502,6 +1502,7 @@ struct amdgpu_ssg {
>>   #endif
>>   };
>>   +#define AMDGPU_RESET_MAGIC_NUM 64
>>   struct amdgpu_device {
>>       struct device            *dev;
>>       struct drm_device        *ddev;
>> @@ -1705,6 +1706,7 @@ struct amdgpu_device {
>>         /* record hw reset is performed */
>>       bool has_hw_reset;
>> +    u8                reset_magic[AMDGPU_RESET_MAGIC_NUM];
>>     };
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 0a31fb1..c56ae4a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device 
>> *adev)
>>       return 0;
>>   }
>>   +static void amdgpu_fill_reset_magic(struct amdgpu_device *adev)
>> +{
>> +    memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
>> +}
>> +
>> +static bool amdgpu_check_vram_lost(struct amdgpu_device *adev)
>> +{
>> +    return !!memcmp(adev->gart.ptr, adev->reset_magic,
>> +            AMDGPU_RESET_MAGIC_NUM);
>> +}
>> +
>>   static int amdgpu_late_init(struct amdgpu_device *adev)
>>   {
>>       int i = 0, r;
>> @@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct 
>> amdgpu_device *adev)
>>           }
>>       }
>>   +    amdgpu_fill_reset_magic(adev);
>> +
>>       return 0;
>>   }
>>   @@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>       struct drm_atomic_state *state = NULL;
>>       int i, r;
>>       int resched;
>> -    bool need_full_reset;
>> +    bool need_full_reset, vram_lost = false;
>>         if (amdgpu_sriov_vf(adev))
>>           return amdgpu_sriov_gpu_reset(adev, true);
>> @@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>               r = amdgpu_resume_phase1(adev);
>>               if (r)
>>                   goto out;
>> +            vram_lost = amdgpu_check_vram_lost(adev);
>> +            if (vram_lost)
>> +                DRM_ERROR("VRAM is lost!\n");
>>               r = amdgpu_ttm_recover_gart(adev);
>>               if (r)
>>                   goto out;
>>               r = amdgpu_resume_phase2(adev);
>>               if (r)
>>                   goto out;
>> +            if (vram_lost)
>> +                amdgpu_fill_reset_magic(adev);
>>           }
>>       }
>>   out:
>
>