[PATCH 2/3] drm/amdgpu: return -ENODEV to user space when vram is lost

zhoucm1 david1.zhou at amd.com
Tue May 16 09:12:13 UTC 2017



On 2017年05月16日 16:13, Christian König wrote:
> Am 16.05.2017 um 08:44 schrieb Chunming Zhou:
>> below ioctl will return -ENODEV:
>> amdgpu_cs_ioctl
>> amdgpu_cs_wait_ioctl
>> amdgpu_cs_wait_fences_ioctl
>> amdgpu_gem_va_ioctl
>> amdgpu_info_ioctl
>>
>> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e
>> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  9 +++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ++++++++++
>>   5 files changed, 28 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index f9da215..dcd6203 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -855,6 +855,7 @@ struct amdgpu_fpriv {
>>       struct amdgpu_ctx_mgr    ctx_mgr;
>>       spinlock_t        sem_handles_lock;
>>       struct idr        sem_handles;
>> +    u32            vram_lost_counter;
>>   };
>>     /*
>> @@ -1607,6 +1608,7 @@ struct amdgpu_device {
>>       atomic64_t            num_bytes_moved;
>>       atomic64_t            num_evictions;
>>       atomic_t            gpu_reset_counter;
>> +    atomic_t            vram_lost_counter;
>>         /* data for buffer migration throttling */
>>       struct {
>> @@ -2005,6 +2007,8 @@ static inline void 
>> amdgpu_unregister_atpx_handler(void) {}
>>   extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
>>   extern const int amdgpu_max_kms_ioctl;
>>   +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
>> +              struct amdgpu_fpriv *fpriv);
>>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long 
>> flags);
>>   int amdgpu_driver_unload_kms(struct drm_device *dev);
>>   void amdgpu_driver_lastclose_kms(struct drm_device *dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index b803412..911aa02 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct 
>> amdgpu_cs_parser *p,
>>   int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct 
>> drm_file *filp)
>>   {
>>       struct amdgpu_device *adev = dev->dev_private;
>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>       union drm_amdgpu_cs *cs = data;
>>       struct amdgpu_cs_parser parser = {};
>>       bool reserved_buffers = false;
>> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, 
>> void *data, struct drm_file *filp)
>>         if (!adev->accel_working)
>>           return -EBUSY;
>> +    if (amdgpu_kms_vram_lost(adev, fpriv))
>> +        return -ENODEV;
>>         parser.adev = adev;
>>       parser.filp = filp;
>> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device 
>> *dev, void *data,
>>   {
>>       union drm_amdgpu_wait_cs *wait = data;
>>       struct amdgpu_device *adev = dev->dev_private;
>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>       unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
>>       struct amdgpu_ring *ring = NULL;
>>       struct amdgpu_ctx *ctx;
>>       struct fence *fence;
>>       long r;
>>   +    if (amdgpu_kms_vram_lost(adev, fpriv))
>> +        return -ENODEV;
>>       r = amdgpu_cs_get_ring(adev, wait->in.ip_type, 
>> wait->in.ip_instance,
>>                      wait->in.ring, &ring);
>>       if (r)
>> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct 
>> drm_device *dev, void *data,
>>                   struct drm_file *filp)
>>   {
>>       struct amdgpu_device *adev = dev->dev_private;
>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>       union drm_amdgpu_wait_fences *wait = data;
>>       uint32_t fence_count = wait->in.fence_count;
>>       struct drm_amdgpu_fence *fences_user;
>>       struct drm_amdgpu_fence *fences;
>>       int r;
>>   +    if (amdgpu_kms_vram_lost(adev, fpriv))
>> +        return -ENODEV;
>>       /* Get the fences from userspace */
>>       fences = kmalloc_array(fence_count, sizeof(struct 
>> drm_amdgpu_fence),
>>               GFP_KERNEL);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 58af9ea..417b8f8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2919,8 +2919,10 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>>               if (r)
>>                   goto out;
>>               vram_lost = amdgpu_check_vram_lost(adev);
>> -            if (vram_lost)
>> +            if (vram_lost) {
>>                   DRM_ERROR("VRAM is lost!\n");
>> +                atomic_inc(&adev->vram_lost_counter);
>> +            }
>>               r = amdgpu_ttm_recover_gart(adev);
>>               if (r)
>>                   goto out;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index d8275ef..0f0b736 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -776,6 +776,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, 
>> void *data,
>>         if (!adev->vm_manager.enabled)
>>           return -ENOTTY;
>> +    if (amdgpu_kms_vram_lost(adev, fpriv))
>> +        return -ENODEV;
>
> We should only block AMDGPU_VA_OP_MAP and AMDGPU_VA_OP_REPLACE here 
> and still allow AMDGPU_VA_OP_UNMAP and AMDGPU_VA_OP_CLEAR.
>
> BTW: How should the UMD recover from that situation? Completely close 
> the fd and recreate it?
>
> That might be tricky for processes like X or the Compositor. Should we 
> have an IOCTL to reset the vram_lost counter for an fd?
if no NAK, I can add it.

Regards,
David Zhou
>
> Christian.
>
>>         if (args->va_address < AMDGPU_VA_RESERVED_SIZE) {
>>           dev_err(&dev->pdev->dev,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 368829a..a231aa1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct 
>> drm_amdgpu_info_firmware *fw_info,
>>   static int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
>> struct drm_file *filp)
>>   {
>>       struct amdgpu_device *adev = dev->dev_private;
>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>       struct drm_amdgpu_info *info = data;
>>       struct amdgpu_mode_info *minfo = &adev->mode_info;
>>       void __user *out = (void __user *)(uintptr_t)info->return_pointer;
>> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device 
>> *dev, void *data, struct drm_file
>>         if (!info->return_size || !info->return_pointer)
>>           return -EINVAL;
>> +    if (amdgpu_kms_vram_lost(adev, fpriv))
>> +        return -ENODEV;
>>         switch (info->query) {
>>       case AMDGPU_INFO_VIRTUAL_RANGE: {
>> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct 
>> drm_device *dev)
>>       vga_switcheroo_process_delayed_switch();
>>   }
>>   +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
>> +              struct amdgpu_fpriv *fpriv)
>> +{
>> +    return fpriv->vram_lost_counter != 
>> atomic_read(&adev->vram_lost_counter);
>> +}
>> +
>>   /**
>>    * amdgpu_driver_open_kms - drm callback for open
>>    *
>> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device 
>> *dev, struct drm_file *file_priv)
>>         amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
>>   +    fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>       file_priv->driver_priv = fpriv;
>>     out_suspend:
>
>



More information about the amd-gfx mailing list