[PATCH v2 3/3] drm/amdgpu: recover gart table at resume

Christian König christian.koenig at amd.com
Wed Oct 20 10:51:13 UTC 2021



Am 20.10.21 um 12:21 schrieb Das, Nirmoy:
>
> On 10/20/2021 12:15 PM, Lazar, Lijo wrote:
>>
>>
>> On 10/20/2021 3:42 PM, Das, Nirmoy wrote:
>>>
>>> On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>>>>
>>>>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>>>>
>>>>>>
>>>>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>>>>> instead pin only once and try to recover gart content
>>>>>>> at resume time. This is much more stable in case there
>>>>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>>>>> while evicting GART table.
>>>>>>>
>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das at amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
>>>>>>> ++++++++++++----------
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>>>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index 5807df52031c..f69e613805db 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct 
>>>>>>> drm_device *dev, bool fbcon)
>>>>>>>       amdgpu_fence_driver_hw_fini(adev);
>>>>>>>
>>>>>>>       amdgpu_device_ip_suspend_phase2(adev);
>>>>>>> -    /* This second call to evict device resources is to evict
>>>>>>> -     * the gart page table using the CPU.
>>>>>>> -     */
>>>>>>> -    amdgpu_device_evict_resources(adev);
>>>>>>>
>>>>>>>       return 0;
>>>>>>>   }
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>> index d3e4203f6217..97a9f61fa106 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>    *
>>>>>>>    * @adev: amdgpu_device pointer
>>>>>>>    *
>>>>>>> - * Allocate video memory for GART page table
>>>>>>> + * Allocate and pin video memory for GART page table
>>>>>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>>>>>    * gart table to be in video memory.
>>>>>>>    * Returns 0 for success, error for failure.
>>>>>>>    */
>>>>>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>>>>   {
>>>>>>> +    struct amdgpu_bo_param bp;
>>>>>>>       int r;
>>>>>>>
>>>>>>> -    if (adev->gart.bo == NULL) {
>>>>>>> -        struct amdgpu_bo_param bp;
>>>>>>> -
>>>>>>> -        memset(&bp, 0, sizeof(bp));
>>>>>>> -        bp.size = adev->gart.table_size;
>>>>>>> -        bp.byte_align = PAGE_SIZE;
>>>>>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>> -        bp.type = ttm_bo_type_kernel;
>>>>>>> -        bp.resv = NULL;
>>>>>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>> -
>>>>>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>> -        if (r) {
>>>>>>> -            return r;
>>>>>>> -        }
>>>>>>> -    }
>>>>>>> +    if (adev->gart.bo != NULL)
>>>>>>> +        return 0;
>>>>>>> +
>>>>>>> +    memset(&bp, 0, sizeof(bp));
>>>>>>> +    bp.size = adev->gart.table_size;
>>>>>>> +    bp.byte_align = PAGE_SIZE;
>>>>>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>> +    bp.type = ttm_bo_type_kernel;
>>>>>>> +    bp.resv = NULL;
>>>>>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>> +
>>>>>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>> +    if (r)
>>>>>>> +        return r;
>>>>>>> +
>>>>>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> +    if (r)
>>>>>>> +        return r;
>>>>>>> +
>>>>>>>       return 0;
>>>>>>>   }
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>>
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>
>>>>>> When the existing usage of this function is checked, this is 
>>>>>> called during reset recovery after ip resume phase1. Can't the 
>>>>>> same thing be done in ip_resume() to place this after phase1 
>>>>>> resume instead of repeating in every IP version?
>>>>>
>>>>>
>>>>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
>>>>> gmc_v10_0_gart_enable() seems to be correct place  to do this
>>>>>
>>>>> gart specific work.
>>>>>
>>>>
>>>> I see. In that case probably the patch needs to change other call 
>>>> places also as this step is already taken care in gart enable.
>>>
>>>
>>> Do you mean amdgpu_do_asic_reset() ?
>>>
>>
>> Yes, and saw it called in one more place related to sriov reset 
>> (didn't track the sriov reset path though).
>
>
> True, hmm looks like this patch going to need multiple tested-by tags 
> for gfx6,7 and sriov. I only have gfx8,9,10.

You also need to test this on APUs as well, when it works won Raven/gfx9 
I'm pretty sure it will work on other generations as well (except for 
typos of course).

Christian.

>
>
> Regards,
>
> Nirmoy
>
>
>>
>> Thanks,
>> Lijo
>>
>>>
>>> Nirmoy
>>>
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>
>>>>> Regards,
>>>>>
>>>>> Nirmoy
>>>>>
>>>>>
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>       if (r)
>>>>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   static int gmc_v10_0_hw_fini(void *handle)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>
>>>>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>> index 492ebed2915b..dc2577e37688 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>
>>>>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> index cb82404df534..732d91dbf449 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>>
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>       if (r)
>>>>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   static int gmc_v9_0_hw_fini(void *handle)
>>>>>>> -- 
>>>>>>> 2.32.0
>>>>>>>



More information about the amd-gfx mailing list