[PATCH v2 3/3] drm/amdgpu: recover gart table at resume
Das, Nirmoy
nirmoy.das at amd.com
Wed Oct 20 10:21:00 UTC 2021
On 10/20/2021 12:15 PM, Lazar, Lijo wrote:
>
>
> On 10/20/2021 3:42 PM, Das, Nirmoy wrote:
>>
>> On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>>>
>>>
>>> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>>>
>>>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>>>> instead pin only once and try to recover gart content
>>>>>> at resume time. This is much more stable in case there
>>>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>>>> while evicting GART table.
>>>>>>
>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das at amd.com>
>>>>>> ---
>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ---
>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 42
>>>>>> ++++++++++++----------
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 ++---
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 10 +++---
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 10 +++---
>>>>>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 9 ++---
>>>>>> 6 files changed, 45 insertions(+), 39 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index 5807df52031c..f69e613805db 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct
>>>>>> drm_device *dev, bool fbcon)
>>>>>> amdgpu_fence_driver_hw_fini(adev);
>>>>>>
>>>>>> amdgpu_device_ip_suspend_phase2(adev);
>>>>>> - /* This second call to evict device resources is to evict
>>>>>> - * the gart page table using the CPU.
>>>>>> - */
>>>>>> - amdgpu_device_evict_resources(adev);
>>>>>>
>>>>>> return 0;
>>>>>> }
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>> index d3e4203f6217..97a9f61fa106 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct
>>>>>> amdgpu_device *adev)
>>>>>> *
>>>>>> * @adev: amdgpu_device pointer
>>>>>> *
>>>>>> - * Allocate video memory for GART page table
>>>>>> + * Allocate and pin video memory for GART page table
>>>>>> * (pcie r4xx, r5xx+). These asics require the
>>>>>> * gart table to be in video memory.
>>>>>> * Returns 0 for success, error for failure.
>>>>>> */
>>>>>> int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>>> {
>>>>>> + struct amdgpu_bo_param bp;
>>>>>> int r;
>>>>>>
>>>>>> - if (adev->gart.bo == NULL) {
>>>>>> - struct amdgpu_bo_param bp;
>>>>>> -
>>>>>> - memset(&bp, 0, sizeof(bp));
>>>>>> - bp.size = adev->gart.table_size;
>>>>>> - bp.byte_align = PAGE_SIZE;
>>>>>> - bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>> - bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>> - AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>> - bp.type = ttm_bo_type_kernel;
>>>>>> - bp.resv = NULL;
>>>>>> - bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>> -
>>>>>> - r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>> - if (r) {
>>>>>> - return r;
>>>>>> - }
>>>>>> - }
>>>>>> + if (adev->gart.bo != NULL)
>>>>>> + return 0;
>>>>>> +
>>>>>> + memset(&bp, 0, sizeof(bp));
>>>>>> + bp.size = adev->gart.table_size;
>>>>>> + bp.byte_align = PAGE_SIZE;
>>>>>> + bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>> + bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>> + AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>> + bp.type = ttm_bo_type_kernel;
>>>>>> + bp.resv = NULL;
>>>>>> + bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>> +
>>>>>> + r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>> + if (r)
>>>>>> + return r;
>>>>>> +
>>>>>> + r = amdgpu_gart_table_vram_pin(adev);
>>>>>> + if (r)
>>>>>> + return r;
>>>>>> +
>>>>>> return 0;
>>>>>> }
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct
>>>>>> amdgpu_device *adev)
>>>>>> return -EINVAL;
>>>>>> }
>>>>>>
>>>>>> - r = amdgpu_gart_table_vram_pin(adev);
>>>>>> - if (r)
>>>>>> - return r;
>>>>>> + if (adev->in_suspend) {
>>>>>> + r = amdgpu_gtt_mgr_recover(adev);
>>>>>
>>>>> When the existing usage of this function is checked, this is
>>>>> called during reset recovery after ip resume phase1. Can't the
>>>>> same thing be done in ip_resume() to place this after phase1
>>>>> resume instead of repeating in every IP version?
>>>>
>>>>
>>>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but
>>>> gmc_v10_0_gart_enable() seems to be correct place to do this
>>>>
>>>> gart specific work.
>>>>
>>>
>>> I see. In that case probably the patch needs to change other call
>>> places also as this step is already taken care in gart enable.
>>
>>
>> Do you mean amdgpu_do_asic_reset() ?
>>
>
> Yes, and saw it called in one more place related to sriov reset
> (didn't track the sriov reset path though).
True, hmm looks like this patch going to need multiple tested-by tags
for gfx6,7 and sriov. I only have gfx8,9,10.
Regards,
Nirmoy
>
> Thanks,
> Lijo
>
>>
>> Nirmoy
>>
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>
>>>> Regards,
>>>>
>>>> Nirmoy
>>>>
>>>>
>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>> + if (r)
>>>>>> + return r;
>>>>>> + }
>>>>>>
>>>>>> r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>> if (r)
>>>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct
>>>>>> amdgpu_device *adev)
>>>>>> {
>>>>>> adev->gfxhub.funcs->gart_disable(adev);
>>>>>> adev->mmhub.funcs->gart_disable(adev);
>>>>>> - amdgpu_gart_table_vram_unpin(adev);
>>>>>> }
>>>>>>
>>>>>> static int gmc_v10_0_hw_fini(void *handle)
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct
>>>>>> amdgpu_device *adev)
>>>>>> dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>> return -EINVAL;
>>>>>> }
>>>>>> - r = amdgpu_gart_table_vram_pin(adev);
>>>>>> - if (r)
>>>>>> - return r;
>>>>>> +
>>>>>> + if (adev->in_suspend) {
>>>>>> + r = amdgpu_gtt_mgr_recover(adev);
>>>>>> + if (r)
>>>>>> + return r;
>>>>>> + }
>>>>>>
>>>>>> table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>
>>>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct
>>>>>> amdgpu_device *adev)
>>>>>> tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>> WREG32(mmVM_L2_CNTL, tmp);
>>>>>> WREG32(mmVM_L2_CNTL2, 0);
>>>>>> - amdgpu_gart_table_vram_unpin(adev);
>>>>>> }
>>>>>>
>>>>>> /**
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>> index 492ebed2915b..dc2577e37688 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct
>>>>>> amdgpu_device *adev)
>>>>>> dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>> return -EINVAL;
>>>>>> }
>>>>>> - r = amdgpu_gart_table_vram_pin(adev);
>>>>>> - if (r)
>>>>>> - return r;
>>>>>> +
>>>>>> + if (adev->in_suspend) {
>>>>>> + r = amdgpu_gtt_mgr_recover(adev);
>>>>>> + if (r)
>>>>>> + return r;
>>>>>> + }
>>>>>>
>>>>>> table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>
>>>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct
>>>>>> amdgpu_device *adev)
>>>>>> tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>> WREG32(mmVM_L2_CNTL, tmp);
>>>>>> WREG32(mmVM_L2_CNTL2, 0);
>>>>>> - amdgpu_gart_table_vram_unpin(adev);
>>>>>> }
>>>>>>
>>>>>> /**
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index cb82404df534..732d91dbf449 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct
>>>>>> amdgpu_device *adev)
>>>>>> return -EINVAL;
>>>>>> }
>>>>>>
>>>>>> - r = amdgpu_gart_table_vram_pin(adev);
>>>>>> - if (r)
>>>>>> - return r;
>>>>>> + if (adev->in_suspend) {
>>>>>> + r = amdgpu_gtt_mgr_recover(adev);
>>>>>> + if (r)
>>>>>> + return r;
>>>>>> + }
>>>>>>
>>>>>> r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>> if (r)
>>>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct
>>>>>> amdgpu_device *adev)
>>>>>> {
>>>>>> adev->gfxhub.funcs->gart_disable(adev);
>>>>>> adev->mmhub.funcs->gart_disable(adev);
>>>>>> - amdgpu_gart_table_vram_unpin(adev);
>>>>>> }
>>>>>>
>>>>>> static int gmc_v9_0_hw_fini(void *handle)
>>>>>> --
>>>>>> 2.32.0
>>>>>>
More information about the amd-gfx
mailing list