[PATCH 2/4] drm/amdgpu: cleanups for vram lost handling

Wed Feb 28 13:41:30 UTC 2018

Patch look good to me then.

Acked-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>

Andrey

On 02/28/2018 08:36 AM, Liu, Monk wrote:
>>    		goto error;
>>    
>>    	amdgpu_irq_gpu_reset_resume_helper(adev);
>>    	r = amdgpu_ib_ring_tests(adev);
>> -	if (r)
>> -		dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
> This 2 lines deletion seems unintentional
>
> No, intentional, because if IB TEST failed the error will be reported by the fault IP
>
>
>> +	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
>> +		atomic_inc(&adev->vram_lost_counter);
>> +		r = amdgpu_handle_vram_lost(adev);
>>    	}
>>    
>> +error:
>> +
>>    	return r;
> Not doing incrementation of VRAM lost counter in case of error anymore is intentional ?
>
>
> [ML] yeah, no need to increase the counter because GPU reset is failed so everything is meaningless, driver cannot used anymore
>
>
>
> -----Original Message-----
> From: Grodzovsky, Andrey
> Sent: 2018年2月28日 21:29
> To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH 2/4] drm/amdgpu: cleanups for vram lost handling
>
>
>
> On 02/28/2018 02:21 AM, Monk Liu wrote:
>> 1)create a routine "handle_vram_lost" to do the vram recovery, and put
>> it into amdgpu_device_reset/reset_sriov, this way no need of the extra
>> paramter to hold the VRAM LOST information and the related macros can
>> be removed.
>>
>> 3)show vram_recover failure if time out, and set TMO equal to
>> lockup_timeout if vram_recover is under SRIOV runtime mode.
>>
>> 4)report error if any ip reset failed for SR-IOV
>>
>> Change-Id: I686e2b6133844c14948c206a2315c064a78c1d9c
>> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   4 -
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 136 +++++++++++++++--------------
>>    2 files changed, 72 insertions(+), 68 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 5bddfc1..abbc3f1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -181,10 +181,6 @@ extern int amdgpu_cik_support;
>>    #define CIK_CURSOR_WIDTH 128
>>    #define CIK_CURSOR_HEIGHT 128
>>    
>> -/* GPU RESET flags */
>> -#define AMDGPU_RESET_INFO_VRAM_LOST  (1 << 0) -#define
>> AMDGPU_RESET_INFO_FULLRESET  (1 << 1)
>> -
>>    struct amdgpu_device;
>>    struct amdgpu_ib;
>>    struct amdgpu_cs_parser;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e9d81a8..39ece7f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1591,6 +1591,8 @@ static int
>> amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
>>    
>>    			r = block->version->funcs->hw_init(adev);
>>    			DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name,
>> r?"failed":"successed");
>> +			if (r)
>> +				return r;
>>    		}
>>    	}
>>    
>> @@ -1624,6 +1626,8 @@ static int
>> amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
>>    
>>    			r = block->version->funcs->hw_init(adev);
>>    			DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name,
>> r?"failed":"successed");
>> +			if (r)
>> +				return r;
>>    		}
>>    	}
>>    
>> @@ -2471,17 +2475,71 @@ static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,
>>    	return r;
>>    }
>>    
>> +static int amdgpu_handle_vram_lost(struct amdgpu_device *adev) {
>> +	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>> +	struct amdgpu_bo *bo, *tmp;
>> +	struct dma_fence *fence = NULL, *next = NULL;
>> +	long r = 1;
>> +	int i = 0;
>> +	long tmo;
>> +
>> +	if (amdgpu_sriov_runtime(adev))
>> +		tmo = msecs_to_jiffies(amdgpu_lockup_timeout);
>> +	else
>> +		tmo = msecs_to_jiffies(100);
>> +
>> +	DRM_INFO("recover vram bo from shadow start\n");
>> +	mutex_lock(&adev->shadow_list_lock);
>> +	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
>> +		next = NULL;
>> +		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>> +		if (fence) {
>> +			r = dma_fence_wait_timeout(fence, false, tmo);
>> +			if (r == 0)
>> +				pr_err("wait fence %p[%d] timeout\n", fence, i);
>> +			else if (r < 0)
>> +				pr_err("wait fence %p[%d] interrupted\n", fence, i);
>> +			if (r < 1) {
>> +				dma_fence_put(fence);
>> +				fence = next;
>> +				break;
>> +			}
>> +			i++;
>> +		}
>> +
>> +		dma_fence_put(fence);
>> +		fence = next;
>> +	}
>> +	mutex_unlock(&adev->shadow_list_lock);
>> +
>> +	if (fence) {
>> +		r = dma_fence_wait_timeout(fence, false, tmo);
>> +		if (r == 0)
>> +			pr_err("wait fence %p[%d] timeout\n", fence, i);
>> +		else if (r < 0)
>> +			pr_err("wait fence %p[%d] interrupted\n", fence, i);
>> +
>> +	}
>> +	dma_fence_put(fence);
>> +
>> +	if (r > 0)
>> +		DRM_INFO("recover vram bo from shadow done\n");
>> +	else
>> +		DRM_ERROR("recover vram bo from shadow failed\n");
>> +
>> +	return (r > 0?0:1);
>> +}
>> +
>>    /*
>>     * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough
>>     *
>>     * @adev: amdgpu device pointer
>> - * @reset_flags: output param tells caller the reset result
>>     *
>>     * attempt to do soft-reset or full-reset and reinitialize Asic
>>     * return 0 means successed otherwise failed
>>    */
>> -static int amdgpu_device_reset(struct amdgpu_device *adev,
>> -			       uint64_t* reset_flags)
>> +static int amdgpu_device_reset(struct amdgpu_device *adev)
>>    {
>>    	bool need_full_reset, vram_lost = 0;
>>    	int r;
>> @@ -2496,7 +2554,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
>>    			DRM_INFO("soft reset failed, will fallback to full reset!\n");
>>    			need_full_reset = true;
>>    		}
>> -
>>    	}
>>    
>>    	if (need_full_reset) {
>> @@ -2545,13 +2602,8 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
>>    		}
>>    	}
>>    
>> -	if (reset_flags) {
>> -		if (vram_lost)
>> -			(*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
>> -
>> -		if (need_full_reset)
>> -			(*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
>> -	}
>> +	if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost))
>> +		r = amdgpu_handle_vram_lost(adev);
>>    
>>    	return r;
>>    }
>> @@ -2560,14 +2612,11 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
>>     * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
>>     *
>>     * @adev: amdgpu device pointer
>> - * @reset_flags: output param tells caller the reset result
>>     *
>>     * do VF FLR and reinitialize Asic
>>     * return 0 means successed otherwise failed
>>    */
>> -static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>> -				     uint64_t *reset_flags,
>> -				     bool from_hypervisor)
>> +static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool
>> +from_hypervisor)
>>    {
>>    	int r;
>>    
>> @@ -2588,28 +2637,20 @@ static int amdgpu_device_reset_sriov(struct
>> amdgpu_device *adev,
>>    
>>    	/* now we are okay to resume SMC/CP/SDMA */
>>    	r = amdgpu_device_ip_reinit_late_sriov(adev);
>> +	amdgpu_virt_release_full_gpu(adev, true);
>>    	if (r)
>>    		goto error;
>>    
>>    	amdgpu_irq_gpu_reset_resume_helper(adev);
>>    	r = amdgpu_ib_ring_tests(adev);
>> -	if (r)
>> -		dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
> This 2 lines deletion seems unintentional
>
>>    
>> -error:
>> -	/* release full control of GPU after ib test */
>> -	amdgpu_virt_release_full_gpu(adev, true);
>> -
>> -	if (reset_flags) {
>> -		if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
>> -			(*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
>> -			atomic_inc(&adev->vram_lost_counter);
>> -		}
>> -
>> -		/* VF FLR or hotlink reset is always full-reset */
>> -		(*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
>> +	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
>> +		atomic_inc(&adev->vram_lost_counter);
>> +		r = amdgpu_handle_vram_lost(adev);
>>    	}
>>    
>> +error:
>> +
>>    	return r;
> Not doing incrementation of VRAM lost counter in case of error anymore is intentional ?
>
> Andrey
>>    }
>>    
>> @@ -2673,42 +2714,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>    	}
>>    
>>    	if (amdgpu_sriov_vf(adev))
>> -		r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true);
>> +		r = amdgpu_device_reset_sriov(adev, job ? false : true);
>>    	else
>> -		r = amdgpu_device_reset(adev, &reset_flags);
>> -
>> -	if (!r) {
>> -		if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) ||
>> -			(reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) {
>> -			struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>> -			struct amdgpu_bo *bo, *tmp;
>> -			struct dma_fence *fence = NULL, *next = NULL;
>> -
>> -			DRM_INFO("recover vram bo from shadow\n");
>> -			mutex_lock(&adev->shadow_list_lock);
>> -			list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
>> -				next = NULL;
>> -				amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>> -				if (fence) {
>> -					r = dma_fence_wait(fence, false);
>> -					if (r) {
>> -						WARN(r, "recovery from shadow isn't completed\n");
>> -						break;
>> -					}
>> -				}
>> -
>> -				dma_fence_put(fence);
>> -				fence = next;
>> -			}
>> -			mutex_unlock(&adev->shadow_list_lock);
>> -			if (fence) {
>> -				r = dma_fence_wait(fence, false);
>> -				if (r)
>> -					WARN(r, "recovery from shadow isn't completed\n");
>> -			}
>> -			dma_fence_put(fence);
>> -		}
>> -	}
>> +		r = amdgpu_device_reset(adev);
>>    
>>    	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>    		struct amdgpu_ring *ring = adev->rings[i];