[PATCH] drm/amdgpu/gfx10: re-init clear state buffer after gpu reset

Wed Nov 20 08:57:16 UTC 2019

While the GPU is locked up? Not a good idea at all, we need the SDMA for 
this and that would be rather unreliable.

If the content of the BO is constant we should just re-initialize it 
after the reset.

Regards,
Christian.

Am 20.11.19 um 09:54 schrieb Zhang, Hawking:
> I think we should evict the bo and then move it back.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken at gmail.com>
> Sent: 2019年11月20日 16:47
> To: Zhang, Hawking <Hawking.Zhang at amd.com>; Yuan, Xiaojie <Xiaojie.Yuan at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Long, Gang <Gang.Long at amd.com>; Xiao, Jack <Jack.Xiao at amd.com>
> Subject: Re: [PATCH] drm/amdgpu/gfx10: re-init clear state buffer after gpu reset
>
> A baco reset also resets the MC, doesn't it? n this case it would be expected that the content of VRAM is corrupted.
>
> Christian.
>
> Am 20.11.19 um 09:45 schrieb Zhang, Hawking:
>> Or in another word, we are still not clear when the corruption actually happens, right?
>>
>> Regards,
>> Hawking
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
>> Zhang, Hawking
>> Sent: 2019年11月20日 16:44
>> To: Yuan, Xiaojie <Xiaojie.Yuan at amd.com>;
>> amd-gfx at lists.freedesktop.org
>> Cc: Long, Gang <Gang.Long at amd.com>; Xiao, Jack <Jack.Xiao at amd.com>
>> Subject: RE: [PATCH] drm/amdgpu/gfx10: re-init clear state buffer
>> after gpu reset
>>
>> Just make sure I understand you correctly. So until fw team root cause the reason of csb corruption, we keep the workaround in driver, correct?
>>
>> Regards,
>> Hawking
>> -----Original Message-----
>> From: Yuan, Xiaojie <Xiaojie.Yuan at amd.com>
>> Sent: 2019年11月20日 14:47
>> To: amd-gfx at lists.freedesktop.org
>> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Xiao, Jack
>> <Jack.Xiao at amd.com>; Long, Gang <Gang.Long at amd.com>; Yuan, Xiaojie
>> <Xiaojie.Yuan at amd.com>
>> Subject: [PATCH] drm/amdgpu/gfx10: re-init clear state buffer after
>> gpu reset
>>
>> This patch fixes 2nd baco reset failure with gfxoff enabled on navi1x.
>>
>> clear state buffer (resides in vram) is corrupted after 1st baco reset, upon gfxoff exit, CPF gets garbage header in CSIB and hangs.
>>
>> Signed-off-by: Xiaojie Yuan <xiaojie.yuan at amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 43 ++++++++++++++++++++++----
>>    1 file changed, 37 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 9274bd4b6c68..8e24ea08ca39 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -1789,27 +1789,52 @@ static void gfx_v10_0_enable_gui_idle_interrupt(struct amdgpu_device *adev,
>>    	WREG32_SOC15(GC, 0, mmCP_INT_CNTL_RING0, tmp);  }
>>    
>> -static void gfx_v10_0_init_csb(struct amdgpu_device *adev)
>> +static int gfx_v10_0_init_csb(struct amdgpu_device *adev)
>>    {
>> +	int r;
>> +
>> +	if (adev->in_gpu_reset) {
>> +		r = amdgpu_bo_reserve(adev->gfx.rlc.clear_state_obj, false);
>> +		if (r)
>> +			return r;
>> +
>> +		r = amdgpu_bo_kmap(adev->gfx.rlc.clear_state_obj,
>> +				   (void **)&adev->gfx.rlc.cs_ptr);
>> +		if (!r) {
>> +			adev->gfx.rlc.funcs->get_csb_buffer(adev,
>> +					adev->gfx.rlc.cs_ptr);
>> +			amdgpu_bo_kunmap(adev->gfx.rlc.clear_state_obj);
>> +		}
>> +
>> +		amdgpu_bo_unreserve(adev->gfx.rlc.clear_state_obj);
>> +		if (r)
>> +			return r;
>> +	}
>> +
>>    	/* csib */
>>    	WREG32_SOC15(GC, 0, mmRLC_CSIB_ADDR_HI,
>>    		     adev->gfx.rlc.clear_state_gpu_addr >> 32);
>>    	WREG32_SOC15(GC, 0, mmRLC_CSIB_ADDR_LO,
>>    		     adev->gfx.rlc.clear_state_gpu_addr & 0xfffffffc);
>>    	WREG32_SOC15(GC, 0, mmRLC_CSIB_LENGTH,
>> adev->gfx.rlc.clear_state_size);
>> +
>> +	return 0;
>>    }
>>    
>> -static void gfx_v10_0_init_pg(struct amdgpu_device *adev)
>> +static int gfx_v10_0_init_pg(struct amdgpu_device *adev)
>>    {
>>    	int i;
>> +	int r;
>>    
>> -	gfx_v10_0_init_csb(adev);
>> +	r = gfx_v10_0_init_csb(adev);
>> +	if (r)
>> +		return r;
>>    
>>    	for (i = 0; i < adev->num_vmhubs; i++)
>>    		amdgpu_gmc_flush_gpu_tlb(adev, 0, i, 0);
>>    
>>    	/* TODO: init power gating */
>> -	return;
>> +	return 0;
>>    }
>>    
>>    void gfx_v10_0_rlc_stop(struct amdgpu_device *adev) @@ -1911,7 +1936,10 @@ static int gfx_v10_0_rlc_resume(struct amdgpu_device *adev)
>>    		r = gfx_v10_0_wait_for_rlc_autoload_complete(adev);
>>    		if (r)
>>    			return r;
>> -		gfx_v10_0_init_pg(adev);
>> +
>> +		r = gfx_v10_0_init_pg(adev);
>> +		if (r)
>> +			return r;
>>    
>>    		/* enable RLC SRM */
>>    		gfx_v10_0_rlc_enable_srm(adev);
>> @@ -1937,7 +1965,10 @@ static int gfx_v10_0_rlc_resume(struct amdgpu_device *adev)
>>    				return r;
>>    		}
>>    
>> -		gfx_v10_0_init_pg(adev);
>> +		r = gfx_v10_0_init_pg(adev);
>> +		if (r)
>> +			return r;
>> +
>>    		adev->gfx.rlc.funcs->start(adev);
>>    
>>    		if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO)
>> {
>> --
>> 2.20.1
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx