[PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset

Tue Jul 26 16:59:46 UTC 2022

Got it

Acked-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>

Andrey

On 2022-07-26 06:01, Zhao, Victor wrote:
> [AMD Official Use Only - General]
>
> Hi Andrey,
>
> For slow tests I mean the slow hang tests by quark tool.
> An example here:
> hang_vm_gfx_dispatch_slow.lua - This script runs on a graphics engine using compute engine and has a hacked CS program which is massive and duplicates standard CS program move code hundreds of thousands of times. The effect is a very slowly executing CS program.
>
> It's not a bad job but just need a very long time to finish. I suppose we don’t have a way to stop shader here. And the running apps will be affected when reset is done.
>
>
> Thanks,
> Victor
>
>
>
> -----Original Message-----
> From: Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
> Sent: Tuesday, July 26, 2022 5:20 AM
> To: Zhao, Victor <Victor.Zhao at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Deng, Emily <Emily.Deng at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
> Subject: Re: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset
>
> On 2022-07-22 03:34, Victor Zhao wrote:
>
>> For some hang caused by slow tests, engine cannot be stopped which may
>> cause resume failure after reset. In this case, force halt engine by
>> reverting context addresses
>
> Can you maybe explain a bit more what exactly you mean by slow test and why engine cannot be stopped in this case ?
>
> Andrey
>
>
>> Signed-off-by: Victor Zhao <Victor.Zhao at amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h  |  1 +
>>    drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c    | 36 +++++++++++++++++++++
>>    drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
>>    4 files changed, 40 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5498fda8617f..833dc5e224d3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5037,6 +5037,7 @@ static void amdgpu_device_recheck_guilty_jobs(
>>    
>>    			/* set guilty */
>>    			drm_sched_increase_karma(s_job);
>> +			amdgpu_reset_prepare_hwcontext(adev, reset_context);
>>    retry:
>>    			/* do hw reset */
>>    			if (amdgpu_sriov_vf(adev)) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> index f8036f2b100e..c7b44aeb671b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
>> @@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs {
>>    	void (*utcl2_harvest)(struct amdgpu_device *adev);
>>    	void (*mode2_save_regs)(struct amdgpu_device *adev);
>>    	void (*mode2_restore_regs)(struct amdgpu_device *adev);
>> +	void (*halt)(struct amdgpu_device *adev);
>>    };
>>    
>>    struct amdgpu_gfxhub {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> index 51cf8acd2d79..8cf53e039c11 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
>> @@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
>>    	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
>>    }
>>    
>> +static void gfxhub_v2_1_halt(struct amdgpu_device *adev) {
>> +	struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0];
>> +	int i;
>> +	uint32_t tmp;
>> +	int time = 1000;
>> +
>> +	gfxhub_v2_1_set_fault_enable_default(adev, false);
>> +
>> +	for (i = 0; i <= 14; i++) {
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32,
>> +				    i * hub->ctx_addr_distance, ~0);
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32,
>> +				    i * hub->ctx_addr_distance, ~0);
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32,
>> +				    i * hub->ctx_addr_distance,
>> +				    0);
>> +		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
>> +				    i * hub->ctx_addr_distance,
>> +				    0);
>> +	}
>> +	tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
>> +	while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK |
>> +		      GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 &&
>> +	       time) {
>> +		udelay(100);
>> +		time--;
>> +		tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
>> +	}
>> +
>> +	if (!time) {
>> +		DRM_WARN("failed to wait for GRBM(EA) idle\n");
>> +	}
>> +}
>> +
>>    const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>>    	.get_fb_location = gfxhub_v2_1_get_fb_location,
>>    	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset, @@ -658,4 +693,5
>> @@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
>>    	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
>>    	.mode2_save_regs = gfxhub_v2_1_save_regs,
>>    	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
>> +	.halt = gfxhub_v2_1_halt,
>>    };
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> index 51a5b68f77d3..fead7251292f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
>> @@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
>>    	if (!amdgpu_sriov_vf(adev)) {
>>    		if (adev->gfxhub.funcs->mode2_save_regs)
>>    			adev->gfxhub.funcs->mode2_save_regs(adev);
>> +		if (adev->gfxhub.funcs->halt)
>> +			adev->gfxhub.funcs->halt(adev);
>>    		r = sienna_cichlid_mode2_suspend_ip(adev);
>>    	}
>>