[PATCH v2 0/2] Fix a hung during memory pressure test

Mon Sep 6 11:04:56 UTC 2021

Am 06.09.21 um 12:16 schrieb Pan, Xinhui:
>> 2021年9月6日 17:04，Christian König <ckoenig.leichtzumerken at gmail.com> 写道：
>>
>>
>>
>> Am 06.09.21 um 03:12 schrieb xinhui pan:
>>> A long time ago, someone reports system got hung during memory test.
>>> In recent days, I am trying to look for or understand the potential
>>> deadlock in ttm/amdgpu code.
>>>
>>> This patchset aims to fix the deadlock during ttm populate.
>>>
>>> TTM has a parameter called pages_limit, when allocated GTT memory
>>> reaches this limit, swapout would be triggered. As ttm_bo_swapout does
>>> not return the correct retval, populate might get hung.
>>>
>>> UVD ib test uses GTT which might be insufficient. So a gpu recovery
>>> would hung if populate hung.
>> Ah, now I understand what you are trying to do.
>>
>> Problem is that won't work either. Allocating VRAM can easily land you inside the same deadlock.
>>
>> We need to avoid the allocation altogether for this for work correctly.
> looks like we need reserve some pages at sw init.

Yeah, something like that should do it.

But keep in mind that you then need a lock or similar when using the 
resource to prevent concurrent use.

>
>>> I have made one drm test which alloc two GTT BOs, submit gfx copy
>>> commands and free these BOs without waiting fence. What's more, these
>>> gfx copy commands will cause gfx ring hang. So gpu recovery would be
>>> triggered.
>> Mhm, that should never be possible. It is perfectly valid for an application to terminate without waitting for the GFX submission to be completed.
> gfx ring hangs because of the command is illegal.
> the packet is COMMAND [30:21] | BYTE_COUNT [20:0]
> I use 0xFF << 20 to hang the ring on purpose.

Ok that makes more sense.

Thanks,
Christian.

>
>> Going to push patch #1 to drm-misc-fixes or drm-misc-next-fixes in a moment.
>>
>> Thanks,
>> Christian.
>>
>>> Now here is one possible deadlock case.
>>> gpu_recovery
>>>   -> stop drm scheduler
>>>   -> asic reset
>>>     -> ib test
>>>        -> tt populate (uvd ib test)
>>> 	->  ttm_bo_swapout (BO A) // this always fails as the fence of
>>> 	BO A would not be signaled by schedluer or HW. Hit deadlock.
>>>
>>> I paste the drm test patch below.
>>> #modprobe ttm pages_limit=65536
>>> #amdgpu_test -s 1 -t 4
>>> ---
>>>   tests/amdgpu/basic_tests.c | 32 ++++++++++++++------------------
>>>   1 file changed, 14 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/tests/amdgpu/basic_tests.c b/tests/amdgpu/basic_tests.c
>>> index dbf02fee..f85ed340 100644
>>> --- a/tests/amdgpu/basic_tests.c
>>> +++ b/tests/amdgpu/basic_tests.c
>>> @@ -65,13 +65,16 @@ static void amdgpu_direct_gma_test(void);
>>>   static void amdgpu_command_submission_write_linear_helper(unsigned ip_type);
>>>   static void amdgpu_command_submission_const_fill_helper(unsigned ip_type);
>>>   static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type);
>>> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>>> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>>>   				       unsigned ip_type,
>>>   				       int instance, int pm4_dw, uint32_t *pm4_src,
>>>   				       int res_cnt, amdgpu_bo_handle *resources,
>>>   				       struct amdgpu_cs_ib_info *ib_info,
>>> -				       struct amdgpu_cs_request *ibs_request);
>>> +				       struct amdgpu_cs_request *ibs_request, int sync, int repeat);
>>>    +#define amdgpu_test_exec_cs_helper(...) \
>>> +	_amdgpu_test_exec_cs_helper(__VA_ARGS__, 1, 1)
>>> +
>>>   CU_TestInfo basic_tests[] = {
>>>   	{ "Query Info Test",  amdgpu_query_info_test },
>>>   	{ "Userptr Test",  amdgpu_userptr_test },
>>> @@ -1341,12 +1344,12 @@ static void amdgpu_command_submission_compute(void)
>>>    * pm4_src, resources, ib_info, and ibs_request
>>>    * submit command stream described in ibs_request and wait for this IB accomplished
>>>    */
>>> -static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>>> +static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>>>   				       unsigned ip_type,
>>>   				       int instance, int pm4_dw, uint32_t *pm4_src,
>>>   				       int res_cnt, amdgpu_bo_handle *resources,
>>>   				       struct amdgpu_cs_ib_info *ib_info,
>>> -				       struct amdgpu_cs_request *ibs_request)
>>> +				       struct amdgpu_cs_request *ibs_request, int sync, int repeat)
>>>   {
>>>   	int r;
>>>   	uint32_t expired;
>>> @@ -1395,12 +1398,15 @@ static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
>>>   	CU_ASSERT_NOT_EQUAL(ibs_request, NULL);
>>>     	/* submit CS */
>>> -	r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
>>> +	while (repeat--)
>>> +		r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
>>>   	CU_ASSERT_EQUAL(r, 0);
>>>     	r = amdgpu_bo_list_destroy(ibs_request->resources);
>>>   	CU_ASSERT_EQUAL(r, 0);
>>>   +	if (!sync)
>>> +		return;
>>>   	fence_status.ip_type = ip_type;
>>>   	fence_status.ip_instance = 0;
>>>   	fence_status.ring = ibs_request->ring;
>>> @@ -1667,7 +1673,7 @@ static void amdgpu_command_submission_sdma_const_fill(void)
>>>     static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>>   {
>>> -	const int sdma_write_length = 1024;
>>> +	const int sdma_write_length = (255) << 20;
>>>   	const int pm4_dw = 256;
>>>   	amdgpu_context_handle context_handle;
>>>   	amdgpu_bo_handle bo1, bo2;
>>> @@ -1715,8 +1721,6 @@ static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>>   							    &bo1_va_handle);
>>>   				CU_ASSERT_EQUAL(r, 0);
>>>   -				/* set bo1 */
>>> -				memset((void*)bo1_cpu, 0xaa, sdma_write_length);
>>>     				/* allocate UC bo2 for sDMA use */
>>>   				r = amdgpu_bo_alloc_and_map(device_handle,
>>> @@ -1727,8 +1731,6 @@ static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>>   							    &bo2_va_handle);
>>>   				CU_ASSERT_EQUAL(r, 0);
>>>   -				/* clear bo2 */
>>> -				memset((void*)bo2_cpu, 0, sdma_write_length);
>>>     				resources[0] = bo1;
>>>   				resources[1] = bo2;
>>> @@ -1785,17 +1787,11 @@ static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
>>>   					}
>>>   				}
>>>   -				amdgpu_test_exec_cs_helper(context_handle,
>>> +				_amdgpu_test_exec_cs_helper(context_handle,
>>>   							   ip_type, ring_id,
>>>   							   i, pm4,
>>>   							   2, resources,
>>> -							   ib_info, ibs_request);
>>> -
>>> -				/* verify if SDMA test result meets with expected */
>>> -				i = 0;
>>> -				while(i < sdma_write_length) {
>>> -					CU_ASSERT_EQUAL(bo2_cpu[i++], 0xaa);
>>> -				}
>>> +							   ib_info, ibs_request, 0, 100);
>>>   				r = amdgpu_bo_unmap_and_free(bo1, bo1_va_handle, bo1_mc,
>>>   							     sdma_write_length);
>>>   				CU_ASSERT_EQUAL(r, 0);