[PATCH] drm/amdkfd: handle fault counters on invalid address

Wed Jul 7 18:50:51 UTC 2021

On 2021-07-07 2:34 p.m., Philip Yang wrote:
> prange is NULL if vm fault retry on invalid address, for this case, can
> not use prange to get pdd, use adev to get gpuidx and then get pdd
> instead, then increase pdd vm fault counter.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 30 +++++++++++++++++-----------
>   1 file changed, 18 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 31f3f24cef6a..e7e99c5070b9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2375,21 +2375,27 @@ static bool svm_range_skip_recover(struct svm_range *prange)
>   
>   static void
>   svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
> -		      struct svm_range *prange, int32_t gpuidx)
> +		      int32_t gpuidx)
>   {
>   	struct kfd_process_device *pdd;
>   
> -	if (gpuidx == MAX_GPU_INSTANCE)
> -		/* fault is on different page of same range
> -		 * or fault is skipped to recover later
> -		 */
> -		pdd = svm_range_get_pdd_by_adev(prange, adev);
> -	else
> -		/* fault recovered
> -		 * or fault cannot recover because GPU no access on the range
> -		 */
> -		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
> +	/* fault is on different page of same range
> +	 * or fault is skipped to recover later
> +	 * or fault is on invalid virtual address
> +	 */
> +	if (gpuidx == MAX_GPU_INSTANCE) {
> +		uint32_t gpuid;
> +		int r;
>   
> +		r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx);
> +		if (r < 0)
> +			return;
> +	}
> +
> +	/* fault is recovered
> +	 * or fault cannot recover because GPU no access on the range
> +	 */
> +	pdd = kfd_process_device_from_gpuidx(p, gpuidx);
>   	if (pdd)
>   		WRITE_ONCE(pdd->faults, pdd->faults + 1);
>   }
> @@ -2525,7 +2531,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   	mutex_unlock(&svms->lock);
>   	mmap_read_unlock(mm);
>   
> -	svm_range_count_fault(adev, p, prange, gpuidx);
> +	svm_range_count_fault(adev, p, gpuidx);
>   
>   	mmput(mm);
>   out: