[PATCH] drm/amdkfd: handle fault counters on invalid address
Felix Kuehling
felix.kuehling at amd.com
Wed Jul 7 18:50:51 UTC 2021
On 2021-07-07 2:34 p.m., Philip Yang wrote:
> prange is NULL if vm fault retry on invalid address, for this case, can
> not use prange to get pdd, use adev to get gpuidx and then get pdd
> instead, then increase pdd vm fault counter.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 30 +++++++++++++++++-----------
> 1 file changed, 18 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 31f3f24cef6a..e7e99c5070b9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2375,21 +2375,27 @@ static bool svm_range_skip_recover(struct svm_range *prange)
>
> static void
> svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
> - struct svm_range *prange, int32_t gpuidx)
> + int32_t gpuidx)
> {
> struct kfd_process_device *pdd;
>
> - if (gpuidx == MAX_GPU_INSTANCE)
> - /* fault is on different page of same range
> - * or fault is skipped to recover later
> - */
> - pdd = svm_range_get_pdd_by_adev(prange, adev);
> - else
> - /* fault recovered
> - * or fault cannot recover because GPU no access on the range
> - */
> - pdd = kfd_process_device_from_gpuidx(p, gpuidx);
> + /* fault is on different page of same range
> + * or fault is skipped to recover later
> + * or fault is on invalid virtual address
> + */
> + if (gpuidx == MAX_GPU_INSTANCE) {
> + uint32_t gpuid;
> + int r;
>
> + r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx);
> + if (r < 0)
> + return;
> + }
> +
> + /* fault is recovered
> + * or fault cannot recover because GPU no access on the range
> + */
> + pdd = kfd_process_device_from_gpuidx(p, gpuidx);
> if (pdd)
> WRITE_ONCE(pdd->faults, pdd->faults + 1);
> }
> @@ -2525,7 +2531,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> mutex_unlock(&svms->lock);
> mmap_read_unlock(mm);
>
> - svm_range_count_fault(adev, p, prange, gpuidx);
> + svm_range_count_fault(adev, p, gpuidx);
>
> mmput(mm);
> out:
More information about the amd-gfx
mailing list