[PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
Russell, Kent
Kent.Russell at amd.com
Fri Mar 8 14:17:26 UTC 2019
That dmesg was from Fiji, but it occurred on Vega10 as well.
Kent
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken at gmail.com>
> Sent: Friday, March 08, 2019 9:14 AM
> To: Russell, Kent <Kent.Russell at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: Re: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
>
> My best guess is that we forget somewhere to update the PDs. What
> hardware is that on?
>
> Felix already mentioned that this could be problematic for the KFD.
>
> Maybe he has an idea,
> Christian.
>
> Am 08.03.19 um 15:04 schrieb Russell, Kent:
> > Hi Christian,
> >
> > This patch ended up causing a VM Fault in KFDTest. Reverting just this
> patch addressed the issue:
> > [ 82.703503] amdgpu 0000:0c:00.0: GPU fault detected: 146 0x0000480c for
> process pid 0 thread pid 0
> > [ 82.703512] amdgpu 0000:0c:00.0:
> VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x00001000
> > [ 82.703516] amdgpu 0000:0c:00.0:
> VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x1004800C
> > [ 82.703522] amdgpu 0000:0c:00.0: VM fault (0x0c, vmid 8, pasid 32769) at
> page 4096, read from 'TC0' (0x54433000) (72)
> > [ 82.703585] Evicting PASID 32769 queues
> >
> > I am looking into it, but if you have any insight that would be great in
> helping to resolve it quickly.
> >
> > Kent
> >> -----Original Message-----
> >> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> >> Christian König
> >> Sent: Tuesday, February 26, 2019 7:47 AM
> >> To: amd-gfx at lists.freedesktop.org
> >> Subject: [PATCH 3/6] drm/amdgpu: allocate VM PDs/PTs on demand
> >>
> >> Let's start to allocate VM PDs/PTs on demand instead of
> >> pre-allocating them during mapping.
> >>
> >> Signed-off-by: Christian König <christian.koenig at amd.com>
> >> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> >> ---
> >> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 +-
> >> drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 9 --
> >> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 10 --
> >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 136 +++++------------
> -
> >> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 -
> >> 5 files changed, 39 insertions(+), 129 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> >> index 31e3953dcb6e..088e9b6b765b 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> >> @@ -410,15 +410,7 @@ static int add_bo_to_vm(struct amdgpu_device
> >> *adev, struct kgd_mem *mem,
> >> if (p_bo_va_entry)
> >> *p_bo_va_entry = bo_va_entry;
> >>
> >> - /* Allocate new page tables if needed and validate
> >> - * them.
> >> - */
> >> - ret = amdgpu_vm_alloc_pts(adev, vm, va, amdgpu_bo_size(bo));
> >> - if (ret) {
> >> - pr_err("Failed to allocate pts, err=%d\n", ret);
> >> - goto err_alloc_pts;
> >> - }
> >> -
> >> + /* Allocate validate page tables if needed */
> >> ret = vm_validate_pt_pd_bos(vm);
> >> if (ret) {
> >> pr_err("validate_pt_pd_bos() failed\n"); diff --git
> >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> >> index 7e22be7ca68a..54dd02a898b9 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
> >> @@ -92,15 +92,6 @@ int amdgpu_map_static_csa(struct amdgpu_device
> >> *adev, struct amdgpu_vm *vm,
> >> return -ENOMEM;
> >> }
> >>
> >> - r = amdgpu_vm_alloc_pts(adev, (*bo_va)->base.vm, csa_addr,
> >> - size);
> >> - if (r) {
> >> - DRM_ERROR("failed to allocate pts for static CSA, err=%d\n",
> >> r);
> >> - amdgpu_vm_bo_rmv(adev, *bo_va);
> >> - ttm_eu_backoff_reservation(&ticket, &list);
> >> - return r;
> >> - }
> >> -
> >> r = amdgpu_vm_bo_map(adev, *bo_va, csa_addr, 0, size,
> >> AMDGPU_PTE_READABLE |
> >> AMDGPU_PTE_WRITEABLE |
> >> AMDGPU_PTE_EXECUTABLE);
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >> index 555285e329ed..fcaaac30e84b 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >> @@ -625,11 +625,6 @@ int amdgpu_gem_va_ioctl(struct drm_device
> *dev,
> >> void *data,
> >>
> >> switch (args->operation) {
> >> case AMDGPU_VA_OP_MAP:
> >> - r = amdgpu_vm_alloc_pts(adev, bo_va->base.vm, args-
> >>> va_address,
> >> - args->map_size);
> >> - if (r)
> >> - goto error_backoff;
> >> -
> >> va_flags = amdgpu_gmc_get_pte_flags(adev, args->flags);
> >> r = amdgpu_vm_bo_map(adev, bo_va, args->va_address,
> >> args->offset_in_bo, args->map_size, @@ -
> >> 645,11 +640,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev,
> void
> >> *data,
> >> args->map_size);
> >> break;
> >> case AMDGPU_VA_OP_REPLACE:
> >> - r = amdgpu_vm_alloc_pts(adev, bo_va->base.vm, args-
> >>> va_address,
> >> - args->map_size);
> >> - if (r)
> >> - goto error_backoff;
> >> -
> >> va_flags = amdgpu_gmc_get_pte_flags(adev, args->flags);
> >> r = amdgpu_vm_bo_replace_map(adev, bo_va, args-
> >>> va_address,
> >> args->offset_in_bo, args-
> >>> map_size, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> >> index 362436f4e856..dfad543fc000 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> >> @@ -504,47 +504,6 @@ static void amdgpu_vm_pt_next(struct
> >> amdgpu_device *adev,
> >> }
> >> }
> >>
> >> -/**
> >> - * amdgpu_vm_pt_first_leaf - get first leaf PD/PT
> >> - *
> >> - * @adev: amdgpu_device pointer
> >> - * @vm: amdgpu_vm structure
> >> - * @start: start addr of the walk
> >> - * @cursor: state to initialize
> >> - *
> >> - * Start a walk and go directly to the leaf node.
> >> - */
> >> -static void amdgpu_vm_pt_first_leaf(struct amdgpu_device *adev,
> >> - struct amdgpu_vm *vm, uint64_t start,
> >> - struct amdgpu_vm_pt_cursor *cursor)
> >> -{
> >> - amdgpu_vm_pt_start(adev, vm, start, cursor);
> >> - while (amdgpu_vm_pt_descendant(adev, cursor));
> >> -}
> >> -
> >> -/**
> >> - * amdgpu_vm_pt_next_leaf - get next leaf PD/PT
> >> - *
> >> - * @adev: amdgpu_device pointer
> >> - * @cursor: current state
> >> - *
> >> - * Walk the PD/PT tree to the next leaf node.
> >> - */
> >> -static void amdgpu_vm_pt_next_leaf(struct amdgpu_device *adev,
> >> - struct amdgpu_vm_pt_cursor *cursor)
> >> -{
> >> - amdgpu_vm_pt_next(adev, cursor);
> >> - if (cursor->pfn != ~0ll)
> >> - while (amdgpu_vm_pt_descendant(adev, cursor));
> >> -}
> >> -
> >> -/**
> >> - * for_each_amdgpu_vm_pt_leaf - walk over all leaf PDs/PTs in the
> >> hierarchy
> >> - */
> >> -#define for_each_amdgpu_vm_pt_leaf(adev, vm, start, end, cursor)
> >> \
> >> - for (amdgpu_vm_pt_first_leaf((adev), (vm), (start), &(cursor));
> >> \
> >> - (cursor).pfn <= end; amdgpu_vm_pt_next_leaf((adev),
> >> &(cursor)))
> >> -
> >> /**
> >> * amdgpu_vm_pt_first_dfs - start a deep first search
> >> *
> >> @@ -915,74 +874,51 @@ static void amdgpu_vm_bo_param(struct
> >> amdgpu_device *adev, struct amdgpu_vm *vm,
> >> * Returns:
> >> * 0 on success, errno otherwise.
> >> */
> >> -int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
> >> - struct amdgpu_vm *vm,
> >> - uint64_t saddr, uint64_t size)
> >> +static int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
> >> + struct amdgpu_vm *vm,
> >> + struct amdgpu_vm_pt_cursor *cursor)
> >> {
> >> - struct amdgpu_vm_pt_cursor cursor;
> >> + struct amdgpu_vm_pt *entry = cursor->entry;
> >> + struct amdgpu_bo_param bp;
> >> struct amdgpu_bo *pt;
> >> - uint64_t eaddr;
> >> int r;
> >>
> >> - /* validate the parameters */
> >> - if (saddr & AMDGPU_GPU_PAGE_MASK || size &
> >> AMDGPU_GPU_PAGE_MASK)
> >> - return -EINVAL;
> >> + if (cursor->level < AMDGPU_VM_PTB && !entry->entries) {
> >> + unsigned num_entries;
> >>
> >> - eaddr = saddr + size - 1;
> >> -
> >> - saddr /= AMDGPU_GPU_PAGE_SIZE;
> >> - eaddr /= AMDGPU_GPU_PAGE_SIZE;
> >> -
> >> - if (eaddr >= adev->vm_manager.max_pfn) {
> >> - dev_err(adev->dev, "va above limit (0x%08llX >=
> >> 0x%08llX)\n",
> >> - eaddr, adev->vm_manager.max_pfn);
> >> - return -EINVAL;
> >> + num_entries = amdgpu_vm_num_entries(adev, cursor-
> >>> level);
> >> + entry->entries = kvmalloc_array(num_entries,
> >> + sizeof(*entry->entries),
> >> + GFP_KERNEL | __GFP_ZERO);
> >> + if (!entry->entries)
> >> + return -ENOMEM;
> >> }
> >>
> >> - for_each_amdgpu_vm_pt_leaf(adev, vm, saddr, eaddr, cursor) {
> >> - struct amdgpu_vm_pt *entry = cursor.entry;
> >> - struct amdgpu_bo_param bp;
> >> -
> >> - if (cursor.level < AMDGPU_VM_PTB) {
> >> - unsigned num_entries;
> >> -
> >> - num_entries = amdgpu_vm_num_entries(adev,
> >> cursor.level);
> >> - entry->entries = kvmalloc_array(num_entries,
> >> - sizeof(*entry-
> >>> entries),
> >> - GFP_KERNEL |
> >> - __GFP_ZERO);
> >> - if (!entry->entries)
> >> - return -ENOMEM;
> >> - }
> >> -
> >> -
> >> - if (entry->base.bo)
> >> - continue;
> >> -
> >> - amdgpu_vm_bo_param(adev, vm, cursor.level, &bp);
> >> -
> >> - r = amdgpu_bo_create(adev, &bp, &pt);
> >> - if (r)
> >> - return r;
> >> -
> >> - if (vm->use_cpu_for_update) {
> >> - r = amdgpu_bo_kmap(pt, NULL);
> >> - if (r)
> >> - goto error_free_pt;
> >> - }
> >> + if (entry->base.bo)
> >> + return 0;
> >>
> >> - /* Keep a reference to the root directory to avoid
> >> - * freeing them up in the wrong order.
> >> - */
> >> - pt->parent = amdgpu_bo_ref(cursor.parent->base.bo);
> >> + amdgpu_vm_bo_param(adev, vm, cursor->level, &bp);
> >>
> >> - amdgpu_vm_bo_base_init(&entry->base, vm, pt);
> >> + r = amdgpu_bo_create(adev, &bp, &pt);
> >> + if (r)
> >> + return r;
> >>
> >> - r = amdgpu_vm_clear_bo(adev, vm, pt);
> >> + if (vm->use_cpu_for_update) {
> >> + r = amdgpu_bo_kmap(pt, NULL);
> >> if (r)
> >> goto error_free_pt;
> >> }
> >>
> >> + /* Keep a reference to the root directory to avoid
> >> + * freeing them up in the wrong order.
> >> + */
> >> + pt->parent = amdgpu_bo_ref(cursor->parent->base.bo);
> >> + amdgpu_vm_bo_base_init(&entry->base, vm, pt);
> >> +
> >> + r = amdgpu_vm_clear_bo(adev, vm, pt);
> >> + if (r)
> >> + goto error_free_pt;
> >> +
> >> return 0;
> >>
> >> error_free_pt:
> >> @@ -1627,6 +1563,7 @@ static int amdgpu_vm_update_ptes(struct
> >> amdgpu_pte_update_params *params,
> >> struct amdgpu_vm_pt_cursor cursor;
> >> uint64_t frag_start = start, frag_end;
> >> unsigned int frag;
> >> + int r;
> >>
> >> /* figure out the initial fragment */
> >> amdgpu_vm_fragment(params, frag_start, end, flags, &frag,
> >> &frag_end); @@ -1634,12 +1571,15 @@ static int
> >> amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
> >> /* walk over the address space and update the PTs */
> >> amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
> >> while (cursor.pfn < end) {
> >> - struct amdgpu_bo *pt = cursor.entry->base.bo;
> >> unsigned shift, parent_shift, mask;
> >> uint64_t incr, entry_end, pe_start;
> >> + struct amdgpu_bo *pt;
> >>
> >> - if (!pt)
> >> - return -ENOENT;
> >> + r = amdgpu_vm_alloc_pts(params->adev, params->vm,
> >> &cursor);
> >> + if (r)
> >> + return r;
> >> +
> >> + pt = cursor.entry->base.bo;
> >>
> >> /* The root level can't be a huge page */
> >> if (cursor.level == adev->vm_manager.root_level) { diff --git
> >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> >> index 81ff8177f092..116605c038d2 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> >> @@ -303,9 +303,6 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm);
> int
> >> amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct
> >> amdgpu_vm *vm,
> >> int (*callback)(void *p, struct amdgpu_bo *bo),
> >> void *param);
> >> -int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
> >> - struct amdgpu_vm *vm,
> >> - uint64_t saddr, uint64_t size);
> >> int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job
> >> *job, bool need_pipe_sync); int amdgpu_vm_update_directories(struct
> >> amdgpu_device *adev,
> >> struct amdgpu_vm *vm);
> >> --
> >> 2.17.1
> >>
> >> _______________________________________________
> >> amd-gfx mailing list
> >> amd-gfx at lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list