[PATCH] drm/amdgpu: revert "rework synchronization of VM updates v2"
Tom St Denis
tom.stdenis at amd.com
Wed Jan 15 14:36:50 UTC 2020
Tested-by: Tom St Denis <tom.stdenis at amd.com>
On 2020-01-15 9:34 a.m., Christian König wrote:
> This reverts commit 75a0d3f5e4a8dc70c22842ec1fde2866f27f48b9.
>
> It's causing VM page faults, revert until further investigated.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 37 ++++++----------------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 3 ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 7 ++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 38 ++++++++++++-----------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 +--
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 22 +++++++++++++----
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 15 +++++++++---
> 7 files changed, 61 insertions(+), 65 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index c70bbdda078c..46c76e2e1281 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1403,49 +1403,28 @@ void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
> }
>
> /**
> - * amdgpu_bo_sync_wait_resv - Wait for BO reservation fences
> + * amdgpu_sync_wait_resv - Wait for BO reservation fences
> *
> - * @adev: amdgpu device pointer
> - * @resv: reservation object to sync to
> - * @sync_mode: synchronization mode
> + * @bo: buffer object
> * @owner: fence owner
> * @intr: Whether the wait is interruptible
> *
> - * Extract the fences from the reservation object and waits for them to finish.
> - *
> * Returns:
> * 0 on success, errno otherwise.
> */
> -int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct dma_resv *resv,
> - enum amdgpu_sync_mode sync_mode, void *owner,
> - bool intr)
> +int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr)
> {
> + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> struct amdgpu_sync sync;
> int r;
>
> amdgpu_sync_create(&sync);
> - amdgpu_sync_resv(adev, &sync, resv, sync_mode, owner);
> - r = amdgpu_sync_wait(&sync, true);
> + amdgpu_sync_resv(adev, &sync, bo->tbo.base.resv,
> + AMDGPU_SYNC_NE_OWNER, owner);
> + r = amdgpu_sync_wait(&sync, intr);
> amdgpu_sync_free(&sync);
> - return r;
> -}
>
> -/**
> - * amdgpu_bo_sync_wait - Wrapper for amdgpu_bo_sync_wait_resv
> - * @bo: buffer object to wait for
> - * @owner: fence owner
> - * @intr: Whether the wait is interruptible
> - *
> - * Wrapper to wait for fences in a BO.
> - * Returns:
> - * 0 on success, errno otherwise.
> - */
> -int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr)
> -{
> - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -
> - return amdgpu_bo_sync_wait_resv(adev, bo->tbo.base.resv,
> - AMDGPU_SYNC_NE_OWNER, owner, intr);
> + return r;
> }
>
> /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 96d805889e8d..2eeafc77c9c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -283,9 +283,6 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo);
> int amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo);
> void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,
> bool shared);
> -int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct dma_resv *resv,
> - enum amdgpu_sync_mode sync_mode, void *owner,
> - bool intr);
> int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr);
> u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
> int amdgpu_bo_validate(struct amdgpu_bo *bo);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> index 5816df9f8531..c124f64e7aae 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> @@ -240,6 +240,13 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
> owner != AMDGPU_FENCE_OWNER_UNDEFINED)
> continue;
>
> + /* VM updates only sync with moves but not with user
> + * command submissions or KFD evictions fences
> + */
> + if (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED &&
> + owner == AMDGPU_FENCE_OWNER_VM)
> + continue;
> +
> /* Ignore fences depending on the sync mode */
> switch (mode) {
> case AMDGPU_SYNC_ALWAYS:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index a74cafa0e09e..5cb182231f5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -797,7 +797,7 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
> params.vm = vm;
> params.direct = direct;
>
> - r = vm->update_funcs->prepare(¶ms, NULL, AMDGPU_SYNC_EXPLICIT);
> + r = vm->update_funcs->prepare(¶ms, AMDGPU_FENCE_OWNER_KFD, NULL);
> if (r)
> return r;
>
> @@ -1293,7 +1293,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
> params.vm = vm;
> params.direct = direct;
>
> - r = vm->update_funcs->prepare(¶ms, NULL, AMDGPU_SYNC_EXPLICIT);
> + r = vm->update_funcs->prepare(¶ms, AMDGPU_FENCE_OWNER_VM, NULL);
> if (r)
> return r;
>
> @@ -1539,7 +1539,7 @@ static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
> * @adev: amdgpu_device pointer
> * @vm: requested vm
> * @direct: direct submission in a page fault
> - * @resv: fences we need to sync to
> + * @exclusive: fence we need to sync to
> * @start: start of mapped range
> * @last: last mapped entry
> * @flags: flags for the entries
> @@ -1554,14 +1554,14 @@ static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
> */
> static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
> struct amdgpu_vm *vm, bool direct,
> - struct dma_resv *resv,
> + struct dma_fence *exclusive,
> uint64_t start, uint64_t last,
> uint64_t flags, uint64_t addr,
> dma_addr_t *pages_addr,
> struct dma_fence **fence)
> {
> struct amdgpu_vm_update_params params;
> - enum amdgpu_sync_mode sync_mode;
> + void *owner = AMDGPU_FENCE_OWNER_VM;
> int r;
>
> memset(¶ms, 0, sizeof(params));
> @@ -1570,13 +1570,9 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
> params.direct = direct;
> params.pages_addr = pages_addr;
>
> - /* Implicitly sync to command submissions in the same VM before
> - * unmapping. Sync to moving fences before mapping.
> - */
> + /* sync to everything except eviction fences on unmapping */
> if (!(flags & AMDGPU_PTE_VALID))
> - sync_mode = AMDGPU_SYNC_EQ_OWNER;
> - else
> - sync_mode = AMDGPU_SYNC_EXPLICIT;
> + owner = AMDGPU_FENCE_OWNER_KFD;
>
> amdgpu_vm_eviction_lock(vm);
> if (vm->evicting) {
> @@ -1584,7 +1580,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
> goto error_unlock;
> }
>
> - r = vm->update_funcs->prepare(¶ms, resv, sync_mode);
> + r = vm->update_funcs->prepare(¶ms, owner, exclusive);
> if (r)
> goto error_unlock;
>
> @@ -1603,7 +1599,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
> * amdgpu_vm_bo_split_mapping - split a mapping into smaller chunks
> *
> * @adev: amdgpu_device pointer
> - * @resv: fences we need to sync to
> + * @exclusive: fence we need to sync to
> * @pages_addr: DMA addresses to use for mapping
> * @vm: requested vm
> * @mapping: mapped range and flags to use for the update
> @@ -1619,7 +1615,7 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
> * 0 for success, -EINVAL for failure.
> */
> static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
> - struct dma_resv *resv,
> + struct dma_fence *exclusive,
> dma_addr_t *pages_addr,
> struct amdgpu_vm *vm,
> struct amdgpu_bo_va_mapping *mapping,
> @@ -1695,7 +1691,7 @@ static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
> }
>
> last = min((uint64_t)mapping->last, start + max_entries - 1);
> - r = amdgpu_vm_bo_update_mapping(adev, vm, false, resv,
> + r = amdgpu_vm_bo_update_mapping(adev, vm, false, exclusive,
> start, last, flags, addr,
> dma_addr, fence);
> if (r)
> @@ -1734,8 +1730,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
> dma_addr_t *pages_addr = NULL;
> struct ttm_mem_reg *mem;
> struct drm_mm_node *nodes;
> - struct dma_fence **last_update;
> - struct dma_resv *resv;
> + struct dma_fence *exclusive, **last_update;
> uint64_t flags;
> struct amdgpu_device *bo_adev = adev;
> int r;
> @@ -1743,6 +1738,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
> if (clear || !bo) {
> mem = NULL;
> nodes = NULL;
> + exclusive = NULL;
> } else {
> struct ttm_dma_tt *ttm;
>
> @@ -1752,6 +1748,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
> ttm = container_of(bo->tbo.ttm, struct ttm_dma_tt, ttm);
> pages_addr = ttm->dma_address;
> }
> + exclusive = bo->tbo.moving;
> }
>
> if (bo) {
> @@ -1761,14 +1758,11 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
> flags |= AMDGPU_PTE_TMZ;
>
> bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
> - resv = bo->tbo.base.resv;
> } else {
> flags = 0x0;
> - resv = NULL;
> }
>
> - if (clear || (bo && bo->tbo.base.resv ==
> - vm->root.base.bo->tbo.base.resv))
> + if (clear || (bo && bo->tbo.base.resv == vm->root.base.bo->tbo.base.resv))
> last_update = &vm->last_update;
> else
> last_update = &bo_va->last_pt_update;
> @@ -1782,7 +1776,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
> }
>
> list_for_each_entry(mapping, &bo_va->invalids, list) {
> - r = amdgpu_vm_bo_split_mapping(adev, resv, pages_addr, vm,
> + r = amdgpu_vm_bo_split_mapping(adev, exclusive, pages_addr, vm,
> mapping, flags, bo_adev, nodes,
> last_update);
> if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index b5705fcfc935..c21a36bebc0c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -230,8 +230,8 @@ struct amdgpu_vm_update_params {
>
> struct amdgpu_vm_update_funcs {
> int (*map_table)(struct amdgpu_bo *bo);
> - int (*prepare)(struct amdgpu_vm_update_params *p, struct dma_resv *resv,
> - enum amdgpu_sync_mode sync_mode);
> + int (*prepare)(struct amdgpu_vm_update_params *p, void * owner,
> + struct dma_fence *exclusive);
> int (*update)(struct amdgpu_vm_update_params *p,
> struct amdgpu_bo *bo, uint64_t pe, uint64_t addr,
> unsigned count, uint32_t incr, uint64_t flags);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> index e38516304070..68b013be3837 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
> @@ -44,14 +44,26 @@ static int amdgpu_vm_cpu_map_table(struct amdgpu_bo *table)
> * Returns:
> * Negativ errno, 0 for success.
> */
> -static int amdgpu_vm_cpu_prepare(struct amdgpu_vm_update_params *p,
> - struct dma_resv *resv,
> - enum amdgpu_sync_mode sync_mode)
> +static int amdgpu_vm_cpu_prepare(struct amdgpu_vm_update_params *p, void *owner,
> + struct dma_fence *exclusive)
> {
> - if (!resv)
> + int r;
> +
> + /* Wait for any BO move to be completed */
> + if (exclusive) {
> + r = dma_fence_wait(exclusive, true);
> + if (unlikely(r))
> + return r;
> + }
> +
> + /* Don't wait for submissions during page fault */
> + if (p->direct)
> return 0;
>
> - return amdgpu_bo_sync_wait_resv(p->adev, resv, sync_mode, p->vm, true);
> + /* Wait for PT BOs to be idle. PTs share the same resv. object
> + * as the root PD BO
> + */
> + return amdgpu_bo_sync_wait(p->vm->root.base.bo, owner, true);
> }
>
> /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> index 4cc7881f438c..ab6481751763 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
> @@ -58,9 +58,9 @@ static int amdgpu_vm_sdma_map_table(struct amdgpu_bo *table)
> * Negativ errno, 0 for success.
> */
> static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p,
> - struct dma_resv *resv,
> - enum amdgpu_sync_mode sync_mode)
> + void *owner, struct dma_fence *exclusive)
> {
> + struct amdgpu_bo *root = p->vm->root.base.bo;
> unsigned int ndw = AMDGPU_VM_SDMA_MIN_NUM_DW;
> int r;
>
> @@ -70,10 +70,17 @@ static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p,
>
> p->num_dw_left = ndw;
>
> - if (!resv)
> + /* Wait for moves to be completed */
> + r = amdgpu_sync_fence(&p->job->sync, exclusive, false);
> + if (r)
> + return r;
> +
> + /* Don't wait for any submissions during page fault handling */
> + if (p->direct)
> return 0;
>
> - return amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode, p->vm);
> + return amdgpu_sync_resv(p->adev, &p->job->sync, root->tbo.base.resv,
> + AMDGPU_SYNC_NE_OWNER, owner);
> }
>
> /**
More information about the amd-gfx
mailing list