[PATCH 2/2] drm/amdgpu: Fix silent amdgpu_bo_move failures
Christian König
ckoenig.leichtzumerken at gmail.com
Tue Jul 16 09:29:19 UTC 2019
Am 13.07.19 um 08:42 schrieb Kuehling, Felix:
> Under memory pressure, buffer moves between RAM to VRAM can
> fail when there is no GTT space available. In those cases
> amdgpu_bo_move falls back to ttm_bo_move_memcpy, which seems to
> succeed, although it doesn't really support non-contiguous or
> invisible VRAM. This manifests as VM faults with corrupted page
> table entries in KFD eviction stress tests.
>
> Print some helpful messages when lack of GTT space is causing buffer
> moves to fail. Check that source and destination memory regions are
> supported by ttm_bo_move_memcpy before taking that fallback.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
Reviewed-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 40 +++++++++++++++++++++++--
> 1 file changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 78440748c87f..37d9a3b09946 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -498,6 +498,7 @@ static int amdgpu_move_vram_ram(struct ttm_buffer_object *bo, bool evict,
> placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
> r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
> if (unlikely(r)) {
> + pr_err("Failed to find GTT space for blit from VRAM\n");
> return r;
> }
>
> @@ -556,6 +557,7 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
> placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
> r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
> if (unlikely(r)) {
> + pr_err("Failed to find GTT space for blit to VRAM\n");
> return r;
> }
>
> @@ -575,6 +577,30 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
> return r;
> }
>
> +/**
> + * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy
> + *
> + * Called by amdgpu_bo_move()
> + */
> +static bool amdgpu_mem_visible(struct amdgpu_device *adev,
> + struct ttm_mem_reg *mem)
> +{
> + struct drm_mm_node *nodes = mem->mm_node;
> +
> + if (mem->mem_type == TTM_PL_SYSTEM ||
> + mem->mem_type == TTM_PL_TT)
> + return true;
> + if (mem->mem_type != TTM_PL_VRAM)
> + return false;
> +
> + /* ttm_mem_reg_ioremap only supports contiguous memory */
> + if (nodes->size != mem->num_pages)
> + return false;
> +
> + return ((nodes->start + nodes->size) << PAGE_SHIFT)
> + <= adev->gmc.visible_vram_size;
> +}
> +
> /**
> * amdgpu_bo_move - Move a buffer object to a new memory location
> *
> @@ -619,8 +645,10 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
> return 0;
> }
>
> - if (!adev->mman.buffer_funcs_enabled)
> + if (!adev->mman.buffer_funcs_enabled) {
> + r = -ENODEV;
> goto memcpy;
> + }
>
> if (old_mem->mem_type == TTM_PL_VRAM &&
> new_mem->mem_type == TTM_PL_SYSTEM) {
> @@ -635,10 +663,16 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
>
> if (r) {
> memcpy:
> - r = ttm_bo_move_memcpy(bo, ctx, new_mem);
> - if (r) {
> + /* Check that all memory is CPU accessible */
> + if (!amdgpu_mem_visible(adev, old_mem) ||
> + !amdgpu_mem_visible(adev, new_mem)) {
> + pr_err("Move buffer fallback to memcpy unavailable\n");
> return r;
> }
> +
> + r = ttm_bo_move_memcpy(bo, ctx, new_mem);
> + if (r)
> + return r;
> }
>
> if (bo->type == ttm_bo_type_device &&
More information about the amd-gfx
mailing list