Regression: Re: [PATCH 2/3] drm/amdgpu: fix amdgpu_ttm_bind

Felix Kuehling felix.kuehling at amd.com
Thu Aug 31 22:50:12 UTC 2017


I believe this patch breaks pinned GTT BOs. It trips KFD tests that
apply a lot of memory pressure on one of our test systems.

I understand the cause of the problem (see my analysis below), but I'm
not sure how to solve it, without reverting the patch. Christian, do you
have any ideas? Should we at least revert this as a short-term fix?

I added some WARNs in the code to understand what's happening:

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 72ad045..3ead155 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -930,6 +930,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
                return;
 
        abo = container_of(bo, struct amdgpu_bo, tbo);
+        WARN(abo->pin_count, "Moving pinned bo");
        amdgpu_vm_bo_invalidate(adev, abo);
 
        amdgpu_bo_kunmap(abo);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index c934ad5..af9f40e 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -63,6 +63,7 @@ int ttm_bo_move_ttm(struct ttm_buffer_object *bo,
 
                ttm_tt_unbind(ttm);
                ttm_bo_free_old_node(bo);
+                WARN(old_mem->placement & TTM_PL_FLAG_NO_EVICT, "Stripping NO_EVICT flag from BO");
                ttm_flag_masked(&old_mem->placement, TTM_PL_FLAG_SYSTEM,
                                TTM_PL_MASK_MEM);
                old_mem->mem_type = TTM_PL_SYSTEM;

With that I see the following scenario unfold in my kernel log, where
pinned GTT BOs are swapped out and kunmapped under memory pressure.
Eventually this leads to memory access faults while accessing ring
buffers or IBs:

[    8.475453] Stripping NO_EVICT flag from BO
[    8.475461] ------------[ cut here ]------------
[    8.475465] WARNING: CPU: 5 PID: 2186 at /home/fkuehlin/compute/kernel/drivers/gpu/drm/ttm/ttm_bo_util.c:66 ttm_bo_move_ttm+0x14a/0x160 [ttm]
[    8.475466] Modules linked in: amdkfd(E) amd_iommu_v2(E) amdgpu(E+) radeon(E) ttm(E)
[    8.475472] CPU: 5 PID: 2186 Comm: systemd-udevd Tainted: G        W   E   4.12.0-kfd-fkuehlin #1
[    8.475473] Hardware name: ASUS All Series/X99-E WS/USB 3.1, BIOS 2006 04/07/2016
[    8.475474] task: ffff91bf12a82e00 task.stack: ffffb8bd43600000
[    8.475477] RIP: 0010:ttm_bo_move_ttm+0x14a/0x160 [ttm]
[    8.475478] RSP: 0018:ffffb8bd43603780 EFLAGS: 00010282
[    8.475480] RAX: 000000000000001f RBX: ffff91bf0b0ee058 RCX: 0000000000000006
[    8.475481] RDX: 0000000000000006 RSI: ffff91bf12a83678 RDI: 0000000000000202
[    8.475482] RBP: ffffb8bd436037a0 R08: 0000000000000001 R09: 0000000000000000
[    8.475483] R10: 0000000000000000 R11: 0000000000000000 R12: ffffb8bd436037e8
[    8.475484] R13: 0000000000000000 R14: ffff91bf12c1bf00 R15: ffff91bf0b0ee000
[    8.475485] FS:  00007f71b381b8c0(0000) GS:ffff91bf17340000(0000) knlGS:0000000000000000
[    8.475486] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    8.475487] CR2: 0000557e2ad63a28 CR3: 000000060d153000 CR4: 00000000001406e0
[    8.475488] Call Trace:
[    8.475514]  amdgpu_ttm_bind+0x129/0x1a0 [amdgpu]
[    8.475518]  ? ttm_bo_mem_compat+0x20/0x50 [ttm]
[    8.475522]  ? sparse_add_one_section+0x166/0x167
[    8.475547]  amdgpu_bo_pin_restricted+0x1ca/0x2f0 [amdgpu]
[    8.475575]  amdgpu_bo_create_reserved+0x95/0x220 [amdgpu]
[    8.475602]  amdgpu_bo_create_kernel+0x10/0x70 [amdgpu]
[    8.475632]  amdgpu_gfx_compute_mqd_sw_init+0xf2/0x170 [amdgpu]
[    8.475663]  gfx_v8_0_sw_init+0xca8/0x15d0 [amdgpu]
[    8.475690]  amdgpu_device_init+0xe6f/0x1620 [amdgpu]
[    8.475717]  amdgpu_driver_load_kms+0x63/0x200 [amdgpu]
[    8.475720]  drm_dev_register+0x143/0x1d0
[    8.475744]  amdgpu_pci_probe+0x110/0x140 [amdgpu]
[    8.475747]  local_pci_probe+0x40/0xa0
[    8.475749]  ? pci_match_device+0xdb/0x100
[    8.475752]  pci_device_probe+0x136/0x160
[    8.475755]  driver_probe_device+0x299/0x440
[    8.475757]  __driver_attach+0xde/0xe0
[    8.475759]  ? driver_probe_device+0x440/0x440
[    8.475761]  bus_for_each_dev+0x61/0xa0
[    8.475763]  driver_attach+0x19/0x20
[    8.475764]  bus_add_driver+0x1f6/0x270
[    8.475766]  ? 0xffffffffc0839000
[    8.475768]  driver_register+0x5b/0xd0
[    8.475769]  ? 0xffffffffc0839000
[    8.475771]  __pci_register_driver+0x5b/0x60
[    8.475799]  amdgpu_init+0x8a/0x1000 [amdgpu]
[    8.475801]  do_one_initcall+0x4e/0x190
[    8.475804]  ? kmem_cache_alloc+0xd8/0x170
[    8.475807]  do_init_module+0x55/0x1e9
[    8.475810]  load_module+0x2771/0x29c0
[    8.475817]  SYSC_finit_module+0xbc/0xf0
[    8.475819]  ? SYSC_finit_module+0xbc/0xf0
[    8.475823]  SyS_finit_module+0x9/0x10
[    8.475825]  entry_SYSCALL_64_fastpath+0x1f/0xbe
[...]
[  278.693790] Moving pinned bo (this kunmaps the BO)
[  278.693802] ------------[ cut here ]------------
[  278.693833] WARNING: CPU: 11 PID: 2145 at /home/fkuehlin/compute/kernel/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:933 amdgpu_bo_move_notify+0x7c/0x90 [amdgpu]
[  278.693834] Modules linked in: fuse(E) x86_pkg_temp_thermal(E) amdkfd(E) amd_iommu_v2(E) amdgpu(E) radeon(E) ttm(E)
[  278.693843] CPU: 11 PID: 2145 Comm: kworker/11:2 Tainted: G        W   E   4.12.0-kfd-fkuehlin #1
[  278.693845] Hardware name: ASUS All Series/X99-E WS/USB 3.1, BIOS 2006 04/07/2016
[  278.693851] Workqueue: kfd_process_wq kfd_process_wq_release [amdkfd]
[  278.693853] task: ffff91bf0dab1700 task.stack: ffffb8bd4495c000
[  278.693880] RIP: 0010:amdgpu_bo_move_notify+0x7c/0x90 [amdgpu]
[  278.693882] RSP: 0018:ffffb8bd4495fc20 EFLAGS: 00010282
[  278.693884] RAX: 0000000000000010 RBX: ffff91bf0bec7058 RCX: 0000000000000006
[  278.693885] RDX: 0000000000000006 RSI: ffff91bf0dab1f50 RDI: 0000000000000202
[  278.693886] RBP: ffffb8bd4495fc40 R08: 0000000000000001 R09: 0000000000000000
[  278.693887] R10: ffffb8bd4495fb98 R11: 0000000000000000 R12: ffff91bf0b0c2a68
[  278.693889] R13: 0000000000000000 R14: ffff91bf0bec708c R15: ffff91bf104bb000
[  278.693890] FS:  0000000000000000(0000) GS:ffff91bf174c0000(0000) knlGS:0000000000000000
[  278.693891] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  278.693892] CR2: 0000000004199000 CR3: 00000001f3e0f000 CR4: 00000000001406e0
[  278.693893] Call Trace:
[  278.693898]  ttm_bo_cleanup_memtype_use+0x1f/0x70 [ttm]
[  278.693902]  ttm_bo_unref+0x388/0x470 [ttm]
[  278.693931]  amdgpu_bo_unref+0x25/0x40 [amdgpu]
[  278.693962]  amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x1b5/0x260 [amdgpu]
[  278.693974]  kfd_process_free_outstanding_kfd_bos+0xab/0xe0 [amdkfd]
[  278.693980]  kfd_process_wq_release+0x56/0x110 [amdkfd]
[  278.693983]  process_one_work+0x190/0x430
[  278.693984]  ? process_one_work+0x133/0x430
[  278.693988]  worker_thread+0x46/0x400
[  278.693991]  kthread+0x10a/0x140
[  278.693993]  ? process_one_work+0x430/0x430
[  278.693995]  ? kthread_create_on_node+0x40/0x40
[  278.693996]  ? umh_complete+0x40/0x40
[  278.693998]  ? call_usermodehelper_exec_async+0x137/0x140
[  278.694000]  ret_from_fork+0x2a/0x40
[...]
[  694.740232] BUG: unable to handle kernel paging request at ffffb8bd4354d540
[  694.740387] IP: sdma_v3_0_ring_set_wptr+0x64/0x80 [amdgpu]
[  694.740430] PGD 616c23067 
[  694.740432] P4D 616c23067 
[  694.740454] PUD 616c24067 
[  694.740476] PMD 613704067 
[  694.740498] PTE 0

[  694.740551] Oops: 0002 [#1] SMP
[  694.740576] Modules linked in: fuse(E) x86_pkg_temp_thermal(E) amdkfd(E) amd_iommu_v2(E) amdgpu(E) radeon(E) ttm(E)
[  694.740667] CPU: 10 PID: 2213 Comm: sdma0 Tainted: G        W   E   4.12.0-kfd-fkuehlin #1
[  694.740727] Hardware name: ASUS All Series/X99-E WS/USB 3.1, BIOS 2006 04/07/2016
[  694.740782] task: ffff91bf0d989700 task.stack: ffffb8bd43730000
[  694.740895] RIP: 0010:sdma_v3_0_ring_set_wptr+0x64/0x80 [amdgpu]
[  694.740941] RSP: 0018:ffffb8bd43733dc0 EFLAGS: 00010202
[  694.740982] RAX: ffff91bf0b0c7770 RBX: ffff91bf0b0c7770 RCX: ffffb8bd4354d540
[  694.741035] RDX: 0000000000001ac0 RSI: 00000000000006b0 RDI: ffff91bf0b0c0000
[  694.741087] RBP: ffffb8bd43733dc0 R08: ffffffffc072bd60 R09: 0000000000000000
[  694.741140] R10: ffff91bf0b0c0000 R11: ffffffffc064ef47 R12: ffff91bef6166000
[  694.741192] R13: ffff91bef6166368 R14: 0000000000000001 R15: ffff91bf0b0c7770
[  694.741246] FS:  0000000000000000(0000) GS:ffff91bf17480000(0000) knlGS:0000000000000000
[  694.741305] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  694.741349] CR2: ffffb8bd4354d540 CR3: 000000060e4a6000 CR4: 00000000001406e0
[  694.741401] Call Trace:
[  694.741480]  amdgpu_ring_commit+0x3a/0x60 [amdgpu]
[  694.741575]  amdgpu_ib_schedule+0x2c7/0x450 [amdgpu]
[  694.741687]  amdgpu_job_run+0x72/0x110 [amdgpu]
[  694.741793]  amd_sched_main+0x343/0x470 [amdgpu]
[  694.741838]  ? wake_atomic_t_function+0x60/0x60
[  694.741876]  kthread+0x10a/0x140
[  694.741969]  ? amd_sched_process_job+0x60/0x60 [amdgpu]
[  694.742010]  ? kthread_create_on_node+0x40/0x40
[  694.742048]  ret_from_fork+0x2a/0x40



On 2017-08-23 05:02 AM, Christian König wrote:
> From: Christian König <christian.koenig at amd.com>
>
> Use ttm_bo_mem_space instead of manually allocating GART space.
>
> This allows us to evict BOs when there isn't enought GART space any more.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 14 +++++--------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     | 31 +++++++++++++++++++++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  4 ----
>  3 files changed, 30 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> index 9e05e25..0d15eb7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> @@ -108,10 +108,10 @@ bool amdgpu_gtt_mgr_is_allocated(struct ttm_mem_reg *mem)
>   *
>   * Allocate the address space for a node.
>   */
> -int amdgpu_gtt_mgr_alloc(struct ttm_mem_type_manager *man,
> -			 struct ttm_buffer_object *tbo,
> -			 const struct ttm_place *place,
> -			 struct ttm_mem_reg *mem)
> +static int amdgpu_gtt_mgr_alloc(struct ttm_mem_type_manager *man,
> +				struct ttm_buffer_object *tbo,
> +				const struct ttm_place *place,
> +				struct ttm_mem_reg *mem)
>  {
>  	struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev);
>  	struct amdgpu_gtt_mgr *mgr = man->priv;
> @@ -143,12 +143,8 @@ int amdgpu_gtt_mgr_alloc(struct ttm_mem_type_manager *man,
>  					fpfn, lpfn, mode);
>  	spin_unlock(&mgr->lock);
>  
> -	if (!r) {
> +	if (!r)
>  		mem->start = node->start;
> -		if (&tbo->mem == mem)
> -			tbo->offset = (tbo->mem.start << PAGE_SHIFT) +
> -			    tbo->bdev->man[tbo->mem.mem_type].gpu_offset;
> -	}
>  
>  	return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 38d26a7..aa0b7f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -809,20 +809,39 @@ bool amdgpu_ttm_is_bound(struct ttm_tt *ttm)
>  
>  int amdgpu_ttm_bind(struct ttm_buffer_object *bo, struct ttm_mem_reg *bo_mem)
>  {
> +	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
>  	struct ttm_tt *ttm = bo->ttm;
> +	struct ttm_mem_reg tmp;
> +
> +	struct ttm_placement placement;
> +	struct ttm_place placements;
>  	int r;
>  
>  	if (!ttm || amdgpu_ttm_is_bound(ttm))
>  		return 0;
>  
> -	r = amdgpu_gtt_mgr_alloc(&bo->bdev->man[TTM_PL_TT], bo,
> -				 NULL, bo_mem);
> -	if (r) {
> -		DRM_ERROR("Failed to allocate GTT address space (%d)\n", r);
> +	tmp = bo->mem;
> +	tmp.mm_node = NULL;
> +	placement.num_placement = 1;
> +	placement.placement = &placements;
> +	placement.num_busy_placement = 1;
> +	placement.busy_placement = &placements;
> +	placements.fpfn = 0;
> +	placements.lpfn = adev->mc.gart_size >> PAGE_SHIFT;
> +	placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
> +
> +	r = ttm_bo_mem_space(bo, &placement, &tmp, true, false);
> +	if (unlikely(r))
>  		return r;
> -	}
>  
> -	return amdgpu_ttm_do_bind(ttm, bo_mem);
> +	r = ttm_bo_move_ttm(bo, true, false, &tmp);
> +	if (unlikely(r))
> +		ttm_bo_mem_put(bo, &tmp);
> +	else
> +		bo->offset = (bo->mem.start << PAGE_SHIFT) +
> +			bo->bdev->man[bo->mem.mem_type].gpu_offset;
> +
> +	return r;
>  }
>  
>  int amdgpu_ttm_recover_gart(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index f22a475..43093bf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -62,10 +62,6 @@ extern const struct ttm_mem_type_manager_func amdgpu_gtt_mgr_func;
>  extern const struct ttm_mem_type_manager_func amdgpu_vram_mgr_func;
>  
>  bool amdgpu_gtt_mgr_is_allocated(struct ttm_mem_reg *mem);
> -int amdgpu_gtt_mgr_alloc(struct ttm_mem_type_manager *man,
> -			 struct ttm_buffer_object *tbo,
> -			 const struct ttm_place *place,
> -			 struct ttm_mem_reg *mem);
>  uint64_t amdgpu_gtt_mgr_usage(struct ttm_mem_type_manager *man);
>  
>  uint64_t amdgpu_vram_mgr_usage(struct ttm_mem_type_manager *man);



More information about the amd-gfx mailing list