<div dir="auto"><div><br><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Wed., Jul. 17, 2019, 03:15 Christian König, <<a href="mailto:ckoenig.leichtzumerken@gmail.com">ckoenig.leichtzumerken@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Am 17.07.19 um 02:06 schrieb Marek Olšák:<br>
> From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank" rel="noreferrer">marek.olsak@amd.com</a>><br>
><br>
> Hopefully we'll only use 1 gfx ring, because otherwise we'd have to have<br>
> separate GDS buffers for each gfx ring.<br>
><br>
> This is a workaround to ensure stability of transform feedback. Shaders hang<br>
> waiting for a GDS instruction (ds_sub, not ds_ordered_count).<br>
><br>
> The disadvantage is that compute IBs might get a different VMID,<br>
> because now gfx always has GDS and compute doesn't.<br>
<br>
So far compute is only using GWS, but I don't think that those <br>
reservations are a good idea in general.<br>
<br>
How severe is the ENOMEM problem you see with using an explicit GWS <br>
allocation?<br></blockquote></div></div><div dir="auto"><br></div><div dir="auto">I guess you mean GDS or OA.</div><div dir="auto"><br></div><div dir="auto">There is no ENOMEM, it just hangs. I don't know why. The shader is waiting for ds_sub and can't continue, but GDS is idle.</div><div dir="auto"><br></div><div dir="auto">Marek</div><div dir="auto"><br></div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
Regards,<br>
Christian.<br>
<br>
><br>
> Signed-off-by: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank" rel="noreferrer">marek.olsak@amd.com</a>><br>
> ---<br>
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +<br>
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 10 ++++++++++<br>
> drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h | 6 ++++++<br>
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 20 ++++++++++++++++++++<br>
> 4 files changed, 37 insertions(+)<br>
><br>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
> index 4b514a44184c..cbd55d061b72 100644<br>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
> @@ -456,20 +456,21 @@ struct amdgpu_cs_parser {<br>
> struct drm_file *filp;<br>
> struct amdgpu_ctx *ctx;<br>
> <br>
> /* chunks */<br>
> unsigned nchunks;<br>
> struct amdgpu_cs_chunk *chunks;<br>
> <br>
> /* scheduler job object */<br>
> struct amdgpu_job *job;<br>
> struct drm_sched_entity *entity;<br>
> + unsigned hw_ip;<br>
> <br>
> /* buffer objects */<br>
> struct ww_acquire_ctx ticket;<br>
> struct amdgpu_bo_list *bo_list;<br>
> struct amdgpu_mn *mn;<br>
> struct amdgpu_bo_list_entry vm_pd;<br>
> struct list_head validated;<br>
> struct dma_fence *fence;<br>
> uint64_t bytes_moved_threshold;<br>
> uint64_t bytes_moved_vis_threshold;<br>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c<br>
> index c691df6f7a57..9125cd69a124 100644<br>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c<br>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c<br>
> @@ -678,20 +678,28 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,<br>
> if (r)<br>
> goto error_validate;<br>
> <br>
> amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,<br>
> p->bytes_moved_vis);<br>
> <br>
> gds = p->bo_list->gds_obj;<br>
> gws = p->bo_list->gws_obj;<br>
> oa = p->bo_list->oa_obj;<br>
> <br>
> + if (p->hw_ip == AMDGPU_HW_IP_GFX) {<br>
> + /* Only gfx10 allocates these. */<br>
> + if (!gds)<br>
> + gds = p->adev->gds.gds_gfx_bo;<br>
> + if (!oa)<br>
> + oa = p->adev->gds.oa_gfx_bo;<br>
> + }<br>
> +<br>
> amdgpu_bo_list_for_each_entry(e, p->bo_list) {<br>
> struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e-><a href="http://tv.bo" rel="noreferrer noreferrer" target="_blank">tv.bo</a>);<br>
> <br>
> /* Make sure we use the exclusive slot for shared BOs */<br>
> if (bo->prime_shared_count)<br>
> e->tv.num_shared = 0;<br>
> e->bo_va = amdgpu_vm_bo_find(vm, bo);<br>
> }<br>
> <br>
> if (gds) {<br>
> @@ -954,20 +962,22 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,<br>
> struct drm_amdgpu_cs_chunk_ib *chunk_ib;<br>
> struct drm_sched_entity *entity;<br>
> <br>
> chunk = &parser->chunks[i];<br>
> ib = &parser->job->ibs[j];<br>
> chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;<br>
> <br>
> if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)<br>
> continue;<br>
> <br>
> + parser->hw_ip = chunk_ib->ip_type;<br>
> +<br>
> if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&<br>
> (amdgpu_mcbp || amdgpu_sriov_vf(adev))) {<br>
> if (chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {<br>
> if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)<br>
> ce_preempt++;<br>
> else<br>
> de_preempt++;<br>
> }<br>
> <br>
> /* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */<br>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h<br>
> index df8a23554831..0943b8e1d97e 100644<br>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h<br>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h<br>
> @@ -26,20 +26,26 @@<br>
> <br>
> struct amdgpu_ring;<br>
> struct amdgpu_bo;<br>
> <br>
> struct amdgpu_gds {<br>
> uint32_t gds_size;<br>
> uint32_t gws_size;<br>
> uint32_t oa_size;<br>
> uint32_t gds_compute_max_wave_id;<br>
> uint32_t vgt_gs_max_wave_id;<br>
> +<br>
> + /* Reserved OA for the gfx ring. (gfx10) */<br>
> + uint32_t gds_gfx_reservation_size;<br>
> + uint32_t oa_gfx_reservation_size;<br>
> + struct amdgpu_bo *gds_gfx_bo;<br>
> + struct amdgpu_bo *oa_gfx_bo;<br>
> };<br>
> <br>
> struct amdgpu_gds_reg_offset {<br>
> uint32_t mem_base;<br>
> uint32_t mem_size;<br>
> uint32_t gws;<br>
> uint32_t oa;<br>
> };<br>
> <br>
> #endif /* __AMDGPU_GDS_H__ */<br>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c<br>
> index 618291df659b..3952754c04ff 100644<br>
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c<br>
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c<br>
> @@ -1314,20 +1314,33 @@ static int gfx_v10_0_sw_init(void *handle)<br>
> <br>
> kiq = &adev->gfx.kiq;<br>
> r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);<br>
> if (r)<br>
> return r;<br>
> <br>
> r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v10_compute_mqd));<br>
> if (r)<br>
> return r;<br>
> <br>
> + /* allocate reserved GDS resources for transform feedback */<br>
> + r = amdgpu_bo_create_kernel(adev, adev->gds.gds_gfx_reservation_size,<br>
> + 4, AMDGPU_GEM_DOMAIN_GDS,<br>
> + &adev->gds.gds_gfx_bo, NULL, NULL);<br>
> + if (r)<br>
> + return r;<br>
> +<br>
> + r = amdgpu_bo_create_kernel(adev, adev->gds.oa_gfx_reservation_size,<br>
> + 1, AMDGPU_GEM_DOMAIN_OA,<br>
> + &adev->gds.oa_gfx_bo, NULL, NULL);<br>
> + if (r)<br>
> + return r;<br>
> +<br>
> /* allocate visible FB for rlc auto-loading fw */<br>
> if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {<br>
> r = gfx_v10_0_rlc_backdoor_autoload_buffer_init(adev);<br>
> if (r)<br>
> return r;<br>
> }<br>
> <br>
> adev->gfx.ce_ram_size = F32_CE_PROGRAM_RAM_SIZE;<br>
> <br>
> gfx_v10_0_gpu_early_init(adev);<br>
> @@ -1354,20 +1367,23 @@ static void gfx_v10_0_me_fini(struct amdgpu_device *adev)<br>
> amdgpu_bo_free_kernel(&adev->gfx.me.me_fw_obj,<br>
> &adev->gfx.me.me_fw_gpu_addr,<br>
> (void **)&adev->gfx.me.me_fw_ptr);<br>
> }<br>
> <br>
> static int gfx_v10_0_sw_fini(void *handle)<br>
> {<br>
> int i;<br>
> struct amdgpu_device *adev = (struct amdgpu_device *)handle;<br>
> <br>
> + amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);<br>
> + amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);<br>
> +<br>
> for (i = 0; i < adev->gfx.num_gfx_rings; i++)<br>
> amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);<br>
> for (i = 0; i < adev->gfx.num_compute_rings; i++)<br>
> amdgpu_ring_fini(&adev->gfx.compute_ring[i]);<br>
> <br>
> amdgpu_gfx_mqd_sw_fini(adev);<br>
> amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);<br>
> amdgpu_gfx_kiq_fini(adev);<br>
> <br>
> gfx_v10_0_pfp_fini(adev);<br>
> @@ -5181,20 +5197,24 @@ static void gfx_v10_0_set_gds_init(struct amdgpu_device *adev)<br>
> case CHIP_NAVI10:<br>
> default:<br>
> adev->gds.gds_size = 0x10000;<br>
> adev->gds.gds_compute_max_wave_id = 0x4ff;<br>
> adev->gds.vgt_gs_max_wave_id = 0x3ff;<br>
> break;<br>
> }<br>
> <br>
> adev->gds.gws_size = 64;<br>
> adev->gds.oa_size = 16;<br>
> +<br>
> + /* Reserved for transform feedback. */<br>
> + adev->gds.gds_gfx_reservation_size = 256;<br>
> + adev->gds.oa_gfx_reservation_size = 4;<br>
> }<br>
> <br>
> static void gfx_v10_0_set_user_wgp_inactive_bitmap_per_sh(struct amdgpu_device *adev,<br>
> u32 bitmap)<br>
> {<br>
> u32 data;<br>
> <br>
> if (!bitmap)<br>
> return;<br>
> <br>
<br>
</blockquote></div></div></div>