<div dir="auto"><div> <div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Wed., Jul. 17, 2019, 03:15 Christian König, <<a href="mailto:ckoenig.leichtzumerken@gmail.com">ckoenig.leichtzumerken@gmail.com</a>> wrote: </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Am 17.07.19 um 02:06 schrieb Marek Olšák: > From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank" rel="noreferrer">marek.olsak@amd.com</a>> > > Hopefully we'll only use 1 gfx ring, because otherwise we'd have to have > separate GDS buffers for each gfx ring. > > This is a workaround to ensure stability of transform feedback. Shaders hang > waiting for a GDS instruction (ds_sub, not ds_ordered_count). > > The disadvantage is that compute IBs might get a different VMID, > because now gfx always has GDS and compute doesn't. So far compute is only using GWS, but I don't think that those reservations are a good idea in general. How severe is the ENOMEM problem you see with using an explicit GWS allocation? </blockquote></div></div><div dir="auto"> </div><div dir="auto">I guess you mean GDS or OA.</div><div dir="auto"> </div><div dir="auto">There is no ENOMEM, it just hangs. I don't know why. The shader is waiting for ds_sub and can't continue, but GDS is idle.</div><div dir="auto"> </div><div dir="auto">Marek</div><div dir="auto"> </div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> Regards, Christian. > > Signed-off-by: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank" rel="noreferrer">marek.olsak@amd.com</a>> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 10 ++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h | 6 ++++++ > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 20 ++++++++++++++++++++ > 4 files changed, 37 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 4b514a44184c..cbd55d061b72 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -456,20 +456,21 @@ struct amdgpu_cs_parser { > struct drm_file *filp; > struct amdgpu_ctx *ctx; > > /* chunks */ > unsigned nchunks; > struct amdgpu_cs_chunk *chunks; > > /* scheduler job object */ > struct amdgpu_job *job; > struct drm_sched_entity *entity; > + unsigned hw_ip; > > /* buffer objects */ > struct ww_acquire_ctx ticket; > struct amdgpu_bo_list *bo_list; > struct amdgpu_mn *mn; > struct amdgpu_bo_list_entry vm_pd; > struct list_head validated; > struct dma_fence *fence; > uint64_t bytes_moved_threshold; > uint64_t bytes_moved_vis_threshold; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index c691df6f7a57..9125cd69a124 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -678,20 +678,28 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > if (r) > goto error_validate; > > amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, > p->bytes_moved_vis); > > gds = p->bo_list->gds_obj; > gws = p->bo_list->gws_obj; > oa = p->bo_list->oa_obj; > > + if (p->hw_ip == AMDGPU_HW_IP_GFX) { > + /* Only gfx10 allocates these. */ > + if (!gds) > + gds = p->adev->gds.gds_gfx_bo; > + if (!oa) > + oa = p->adev->gds.oa_gfx_bo; > + } > + > amdgpu_bo_list_for_each_entry(e, p->bo_list) { > struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e-><a href="http://tv.bo" rel="noreferrer noreferrer" target="_blank">tv.bo</a>); > > /* Make sure we use the exclusive slot for shared BOs */ > if (bo->prime_shared_count) > e->tv.num_shared = 0; > e->bo_va = amdgpu_vm_bo_find(vm, bo); > } > > if (gds) { > @@ -954,20 +962,22 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, > struct drm_amdgpu_cs_chunk_ib *chunk_ib; > struct drm_sched_entity *entity; > > chunk = &parser->chunks[i]; > ib = &parser->job->ibs[j]; > chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata; > > if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB) > continue; > > + parser->hw_ip = chunk_ib->ip_type; > + > if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX && > (amdgpu_mcbp || amdgpu_sriov_vf(adev))) { > if (chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) { > if (chunk_ib->flags & AMDGPU_IB_FLAG_CE) > ce_preempt++; > else > de_preempt++; > } > > /* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h > index df8a23554831..0943b8e1d97e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h > @@ -26,20 +26,26 @@ > > struct amdgpu_ring; > struct amdgpu_bo; > > struct amdgpu_gds { > uint32_t gds_size; > uint32_t gws_size; > uint32_t oa_size; > uint32_t gds_compute_max_wave_id; > uint32_t vgt_gs_max_wave_id; > + > + /* Reserved OA for the gfx ring. (gfx10) */ > + uint32_t gds_gfx_reservation_size; > + uint32_t oa_gfx_reservation_size; > + struct amdgpu_bo *gds_gfx_bo; > + struct amdgpu_bo *oa_gfx_bo; > }; > > struct amdgpu_gds_reg_offset { > uint32_t mem_base; > uint32_t mem_size; > uint32_t gws; > uint32_t oa; > }; > > #endif /* __AMDGPU_GDS_H__ */ > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index 618291df659b..3952754c04ff 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -1314,20 +1314,33 @@ static int gfx_v10_0_sw_init(void *handle) > > kiq = &adev->gfx.kiq; > r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq); > if (r) > return r; > > r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v10_compute_mqd)); > if (r) > return r; > > + /* allocate reserved GDS resources for transform feedback */ > + r = amdgpu_bo_create_kernel(adev, adev->gds.gds_gfx_reservation_size, > + 4, AMDGPU_GEM_DOMAIN_GDS, > + &adev->gds.gds_gfx_bo, NULL, NULL); > + if (r) > + return r; > + > + r = amdgpu_bo_create_kernel(adev, adev->gds.oa_gfx_reservation_size, > + 1, AMDGPU_GEM_DOMAIN_OA, > + &adev->gds.oa_gfx_bo, NULL, NULL); > + if (r) > + return r; > + > /* allocate visible FB for rlc auto-loading fw */ > if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) { > r = gfx_v10_0_rlc_backdoor_autoload_buffer_init(adev); > if (r) > return r; > } > > adev->gfx.ce_ram_size = F32_CE_PROGRAM_RAM_SIZE; > > gfx_v10_0_gpu_early_init(adev); > @@ -1354,20 +1367,23 @@ static void gfx_v10_0_me_fini(struct amdgpu_device *adev) > amdgpu_bo_free_kernel(&adev->gfx.me.me_fw_obj, > &adev->gfx.me.me_fw_gpu_addr, > (void **)&adev->gfx.me.me_fw_ptr); > } > > static int gfx_v10_0_sw_fini(void *handle) > { > int i; > struct amdgpu_device *adev = (struct amdgpu_device *)handle; > > + amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL); > + amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL); > + > for (i = 0; i < adev->gfx.num_gfx_rings; i++) > amdgpu_ring_fini(&adev->gfx.gfx_ring[i]); > for (i = 0; i < adev->gfx.num_compute_rings; i++) > amdgpu_ring_fini(&adev->gfx.compute_ring[i]); > > amdgpu_gfx_mqd_sw_fini(adev); > amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq); > amdgpu_gfx_kiq_fini(adev); > > gfx_v10_0_pfp_fini(adev); > @@ -5181,20 +5197,24 @@ static void gfx_v10_0_set_gds_init(struct amdgpu_device *adev) > case CHIP_NAVI10: > default: > adev->gds.gds_size = 0x10000; > adev->gds.gds_compute_max_wave_id = 0x4ff; > adev->gds.vgt_gs_max_wave_id = 0x3ff; > break; > } > > adev->gds.gws_size = 64; > adev->gds.oa_size = 16; > + > + /* Reserved for transform feedback. */ > + adev->gds.gds_gfx_reservation_size = 256; > + adev->gds.oa_gfx_reservation_size = 4; > } > > static void gfx_v10_0_set_user_wgp_inactive_bitmap_per_sh(struct amdgpu_device *adev, > u32 bitmap) > { > u32 data; > > if (!bitmap) > return; > </blockquote></div></div></div>