[PATCH] drm/amdgpu: optionally do a writeback but don't invalidate TC for IB fences
Christian König
ckoenig.leichtzumerken at gmail.com
Wed Apr 4 07:28:36 UTC 2018
Am 03.04.2018 um 22:25 schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak at amd.com>
>
> There is a new IB flag that enables this new behavior.
> Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense
> when draw calls from two adjacent gfx IBs run in parallel. This will be
> the new default for Mesa.
>
> v2: bump the version
>
> Signed-off-by: Marek Olšák <marek.olsak at amd.com>
Looks good to me, but I would split it into two patches. One which
implements all the stuff in the common code and the second implementing
the handling in gfx_v9_0.c and bumping the version number.
But that's only nice to have, the patch is Reviewed-by: Christian König
<christian.koenig at amd.com> anyway.
Regards,
Christian.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 5 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 8 ++++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 4 +++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 +++++++----
> drivers/gpu/drm/amd/amdgpu/soc15d.h | 1 +
> include/uapi/drm/amdgpu_drm.h | 4 ++++
> 8 files changed, 27 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 43df7d2aebb4..0a45f5cceba7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -68,23 +68,24 @@
> * - 3.16.0 - Add reserved vmid support
> * - 3.17.0 - Add AMDGPU_NUM_VRAM_CPU_PAGE_FAULTS.
> * - 3.18.0 - Export gpu always on cu bitmap
> * - 3.19.0 - Add support for UVD MJPEG decode
> * - 3.20.0 - Add support for local BOs
> * - 3.21.0 - Add DRM_AMDGPU_FENCE_TO_HANDLE ioctl
> * - 3.22.0 - Add DRM_AMDGPU_SCHED ioctl
> * - 3.23.0 - Add query for VRAM lost counter
> * - 3.24.0 - Add high priority compute support for gfx9
> * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
> + * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
> */
> #define KMS_DRIVER_MAJOR 3
> -#define KMS_DRIVER_MINOR 25
> +#define KMS_DRIVER_MINOR 26
> #define KMS_DRIVER_PATCHLEVEL 0
>
> int amdgpu_vram_limit = 0;
> int amdgpu_vis_vram_limit = 0;
> int amdgpu_gart_size = -1; /* auto */
> int amdgpu_gtt_size = -1; /* auto */
> int amdgpu_moverate = -1; /* auto */
> int amdgpu_benchmarking = 0;
> int amdgpu_testing = 0;
> int amdgpu_audio = -1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 97449e06a242..d09fcab2398f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -124,39 +124,40 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
>
> /**
> * amdgpu_fence_emit - emit a fence on the requested ring
> *
> * @ring: ring the fence is associated with
> * @f: resulting fence object
> *
> * Emits a fence command on the requested ring (all asics).
> * Returns 0 on success, -ENOMEM on failure.
> */
> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> + unsigned flags)
> {
> struct amdgpu_device *adev = ring->adev;
> struct amdgpu_fence *fence;
> struct dma_fence *old, **ptr;
> uint32_t seq;
>
> fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
> if (fence == NULL)
> return -ENOMEM;
>
> seq = ++ring->fence_drv.sync_seq;
> fence->ring = ring;
> dma_fence_init(&fence->base, &amdgpu_fence_ops,
> &ring->fence_drv.lock,
> adev->fence_context + ring->idx,
> seq);
> amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> - seq, AMDGPU_FENCE_FLAG_INT);
> + seq, flags | AMDGPU_FENCE_FLAG_INT);
>
> ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
> /* This function can't be called concurrently anyway, otherwise
> * emitting the fence would mess up the hardware ring buffer.
> */
> old = rcu_dereference_protected(*ptr, 1);
> if (old && !dma_fence_is_signaled(old)) {
> DRM_INFO("rcu slot is busy\n");
> dma_fence_wait(old, false);
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 311589e02d17..f70eeed9ed76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -120,20 +120,21 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
> struct dma_fence **f)
> {
> struct amdgpu_device *adev = ring->adev;
> struct amdgpu_ib *ib = &ibs[0];
> struct dma_fence *tmp = NULL;
> bool skip_preamble, need_ctx_switch;
> unsigned patch_offset = ~0;
> struct amdgpu_vm *vm;
> uint64_t fence_ctx;
> uint32_t status = 0, alloc_size;
> + unsigned fence_flags = 0;
>
> unsigned i;
> int r = 0;
> bool need_pipe_sync = false;
>
> if (num_ibs == 0)
> return -EINVAL;
>
> /* ring tests don't use a job */
> if (job) {
> @@ -220,36 +221,39 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
> }
>
> if (ring->funcs->emit_tmz)
> amdgpu_ring_emit_tmz(ring, false);
>
> #ifdef CONFIG_X86_64
> if (!(adev->flags & AMD_IS_APU))
> #endif
> amdgpu_asic_invalidate_hdp(adev, ring);
>
> - r = amdgpu_fence_emit(ring, f);
> + if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE)
> + fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY;
> +
> + r = amdgpu_fence_emit(ring, f, fence_flags);
> if (r) {
> dev_err(adev->dev, "failed to emit fence (%d)\n", r);
> if (job && job->vmid)
> amdgpu_vmid_reset(adev, ring->funcs->vmhub, job->vmid);
> amdgpu_ring_undo(ring);
> return r;
> }
>
> if (ring->funcs->insert_end)
> ring->funcs->insert_end(ring);
>
> /* wrap the last IB with fence */
> if (job && job->uf_addr) {
> amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
> - AMDGPU_FENCE_FLAG_64BIT);
> + fence_flags | AMDGPU_FENCE_FLAG_64BIT);
> }
>
> if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
> amdgpu_ring_patch_cond_exec(ring, patch_offset);
>
> ring->current_ctx = fence_ctx;
> if (vm && ring->funcs->emit_switch_buffer)
> amdgpu_ring_emit_switch_buffer(ring);
> amdgpu_ring_commit(ring);
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 1d0d250cbfdf..222052daedd1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -33,20 +33,21 @@
> #define AMDGPU_MAX_COMPUTE_RINGS 8
> #define AMDGPU_MAX_VCE_RINGS 3
> #define AMDGPU_MAX_UVD_ENC_RINGS 2
>
> /* some special values for the owner field */
> #define AMDGPU_FENCE_OWNER_UNDEFINED ((void*)0ul)
> #define AMDGPU_FENCE_OWNER_VM ((void*)1ul)
>
> #define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
> #define AMDGPU_FENCE_FLAG_INT (1 << 1)
> +#define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2)
>
> enum amdgpu_ring_type {
> AMDGPU_RING_TYPE_GFX,
> AMDGPU_RING_TYPE_COMPUTE,
> AMDGPU_RING_TYPE_SDMA,
> AMDGPU_RING_TYPE_UVD,
> AMDGPU_RING_TYPE_VCE,
> AMDGPU_RING_TYPE_KIQ,
> AMDGPU_RING_TYPE_UVD_ENC,
> AMDGPU_RING_TYPE_VCN_DEC,
> @@ -81,21 +82,22 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev);
> void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
> void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
>
> int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
> unsigned num_hw_submission);
> int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
> struct amdgpu_irq_src *irq_src,
> unsigned irq_type);
> void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
> void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
> + unsigned flags);
> int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
> void amdgpu_fence_process(struct amdgpu_ring *ring);
> int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
> signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring,
> uint32_t wait_seq,
> signed long timeout);
> unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring);
>
> /*
> * Rings.
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 24474294c92a..fe05351ea4d2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -620,21 +620,21 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
>
> if (vm_flush_needed) {
> trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
> amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
> }
>
> if (pasid_mapping_needed)
> amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
>
> if (vm_flush_needed || pasid_mapping_needed) {
> - r = amdgpu_fence_emit(ring, &fence);
> + r = amdgpu_fence_emit(ring, &fence, 0);
> if (r)
> return r;
> }
>
> if (vm_flush_needed) {
> mutex_lock(&id_mgr->lock);
> dma_fence_put(id->last_flush);
> id->last_flush = dma_fence_get(fence);
> id->current_gpu_reset_count =
> atomic_read(&adev->gpu_reset_counter);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9d39fd5b1822..5dea0d4c0af4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -3767,27 +3767,30 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
> lower_32_bits(ib->gpu_addr));
> amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
> amdgpu_ring_write(ring, control);
> }
>
> static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
> u64 seq, unsigned flags)
> {
> bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
> bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> + bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
>
> /* RELEASE_MEM - flush caches, send int */
> amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
> - amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> - EOP_TC_ACTION_EN |
> - EOP_TC_WB_ACTION_EN |
> - EOP_TC_MD_ACTION_EN |
> + amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
> + EOP_TC_NC_ACTION_EN) :
> + (EOP_TCL1_ACTION_EN |
> + EOP_TC_ACTION_EN |
> + EOP_TC_WB_ACTION_EN |
> + EOP_TC_MD_ACTION_EN)) |
> EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
> EVENT_INDEX(5)));
> amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
>
> /*
> * the address should be Qword aligned if 64bit write, Dword
> * aligned if only send 32bit data low (discard data high)
> */
> if (write64bit)
> BUG_ON(addr & 0x7);
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> index 7f408f85fdb6..839a144c1645 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> @@ -152,20 +152,21 @@
> * 4 - *S_PARTIAL_FLUSH
> */
> #define PACKET3_RELEASE_MEM 0x49
> #define EVENT_TYPE(x) ((x) << 0)
> #define EVENT_INDEX(x) ((x) << 8)
> #define EOP_TCL1_VOL_ACTION_EN (1 << 12)
> #define EOP_TC_VOL_ACTION_EN (1 << 13) /* L2 */
> #define EOP_TC_WB_ACTION_EN (1 << 15) /* L2 */
> #define EOP_TCL1_ACTION_EN (1 << 16)
> #define EOP_TC_ACTION_EN (1 << 17) /* L2 */
> +#define EOP_TC_NC_ACTION_EN (1 << 19)
> #define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */
>
> #define DATA_SEL(x) ((x) << 29)
> /* 0 - discard
> * 1 - send low 32bit data
> * 2 - send 64bit data
> * 3 - send 64bit GPU counter value
> * 4 - send 64bit sys counter value
> */
> #define INT_SEL(x) ((x) << 24)
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 0087799962cf..f5901bd9c7d8 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -516,20 +516,24 @@ union drm_amdgpu_cs {
>
> /* This IB should be submitted to CE */
> #define AMDGPU_IB_FLAG_CE (1<<0)
>
> /* Preamble flag, which means the IB could be dropped if no context switch */
> #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
>
> /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
> #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
>
> +/* The IB fence should do the L2 writeback but not invalidate any shader
> + * caches (L2/vL1/sL1/I$). */
> +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
> +
> struct drm_amdgpu_cs_chunk_ib {
> __u32 _pad;
> /** AMDGPU_IB_FLAG_* */
> __u32 flags;
> /** Virtual address to begin IB execution */
> __u64 va_start;
> /** Size of submission */
> __u32 ib_bytes;
> /** HW IP to submit to */
> __u32 ip_type;
More information about the amd-gfx
mailing list