[PATCH 4/5] drm/amdgpu: Implement OS triggered MCBP (v5)

Fri Sep 23 09:24:42 UTC 2022

[AMD Official Use Only - General]

Inlined.

Thanks,
Jiadong

-----Original Message-----
From: Koenig, Christian <Christian.Koenig at amd.com>
Sent: Wednesday, September 21, 2022 9:01 PM
To: Zhu, Jiadong <Jiadong.Zhu at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Tuikov, Luben <Luben.Tuikov at amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
Subject: Re: [PATCH 4/5] drm/amdgpu: Implement OS triggered MCBP (v5)

Am 21.09.22 um 11:41 schrieb jiadong.zhu at amd.com:
> From: "Jiadong.Zhu" <Jiadong.Zhu at amd.com>
>
> Trigger Mid-Command Buffer Preemption according to the priority of the
> software rings and the hw fence signalling condition.
>
> The muxer saves the locations of the indirect buffer frames from the
> software ring together with the fence sequence number in its fifo
> queue, and pops out those records when the fences are signalled. The
> locations are used to resubmit packages in preemption scenarios by coping the chunks from the software ring.

Maybe change the subject a bit. The MCBP is not really triggered by the core Linux kernel.

Maybe write instead "MCBP based on DRM scheduler".

>
> v2: Update comment style.
> v3: Fix conflict caused by previous modifications.
> v4: Remove unnecessary prints.
> v5: Fix corner cases for resubmission cases.
>
> Cc: Christian Koenig <Christian.Koenig at amd.com>
> Cc: Luben Tuikov <Luben.Tuikov at amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
> Acked-by: Luben Tuikov <luben.tuikov at amd.com>
> Signed-off-by: Jiadong.Zhu <Jiadong.Zhu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/Makefile          |   2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c       |   2 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.c     |  91 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.h     |  29 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c     |  12 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h     |   3 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 186 ++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h |  24 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c  |  27 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c       |   2 +
>   10 files changed, 372 insertions(+), 6 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.c
>   create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 85224bc81ce5..24c5aa19bbf2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -59,7 +59,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
>       amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
>       amdgpu_fw_attestation.o amdgpu_securedisplay.o \
>       amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
> -     amdgpu_sw_ring.o amdgpu_ring_mux.o
> +     amdgpu_sw_ring.o amdgpu_ring_mux.o amdgpu_mcbp.o

This functionality is spread over to many files. Probably better to move this into the amdgpu_ring_mux.c as well.

>
>   amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 258cffe3c06a..af86d87e2f3b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -211,6 +211,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>               }
>       }
>
> +     amdgpu_ring_ib_begin(ring);
>       if (job && ring->funcs->init_cond_exec)
>               patch_offset = amdgpu_ring_init_cond_exec(ring);
>
> @@ -285,6 +286,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>           ring->hw_prio == AMDGPU_GFX_PIPE_PRIO_HIGH)
>               ring->funcs->emit_wave_limit(ring, false);
>
> +     amdgpu_ring_ib_end(ring);
>       amdgpu_ring_commit(ring);
>       return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.c
> new file mode 100644
> index 000000000000..121b1a4e0f04
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.c
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/delay.h>
> +#include <linux/kernel.h>
> +#include <linux/firmware.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <drm/gpu_scheduler.h>
> +
> +#include "amdgpu.h"
> +#include "amdgpu_mcbp.h"
> +#include "amdgpu_ring.h"
> +
> +/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need
> +to resubmit. */ int amdgpu_mcbp_trigger_preempt(struct
> +amdgpu_ring_mux *mux) {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_ring *ring = NULL;
> +     int i;
> +
> +     spin_lock(&mux->lock);
> +
> +     amdgpu_ring_preempt_ib(mux->real_ring);
> +
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             e = &mux->ring_entry[i];
> +             if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +                     ring = e->ring;
> +                     break;
> +             }
> +     }
> +
> +     if (!ring) {
> +             DRM_ERROR("cannot find low priority ring\n");
> +             spin_unlock(&mux->lock);
> +             return -ENOENT;
> +     }
> +
> +     amdgpu_fence_process(ring);

> That's usually and extremely bad idea since fence processing should only be kicked of from the interrupt handler.

> Otherwise you have interrupt handler and this thread here racing to signaling fences.

We have to block low priority ibs copied to the real ring and check the seq no signaled here.
I will refactor to use the trailing fence irq to handle this.

> +
> +     if (atomic_read(&ring->fence_drv.last_seq) !=
> +         ring->fence_drv.sync_seq) {
> +             mux->s_resubmit = true;
> +             mux->seq_no_resubmit = ring->fence_drv.sync_seq;

> Don't touch any fence handling internals here. If you need to know which fences are signaled and which aren't look into amdgpu_fence.c
I would use amdgpu_fence_count_emitted in irq hander to meet this.

> +             amdgpu_ring_mux_schedule_resubmit(mux);
> +     }
> +
> +     spin_unlock(&mux->lock);
> +     return 0;
> +}
> +
> +/*scan on low prio rings to have unsignaled fence and high ring has
> +no fence.*/

What exactly should that comment mean?

> +int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux) {
> +     struct amdgpu_ring *ring;
> +     uint32_t seq, last_seq;
> +     int i, need_preempt;
> +
> +     need_preempt = 0;
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             ring = mux->ring_entry[i].ring;
> +             last_seq = atomic_read(&ring->fence_drv.last_seq);
> +             seq = READ_ONCE(ring->fence_drv.sync_seq);
> +             if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT && last_seq < seq)
> +                     return 0;
> +             if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && last_seq < seq)
> +                     need_preempt = 1;
> +     }
> +     return need_preempt && !mux->s_resubmit;

>Well what exactly are you trying to do here? Finding if a lower priority ring has unsignaled fences?

Yes, we are peeking the fence_drv data at the time high priority ibs are going to emit. The result is not necessarily accurate because we would check the fence after preemption complete.

Regards,
Christian.

> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.h
> new file mode 100644
> index 000000000000..0033bcba8d03
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mcbp.h
> @@ -0,0 +1,29 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __AMDGPU_MCBP_H__
> +#define __AMDGPU_MCBP_H__
> +
> +int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux); int
> +amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux); #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 4eaf3bd332f7..94362c39b73e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -574,3 +574,15 @@ int amdgpu_ring_init_mqd(struct amdgpu_ring
> *ring)
>
>       return mqd_mgr->init_mqd(adev, ring->mqd_ptr, &prop);
>   }
> +
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring) {
> +     if (ring->is_sw_ring)
> +             amdgpu_sw_ring_ib_begin(ring);
> +}
> +
> +void amdgpu_ring_ib_end(struct amdgpu_ring *ring) {
> +     if (ring->is_sw_ring)
> +             amdgpu_sw_ring_ib_end(ring);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index aeb48cc3666c..36726c28a806 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -312,6 +312,9 @@ struct amdgpu_ring {
>   #define amdgpu_ring_preempt_ib(r) (r)->funcs->preempt_ib(r)
>
>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> +void amdgpu_ring_ib_begin(struct amdgpu_ring *ring); void
> +amdgpu_ring_ib_end(struct amdgpu_ring *ring);
> +
>   void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>   void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>   void amdgpu_ring_commit(struct amdgpu_ring *ring); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> index d6b30db27104..70dd725432d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
> @@ -24,30 +24,59 @@
>   #include <drm/drm_print.h>
>
>   #include "amdgpu_ring_mux.h"
> +#include "amdgpu_mcbp.h"
>   #include "amdgpu_ring.h"
>
>   #define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
>
> +static struct kmem_cache *amdgpu_mux_chunk_slab;
> +
>   static void copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>                                 u64 s_start, u64 s_end);
> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux,
> +bool is_fallback); static void amdgpu_mux_resubmit_fallback(struct
> +timer_list *t);
>
>   int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>                        unsigned int entry_size)
>   {
>       mux->real_ring = ring;
>       mux->num_ring_entries = 0;
> +
>       mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
>       if (!mux->ring_entry)
>               return -ENOMEM;
>
>       mux->ring_entry_size = entry_size;
> +     mux->s_resubmit = false;
> +
> +     amdgpu_mux_chunk_slab = kmem_cache_create("amdgpu_mux_chunk",
> +                                               sizeof(struct amdgpu_mux_chunk), 0,
> +                                               SLAB_HWCACHE_ALIGN, NULL);
> +     if (!amdgpu_mux_chunk_slab) {
> +             DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
> +             return -ENOMEM;
> +     }
> +
>       spin_lock_init(&mux->lock);
> +     timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
>
>       return 0;
>   }
>
>   void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
>   {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk, *chunk2;
> +     int i;
> +
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             e = &mux->ring_entry[i];
> +             list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
> +                     list_del(&chunk->entry);
> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +             }
> +     }
> +     kmem_cache_destroy(amdgpu_mux_chunk_slab);
>       kfree(mux->ring_entry);
>       mux->ring_entry = NULL;
>       mux->num_ring_entries = 0;
> @@ -67,6 +96,7 @@ int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>       ring->entry_index = mux->num_ring_entries;
>       e->ring = ring;
>
> +     INIT_LIST_HEAD(&e->list);
>       mux->num_ring_entries += 1;
>       return 0;
>   }
> @@ -82,6 +112,9 @@ void amdgpu_ring_set_wptr_to_mux(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>   {
>       struct amdgpu_mux_entry *e;
>
> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
> +             amdgpu_mux_resubmit_chunks(mux, false);
> +
>       e = amdgpu_get_sw_entry(mux, ring);
>       if (!e) {
>               DRM_ERROR("cannot find entry for sw ring\n"); @@ -90,13 +123,19 @@
> void amdgpu_ring_set_wptr_to_mux(struct amdgpu_ring_mux *mux, struct
> amdgpu_ring
>
>       spin_lock(&mux->lock);
>       e->sw_cptr = e->sw_wptr;
> +     if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
> +             e->sw_cptr = mux->wptr_resubmit;
>       e->sw_wptr = wptr;
>       e->start_ptr_in_hw_ring = mux->real_ring->wptr;
>
> -     copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> -     e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> -     amdgpu_ring_commit(mux->real_ring);
> -
> +     /* donnot copy the ibs which have been resubmitted*/
> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
> +             copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +             amdgpu_ring_commit(mux->real_ring);
> +     } else {
> +             e->end_ptr_in_hw_ring = mux->real_ring->wptr;
> +     }
>       spin_unlock(&mux->lock);
>   }
>
> @@ -159,7 +198,7 @@ u64 amdgpu_ring_get_rptr_from_mux(struct amdgpu_ring_mux *mux, struct amdgpu_rin
>       return e->sw_rptr;
>   }
>
> -/* copy packages on sw ring range[begin, end) */
> +/* copy packages on sw ring range[start, end) */
>   static void copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
>                                 u64 s_start, u64 s_end)
>   {
> @@ -183,3 +222,140 @@ static void copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_rin
>               amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
>       }
>   }
> +
> +void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux) {
> +     mod_timer(&mux->resubmit_timer, jiffies +
> +AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
> +}
> +
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
> +amdgpu_ring *ring) {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk;
> +
> +     amdgpu_mux_resubmit_chunks(mux, false);
> +
> +     e = amdgpu_get_sw_entry(mux, ring);
> +     if (!e) {
> +             DRM_ERROR("cannot find entry!\n");
> +             return;
> +     }
> +
> +     chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
> +     if (!chunk) {
> +             DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
> +             return;
> +     }
> +
> +     chunk->start = ring->wptr;
> +     list_add_tail(&chunk->entry, &e->list); }
> +
> +static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux
> +*mux, struct amdgpu_ring *ring) {
> +     uint32_t last_seq, size = 0;
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk, *tmp;
> +
> +     e = amdgpu_get_sw_entry(mux, ring);
> +     if (!e) {
> +             DRM_ERROR("cannot find entry!\n");
> +             return;
> +     }
> +
> +     last_seq = atomic_read(&ring->fence_drv.last_seq);
> +
> +     list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
> +             if (chunk->sync_seq <= last_seq) {
> +                     list_del(&chunk->entry);
> +                     kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
> +             } else {
> +                     size++;
> +             }
> +     }
> +}
> +
> +void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct
> +amdgpu_ring *ring) {
> +     struct amdgpu_mux_entry *e;
> +     struct amdgpu_mux_chunk *chunk;
> +
> +     e = amdgpu_get_sw_entry(mux, ring);
> +     if (!e) {
> +             DRM_ERROR("cannot find entry!\n");
> +             return;
> +     }
> +
> +     chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
> +     if (!chunk) {
> +             DRM_ERROR("cannot find chunk!\n");
> +             return;
> +     }
> +
> +     chunk->end = ring->wptr;
> +     chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
> +
> +     scan_and_remove_signaled_chunk(mux, ring); }
> +
> +static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux,
> +bool is_fallback) {
> +     struct amdgpu_mux_entry *e = NULL;
> +     struct amdgpu_mux_chunk *chunk;
> +     uint32_t seq, last_seq;
> +     int i;
> +
> +     if (is_fallback) {
> +             if (!spin_trylock(&mux->lock)) {
> +                     amdgpu_ring_mux_schedule_resubmit(mux);
> +                     DRM_ERROR("reschedule resubmit\n");
> +                     return;
> +             }
> +     } else {
> +             spin_lock(&mux->lock);
> +     }
> +
> +     /*find low priority entries:*/
> +     if (!mux->s_resubmit) {
> +             spin_unlock(&mux->lock);
> +             return;
> +     }
> +
> +     for (i = 0; i < mux->num_ring_entries; i++) {
> +             if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
> +                     e = &mux->ring_entry[i];
> +                     break;
> +             }
> +     }
> +
> +     if (!e) {
> +             DRM_ERROR("%s no low priority ring found\n", __func__);
> +             spin_unlock(&mux->lock);
> +             return;
> +     }
> +
> +     last_seq = atomic_read(&e->ring->fence_drv.last_seq);
> +     seq = mux->seq_no_resubmit;
> +     if (last_seq < seq) {
> +             /*resubmit all the fences between (last_seq, seq]*/
> +             list_for_each_entry(chunk, &e->list, entry) {
> +                     if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
> +                             copy_pkt_from_sw_ring(mux, e->ring, chunk->start, chunk->end);
> +                             mux->wptr_resubmit = chunk->end;
> +                             amdgpu_ring_commit(mux->real_ring);
> +                     }
> +             }
> +     }
> +
> +     del_timer(&mux->resubmit_timer);
> +     mux->s_resubmit = false;
> +     spin_unlock(&mux->lock);
> +}
> +
> +static void amdgpu_mux_resubmit_fallback(struct timer_list *t) {
> +     struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
> +
> +     DRM_INFO("calling %s\n", __func__);
> +     amdgpu_mux_resubmit_chunks(mux, true); }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> index e8ee34e6b9a5..f6fc0afa3cc7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.h
> @@ -35,6 +35,7 @@ struct amdgpu_ring;
>    * sw_cptr -- the position of the copy pointer in the sw ring
>    * sw_rptr -- the read pointer in software ring
>    * sw_wptr -- the write pointer in software ring
> + * list -- list head for amdgpu_mux_chunk
>    */
>   struct amdgpu_mux_entry {
>       struct                  amdgpu_ring *ring;
> @@ -43,6 +44,7 @@ struct amdgpu_mux_entry {
>       u64                     sw_cptr;
>       u64                     sw_rptr;
>       u64                     sw_wptr;
> +     struct list_head        list;
>   };
>
>   struct amdgpu_ring_mux {
> @@ -53,6 +55,24 @@ struct amdgpu_ring_mux {
>       unsigned int            ring_entry_size;
>       /*the lock for copy data from different software rings*/
>       spinlock_t              lock;
> +     bool                    s_resubmit;
> +     uint32_t                seq_no_resubmit;
> +     u64                     wptr_resubmit;
> +     struct timer_list       resubmit_timer;
> +};
> +
> +/*
> + * amdgpu_munx_chunk -- save the location of indirect buffer's
> +package on softare rings
> + * entry -- the list entry.
> + * sync_seq -- the fence seqno related with the saved IB.
> + * start -- start location on the software ring.
> + * end -- end location on the software ring.
> + */
> +struct amdgpu_mux_chunk {
> +     struct list_head        entry;
> +     uint32_t                sync_seq;
> +     u64                     start;
> +     u64                     end;
>   };
>
>   int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct
> amdgpu_ring *ring, @@ -63,4 +83,8 @@ void amdgpu_ring_set_wptr_to_mux(struct amdgpu_ring_mux *mux, struct amdgpu_ring
>   u64 amdgpu_ring_get_wptr_from_mux(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring);
>   u64 amdgpu_ring_get_rptr_from_mux(struct amdgpu_ring_mux *mux,
> struct amdgpu_ring *ring);
>
> +void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct
> +amdgpu_ring *ring); void amdgpu_ring_mux_end_ib(struct
> +amdgpu_ring_mux *mux, struct amdgpu_ring *ring); void
> +amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux);
> +
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> index ec50793aa54d..4809ecf76180 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sw_ring.c
> @@ -26,6 +26,7 @@
>
>   #include "amdgpu_sw_ring.h"
>   #include "amdgpu_ring_mux.h"
> +#include "amdgpu_mcbp.h"
>
>   u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
>   {
> @@ -58,3 +59,29 @@ void amdgpu_sw_ring_commit(struct amdgpu_ring *ring)
>       WARN_ON(!ring->is_sw_ring);
>       amdgpu_ring_set_wptr_to_mux(mux, ring, ring->wptr);
>   }
> +
> +void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring) {
> +     struct amdgpu_device *adev = ring->adev;
> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +     WARN_ON(!ring->is_sw_ring);
> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
> +             if (amdgpu_mcbp_scan(mux) > 0)
> +                     amdgpu_mcbp_trigger_preempt(mux);
> +             return;
> +     }
> +
> +     amdgpu_ring_mux_start_ib(mux, ring); }
> +
> +void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring) {
> +     struct amdgpu_device *adev = ring->adev;
> +     struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
> +
> +     WARN_ON(!ring->is_sw_ring);
> +     if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
> +             return;
> +     amdgpu_ring_mux_end_ib(mux, ring);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 9596c22fded6..b7e94553f4fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -601,6 +601,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>       if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>               return 0;
>
> +     amdgpu_ring_ib_begin(ring);
>       if (ring->funcs->init_cond_exec)
>               patch_offset = amdgpu_ring_init_cond_exec(ring);
>
> @@ -661,6 +662,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>               amdgpu_ring_emit_switch_buffer(ring);
>               amdgpu_ring_emit_switch_buffer(ring);
>       }
> +     amdgpu_ring_ib_end(ring);
>       return 0;
>   }
>