[PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

Thu Aug 11 03:18:05 UTC 2022

[AMD Official Use Only - General]

Hi Christian,

Thank you for the reply, I will update the patch to fix style issue.

The patch has several changes
1. change the unmap package for mcbp which is not correct in gfx_v9_0_kiq_unmap_queues.
2. change the emitted ce/de meta data used for preempted ibs
3. add the function gfx_v9_0_ring_preempt_ib used for debugfs case.

Though the part 3 may be removed in the future.  Those functions of 1 and 2 could be still used by some projects such as virtualization etc.

Thanks,
Jiadong

-----Original Message-----
From: Christian König <ckoenig.leichtzumerken at gmail.com>
Sent: Thursday, August 11, 2022 12:06 AM
To: Zhu, Jiadong <Jiadong.Zhu at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Huang, Ray <Ray.Huang at amd.com>; Liu, Aaron <Aaron.Liu at amd.com>
Subject: Re: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

[CAUTION: External Email]

Hi, Jiadong,

first of all your patches have major style issues. Please use the checkpatch.pl script before sending those out.

Apart from that as discussed on our call on Monday MCBP is not something we will implement on Linux. So we will probably remove the existing debugfs test sooner or later.

Regards,
Christian.

Am 09.08.22 um 11:21 schrieb Zhu, Jiadong:
> [AMD Official Use Only - General]
>
> Hi,
>
> This patch is to correct the mcbp package for gfx9, which is the basic function used for debugfs.
> There are no logic about when to trigger mcbp.
> Shall we get this reviewed?
>
> Thanks,
> Jiadong
>
> -----Original Message-----
> From: Zhu, Jiadong <Jiadong.Zhu at amd.com>
> Sent: Tuesday, August 9, 2022 5:15 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Liu, Aaron <Aaron.Liu at amd.com>; Huang, Ray <Ray.Huang at amd.com>;
> Zhu, Jiadong <Jiadong.Zhu at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)
>
> From: "Jiadong.Zhu" <Jiadong.Zhu at amd.com>
>
> 1. Use unmap_queue package to trigger preemption on gfx9
>     Add trailing fence to track the preemption done.
> 2. Modify emit_ce_meta emit_de_meta functions
>     for the resumed ibs.
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 159 ++++++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/soc15d.h      |   2 +
>   3 files changed, 141 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 82c178a9033a..ca626f0ad7b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
>   #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
>   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
> +#define AMDGPU_FENCE_FLAG_EXEC          (1 << 3)
>
>   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring,
> sched)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5332899642dc..0b7cb4cf13c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);  static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
>                                  struct amdgpu_cu_info *cu_info);
>   static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device
> *adev); -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring
> *ring);
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
> +resume);
>   static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);  static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>                                            void *ras_error_status); @@
> -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
>
> PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
>
>          if (action == PREEMPT_QUEUES_NO_UNMAP) {
> -               amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
> -               amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
> -               amdgpu_ring_write(kiq_ring, seq);
> +               amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
> +               amdgpu_ring_write(kiq_ring, 0);
> +               amdgpu_ring_write(kiq_ring, 0);
> +
>          } else {
>                  amdgpu_ring_write(kiq_ring, 0);
>                  amdgpu_ring_write(kiq_ring, 0); @@ -5446,11 +5447,15
> @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>
>          control |= ib->length_dw | (vmid << 24);
>
> -       if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
> +       if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags
> +&
> +AMDGPU_IB_FLAG_PREEMPT)) {
>                  control |= INDIRECT_BUFFER_PRE_ENB(1);
>
> +               if (flags & AMDGPU_IB_PREEMPTED)
> +                       control |= INDIRECT_BUFFER_PRE_RESUME(1);
> +
>                  if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
> -                       gfx_v9_0_ring_emit_de_meta(ring);
> +                       gfx_v9_0_ring_emit_de_meta(ring,
> +                                (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ?
> +true : false);
>          }
>
>          amdgpu_ring_write(ring, header); @@ -5505,6 +5510,7 @@ static
> void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>          bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>          bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>          bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
> +       bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
>
>          /* RELEASE_MEM - flush caches, send int */
>          amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); @@ -5515,6 +5521,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>                                                 EOP_TC_WB_ACTION_EN |
>                                                 EOP_TC_MD_ACTION_EN)) |
>
> EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
> +                                (exec ? EOP_EXEC : 0x0) |
>                                   EVENT_INDEX(5)));
>          amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) |
> INT_SEL(int_sel ? 2 : 0));
>
> @@ -5620,33 +5627,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
>          amdgpu_ring_write(ring, 0);
>   }
>
> -static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool
> +resume)
>   {
> +       struct amdgpu_device *adev = ring->adev;
>          struct v9_ce_ib_state ce_payload = {0};
> -       uint64_t csa_addr;
> +       uint64_t offset, ce_payload_gpu_addr;
> +       void *ce_payload_cpu_addr;
>          int cnt;
>
>          cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
> -       csa_addr = amdgpu_csa_vaddr(ring->adev);
> +
> +       if (ring->is_mes_queue) {
> +               offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                                 gfx[0].gfx_meta_data) +
> +                       offsetof(struct v9_gfx_meta_data, ce_payload);
> +               ce_payload_gpu_addr =
> +                       amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +               ce_payload_cpu_addr =
> +                       amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
> +       } else {
> +               offset = offsetof(struct v9_gfx_meta_data, ce_payload);
> +               ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +               ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
> +       }
>
>          amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
>          amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
>                                   WRITE_DATA_DST_SEL(8) |
>                                   WR_CONFIRM) |
>                                   WRITE_DATA_CACHE_POLICY(0));
> -       amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -       amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
> -       amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
> +       amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
> +       amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
> +
> +       if (resume)
> +               amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
> +                                          sizeof(ce_payload) >> 2);
> +       else
> +               amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
> +                                          sizeof(ce_payload) >> 2); }
> +
> +static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring) {
> +       int i, r = 0;
> +       struct amdgpu_device *adev = ring->adev;
> +       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> +       struct amdgpu_ring *kiq_ring = &kiq->ring;
> +       unsigned long flags;
> +
> +       if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> +               return -EINVAL;
> +
> +       spin_lock_irqsave(&kiq->ring_lock, flags);
> +
> +       if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
> +               spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +               return -ENOMEM;
> +       }
> +
> +       /* assert preemption condition */
> +       amdgpu_ring_set_preempt_cond_exec(ring, false);
> +
> +       ring->trail_seq += 1;
> +       amdgpu_ring_alloc(ring, 13);
> +       gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
> +                                 ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
> +       /*reset the CP_VMID_PREEMPT after trailing fence*/
> +       amdgpu_ring_emit_wreg(ring,
> +                                 SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
> +                                 0x0);
> +
> +       /* assert IB preemption, emit the trailing fence */
> +       kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
> +                                  ring->trail_fence_gpu_addr,
> +                                  ring->trail_seq);
> +
> +       amdgpu_ring_commit(kiq_ring);
> +       spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +
> +       /* poll the trailing fence */
> +       for (i = 0; i < adev->usec_timeout; i++) {
> +               if (ring->trail_seq ==
> +                   le32_to_cpu(*(ring->trail_fence_cpu_addr)))
> +                       break;
> +               udelay(1);
> +       }
> +
> +       if (i >= adev->usec_timeout) {
> +               r = -EINVAL;
> +               DRM_ERROR("ring %d failed to preempt ib\n", ring->idx);
> +       }
> +
> +       amdgpu_ring_commit(ring);
> +
> +       /* deassert preemption condition */
> +       amdgpu_ring_set_preempt_cond_exec(ring, true);
> +       return r;
>   }
>
> -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
> +resume)
>   {
> +       struct amdgpu_device *adev = ring->adev;
>          struct v9_de_ib_state de_payload = {0};
> -       uint64_t csa_addr, gds_addr;
> +       uint64_t offset, gds_addr, de_payload_gpu_addr;
> +       void *de_payload_cpu_addr;
>          int cnt;
>
> -       csa_addr = amdgpu_csa_vaddr(ring->adev);
> -       gds_addr = csa_addr + 4096;
> +       if (ring->is_mes_queue) {
> +               offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                                 gfx[0].gfx_meta_data) +
> +                       offsetof(struct v9_gfx_meta_data, de_payload);
> +               de_payload_gpu_addr =
> +                       amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +               de_payload_cpu_addr =
> +                       amdgpu_mes_ctx_get_offs_cpu_addr(ring,
> + offset);
> +
> +               offset = offsetof(struct amdgpu_mes_ctx_meta_data,
> +                                 gfx[0].gds_backup) +
> +                       offsetof(struct v9_gfx_meta_data, de_payload);
> +               gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
> +       } else {
> +               offset = offsetof(struct v9_gfx_meta_data, de_payload);
> +               de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
> +               de_payload_cpu_addr = adev->virt.csa_cpu_addr +
> + offset;
> +
> +               gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
> +                                AMDGPU_CSA_SIZE - adev->gds.gds_size,
> +                                PAGE_SIZE);
> +       }
> +
>          de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
>          de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
>
> @@ -5656,9 +5765,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
>                                   WRITE_DATA_DST_SEL(8) |
>                                   WR_CONFIRM) |
>                                   WRITE_DATA_CACHE_POLICY(0));
> -       amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -       amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
> -       amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
> +       amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
> +       amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
> +
> +       if (resume)
> +               amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
> +                                          sizeof(de_payload) >> 2);
> +       else
> +               amdgpu_ring_write_multiple(ring, (void *)&de_payload,
> +                                          sizeof(de_payload) >> 2);
>   }
>
>   static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start, @@ -5674,8 +5789,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)  {
>          uint32_t dw2 = 0;
>
> -       if (amdgpu_sriov_vf(ring->adev))
> -               gfx_v9_0_ring_emit_ce_meta(ring);
> +       if (amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp)
> +               gfx_v9_0_ring_emit_ce_meta(ring,
> +                                   (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ?
> +true : false);
>
>          dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>          if (flags & AMDGPU_HAVE_CTX_SWITCH) { @@ -7024,6 +7140,7 @@
> static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>          .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>          .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>          .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
> +       .preempt_ib = gfx_v9_0_ring_preempt_ib,
>          .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>          .emit_wreg = gfx_v9_0_ring_emit_wreg,
>          .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, diff --git
> a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> index 799925d22fc8..614e9f8467fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> @@ -162,6 +162,7 @@
>                   * 2 - Bypass
>                   */
>   #define     INDIRECT_BUFFER_PRE_ENB(x)          ((x) << 21)
> +#define     INDIRECT_BUFFER_PRE_RESUME(x)           ((x) << 30)
>   #define        PACKET3_COPY_DATA                               0x40
>   #define        PACKET3_PFP_SYNC_ME                             0x42
>   #define        PACKET3_COND_WRITE                              0x45
> @@ -184,6 +185,7 @@
>   #define                EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
>   #define                EOP_TC_NC_ACTION_EN                     (1 << 19)
>   #define                EOP_TC_MD_ACTION_EN                     (1 << 21) /* L2 metadata */
> +#define                EOP_EXEC                                        (1 << 28) /* For Trailing Fence */
>
>   #define                DATA_SEL(x)                             ((x) << 29)
>                  /* 0 - discard
> --
> 2.25.1
>