<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Arial;font-size:11pt;color:#0078D7;margin:5pt;" align="Left">
[AMD Official Use Only - Internal Distribution Only]<br>
</p>
<br>
<div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Reviewed-by: Sonny Jiang <sonny.jiang@amd.com><br>
</div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Christian König <ckoenig.leichtzumerken@gmail.com><br>
<b>Sent:</b> Friday, March 5, 2021 7:51 AM<br>
<b>To:</b> dri-devel@lists.freedesktop.org <dri-devel@lists.freedesktop.org>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org><br>
<b>Cc:</b> Liu, Leo <Leo.Liu@amd.com>; Jiang, Sonny <Sonny.Jiang@amd.com><br>
<b>Subject:</b> [PATCH 2/2] drm/amdgpu: load balance VCN3 decode as well v8</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">Add VCN3 IB parsing to figure out to which instance we can send the<br>
stream for decode.<br>
<br>
v2: remove VCN instance limit as well, fix amdgpu_cs_find_mapping,<br>
    check supported formats instead of unsupported.<br>
v3: fix typo and error handling<br>
v4: make sure the message BO is CPU accessible<br>
v5: fix addr calculation once more<br>
v6: only check message buffers<br>
v7: fix constant and use defines<br>
v8: fix create msg calculation<br>
<br>
Signed-off-by: Christian König <christian.koenig@amd.com><br>
---<br>
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 132 +++++++++++++++++++++++++-<br>
 1 file changed, 130 insertions(+), 2 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c<br>
index b33f513fd2ac..77932003b4c1 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c<br>
@@ -50,6 +50,9 @@<br>
 #define VCN_INSTANCES_SIENNA_CICHLID                            2<br>
 #define DEC_SW_RING_ENABLED                                     FALSE<br>
 <br>
+#define RDECODE_MSG_CREATE                                     0x00000000<br>
+#define RDECODE_MESSAGE_CREATE                                 0x00000001<br>
+<br>
 static int amdgpu_ih_clientid_vcns[] = {<br>
         SOC15_IH_CLIENTID_VCN,<br>
         SOC15_IH_CLIENTID_VCN1<br>
@@ -208,8 +211,6 @@ static int vcn_v3_0_sw_init(void *handle)<br>
                 } else {<br>
                         ring->doorbell_index = (adev->doorbell_index.vcn.vcn_ring0_1 << 1) + 8 * i;<br>
                 }<br>
-               if (adev->asic_type == CHIP_SIENNA_CICHLID && i != 0)<br>
-                       ring->no_scheduler = true;<br>
                 sprintf(ring->name, "vcn_dec_%d", i);<br>
                 r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[i].irq, 0,<br>
                                      AMDGPU_RING_PRIO_DEFAULT,<br>
@@ -1825,6 +1826,132 @@ static const struct amdgpu_ring_funcs vcn_v3_0_dec_sw_ring_vm_funcs = {<br>
         .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,<br>
 };<br>
 <br>
+static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p)<br>
+{<br>
+       struct drm_gpu_scheduler **scheds;<br>
+<br>
+       /* The create msg must be in the first IB submitted */<br>
+       if (atomic_read(&p->entity->fence_seq))<br>
+               return -EINVAL;<br>
+<br>
+       scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC]<br>
+               [AMDGPU_RING_PRIO_DEFAULT].sched;<br>
+       drm_sched_entity_modify_sched(p->entity, scheds, 1);<br>
+       return 0;<br>
+}<br>
+<br>
+static int vcn_v3_0_dec_msg(struct amdgpu_cs_parser *p, uint64_t addr)<br>
+{<br>
+       struct ttm_operation_ctx ctx = { false, false };<br>
+       struct amdgpu_bo_va_mapping *map;<br>
+       uint32_t *msg, num_buffers;<br>
+       struct amdgpu_bo *bo;<br>
+       uint64_t start, end;<br>
+       unsigned int i;<br>
+       void * ptr;<br>
+       int r;<br>
+<br>
+       addr &= AMDGPU_GMC_HOLE_MASK;<br>
+       r = amdgpu_cs_find_mapping(p, addr, &bo, &map);<br>
+       if (r) {<br>
+               DRM_ERROR("Can't find BO for addr 0x%08Lx\n", addr);<br>
+               return r;<br>
+       }<br>
+<br>
+       start = map->start * AMDGPU_GPU_PAGE_SIZE;<br>
+       end = (map->last + 1) * AMDGPU_GPU_PAGE_SIZE;<br>
+       if (addr & 0x7) {<br>
+               DRM_ERROR("VCN messages must be 8 byte aligned!\n");<br>
+               return -EINVAL;<br>
+       }<br>
+<br>
+       bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;<br>
+       amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);<br>
+       r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);<br>
+       if (r) {<br>
+               DRM_ERROR("Failed validating the VCN message BO (%d)!\n", r);<br>
+               return r;<br>
+       }<br>
+<br>
+       r = amdgpu_bo_kmap(bo, &ptr);<br>
+       if (r) {<br>
+               DRM_ERROR("Failed mapping the VCN message (%d)!\n", r);<br>
+               return r;<br>
+       }<br>
+<br>
+       msg = ptr + addr - start;<br>
+<br>
+       /* Check length */<br>
+       if (msg[1] > end - addr) {<br>
+               r = -EINVAL;<br>
+               goto out;<br>
+       }<br>
+<br>
+       if (msg[3] != RDECODE_MSG_CREATE)<br>
+               goto out;<br>
+<br>
+       num_buffers = msg[2];<br>
+       for (i = 0, msg = &msg[6]; i < num_buffers; ++i, msg += 4) {<br>
+               uint32_t offset, size, *create;<br>
+<br>
+               if (msg[0] != RDECODE_MESSAGE_CREATE)<br>
+                       continue;<br>
+<br>
+               offset = msg[1];<br>
+               size = msg[2];<br>
+<br>
+               if (offset + size > end) {<br>
+                       r = -EINVAL;<br>
+                       goto out;<br>
+               }<br>
+<br>
+               create = ptr + addr + offset - start;<br>
+<br>
+               /* H246, HEVC and VP9 can run on any instance */<br>
+               if (create[0] == 0x7 || create[0] == 0x10 || create[0] == 0x11)<br>
+                       continue;<br>
+<br>
+               r = vcn_v3_0_limit_sched(p);<br>
+               if (r)<br>
+                       goto out;<br>
+       }<br>
+<br>
+out:<br>
+       amdgpu_bo_kunmap(bo);<br>
+       return r;<br>
+}<br>
+<br>
+static int vcn_v3_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p,<br>
+                                          uint32_t ib_idx)<br>
+{<br>
+       struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched);<br>
+       struct amdgpu_ib *ib = &p->job->ibs[ib_idx];<br>
+       uint32_t msg_lo = 0, msg_hi = 0;<br>
+       unsigned i;<br>
+       int r;<br>
+<br>
+       /* The first instance can decode anything */<br>
+       if (!ring->me)<br>
+               return 0;<br>
+<br>
+       for (i = 0; i < ib->length_dw; i += 2) {<br>
+               uint32_t reg = amdgpu_get_ib_value(p, ib_idx, i);<br>
+               uint32_t val = amdgpu_get_ib_value(p, ib_idx, i + 1);<br>
+<br>
+               if (reg == PACKET0(p->adev->vcn.internal.data0, 0)) {<br>
+                       msg_lo = val;<br>
+               } else if (reg == PACKET0(p->adev->vcn.internal.data1, 0)) {<br>
+                       msg_hi = val;<br>
+               } else if (reg == PACKET0(p->adev->vcn.internal.cmd, 0) &&<br>
+                          val == 0) {<br>
+                       r = vcn_v3_0_dec_msg(p, ((u64)msg_hi) << 32 | msg_lo);<br>
+                       if (r)<br>
+                               return r;<br>
+               }<br>
+       }<br>
+       return 0;<br>
+}<br>
+<br>
 static const struct amdgpu_ring_funcs vcn_v3_0_dec_ring_vm_funcs = {<br>
         .type = AMDGPU_RING_TYPE_VCN_DEC,<br>
         .align_mask = 0xf,<br>
@@ -1832,6 +1959,7 @@ static const struct amdgpu_ring_funcs vcn_v3_0_dec_ring_vm_funcs = {<br>
         .get_rptr = vcn_v3_0_dec_ring_get_rptr,<br>
         .get_wptr = vcn_v3_0_dec_ring_get_wptr,<br>
         .set_wptr = vcn_v3_0_dec_ring_set_wptr,<br>
+       .patch_cs_in_place = vcn_v3_0_ring_patch_cs_in_place,<br>
         .emit_frame_size =<br>
                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 +<br>
                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 +<br>
-- <br>
2.25.1<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>