<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
</head>
<body>
<p style="font-family:Arial;font-size:10pt;color:#0078D7;margin:15pt;" align="Left">
[AMD Official Use Only - Internal Distribution Only]<br>
</p>
<br>
<div>
<div data-ogsc="" style="">
<meta content="text/html; charset=us-ascii" data-ogsc="" style="">
</div>
<div dir="auto" style="color: rgb(33, 33, 33); background-color: rgb(255, 255, 255); text-align: left;">
Thanks Christian. I will try to send a updated patch soon. </div>
<div id="ms-outlook-mobile-signature" data-ogsc="" style="text-align: left;" dir="auto">
<div><br>
</div>
Get <a href="https://aka.ms/ghei36">Outlook for Android</a></div>
<div id="id-3a39905a-0ef3-4e35-9b95-a5c89c0aa02a" class="ms-outlook-mobile-reference-message" data-ogsc="" style="">
<div style="font-family: sans-serif; font-size: 12pt; color: rgb(0, 0, 0);"><br>
</div>
<hr tabindex="-1" style="display:inline-block; width:98%">
<div id="divRplyFwdMsg"><strong>From:</strong> Koenig, Christian <Christian.Koenig@amd.com><br>
<strong>Sent:</strong> Monday, February 24, 2020, 18:06<br>
<strong>To:</strong> Nirmoy Das<br>
<strong>Cc:</strong> amd-gfx@lists.freedesktop.org; Deucher, Alexander; Liu, Monk; Li, Dennis; Das, Nirmoy<br>
<strong>Subject:</strong> Re: [RFC PATCH 1/1] drm/amdgpu: wait for sched to become ready on job submit<br>
</div>
<br>
<meta content="text/html; charset=utf-8">
<div dir="auto">
<div>Hi Nirmoy,<br>
<div class="gmail_extra"><br>
<div class="gmail_quote">Am 24.02.2020 17:48 schrieb Nirmoy Das <nirmoy.aiemd@gmail.com>:<br type="attribution">
<blockquote class="quote" style="margin:0 0 0 .8ex; border-left:1px #ccc solid; padding-left:1ex">
<div><font size="2"><span style="font-size:11pt">
<div>On reset, amdgpu can set a drm sched's ready status to false temporarily. drm job<br>
init will fail if all of the drm scheds are not ready for a HW IP. This patch tries to make<br>
kernel's internal drm job submit handle, amdgpu_job_submit() a bit more fault tolerant.<br>
</div>
</span></font></div>
</blockquote>
</div>
</div>
</div>
<div dir="auto"><br>
</div>
<div dir="auto">I don't think that this approach makes sense. Since it is a front end property we should rather stop setting the scheduler ready status to false during reset.</div>
<div dir="auto"><br>
</div>
<div dir="auto">Instead we should only set it to false when the ring/IB test fails and we can't bring the ring back to life again.</div>
<div dir="auto"><br>
</div>
<div dir="auto">Christian.</div>
<div dir="auto"><br>
</div>
<div dir="auto">
<div class="gmail_extra">
<div class="gmail_quote">
<blockquote class="quote" style="margin:0 0 0 .8ex; border-left:1px #ccc solid; padding-left:1ex">
<div><font size="2"><span style="font-size:11pt">
<div><br>
Signed-off-by: Nirmoy Das <nirmoy.das@amd.com><br>
---<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 +++++++++++++++++++--<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  5 +--<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |  6 ++--<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c     |  2 +-<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c     |  2 +-<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  2 +-<br>
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c      |  2 +-<br>
 7 files changed, 43 insertions(+), 11 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c<br>
index d42be880a236..0745df80112f 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c<br>
@@ -139,7 +139,38 @@ void amdgpu_job_free(struct amdgpu_job *job)<br>
         kfree(job);<br>
 }<br>
 <br>
-int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,<br>
+static int amdgpu_job_try_init(struct amdgpu_device *adev,<br>
+                              struct drm_sched_job *base,<br>
+                              struct drm_sched_entity *entity,<br>
+                              void *owner)<br>
+{<br>
+       int r, i;<br>
+<br>
+       r = drm_sched_job_init(base, entity, owner);<br>
+       if (r == -ENOENT) {<br>
+               /* retry till we come out of reset phase */<br>
+               while (!mutex_trylock(&adev->lock_reset))<br>
+                       msleep(10);<br>
+               /* retry for a second for the sched to get ready*/<br>
+               for (i = 0; i < 100; i++) {<br>
+                       msleep(10);<br>
+                       r = drm_sched_job_init(base, entity, owner);<br>
+                       if (r == -ENOENT)<br>
+                               continue;<br>
+               }<br>
+<br>
+               mutex_unlock(&adev->lock_reset);<br>
+               /* If after all these we failed to initialize a job<br>
+                * it means the IP is unrecoverable */<br>
+               if (r == -ENOENT)<br>
+                       return -ENODEV;<br>
+       }<br>
+<br>
+       return r;<br>
+}<br>
+<br>
+int amdgpu_job_submit(struct amdgpu_device *adev,struct amdgpu_job *job,<br>
+                     struct drm_sched_entity *entity,<br>
                       void *owner, struct dma_fence **f)<br>
 {<br>
         enum drm_sched_priority priority;<br>
@@ -149,7 +180,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,<br>
         if (!f)<br>
                 return -EINVAL;<br>
 <br>
-       r = drm_sched_job_init(&job->base, entity, owner);<br>
+       r = amdgpu_job_try_init(adev, &job->base, entity, owner);<br>
         if (r)<br>
                 return r;<br>
 <br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h<br>
index 2e2110dddb76..fed87e96cacc 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h<br>
@@ -70,8 +70,9 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,<br>
 <br>
 void amdgpu_job_free_resources(struct amdgpu_job *job);<br>
 void amdgpu_job_free(struct amdgpu_job *job);<br>
-int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,<br>
-                     void *owner, struct dma_fence **f);<br>
+int amdgpu_job_submit(struct amdgpu_device *adev, struct amdgpu_job *job,<br>
+                     struct drm_sched_entity *entity, void *owner,<br>
+                     struct dma_fence **f);<br>
 int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,<br>
                              struct dma_fence **fence);<br>
 <br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c<br>
index 660867cf2597..adfde07eb75f 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c<br>
@@ -2066,7 +2066,7 @@ static int amdgpu_map_buffer(struct ttm_buffer_object *bo,<br>
         if (r)<br>
                 goto error_free;<br>
 <br>
-       r = amdgpu_job_submit(job, &adev->mman.entity,<br>
+       r = amdgpu_job_submit(adev, job, &adev->mman.entity,<br>
                               AMDGPU_FENCE_OWNER_UNDEFINED, &fence);<br>
         if (r)<br>
                 goto error_free;<br>
@@ -2137,7 +2137,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,<br>
         if (direct_submit)<br>
                 r = amdgpu_job_submit_direct(job, ring, fence);<br>
         else<br>
-               r = amdgpu_job_submit(job, &adev->mman.entity,<br>
+               r = amdgpu_job_submit(adev, job, &adev->mman.entity,<br>
                                       AMDGPU_FENCE_OWNER_UNDEFINED, fence);<br>
         if (r)<br>
                 goto error_free;<br>
@@ -2231,7 +2231,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,<br>
 <br>
         amdgpu_ring_pad_ib(ring, &job->ibs[0]);<br>
         WARN_ON(job->ibs[0].length_dw > num_dw);<br>
-       r = amdgpu_job_submit(job, &adev->mman.entity,<br>
+       r = amdgpu_job_submit(adev, job, &adev->mman.entity,<br>
                               AMDGPU_FENCE_OWNER_UNDEFINED, fence);<br>
         if (r)<br>
                 goto error_free;<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c<br>
index 5fd32ad1c575..8ff97b24914e 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c<br>
@@ -1104,7 +1104,7 @@ static int amdgpu_uvd_send_msg(struct amdgpu_ring *ring, struct amdgpu_bo *bo,<br>
                 if (r)<br>
                         goto err_free;<br>
 <br>
-               r = amdgpu_job_submit(job, &adev->uvd.entity,<br>
+               r = amdgpu_job_submit(adev, job, &adev->uvd.entity,<br>
                                       AMDGPU_FENCE_OWNER_UNDEFINED, &f);<br>
                 if (r)<br>
                         goto err_free;<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c<br>
index 59ddba137946..e721d3367783 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c<br>
@@ -554,7 +554,7 @@ static int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,<br>
         if (direct)<br>
                 r = amdgpu_job_submit_direct(job, ring, &f);<br>
         else<br>
-               r = amdgpu_job_submit(job, &ring->adev->vce.entity,<br>
+               r = amdgpu_job_submit(ring->adev, job, &ring->adev->vce.entity,<br>
                                       AMDGPU_FENCE_OWNER_UNDEFINED, &f);<br>
         if (r)<br>
                 goto err;<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c<br>
index 4cc7881f438c..b536962c22d9 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c<br>
@@ -100,7 +100,7 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,<br>
         WARN_ON(ib->length_dw == 0);<br>
         amdgpu_ring_pad_ib(ring, ib);<br>
         WARN_ON(ib->length_dw > p->num_dw_left);<br>
-       r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);<br>
+       r = amdgpu_job_submit(p->adev, p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);<br>
         if (r)<br>
                 goto error;<br>
 <br>
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c<br>
index 9775eca6fe43..a4aaa2a1f878 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c<br>
@@ -377,7 +377,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,<br>
         job->vm_needs_flush = true;<br>
         job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop;<br>
         amdgpu_ring_pad_ib(ring, &job->ibs[0]);<br>
-       r = amdgpu_job_submit(job, &adev->mman.entity,<br>
+       r = amdgpu_job_submit(adev, job, &adev->mman.entity,<br>
                               AMDGPU_FENCE_OWNER_UNDEFINED, &fence);<br>
         if (r)<br>
                 goto error_submit;<br>
-- <br>
2.25.0<br>
<br>
</div>
</span></font></div>
</blockquote>
</div>
<br>
</div>
</div>
</div>
<br>
</div>
</div>
</body>
</html>