<!DOCTYPE html><html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    Hi Shikang,<br>
    <br>
    please completely drop the AMDGPU_PENDING_JOB_TIMEOUT workaround.<br>
    <br>
    This is unnecessary when you use amdgpu_fence_count_emitted()
    instead of looking at the jobs.<br>
    <br>
    That's one of the reasons why looking at the jobs is such a really
    really bad idea in the first place.<br>
    <br>
    Regards,<br>
    Christian.<br>
    <br>
    <div class="moz-cite-prefix">Am 19.11.24 um 09:47 schrieb Fan,
      Shikang:<br>
    </div>
    <blockquote type="cite" cite="mid:SA1PR12MB734309748AB9340AFC555028EB202@SA1PR12MB7343.namprd12.prod.outlook.com">
      
      <style type="text/css" style="display:none;">P {margin-top:0;margin-bottom:0;}</style>
      <p style="font-family:Calibri;font-size:10pt;color:#0000FF;margin:5pt;font-style:normal;font-weight:normal;text-decoration:none;" align="Left">
        [AMD Official Use Only - AMD Internal Distribution Only]<br>
      </p>
      <br>
      <div>
        <div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 11pt; color: rgb(0, 0, 0);">
          +<a href="mailto:Christian.Koenig@amd.com" id="OWAAM957748" class="tWKOu mention ms-bgc-nlr ms-fcl-b" moz-do-not-send="true">@Koenig, Christian</a><br>
          <br>
          Hi Christian,<br>
          <br>
          Could you please help take a look at this patch? Compared to
          the previous patch, we now use amdgpu_fence_emitted_count to
          check unfinished jobs. And this function is currently only
          used for mailbox_flr_work In SRIOV case, soI believe the
          modification on this function will not have any impact on the
          rest part of the driver. Thanks for your advice on v1 patch.<br>
          <br>
          Regards,</div>
        <div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 11pt; color: rgb(0, 0, 0);">
          Shikang</div>
        <div style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 11pt; color: rgb(0, 0, 0);">
          <br>
        </div>
        <hr style="display: inline-block; width: 98%;">
        <div dir="ltr" id="divRplyFwdMsg"><span style="font-family: Calibri, sans-serif; font-size: 11pt; color: rgb(0, 0, 0);"><b>From:</b> Shikang
            Fan <a class="moz-txt-link-rfc2396E" href="mailto:shikang.fan@amd.com"><shikang.fan@amd.com></a><br>
            <b>Sent:</b> Monday, November 18, 2024 6:10 PM<br>
            <b>To:</b> <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>
            <a class="moz-txt-link-rfc2396E" href="mailto:amd-gfx@lists.freedesktop.org"><amd-gfx@lists.freedesktop.org></a><br>
            <b>Cc:</b> Fan, Shikang <a class="moz-txt-link-rfc2396E" href="mailto:Shikang.Fan@amd.com"><Shikang.Fan@amd.com></a>; Deng,
            Emily <a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><Emily.Deng@amd.com></a><br>
            <b>Subject:</b> [PATCH v2] drm/amdgpu: Check fence emitted
            count to identify bad jobs</span>
          <div> </div>
        </div>
        <div style="font-size: 11pt;">In SRIOV, when host driver
          performs MODE 1 reset and notifies FLR to<br>
          guest driver, there is a small chance that there is no job
          running on hw<br>
          but the driver has not updated the pending list yet, causing
          the driver<br>
          not respond the FLR request. Modify the has_job_running
          function to<br>
          make sure if there is still running job.<br>
          <br>
          v2: Use amdgpu_fence_count_emitted to determine job running
          status.<br>
          <br>
          Signed-off-by: Emily Deng <a class="moz-txt-link-rfc2396E" href="mailto:Emily.Deng@amd.com"><Emily.Deng@amd.com></a><br>
          Signed-off-by: Shikang Fan <a class="moz-txt-link-rfc2396E" href="mailto:shikang.fan@amd.com"><shikang.fan@amd.com></a><br>
          ---<br>
           drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22
          ++++++++++++++--------<br>
           1 file changed, 14 insertions(+), 8 deletions(-)<br>
          <br>
          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
          b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
          index b3ca911e55d6..ea756eacebdc 100644<br>
          --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
          +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
          @@ -100,6 +100,7 @@
          MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");<br>
           #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)<br>
           #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)<br>
           #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)<br>
          +#define AMDGPU_PENDING_JOB_TIMEOUT     (1000000)<br>
           <br>
           static const struct drm_driver amdgpu_kms_driver;<br>
           <br>
          @@ -5222,15 +5223,19 @@ static int
          amdgpu_device_reset_sriov(struct amdgpu_device *adev,<br>
           }<br>
           <br>
           /**<br>
          - * amdgpu_device_has_job_running - check if there is any job
          in mirror list<br>
          + * amdgpu_device_has_job_running - check if there is any
          unfinished job<br>
            *<br>
            * @adev: amdgpu_device pointer<br>
            *<br>
          - * check if there is any job in mirror list<br>
          + * check if there is any job running on the device when guest
          driver receives<br>
          + * FLR notification from host driver. If there are still jobs
          running and not<br>
          + * signaled after 1s, the hardware is most likely hung
          already, then the guest<br>
          + * driver will not respond the FLR reset. Instead, let the
          job hit the timeout<br>
          + * and guest driver then issue the reset request.<br>
            */<br>
           bool amdgpu_device_has_job_running(struct amdgpu_device
          *adev)<br>
           {<br>
          -       int i;<br>
          +       int i, j;<br>
                   struct drm_sched_job *job;<br>
           <br>
                   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {<br>
          @@ -5239,11 +5244,12 @@ bool
          amdgpu_device_has_job_running(struct amdgpu_device *adev)<br>
                           if (!amdgpu_ring_sched_ready(ring))<br>
                                   continue;<br>
           <br>
          -               spin_lock(&ring->sched.job_list_lock);<br>
          -               job =
          list_first_entry_or_null(&ring->sched.pending_list,<br>
          -                                              struct
          drm_sched_job, list);<br>
          -              
          spin_unlock(&ring->sched.job_list_lock);<br>
          -               if (job)<br>
          +               for (j = 0; j < AMDGPU_PENDING_JOB_TIMEOUT;
          j++) {<br>
          +                       if (!amdgpu_fence_count_emitted(ring))<br>
          +                               break;<br>
          +                       udelay(1);<br>
          +               }<br>
          +               if (j == AMDGPU_PENDING_JOB_TIMEOUT)<br>
                                   return true;<br>
                   }<br>
                   return false;<br>
          --<br>
          2.34.1<br>
          <br>
        </div>
      </div>
    </blockquote>
    <br>
  </body>
</html>