<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2021-02-26 6:54 a.m., Liu, Monk
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:DM5PR12MB17086F07E3EAC0C9CA03B94D849D9@DM5PR12MB1708.namprd12.prod.outlook.com">
      
      <meta name="Generator" content="Microsoft Word 15 (filtered
        medium)">
      <style><!--
/* Font Definitions */
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:DengXian;
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:"\@DengXian";
        panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0in;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
        {mso-style-priority:34;
        margin-top:0in;
        margin-right:0in;
        margin-bottom:0in;
        margin-left:.5in;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
p.msipheader251902e5, li.msipheader251902e5, div.msipheader251902e5
        {mso-style-name:msipheader251902e5;
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
span.EmailStyle20
        {mso-style-type:personal-reply;
        font-family:"Calibri",sans-serif;
        color:windowtext;}
p.msipheadera92f4c5c, li.msipheadera92f4c5c, div.msipheadera92f4c5c
        {mso-style-name:msipheadera92f4c5c;
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:0in;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:8.5in 11.0in;
        margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
        {page:WordSection1;}
/* List Definitions */
@list l0
        {mso-list-id:503861270;
        mso-list-type:hybrid;
        mso-list-template-ids:1492292582 67698703 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l0:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l0:level2
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l0:level3
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l0:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l0:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l0:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l0:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l0:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l0:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l1
        {mso-list-id:1279491622;
        mso-list-type:hybrid;
        mso-list-template-ids:-1736673670 67698703 67698689 67698703 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l1:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l1:level2
        {mso-level-number-format:bullet;
        mso-level-text:\F0B7;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;
        font-family:Symbol;}
@list l1:level3
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-9.0pt;}
@list l1:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l1:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l1:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l1:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l1:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l1:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l2
        {mso-list-id:1655448059;
        mso-list-type:hybrid;
        mso-list-template-ids:-1584207202 67698703 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l2:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l2:level2
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l2:level3
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l2:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l2:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l2:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l2:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l2:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;}
@list l2:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
ol
        {margin-bottom:0in;}
ul
        {margin-bottom:0in;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
      <div class="WordSection1">
        <p class="msipheadera92f4c5c" style="margin:0in"><span style="font-family:"Arial",sans-serif;color:#0078D7">[AMD
            Official Use Only - Internal Distribution Only]</span><o:p></o:p></p>
        <p class="MsoNormal"><o:p> </o:p></p>
        <p class="MsoNormal">See in line<o:p></o:p></p>
        <p class="MsoNormal"><o:p> </o:p></p>
        <div>
          <p class="MsoNormal">Thanks <o:p></o:p></p>
          <p class="MsoNormal"><o:p> </o:p></p>
          <p class="MsoNormal">------------------------------------------<o:p></o:p></p>
          <p class="MsoNormal">Monk Liu | Cloud-GPU Core team<o:p></o:p></p>
          <p class="MsoNormal">------------------------------------------<o:p></o:p></p>
        </div>
        <p class="MsoNormal"><o:p> </o:p></p>
        <div>
          <div style="border:none;border-top:solid #E1E1E1
            1.0pt;padding:3.0pt 0in 0in 0in">
            <p class="MsoNormal"><b>From:</b> Koenig, Christian
              <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a> <br>
              <b>Sent:</b> Friday, February 26, 2021 3:58 PM<br>
              <b>To:</b> Liu, Monk <a class="moz-txt-link-rfc2396E" href="mailto:Monk.Liu@amd.com"><Monk.Liu@amd.com></a>;
              <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a><br>
              <b>Cc:</b> Zhang, Andy <a class="moz-txt-link-rfc2396E" href="mailto:Andy.Zhang@amd.com"><Andy.Zhang@amd.com></a>; Chen,
              Horace <a class="moz-txt-link-rfc2396E" href="mailto:Horace.Chen@amd.com"><Horace.Chen@amd.com></a>; Zhang, Jack (Jian)
              <a class="moz-txt-link-rfc2396E" href="mailto:Jack.Zhang1@amd.com"><Jack.Zhang1@amd.com></a><br>
              <b>Subject:</b> Re: [RFC] a new approach to detect which
              ring is the real black sheep upon TDR reported<o:p></o:p></p>
          </div>
        </div>
        <p class="MsoNormal"><o:p> </o:p></p>
        <p class="MsoNormal" style="margin-bottom:12.0pt">Hi Monk,<br>
          <br>
          in general an interesting idea, but I see two major problems
          with that:<br>
          <br>
          1. It would make the reset take much longer.<br>
          <br>
          2. Things get often stuck because of timing issues, so a
          guilty job might pass perfectly when run a second time.<o:p></o:p></p>
        <p class="MsoNormal" style="margin-bottom:12.0pt">[ML] but the
          innocent ring already reported a TDR, and the drm sched logic
          already deleted this “sched_job” in its mirror list, thus you
          don’t have chance to re-submit it again after reset, that’s
          the major problem here.<br>
        </p>
      </div>
    </blockquote>
    <p><br>
    </p>
    <p>Just to confirm I understand correctly, Monk reports a scenario
      where the second TDR that was reported by the innocent job is
      bailing out BEFORE having a chance to run  drm_sched_stop for that
      scheduler which should have reinserted the job back into mirror
      list (because the first TDR run is still in progress and hence
      amdgpu_device_lock_adev fails for the second TDR) and so the
      innocent job which was extracted from mirror list in
      drm_sched_job_timedout is now lost.<br>
      If so and as a possible quick fix until we overhaul the entire
      design as suggested in this thread - maybe we can modify
      drm_sched_backend_ops.timedout_job callback to report back
      premature termination BEFORE drm_sched_stop had a chance to run
      and then reinsert back the job into mirror list from within 
      drm_sched_job_timedout? There is no problem of racing against
      concurrent drm_sched_get_cleanup_job once we reinsert there as we
      don't reference the job pointer anymore after this point and so if
      it's already signaled and freed right away - it's ok. <br>
    </p>
    <p>Andrey</p>
    <p><br>
    </p>
    <blockquote type="cite" cite="mid:DM5PR12MB17086F07E3EAC0C9CA03B94D849D9@DM5PR12MB1708.namprd12.prod.outlook.com">
      <div class="WordSection1">
        <p class="MsoNormal" style="margin-bottom:12.0pt">
          <br>
          Apart from that the whole ring mirror list turned out to be a
          really bad idea. E.g. we still struggle with object life time
          because the concept doesn't fit into the object model of the
          GPU scheduler under Linux.<br>
          <br>
          We should probably work on this separately and straighten up
          the job destruction once more and keep the recovery
          information in the fence instead.<o:p></o:p></p>
        <p class="MsoNormal" style="margin-bottom:12.0pt">[ML] we claim
          to our customer that no innocent process will be dropped or
          cancelled, and our current logic works for the most time, but
          only when there are different process running on gfx/computes
          rings then we would run into the tricky situation I stated
          here, and the proposal is the only way I can figure out so
          far, do you have a better solution or idea we review it as
          another candidate RFC ? Be note that we raised this proposal
          is because we do hit our trouble and we do need to resolve it
          …. So even a not perfect solution is still better than just
          cancel the innocent job (and their context/process)<o:p></o:p></p>
        <p class="MsoNormal" style="margin-bottom:12.0pt">Thanks ! <o:p></o:p></p>
        <p class="MsoNormal" style="margin-bottom:12.0pt"><br>
          Regards,<br>
          Christian.<o:p></o:p></p>
        <div>
          <p class="MsoNormal">Am 26.02.21 um 06:58 schrieb Liu, Monk:<o:p></o:p></p>
        </div>
        <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
          <p class="msipheader251902e5" style="margin:0in"><span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:#317100">[AMD
              Public Use]</span><o:p></o:p></p>
          <p class="MsoNormal"><o:p> </o:p></p>
          <p class="MsoNormal">Hi all<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">NAVI2X  project hit a really hard to
            solve issue now, and it is turned out to be a general
            headache of our TDR mechanism , check below scenario:<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <ol style="margin-top:0in" type="1" start="1">
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l0 level1 lfo1">There is a
              job1 running on compute1 ring at timestamp
              <o:p></o:p></li>
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l0 level1 lfo1">There is a
              job2 running on gfx ring at timestamp<o:p></o:p></li>
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l0 level1 lfo1">Job1 is
              the guilty one, and job1/job2 were scheduled to their
              rings at almost the same timestamp
              <o:p></o:p></li>
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l0 level1 lfo1">After 2
              seconds we receive two TDR reporting from both GFX ring
              and compute ring<o:p></o:p></li>
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l0 level1 lfo1"><b>Current
                scheme is that in drm scheduler all the head jobs of
                those two rings are considered “bad job” and taken away
                from the mirror list
              </b><o:p></o:p></li>
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l0 level1 lfo1">The result
              is both the real guilty job (job1) and the innocent job
              (job2) were all deleted from mirror list, and their
              corresponding contexts were also treated as guilty<b> (so
                the innocent process remains running is not secured)</b><o:p></o:p></li>
          </ol>
          <p class="MsoListParagraph"><b> </b><o:p></o:p></p>
          <p class="MsoNormal">But by our wish the ideal case is TDR
            mechanism can detect which ring is the guilty ring and the
            innocent ring can resubmits all its pending jobs:<o:p></o:p></p>
          <ol style="margin-top:0in" type="1" start="1">
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l2 level1 lfo2">Job1 to be
              deleted from compute1 ring’s mirror list<o:p></o:p></li>
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l2 level1 lfo2">Job2 is
              kept and resubmitted later and its belonging
              process/context are even not aware of this TDR at all
              <o:p></o:p></li>
          </ol>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">Here I have a proposal tend to achieve
            above goal and it rough procedure is :<o:p></o:p></p>
          <ol style="margin-top:0in" type="1" start="1">
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l1 level1 lfo3">Once any
              ring reports a TDR, the head job is *<b>not</b>* treated
              as “bad job”, and it is *<b>not</b>* deleted from the
              mirror list in drm sched functions<o:p></o:p></li>
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l1 level1 lfo3">In
              vendor’s function (our amdgpu driver here):<o:p></o:p></li>
          </ol>
          <ol style="margin-top:0in" type="1" start="2">
            <ul style="margin-top:0in" type="disc">
              <li class="MsoListParagraph" style="margin-left:0in;mso-list:l1 level2 lfo3">reset
                GPU<o:p></o:p></li>
              <li class="MsoListParagraph" style="margin-left:0in;mso-list:l1 level2 lfo3">repeat
                below actions on each RINGS * one by one *:<o:p></o:p></li>
            </ul>
          </ol>
          <p class="MsoListParagraph" style="margin-left:1.5in;text-indent:-9.0pt;mso-list:l1
            level3 lfo3">
            <!--[if !supportLists]--><span style="mso-list:Ignore">1.<span style="font:7.0pt "Times New Roman"">
              </span></span><!--[endif]-->take the head job and submit
            it on this ring<o:p></o:p></p>
          <p class="MsoListParagraph" style="margin-left:1.5in;text-indent:-9.0pt;mso-list:l1
            level3 lfo3">
            <!--[if !supportLists]--><span style="mso-list:Ignore">2.<span style="font:7.0pt "Times New Roman"">
              </span></span><!--[endif]-->see if it completes, if not
            then this job is the real “bad job”<o:p></o:p></p>
          <p class="MsoListParagraph" style="margin-left:1.5in;text-indent:-9.0pt;mso-list:l1
            level3 lfo3">
            <!--[if !supportLists]--><span style="mso-list:Ignore">3.<span style="font:7.0pt "Times New Roman"">
              </span></span><!--[endif]--> take it away from mirror list
            if this head job is “bad job”<o:p></o:p></p>
          <ol style="margin-top:0in" type="1" start="2">
            <ul style="margin-top:0in" type="disc">
              <li class="MsoListParagraph" style="margin-left:0in;mso-list:l1 level2 lfo3">After
                above iteration on all RINGS, we already clears all the
                bad job(s)<o:p></o:p></li>
            </ul>
          </ol>
          <ol style="margin-top:0in" type="1" start="3">
            <li class="MsoListParagraph" style="margin-left:0in;mso-list:l1 level1 lfo3">Resubmit
              all jobs from each mirror list to their corresponding
              rings (this is the existed logic)<o:p></o:p></li>
          </ol>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">The idea of this is to use “serial” way
            to re-run and re-check each head job of each RING, in order
            to take out the real black sheep and its guilty context.<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">P.S.: we can use this approaches only on
            GFX/KCQ ring reports TDR , since those rings are
            intermutually affected to each other. For SDMA ring timeout
            it definitely proves the head job on SDMA ring is really
            guilty.<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">Thanks <o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
          <p class="MsoNormal">------------------------------------------<o:p></o:p></p>
          <p class="MsoNormal">Monk Liu | Cloud-GPU Core team<o:p></o:p></p>
          <p class="MsoNormal">------------------------------------------<o:p></o:p></p>
          <p class="MsoNormal"> <o:p></o:p></p>
        </blockquote>
        <p class="MsoNormal"><o:p> </o:p></p>
      </div>
      <br>
      <fieldset class="mimeAttachmentHeader"></fieldset>
      <pre class="moz-quote-pre" wrap="">_______________________________________________
amd-gfx mailing list
<a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>
<a class="moz-txt-link-freetext" href="https://lists.freedesktop.org/mailman/listinfo/amd-gfx">https://lists.freedesktop.org/mailman/listinfo/amd-gfx</a>
</pre>
    </blockquote>
  </body>
</html>