<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2021-02-27 11:10 p.m., Liu, Monk
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:DM5PR12MB170892C661DF1943780049EA849B9@DM5PR12MB1708.namprd12.prod.outlook.com">
      
      <meta name="Generator" content="Microsoft Word 15 (filtered
        medium)">
      <style><!--
/* Font Definitions */
@font-face
        {font-family:宋体;
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:等线;
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:"\@宋体";
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:"\@等线";
        panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0cm;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:blue;
        text-decoration:underline;}
pre
        {mso-style-priority:99;
        mso-style-link:"HTML 预设格式 字符";
        margin:0cm;
        font-size:10.0pt;
        font-family:"Courier New";}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
        {mso-style-priority:34;
        margin-top:0cm;
        margin-right:0cm;
        margin-bottom:0cm;
        margin-left:36.0pt;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
span.HTML
        {mso-style-name:"HTML 预设格式 字符";
        mso-style-priority:99;
        mso-style-link:"HTML 预设格式";
        font-family:"Courier New";}
p.msipheader251902e5, li.msipheader251902e5, div.msipheader251902e5
        {mso-style-name:msipheader251902e5;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
p.msipheadera92f4c5c, li.msipheadera92f4c5c, div.msipheadera92f4c5c
        {mso-style-name:msipheadera92f4c5c;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
span.EmailStyle25
        {mso-style-type:personal-reply;
        font-family:等线;
        color:windowtext;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:612.0pt 792.0pt;
        margin:72.0pt 72.0pt 72.0pt 72.0pt;}
div.WordSection1
        {page:WordSection1;}
/* List Definitions */
@list l0
        {mso-list-id:503861270;
        mso-list-type:hybrid;
        mso-list-template-ids:1492292582 67698703 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l0:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level2
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level3
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l0:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l0:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l1
        {mso-list-id:1279491622;
        mso-list-type:hybrid;
        mso-list-template-ids:-1736673670 67698703 67698689 67698703 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l1:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level2
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Symbol;}
@list l1:level3
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-9.0pt;}
@list l1:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l1:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l2
        {mso-list-id:1655448059;
        mso-list-type:hybrid;
        mso-list-template-ids:-1584207202 67698703 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l2:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l2:level2
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l2:level3
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l2:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l2:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l2:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l2:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l2:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l2:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
ol
        {margin-bottom:0cm;}
ul
        {margin-bottom:0cm;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
      <p style="font-family:Arial;font-size:11pt;color:#0078D7;margin:5pt;" align="Left">
        [AMD Official Use Only - Internal Distribution Only]<br>
      </p>
      <br>
      <div>
        <div class="WordSection1">
          <p><span style="font-size:10.5pt;font-family:等线" lang="EN-US">>></span><span lang="EN-US"> So gfx job hangs because it has a dependency
              on buggy compute job which already is hanging ?<o:p></o:p></span></p>
          <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">No,
              there is no dependency between this gfx job and that
              compute job from a software perspective , but the CU is
              shared thus gfx is affected by the bug from that compute
              job
              <o:p></o:p></span></p>
          <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"><o:p> </o:p></span></p>
          <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">>></span><span lang="EN-US"> I am still missing something - we don't ever
              delete bad jobs or any jobs until they are signaled, we
              reinsert the bad  job back into mirror list in
              drm_sched_stop<o:p></o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US">Oh yeah, it was still
              kept in the mirror list, I thought it was removed once for
              good in scheduler…<o:p></o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US">then my question is
              why we need to remove it in scheduler part if we always
              need to reinsert it back? </span></p>
        </div>
      </div>
    </blockquote>
    <p><br>
    </p>
    <p>See explanation in the original fix in this commit 
      'drm/scheduler: Avoid accessing freed bad job.' - the problem with
      that fix was that while it solved the original race issue it
      created another issue where if the driver was prematurely
      terminating the reset process due to guilty job already being
      signaled (optimization - like we have in non amdgpu drivers) OR
      reset lock contention from multiple TDR threads (like we have in
      amdgpu) then indeed we would remove the bad job but would not
      reinsert back as we would skip drm_sched_stop. For which issue
      Luben proposed a state machine approach to the entire job life
      cycle handling (can't find the patch-set now) but during which
      review it was decided that the optimal approach would be to stop
      relying on the job and start relying on the
      entity->finish_fence to keep all the info (What Christian
      mentions in the beginning of this thread). </p>
    <p><br>
    </p>
    <blockquote type="cite" cite="mid:DM5PR12MB170892C661DF1943780049EA849B9@DM5PR12MB1708.namprd12.prod.outlook.com">
      <div>
        <div class="WordSection1">
          <p class="MsoNormal"><span lang="EN-US">And even for other
              vendors the better way is still<o:p></o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US">let vendor driver
              decide the heading job.<o:p></o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US">The real issue we hit
              is : sometimes if we run a quark test (it hangs kcq ring
              with a bad shader inside), X server will occasionally
              crash with a GFX ring TDR report<o:p></o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US">Root cause is still
              what I described before:  both this innocent gfx job and
              the guilty compute job are all marked as “guilty” by our
              driver, so even they are re-inserted back to mirror list<o:p></o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US">But they are all
              abandoned in drm_sched_resubmit_jobs() due to they are all
              processed by drm_sched_increase_karma()</span></p>
        </div>
      </div>
    </blockquote>
    <p><br>
    </p>
    <p>I see now, in this case the main issue is indeed that we cannot
      rely on head job in mirror list to be the actual bad and guilty
      job and this then requires some redesign (e.g. along the lines of
      what you suggested).</p>
    <p>Andrey</p>
    <p><br>
    </p>
    <blockquote type="cite" cite="mid:DM5PR12MB170892C661DF1943780049EA849B9@DM5PR12MB1708.namprd12.prod.outlook.com">
      <div>
        <div class="WordSection1">
          <p class="MsoNormal"><span lang="EN-US"><o:p></o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
          <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"><o:p> </o:p></span></p>
          <div>
            <div style="border:none;border-top:solid #E1E1E1
              1.0pt;padding:3.0pt 0cm 0cm 0cm">
              <p class="MsoNormal"><b><span style="font-family:等线">发件人<span lang="EN-US">:</span></span></b><span style="font-family:等线" lang="EN-US"> Grodzovsky,
                  Andrey <a class="moz-txt-link-rfc2396E" href="mailto:Andrey.Grodzovsky@amd.com"><Andrey.Grodzovsky@amd.com></a>
                  <br>
                </span><b><span style="font-family:等线">发送时间<span lang="EN-US">:</span></span></b><span style="font-family:等线" lang="EN-US"> 2021</span><span style="font-family:等线">年<span lang="EN-US">2</span>月<span lang="EN-US">28</span>日<span lang="EN-US"> 8:55<br>
                  </span><b>收件人<span lang="EN-US">:</span></b><span lang="EN-US"> Liu, Monk <a class="moz-txt-link-rfc2396E" href="mailto:Monk.Liu@amd.com"><Monk.Liu@amd.com></a>;
                    Koenig, Christian <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a>;
                    <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a><br>
                  </span><b>抄送<span lang="EN-US">:</span></b><span lang="EN-US"> Zhang, Andy
                    <a class="moz-txt-link-rfc2396E" href="mailto:Andy.Zhang@amd.com"><Andy.Zhang@amd.com></a>; Chen, Horace
                    <a class="moz-txt-link-rfc2396E" href="mailto:Horace.Chen@amd.com"><Horace.Chen@amd.com></a>; Zhang, Jack (Jian)
                    <a class="moz-txt-link-rfc2396E" href="mailto:Jack.Zhang1@amd.com"><Jack.Zhang1@amd.com></a><br>
                  </span><b>主题<span lang="EN-US">:</span></b><span lang="EN-US"> Re: </span>回复<span lang="EN-US">:
                    [RFC] a new approach to detect which ring is the
                    real black sheep upon TDR reported<o:p></o:p></span></span></p>
            </div>
          </div>
          <p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
          <p><span lang="EN-US"><o:p> </o:p></span></p>
          <div>
            <p class="MsoNormal"><span lang="EN-US">On 2021-02-26 10:56
                p.m., Liu, Monk wrote:<o:p></o:p></span></p>
          </div>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <p style="margin:5.0pt"><span style="font-size:11.0pt;font-family:"Arial",sans-serif;color:#0078D7" lang="EN-US">[AMD Official Use Only - Internal
                Distribution Only]<o:p></o:p></span></p>
            <p class="MsoNormal"><span style="font-size:12.0pt;font-family:宋体" lang="EN-US"><o:p> </o:p></span></p>
            <div>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">H
                  Andrey</span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"> </span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">The
                  scenario I hit here is not the one you mentioned, let
                  me explain it with more details by another much easier
                  understood example:</span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"> </span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">Consider
                  ring you have a job1 on KCQ, but the timeout of KCQ is
                  60 seconds (just for example)</span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">You
                  also have a job2 on GFX ring, and the timeout of GFX
                  is 2 seconds</span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"> </span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">We
                  submit job1 first, and assume job1 have bug and it
                  will cause shader hang very very soon
                </span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">After
                  10 seconds we submit job2, since KCQ have 60 seconds
                  to report TDR thus SW know nothing about the engine
                  already hang</span><span lang="EN-US"><o:p></o:p></span></p>
            </div>
          </blockquote>
          <p><span lang="EN-US"><o:p> </o:p></span></p>
          <p><span lang="EN-US">So gfx job hangs because it has a
              dependency on buggy compute job which already is hanging ?<o:p></o:p></span></p>
          <p><span lang="EN-US"><o:p> </o:p></span></p>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <div>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">After
                  2 seconds we got TDR report from job2 on GFX ring,
                  sched_job_timeout() think the leading job of GFX ring
                  is the black sheep so it is deleted from the mirror
                  list</span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">But
                  in fact this job1 is innocent, and we should insert it
                  back after recovery , and due to it was already
                  deleted this innocent job</span><span style="font-size:10.5pt;font-family:等线">’<span lang="EN-US">s context/process is really harmed</span></span><span lang="EN-US"><o:p></o:p></span></p>
            </div>
          </blockquote>
          <p><span lang="EN-US"><o:p> </o:p></span></p>
          <p><span lang="EN-US">I am still missing something - we don't
              ever delete bad jobs or any jobs until they are signaled,
              we reinsert the bad  job back into mirror list in
              drm_sched_stop
              <br>
              (here - <a href="https://elixir.bootlin.com/linux/v5.11.1/source/drivers/gpu/drm/scheduler/sched_main.c#L385" moz-do-not-send="true">
https://elixir.bootlin.com/linux/v5.11.1/source/drivers/gpu/drm/scheduler/sched_main.c#L385</a>)
              after sched thread is stopped and continue with the reset
              procedure.<o:p></o:p></span></p>
          <p><span lang="EN-US">Andrey<o:p></o:p></span></p>
          <p><span lang="EN-US"><o:p> </o:p></span></p>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <div>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"> </span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">Hope
                  above example helps</span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"> </span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US">Thanks
                </span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span style="font-size:10.5pt;font-family:等线" lang="EN-US"> </span><span lang="EN-US"><o:p></o:p></span></p>
              <div>
                <div style="border:none;border-top:solid #E1E1E1
                  1.0pt;padding:3.0pt 0cm 0cm 0cm">
                  <p class="MsoNormal"><b><span style="font-family:等线">发件人<span lang="EN-US">:</span></span></b><span style="font-family:等线" lang="EN-US"> Grodzovsky,
                      Andrey
                      <a href="mailto:Andrey.Grodzovsky@amd.com" moz-do-not-send="true"><Andrey.Grodzovsky@amd.com></a>
                      <br>
                    </span><b><span style="font-family:等线">发送时间<span lang="EN-US">:</span></span></b><span style="font-family:等线" lang="EN-US"> 2021</span><span style="font-family:等线">年<span lang="EN-US">2</span>月<span lang="EN-US">27</span>日<span lang="EN-US"> 0:50<br>
                      </span><b>收件人<span lang="EN-US">:</span></b><span lang="EN-US"> Liu, Monk <a href="mailto:Monk.Liu@amd.com" moz-do-not-send="true">
                          <Monk.Liu@amd.com></a>; Koenig,
                        Christian <a href="mailto:Christian.Koenig@amd.com" moz-do-not-send="true">
                          <Christian.Koenig@amd.com></a>; <a href="mailto:amd-gfx@lists.freedesktop.org" moz-do-not-send="true">amd-gfx@lists.freedesktop.org</a><br>
                      </span><b>抄送<span lang="EN-US">:</span></b><span lang="EN-US"> Zhang, Andy <a href="mailto:Andy.Zhang@amd.com" moz-do-not-send="true">
                          <Andy.Zhang@amd.com></a>; Chen, Horace <a href="mailto:Horace.Chen@amd.com" moz-do-not-send="true"><Horace.Chen@amd.com></a>;
                        Zhang, Jack (Jian)
                        <a href="mailto:Jack.Zhang1@amd.com" moz-do-not-send="true"><Jack.Zhang1@amd.com></a><br>
                      </span><b>主题<span lang="EN-US">:</span></b><span lang="EN-US"> Re: [RFC] a new approach to detect
                        which ring is the real black sheep upon TDR
                        reported</span></span><span lang="EN-US"><o:p></o:p></span></p>
                </div>
              </div>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p><span lang="EN-US"> <o:p></o:p></span></p>
              <div>
                <p class="MsoNormal"><span lang="EN-US">On 2021-02-26
                    6:54 a.m., Liu, Monk wrote:<o:p></o:p></span></p>
              </div>
              <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
                <p class="msipheadera92f4c5c" style="margin:0cm"><span style="font-family:"Arial",sans-serif;color:#0078D7" lang="EN-US">[AMD Official Use Only - Internal
                    Distribution Only]</span><span lang="EN-US"><o:p></o:p></span></p>
                <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                <p class="MsoNormal"><span lang="EN-US">See in line<o:p></o:p></span></p>
                <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                <div>
                  <p class="MsoNormal"><span lang="EN-US">Thanks <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">Monk Liu |
                      Cloud-GPU Core team<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
                </div>
                <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                <div>
                  <div style="border:none;border-top:solid #E1E1E1
                    1.0pt;padding:3.0pt 0cm 0cm 0cm">
                    <p class="MsoNormal"><b><span lang="EN-US">From:</span></b><span lang="EN-US"> Koenig, Christian
                        <a href="mailto:Christian.Koenig@amd.com" moz-do-not-send="true"><Christian.Koenig@amd.com></a>
                        <br>
                        <b>Sent:</b> Friday, February 26, 2021 3:58 PM<br>
                        <b>To:</b> Liu, Monk <a href="mailto:Monk.Liu@amd.com" moz-do-not-send="true"><Monk.Liu@amd.com></a>;
                        <a href="mailto:amd-gfx@lists.freedesktop.org" moz-do-not-send="true">
                          amd-gfx@lists.freedesktop.org</a><br>
                        <b>Cc:</b> Zhang, Andy <a href="mailto:Andy.Zhang@amd.com" moz-do-not-send="true"><Andy.Zhang@amd.com></a>;
                        Chen, Horace
                        <a href="mailto:Horace.Chen@amd.com" moz-do-not-send="true"><Horace.Chen@amd.com></a>;
                        Zhang, Jack (Jian)
                        <a href="mailto:Jack.Zhang1@amd.com" moz-do-not-send="true"><Jack.Zhang1@amd.com></a><br>
                        <b>Subject:</b> Re: [RFC] a new approach to
                        detect which ring is the real black sheep upon
                        TDR reported<o:p></o:p></span></p>
                  </div>
                </div>
                <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                <p class="MsoNormal" style="margin-bottom:12.0pt"><span lang="EN-US">Hi Monk,<br>
                    <br>
                    in general an interesting idea, but I see two major
                    problems with that:<br>
                    <br>
                    1. It would make the reset take much longer.<br>
                    <br>
                    2. Things get often stuck because of timing issues,
                    so a guilty job might pass perfectly when run a
                    second time.<o:p></o:p></span></p>
                <p class="MsoNormal" style="margin-bottom:12.0pt"><span lang="EN-US">[ML] but the innocent ring already
                    reported a TDR, and the drm sched logic already
                    deleted this “sched_job” in its mirror list, thus
                    you don’t have chance to re-submit it again after
                    reset, that’s the major problem here.<o:p></o:p></span></p>
              </blockquote>
              <p><span lang="EN-US"> <o:p></o:p></span></p>
              <p><span lang="EN-US">Just to confirm I understand
                  correctly, Monk reports a scenario where the second
                  TDR that was reported by the innocent job is bailing
                  out BEFORE having a chance to run  drm_sched_stop for
                  that scheduler which should have reinserted the job
                  back into mirror list (because the first TDR run is
                  still in progress and hence amdgpu_device_lock_adev
                  fails for the second TDR) and so the innocent job
                  which was extracted from mirror list in
                  drm_sched_job_timedout is now lost.<br>
                  If so and as a possible quick fix until we overhaul
                  the entire design as suggested in this thread - maybe
                  we can modify drm_sched_backend_ops.timedout_job
                  callback to report back premature termination BEFORE
                  drm_sched_stop had a chance to run and then reinsert
                  back the job into mirror list from within 
                  drm_sched_job_timedout? There is no problem of racing
                  against concurrent drm_sched_get_cleanup_job once we
                  reinsert there as we don't reference the job pointer
                  anymore after this point and so if it's already
                  signaled and freed right away - it's ok. <o:p></o:p></span></p>
              <p><span lang="EN-US">Andrey<o:p></o:p></span></p>
              <p><span lang="EN-US"> <o:p></o:p></span></p>
              <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
                <p class="MsoNormal" style="margin-bottom:12.0pt"><span lang="EN-US"><br>
                    Apart from that the whole ring mirror list turned
                    out to be a really bad idea. E.g. we still struggle
                    with object life time because the concept doesn't
                    fit into the object model of the GPU scheduler under
                    Linux.<br>
                    <br>
                    We should probably work on this separately and
                    straighten up the job destruction once more and keep
                    the recovery information in the fence instead.<o:p></o:p></span></p>
                <p class="MsoNormal" style="margin-bottom:12.0pt"><span lang="EN-US">[ML] we claim to our customer that no
                    innocent process will be dropped or cancelled, and
                    our current logic works for the most time, but only
                    when there are different process running on
                    gfx/computes rings then we would run into the tricky
                    situation I stated here, and the proposal is the
                    only way I can figure out so far, do you have a
                    better solution or idea we review it as another
                    candidate RFC ? Be note that we raised this proposal
                    is because we do hit our trouble and we do need to
                    resolve it …. So even a not perfect solution is
                    still better than just cancel the innocent job (and
                    their context/process)<o:p></o:p></span></p>
                <p class="MsoNormal" style="margin-bottom:12.0pt"><span lang="EN-US">Thanks ! <o:p>
                    </o:p></span></p>
                <p class="MsoNormal" style="margin-bottom:12.0pt"><span lang="EN-US"><br>
                    Regards,<br>
                    Christian.<o:p></o:p></span></p>
                <div>
                  <p class="MsoNormal"><span lang="EN-US">Am 26.02.21 um
                      06:58 schrieb Liu, Monk:<o:p></o:p></span></p>
                </div>
                <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
                  <p class="msipheader251902e5" style="margin:0cm"><span style="font-size:10.0pt;font-family:"Arial",sans-serif;color:#317100" lang="EN-US">[AMD Public Use]</span><span lang="EN-US"><o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">Hi all<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">NAVI2X
                       project hit a really hard to solve issue now, and
                      it is turned out to be a general headache of our
                      TDR mechanism , check below scenario:<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <ol style="margin-top:0cm" type="1" start="1">
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l0 level1 lfo1"><span lang="EN-US">There is a job1 running on compute1
                        ring at timestamp
                        <o:p></o:p></span></li>
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l0 level1 lfo1"><span lang="EN-US">There is a job2 running on gfx ring
                        at timestamp<o:p></o:p></span></li>
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l0 level1 lfo1"><span lang="EN-US">Job1 is the guilty one, and
                        job1/job2 were scheduled to their rings at
                        almost the same timestamp
                        <o:p></o:p></span></li>
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l0 level1 lfo1"><span lang="EN-US">After 2 seconds we receive two TDR
                        reporting from both GFX ring and compute ring<o:p></o:p></span></li>
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l0 level1 lfo1"><b><span lang="EN-US">Current scheme is that in drm
                          scheduler all the head jobs of those two rings
                          are considered “bad job” and taken away from
                          the mirror list
                        </span></b><span lang="EN-US"><o:p></o:p></span></li>
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l0 level1 lfo1"><span lang="EN-US">The result is both the real guilty
                        job (job1) and the innocent job (job2) were all
                        deleted from mirror list, and their
                        corresponding contexts were also treated as
                        guilty<b> (so the innocent process remains
                          running is not secured)</b><o:p></o:p></span></li>
                  </ol>
                  <p class="MsoListParagraph"><b><span lang="EN-US"> </span></b><span lang="EN-US"><o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">But by our
                      wish the ideal case is TDR mechanism can detect
                      which ring is the guilty ring and the innocent
                      ring can resubmits all its pending jobs:<o:p></o:p></span></p>
                  <ol style="margin-top:0cm" type="1" start="1">
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l2 level1 lfo2"><span lang="EN-US">Job1 to be deleted from compute1
                        ring’s mirror list<o:p></o:p></span></li>
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l2 level1 lfo2"><span lang="EN-US">Job2 is kept and resubmitted later
                        and its belonging process/context are even not
                        aware of this TDR at all
                        <o:p></o:p></span></li>
                  </ol>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">Here I have a
                      proposal tend to achieve above goal and it rough
                      procedure is :<o:p></o:p></span></p>
                  <ol style="margin-top:0cm" type="1" start="1">
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l1 level1 lfo3"><span lang="EN-US">Once any ring reports a TDR, the
                        head job is *<b>not</b>* treated as “bad job”,
                        and it is *<b>not</b>* deleted from the mirror
                        list in drm sched functions<o:p></o:p></span></li>
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l1 level1 lfo3"><span lang="EN-US">In vendor’s function (our amdgpu
                        driver here):<o:p></o:p></span></li>
                  </ol>
                  <ol style="margin-top:0cm" type="1" start="2">
                    <ul style="margin-top:0cm" type="disc">
                      <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l1 level2 lfo3"><span lang="EN-US">reset GPU<o:p></o:p></span></li>
                      <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l1 level2 lfo3"><span lang="EN-US">repeat below actions on each
                          RINGS * one by one *:<o:p></o:p></span></li>
                    </ul>
                  </ol>
                  <p class="MsoListParagraph" style="margin-left:108.0pt;text-indent:-9.0pt;mso-list:l1
                    level3 lfo3">
                    <!--[if !supportLists]--><span lang="EN-US"><span style="mso-list:Ignore">1.<span style="font:7.0pt "Times New Roman"">
                        </span></span></span><!--[endif]--><span lang="EN-US">take the head job and submit it on
                      this ring<o:p></o:p></span></p>
                  <p class="MsoListParagraph" style="margin-left:108.0pt;text-indent:-9.0pt;mso-list:l1
                    level3 lfo3">
                    <!--[if !supportLists]--><span lang="EN-US"><span style="mso-list:Ignore">2.<span style="font:7.0pt "Times New Roman"">
                        </span></span></span><!--[endif]--><span lang="EN-US">see if it completes, if not then this
                      job is the real “bad job”<o:p></o:p></span></p>
                  <p class="MsoListParagraph" style="margin-left:108.0pt;text-indent:-9.0pt;mso-list:l1
                    level3 lfo3">
                    <!--[if !supportLists]--><span lang="EN-US"><span style="mso-list:Ignore">3.<span style="font:7.0pt "Times New Roman"">
                        </span></span></span><!--[endif]--><span lang="EN-US"> take it away from mirror list if
                      this head job is “bad job”<o:p></o:p></span></p>
                  <ol style="margin-top:0cm" type="1" start="2">
                    <ul style="margin-top:0cm" type="disc">
                      <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l1 level2 lfo3"><span lang="EN-US">After above iteration on all
                          RINGS, we already clears all the bad job(s)<o:p></o:p></span></li>
                    </ul>
                  </ol>
                  <ol style="margin-top:0cm" type="1" start="3">
                    <li class="MsoListParagraph" style="margin-left:0cm;mso-list:l1 level1 lfo3"><span lang="EN-US">Resubmit all jobs from each mirror
                        list to their corresponding rings (this is the
                        existed logic)<o:p></o:p></span></li>
                  </ol>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">The idea of
                      this is to use “serial” way to re-run and re-check
                      each head job of each RING, in order to take out
                      the real black sheep and its guilty context.<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">P.S.: we can
                      use this approaches only on GFX/KCQ ring reports
                      TDR , since those rings are intermutually affected
                      to each other. For SDMA ring timeout it definitely
                      proves the head job on SDMA ring is really guilty.<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">Thanks <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">Monk Liu |
                      Cloud-GPU Core team<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
                  <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                </blockquote>
                <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
                <p class="MsoNormal"><span lang="EN-US"><br>
                    <br>
                    <br>
                    <o:p></o:p></span></p>
                <pre><span lang="EN-US">_______________________________________________<o:p></o:p></span></pre>
                <pre><span lang="EN-US">amd-gfx mailing list<o:p></o:p></span></pre>
                <pre><span lang="EN-US"><a href="mailto:amd-gfx@lists.freedesktop.org" moz-do-not-send="true">amd-gfx@lists.freedesktop.org</a><o:p></o:p></span></pre>
                <pre><span lang="EN-US"><a href="https://lists.freedesktop.org/mailman/listinfo/amd-gfx" moz-do-not-send="true">https://lists.freedesktop.org/mailman/listinfo/amd-gfx</a><o:p></o:p></span></pre>
              </blockquote>
            </div>
          </blockquote>
        </div>
      </div>
    </blockquote>
  </body>
</html>