<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body>
    Am 27.02.21 um 04:50 schrieb Liu, Monk:<br>
    <blockquote type="cite"
cite="mid:DM5PR12MB1708E521038AD046F45943AE849C9@DM5PR12MB1708.namprd12.prod.outlook.com">
      <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
      <meta name="Generator" content="Microsoft Word 15 (filtered
        medium)">
      <style><!--
/* Font Definitions */
@font-face
        {font-family:Wingdings;
        panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
        {font-family:宋体;
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:等线;
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:"\@等线";
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:"\@宋体";
        panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0cm;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:#0563C1;
        text-decoration:underline;}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
        {mso-style-priority:34;
        margin-top:0cm;
        margin-right:0cm;
        margin-bottom:0cm;
        margin-left:36.0pt;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
p.msipheader251902e5, li.msipheader251902e5, div.msipheader251902e5
        {mso-style-name:msipheader251902e5;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
p.msipheadera92f4c5c, li.msipheadera92f4c5c, div.msipheadera92f4c5c
        {mso-style-name:msipheadera92f4c5c;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:11.0pt;
        font-family:"Calibri",sans-serif;}
span.EmailStyle22
        {mso-style-type:personal-reply;
        font-family:等线;
        color:windowtext;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:612.0pt 792.0pt;
        margin:72.0pt 72.0pt 72.0pt 72.0pt;}
div.WordSection1
        {page:WordSection1;}
/* List Definitions */
@list l0
        {mso-list-id:503861270;
        mso-list-type:hybrid;
        mso-list-template-ids:1492292582 67698703 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l0:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level2
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level3
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l0:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l0:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l0:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l1
        {mso-list-id:1279491622;
        mso-list-type:hybrid;
        mso-list-template-ids:-1736673670 67698703 67698689 67698703 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l1:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level2
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Symbol;}
@list l1:level3
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-9.0pt;}
@list l1:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l1:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l1:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l2
        {mso-list-id:1593588455;
        mso-list-type:hybrid;
        mso-list-template-ids:-696075286 67698689 67698691 67698693 67698689 67698691 67698693 67698689 67698691 67698693;}
@list l2:level1
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:21.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level2
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:42.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:63.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:84.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level5
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:105.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:126.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:147.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level8
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:168.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l2:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:189.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l3
        {mso-list-id:1655448059;
        mso-list-type:hybrid;
        mso-list-template-ids:-1584207202 67698703 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l3:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l3:level2
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l3:level3
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l3:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l3:level5
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l3:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
@list l3:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l3:level8
        {mso-level-number-format:alpha-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;}
@list l3:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        text-indent:-9.0pt;}
ol
        {margin-bottom:0cm;}
ul
        {margin-bottom:0cm;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
      <p
        style="font-family:Arial;font-size:11pt;color:#0078D7;margin:5pt;"
        align="Left">
        [AMD Official Use Only - Internal Distribution Only]<br>
      </p>
      <br>
      <div>
        <div class="WordSection1">
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US">the
              code I pasted is to illustrate why the innocent job is
              already taken out in the mirror list thus my suggested
              proposal won’t work unless we don’t delete the job in
              sched_job_timeout() routine, and the problem you stated is
              with my understanding also kind of related with my
              suggested solution – the job removing from list should be
              handled by driver instead of scheduler .</span></p>
        </div>
      </div>
    </blockquote>
    <br>
    Yes, exactly that's my thinking as well.<br>
    <br>
    <blockquote type="cite"
cite="mid:DM5PR12MB1708E521038AD046F45943AE849C9@DM5PR12MB1708.namprd12.prod.outlook.com">
      <div>
        <div class="WordSection1">
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US"><o:p></o:p></span></p>
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US"><o:p> </o:p></span><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US">let
              make scheduler’s duty clear and simple : the
              sched_job_timeout() only get notification when a sched_job
              timedout but it doesn’t judge  if the leading job in
              mirror list should be blamed , all those checking should
              be left to driver to take action.</span></p>
        </div>
      </div>
    </blockquote>
    <br>
    Need to get a detailed look, but it sounds correct as well.<br>
    <br>
    <blockquote type="cite"
cite="mid:DM5PR12MB1708E521038AD046F45943AE849C9@DM5PR12MB1708.namprd12.prod.outlook.com">
      <div>
        <div class="WordSection1">
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US"><o:p></o:p></span></p>
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US"><o:p> </o:p></span></p>
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US">>></span><span
              lang="EN-US"> If we do this we should probably make it
              configurable as a module parameter.<o:p></o:p></span></p>
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US">That’s
              ok,  maybe we can reuse the existed parm “gpu_recovery”,
              extend it with:<o:p></o:p></span></p>
          <p class="MsoListParagraph"
            style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l2
            level1 lfo6">
            <!--[if !supportLists]--><span
              style="font-size:10.5pt;font-family:Wingdings"
              lang="EN-US"><span style="mso-list:Ignore">l<span
                  style="font:7.0pt "Times New Roman""> 
                </span></span></span><!--[endif]--><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US">0 –
              no recovery initiated after job timeout<o:p></o:p></span></p>
          <p class="MsoListParagraph"
            style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l2
            level1 lfo6">
            <!--[if !supportLists]--><span
              style="font-size:10.5pt;font-family:Wingdings"
              lang="EN-US"><span style="mso-list:Ignore">l<span
                  style="font:7.0pt "Times New Roman""> 
                </span></span></span><!--[endif]--><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US">1 –
              legacy TDR behave
              <o:p></o:p></span></p>
          <p class="MsoListParagraph"
            style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l2
            level1 lfo6">
            <!--[if !supportLists]--><span
              style="font-size:10.5pt;font-family:Wingdings"
              lang="EN-US"><span style="mso-list:Ignore">l<span
                  style="font:7.0pt "Times New Roman""> 
                </span></span></span><!--[endif]--><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US">2 –
              enhanced TDR behave (the one suggested here)</span></p>
        </div>
      </div>
    </blockquote>
    <br>
    Yes, something like that should work. Key point is we had a couple
    of people who already suggested to optimize the reset routine so
    that it doesn't take so long.<br>
    <br>
    So far I pushed back on this because the reset routine isn't
    something I would optimize for speed. But when it starts to take
    something like 10 seconds instead of halve a second because you had
    an extra long running compute job we will certainly see complains.<br>
    <br>
    Regards,<br>
    Christian.<br>
    <br>
    <blockquote type="cite"
cite="mid:DM5PR12MB1708E521038AD046F45943AE849C9@DM5PR12MB1708.namprd12.prod.outlook.com">
      <div>
        <div class="WordSection1">
          <p class="MsoListParagraph"
            style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l2
            level1 lfo6"><span style="font-size:10.5pt;font-family:等线"
              lang="EN-US"><o:p></o:p></span></p>
          <p class="MsoNormal"><span
              style="font-size:10.5pt;font-family:等线" lang="EN-US"><o:p> </o:p></span></p>
          <div>
            <div style="border:none;border-top:solid #E1E1E1
              1.0pt;padding:3.0pt 0cm 0cm 0cm">
              <p class="MsoNormal"><b><span style="font-family:等线">发件人<span
                      lang="EN-US">:</span></span></b><span
                  style="font-family:等线" lang="EN-US"> Koenig, Christian
                  <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a>
                  <br>
                </span><b><span style="font-family:等线">发送时间<span
                      lang="EN-US">:</span></span></b><span
                  style="font-family:等线" lang="EN-US"> 2021</span><span
                  style="font-family:等线">年<span lang="EN-US">2</span>月<span
                    lang="EN-US">26</span>日<span lang="EN-US"> 20:05<br>
                  </span><b>收件人<span lang="EN-US">:</span></b><span
                    lang="EN-US"> Liu, Monk <a class="moz-txt-link-rfc2396E" href="mailto:Monk.Liu@amd.com"><Monk.Liu@amd.com></a>;
                    <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a><br>
                  </span><b>抄送<span lang="EN-US">:</span></b><span
                    lang="EN-US"> Zhang, Andy
                    <a class="moz-txt-link-rfc2396E" href="mailto:Andy.Zhang@amd.com"><Andy.Zhang@amd.com></a>; Chen, Horace
                    <a class="moz-txt-link-rfc2396E" href="mailto:Horace.Chen@amd.com"><Horace.Chen@amd.com></a>; Zhang, Jack (Jian)
                    <a class="moz-txt-link-rfc2396E" href="mailto:Jack.Zhang1@amd.com"><Jack.Zhang1@amd.com></a><br>
                  </span><b>主题<span lang="EN-US">:</span></b><span
                    lang="EN-US"> Re: [RFC] a new approach to detect
                    which ring is the real black sheep upon TDR reported<o:p></o:p></span></span></p>
            </div>
          </div>
          <p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
          <p class="MsoNormal"><span lang="EN-US">Yeah that is exactly
              the stuff which doesn't works at all. We got feedback for
              multiple people that this whole approach of tying the job
              to the tdr was not a good idea at all.<br>
              <br>
              What we should do instead is to have a pointer in the
              scheduler fence to which job it belongs. Freeing up the
              job when the scheduler fence is signaled is then job of
              the driver and not the scheduler any more.<br>
              <br>
              The scheduler then gives the scheduler fence to the driver
              when a timeout is detected and the driver can do the rest
              of the handling all by itself.<br>
              <br>
              But this problem is orthogonal to the suggested solution
              here.<br>
              <br>
              <br>
            </span><span style="font-size:12.0pt" lang="EN-US"><o:p></o:p></span></p>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <p class="MsoNormal"><span lang="EN-US">do you have a better
                solution or idea we review it as another candidate RFC ?<o:p></o:p></span></p>
          </blockquote>
          <p class="MsoNormal" style="margin-bottom:12.0pt"><span
              lang="EN-US"><br>
              I don't see much other option either. We could do
              something like only allowing one application at a time to
              use the gfx/compute block, but that would be even worse.<br>
              <br>
              If we do this we should probably make it configurable as a
              module parameter.<br>
              <br>
              Regards,<br>
              Christian.<o:p></o:p></span></p>
          <div>
            <p class="MsoNormal"><span lang="EN-US">Am 26.02.21 um 12:57
                schrieb Liu, Monk:<o:p></o:p></span></p>
          </div>
          <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
            <p class="msipheadera92f4c5c" style="margin:0cm"><span
                style="font-family:"Arial",sans-serif;color:#0078D7"
                lang="EN-US">[AMD Official Use Only - Internal
                Distribution Only]</span><span lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">static void
                drm_sched_job_timedout(struct work_struct *work)<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">279 {<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">280     struct
                drm_gpu_scheduler *sched;<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">281     struct
                drm_sched_job *job;<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">282<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">283     sched =
                container_of(work, struct drm_gpu_scheduler,
                work_tdr.work);<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">284<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">285     /* Protects
                against concurrent deletion in drm_sched_get_cleanup_job
                */<o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">286    
                  spin_lock(&sched->job_list_lock);</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">287     job =
                  list_first_entry_or_null(&sched->ring_mirror_list,</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">288                       
                  struct drm_sched_job, node);</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">289</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">290     if (job)
                  {</span></b><span lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">291         /*</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">292          *
                  Remove the bad job so it cannot be freed by concurrent</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">293          *
                  drm_sched_cleanup_jobs. It will be reinserted back
                  after sched->thread</span></b><span lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">294          * is
                  parked at which point it's safe.</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">295          */</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">296        
                  list_del_init(&job->node);</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">297        
                  spin_unlock(&sched->job_list_lock);</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">298</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><b><span lang="EN-US">299        
                  job->sched->ops->timedout_job(job);</span></b><span
                lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <div>
              <p class="MsoNormal"><span lang="EN-US">Thanks <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">Monk Liu |
                  Cloud-GPU Core team<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
            </div>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <div>
              <div style="border:none;border-top:solid #E1E1E1
                1.0pt;padding:3.0pt 0cm 0cm 0cm">
                <p class="MsoNormal"><b><span lang="EN-US">From:</span></b><span
                    lang="EN-US"> Liu, Monk
                    <br>
                    <b>Sent:</b> Friday, February 26, 2021 7:54 PM<br>
                    <b>To:</b> Koenig, Christian <a
                      href="mailto:Christian.Koenig@amd.com"
                      moz-do-not-send="true"><Christian.Koenig@amd.com></a>;
                    <a href="mailto:amd-gfx@lists.freedesktop.org"
                      moz-do-not-send="true">amd-gfx@lists.freedesktop.org</a><br>
                    <b>Cc:</b> Zhang, Andy <a
                      href="mailto:Andy.Zhang@amd.com"
                      moz-do-not-send="true"><Andy.Zhang@amd.com></a>;
                    Chen, Horace
                    <a href="mailto:Horace.Chen@amd.com"
                      moz-do-not-send="true"><Horace.Chen@amd.com></a>;
                    Zhang, Jack (Jian)
                    <a href="mailto:Jack.Zhang1@amd.com"
                      moz-do-not-send="true"><Jack.Zhang1@amd.com></a><br>
                    <b>Subject:</b> RE: [RFC] a new approach to detect
                    which ring is the real black sheep upon TDR reported<o:p></o:p></span></p>
              </div>
            </div>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <p class="msipheadera92f4c5c" style="margin:0cm"><span
                style="font-family:"Arial",sans-serif;color:#0078D7"
                lang="EN-US">[AMD Official Use Only - Internal
                Distribution Only]</span><span lang="EN-US"><o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US">See in line<o:p></o:p></span></p>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <div>
              <p class="MsoNormal"><span lang="EN-US">Thanks <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">Monk Liu |
                  Cloud-GPU Core team<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
            </div>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <div>
              <div style="border:none;border-top:solid #E1E1E1
                1.0pt;padding:3.0pt 0cm 0cm 0cm">
                <p class="MsoNormal"><b><span lang="EN-US">From:</span></b><span
                    lang="EN-US"> Koenig, Christian <<a
                      href="mailto:Christian.Koenig@amd.com"
                      moz-do-not-send="true">Christian.Koenig@amd.com</a>>
                    <br>
                    <b>Sent:</b> Friday, February 26, 2021 3:58 PM<br>
                    <b>To:</b> Liu, Monk <<a
                      href="mailto:Monk.Liu@amd.com"
                      moz-do-not-send="true">Monk.Liu@amd.com</a>>; <a
                      href="mailto:amd-gfx@lists.freedesktop.org"
                      moz-do-not-send="true">
                      amd-gfx@lists.freedesktop.org</a><br>
                    <b>Cc:</b> Zhang, Andy <<a
                      href="mailto:Andy.Zhang@amd.com"
                      moz-do-not-send="true">Andy.Zhang@amd.com</a>>;
                    Chen, Horace <<a
                      href="mailto:Horace.Chen@amd.com"
                      moz-do-not-send="true">Horace.Chen@amd.com</a>>;
                    Zhang, Jack (Jian) <<a
                      href="mailto:Jack.Zhang1@amd.com"
                      moz-do-not-send="true">Jack.Zhang1@amd.com</a>><br>
                    <b>Subject:</b> Re: [RFC] a new approach to detect
                    which ring is the real black sheep upon TDR reported<o:p></o:p></span></p>
              </div>
            </div>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            <p class="MsoNormal" style="margin-bottom:12.0pt"><span
                lang="EN-US">Hi Monk,<br>
                <br>
                in general an interesting idea, but I see two major
                problems with that:<br>
                <br>
                1. It would make the reset take much longer.<br>
                <br>
                2. Things get often stuck because of timing issues, so a
                guilty job might pass perfectly when run a second time.<o:p></o:p></span></p>
            <p class="MsoNormal" style="margin-bottom:12.0pt"><span
                lang="EN-US">[ML] but the innocent ring already reported
                a TDR, and the drm sched logic already deleted this
                “sched_job” in its mirror list, thus you don’t have
                chance to re-submit it again after reset, that’s the
                major problem here.<br>
                <br>
                Apart from that the whole ring mirror list turned out to
                be a really bad idea. E.g. we still struggle with object
                life time because the concept doesn't fit into the
                object model of the GPU scheduler under Linux.<br>
                <br>
                We should probably work on this separately and
                straighten up the job destruction once more and keep the
                recovery information in the fence instead.<o:p></o:p></span></p>
            <p class="MsoNormal" style="margin-bottom:12.0pt"><span
                lang="EN-US">[ML] we claim to our customer that no
                innocent process will be dropped or cancelled, and our
                current logic works for the most time, but only when
                there are different process running on gfx/computes
                rings then we would run into the tricky situation I
                stated here, and the proposal is the only way I can
                figure out so far, do you have a better solution or idea
                we review it as another candidate RFC ? Be note that we
                raised this proposal is because we do hit our trouble
                and we do need to resolve it …. So even a not perfect
                solution is still better than just cancel the innocent
                job (and their context/process)<o:p></o:p></span></p>
            <p class="MsoNormal" style="margin-bottom:12.0pt"><span
                lang="EN-US">Thanks ! <o:p>
                </o:p></span></p>
            <p class="MsoNormal" style="margin-bottom:12.0pt"><span
                lang="EN-US"><br>
                Regards,<br>
                Christian.<o:p></o:p></span></p>
            <div>
              <p class="MsoNormal"><span lang="EN-US">Am 26.02.21 um
                  06:58 schrieb Liu, Monk:<o:p></o:p></span></p>
            </div>
            <blockquote style="margin-top:5.0pt;margin-bottom:5.0pt">
              <p class="msipheader251902e5" style="margin:0cm"><span
style="font-size:10.0pt;font-family:"Arial",sans-serif;color:#317100"
                  lang="EN-US">[AMD Public Use]</span><span lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">Hi all<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">NAVI2X  project
                  hit a really hard to solve issue now, and it is turned
                  out to be a general headache of our TDR mechanism ,
                  check below scenario:<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <ol style="margin-top:0cm" type="1" start="1">
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l0 level1 lfo1"><span
                    lang="EN-US">There is a job1 running on compute1
                    ring at timestamp
                    <o:p></o:p></span></li>
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l0 level1 lfo1"><span
                    lang="EN-US">There is a job2 running on gfx ring at
                    timestamp<o:p></o:p></span></li>
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l0 level1 lfo1"><span
                    lang="EN-US">Job1 is the guilty one, and job1/job2
                    were scheduled to their rings at almost the same
                    timestamp
                    <o:p></o:p></span></li>
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l0 level1 lfo1"><span
                    lang="EN-US">After 2 seconds we receive two TDR
                    reporting from both GFX ring and compute ring<o:p></o:p></span></li>
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l0 level1 lfo1"><b><span
                      lang="EN-US">Current scheme is that in drm
                      scheduler all the head jobs of those two rings are
                      considered “bad job” and taken away from the
                      mirror list
                    </span></b><span lang="EN-US"><o:p></o:p></span></li>
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l0 level1 lfo1"><span
                    lang="EN-US">The result is both the real guilty job
                    (job1) and the innocent job (job2) were all deleted
                    from mirror list, and their corresponding contexts
                    were also treated as guilty<b> (so the innocent
                      process remains running is not secured)</b><o:p></o:p></span></li>
              </ol>
              <p class="MsoListParagraph"><b><span lang="EN-US"> </span></b><span
                  lang="EN-US"><o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">But by our wish
                  the ideal case is TDR mechanism can detect which ring
                  is the guilty ring and the innocent ring can resubmits
                  all its pending jobs:<o:p></o:p></span></p>
              <ol style="margin-top:0cm" type="1" start="1">
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l3 level1 lfo2"><span
                    lang="EN-US">Job1 to be deleted from compute1 ring’s
                    mirror list<o:p></o:p></span></li>
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l3 level1 lfo2"><span
                    lang="EN-US">Job2 is kept and resubmitted later and
                    its belonging process/context are even not aware of
                    this TDR at all
                    <o:p></o:p></span></li>
              </ol>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">Here I have a
                  proposal tend to achieve above goal and it rough
                  procedure is :<o:p></o:p></span></p>
              <ol style="margin-top:0cm" type="1" start="1">
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l1 level1 lfo3"><span
                    lang="EN-US">Once any ring reports a TDR, the head
                    job is *<b>not</b>* treated as “bad job”, and it is
                    *<b>not</b>* deleted from the mirror list in drm
                    sched functions<o:p></o:p></span></li>
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l1 level1 lfo3"><span
                    lang="EN-US">In vendor’s function (our amdgpu driver
                    here):<o:p></o:p></span></li>
              </ol>
              <ol style="margin-top:0cm" type="1" start="2">
                <ul style="margin-top:0cm" type="disc">
                  <li class="MsoListParagraph"
                    style="margin-left:0cm;mso-list:l1 level2 lfo3"><span
                      lang="EN-US">reset GPU<o:p></o:p></span></li>
                  <li class="MsoListParagraph"
                    style="margin-left:0cm;mso-list:l1 level2 lfo3"><span
                      lang="EN-US">repeat below actions on each RINGS *
                      one by one *:<o:p></o:p></span></li>
                </ul>
              </ol>
              <p class="MsoListParagraph"
                style="margin-left:108.0pt;text-indent:-9.0pt;mso-list:l1
                level3 lfo3">
                <!--[if !supportLists]--><span lang="EN-US"><span
                    style="mso-list:Ignore">1.<span style="font:7.0pt
                      "Times New Roman"">
                    </span></span></span><!--[endif]--><span
                  lang="EN-US">take the head job and submit it on this
                  ring<o:p></o:p></span></p>
              <p class="MsoListParagraph"
                style="margin-left:108.0pt;text-indent:-9.0pt;mso-list:l1
                level3 lfo3">
                <!--[if !supportLists]--><span lang="EN-US"><span
                    style="mso-list:Ignore">2.<span style="font:7.0pt
                      "Times New Roman"">
                    </span></span></span><!--[endif]--><span
                  lang="EN-US">see if it completes, if not then this job
                  is the real “bad job”<o:p></o:p></span></p>
              <p class="MsoListParagraph"
                style="margin-left:108.0pt;text-indent:-9.0pt;mso-list:l1
                level3 lfo3">
                <!--[if !supportLists]--><span lang="EN-US"><span
                    style="mso-list:Ignore">3.<span style="font:7.0pt
                      "Times New Roman"">
                    </span></span></span><!--[endif]--><span
                  lang="EN-US"> take it away from mirror list if this
                  head job is “bad job”<o:p></o:p></span></p>
              <ol style="margin-top:0cm" type="1" start="2">
                <ul style="margin-top:0cm" type="disc">
                  <li class="MsoListParagraph"
                    style="margin-left:0cm;mso-list:l1 level2 lfo3"><span
                      lang="EN-US">After above iteration on all RINGS,
                      we already clears all the bad job(s)<o:p></o:p></span></li>
                </ul>
              </ol>
              <ol style="margin-top:0cm" type="1" start="3">
                <li class="MsoListParagraph"
                  style="margin-left:0cm;mso-list:l1 level1 lfo3"><span
                    lang="EN-US">Resubmit all jobs from each mirror list
                    to their corresponding rings (this is the existed
                    logic)<o:p></o:p></span></li>
              </ol>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">The idea of this
                  is to use “serial” way to re-run and re-check each
                  head job of each RING, in order to take out the real
                  black sheep and its guilty context.<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">P.S.: we can use
                  this approaches only on GFX/KCQ ring reports TDR ,
                  since those rings are intermutually affected to each
                  other. For SDMA ring timeout it definitely proves the
                  head job on SDMA ring is really guilty.<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">Thanks <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">Monk Liu |
                  Cloud-GPU Core team<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US">------------------------------------------<o:p></o:p></span></p>
              <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
            </blockquote>
            <p class="MsoNormal"><span lang="EN-US"> <o:p></o:p></span></p>
          </blockquote>
          <p class="MsoNormal"><span
              style="font-size:12.0pt;font-family:宋体" lang="EN-US"><o:p> </o:p></span></p>
        </div>
      </div>
      <br>
      <fieldset class="mimeAttachmentHeader"></fieldset>
      <pre class="moz-quote-pre" wrap="">_______________________________________________
amd-gfx mailing list
<a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>
<a class="moz-txt-link-freetext" href="https://lists.freedesktop.org/mailman/listinfo/amd-gfx">https://lists.freedesktop.org/mailman/listinfo/amd-gfx</a>
</pre>
    </blockquote>
    <br>
  </body>
</html>