<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
        {font-family:Wingdings;
        panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
        {font-family:宋体;
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:等线;
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:"\@等线";
        panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
        {font-family:"\@宋体";
        panose-1:2 1 6 0 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0cm;
        margin-bottom:.0001pt;
        text-align:justify;
        text-justify:inter-ideograph;
        font-size:10.5pt;
        font-family:等线;}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:#0563C1;
        text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
        {mso-style-priority:99;
        color:#954F72;
        text-decoration:underline;}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
        {mso-style-priority:34;
        margin:0cm;
        margin-bottom:.0001pt;
        text-align:justify;
        text-justify:inter-ideograph;
        text-indent:21.0pt;
        font-size:10.5pt;
        font-family:等线;}
p.msonormal0, li.msonormal0, div.msonormal0
        {mso-style-name:msonormal;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:12.0pt;
        font-family:宋体;}
span.EmailStyle19
        {mso-style-type:personal;
        font-family:等线;
        color:windowtext;}
span.EmailStyle20
        {mso-style-type:personal-reply;
        font-family:等线;
        color:windowtext;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:612.0pt 792.0pt;
        margin:72.0pt 90.0pt 72.0pt 90.0pt;}
div.WordSection1
        {page:WordSection1;}
/* List Definitions */
@list l0
        {mso-list-id:65107788;
        mso-list-type:hybrid;
        mso-list-template-ids:1024615094 67698703 67698713 67698715 67698703 67698713 67698715 67698703 67698713 67698715;}
@list l0:level1
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:21.0pt;
        text-indent:-21.0pt;}
@list l0:level2
        {mso-level-number-format:alpha-lower;
        mso-level-text:"%2\)";
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:42.0pt;
        text-indent:-21.0pt;}
@list l0:level3
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        margin-left:63.0pt;
        text-indent:-21.0pt;}
@list l0:level4
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:84.0pt;
        text-indent:-21.0pt;}
@list l0:level5
        {mso-level-number-format:alpha-lower;
        mso-level-text:"%5\)";
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:105.0pt;
        text-indent:-21.0pt;}
@list l0:level6
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        margin-left:126.0pt;
        text-indent:-21.0pt;}
@list l0:level7
        {mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:147.0pt;
        text-indent:-21.0pt;}
@list l0:level8
        {mso-level-number-format:alpha-lower;
        mso-level-text:"%8\)";
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:168.0pt;
        text-indent:-21.0pt;}
@list l0:level9
        {mso-level-number-format:roman-lower;
        mso-level-tab-stop:none;
        mso-level-number-position:right;
        margin-left:189.0pt;
        text-indent:-21.0pt;}
@list l1
        {mso-list-id:697632068;
        mso-list-type:hybrid;
        mso-list-template-ids:448443560 67698689 67698691 67698693 67698689 67698691 67698693 67698689 67698691 67698693;}
@list l1:level1
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:21.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level2
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:42.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:63.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:84.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level5
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:105.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:126.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:147.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level8
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:168.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
@list l1:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        margin-left:189.0pt;
        text-indent:-21.0pt;
        font-family:Wingdings;}
ol
        {margin-bottom:0cm;}
ul
        {margin-bottom:0cm;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="ZH-CN" link="#0563C1" vlink="#954F72" style="text-justify-trim:punctuation">
<div class="WordSection1">
<p class="MsoNormal"><span lang="EN-US">+ david<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p class="MsoNormal" align="left" style="text-align:left"><b><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif">From:</span></b><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif"> Liu, Monk
<br>
<b>Sent:</b> Wednesday, October 11, 2017 1:34 PM<br>
<b>To:</b> Koenig, Christian <Christian.Koenig@amd.com>; Haehnle, Nicolai <Nicolai.Haehnle@amd.com>; Olsak, Marek <Marek.Olsak@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><br>
<b>Cc:</b> amd-gfx@lists.freedesktop.org; Ding, Pixel <Pixel.Ding@amd.com>; Jiang, Jerry (SW) <Jerry.Jiang@amd.com>; Li, Bingley (Bingley.Li@amd.com) <Bingley.Li@amd.com>; Ramirez, Alejandro <Alejandro.Ramirez@amd.com>; Filipas, Mario <Mario.Filipas@amd.com><br>
<b>Subject:</b> TDR and VRAM lost handling in KMD:<o:p></o:p></span></p>
</div>
</div>
<p class="MsoNormal" align="left" style="text-align:left"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">Hi Christian & Nicolai,<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">We need to achieve some agreements on what should MESA/UMD do and what should KMD do,
<b>please give your comments with </b></span><b>“<span lang="EN-US">okay</span>”<span lang="EN-US"> or
</span>“<span lang="EN-US">No</span>”<span lang="EN-US"> and your idea on below items,</span></b><span lang="EN-US"><o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l1 level1 lfo2">
<![if !supportLists]><span lang="EN-US" style="font-family:Wingdings"><span style="mso-list:Ignore">l<span style="font:7.0pt "Times New Roman""> 
</span></span></span><![endif]><span lang="EN-US">When a job timed out (set from lockup_timeout kernel parameter), What KMD should do in TDR routine :<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">1.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">Update adev-><b>gpu_reset_counter</b>, and stop scheduler first, (<b>gpu_reset_counter</b> is used to force vm flush after GPU reset, out of this thread</span>’<span lang="EN-US">s scope so no more discussion
 on it)<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">2.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">Set its fence error status to </span>
“<b><span lang="EN-US">ETIME</span></b>”<span lang="EN-US">,<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">3.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">Find the entity/ctx behind this job, and set this ctx as
</span>“<b><span lang="EN-US">guilty</span></b>”<span lang="EN-US"><o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">4.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">Kick out this job from scheduler</span>’<span lang="EN-US">s mirror list, so this job won</span>’<span lang="EN-US">t get re-scheduled to ring anymore.<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">5.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">Kick out all jobs in this </span>
“<span lang="EN-US">guilty</span>”<span lang="EN-US"> ctx</span>’<span lang="EN-US">s KFIFO queue, and set all their fence status to
</span>“<b><span lang="EN-US">ECANCELED</span></b>”<span lang="EN-US"><o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><b><span lang="EN-US"><span style="mso-list:Ignore">6.<span style="font:7.0pt "Times New Roman"">      
</span></span></span></b><![endif]><span lang="EN-US">Force signal all fences that get kicked out by above two steps,<b> otherwise UMD will block forever if waiting on those fences<o:p></o:p></b></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">7.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">Do gpu reset, which is can be some callbacks to let bare-metal and SR-IOV implement with their favor style
<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">8.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">After reset, KMD need to aware if the VRAM lost happens or not, bare-metal can implement some function to judge, while for SR-IOV I prefer to read it from GIM side (for initial version we consider it</span>’<span lang="EN-US">s
 always VRAM lost, till GIM side change aligned)<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">9.<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">If VRAM lost not hit, continue, otherwise:<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:42.0pt;text-indent:-21.0pt;mso-list:l0 level2 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">a)<span style="font:7.0pt "Times New Roman"">      
</span></span></span><![endif]><span lang="EN-US">Update adev-><b>vram_lost_counter</b>,<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:42.0pt;text-indent:-21.0pt;mso-list:l0 level2 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">b)<span style="font:7.0pt "Times New Roman"">      
</span></span></span><![endif]><span lang="EN-US">Iterate over all living ctx, and set all ctx as
</span>“<b><span lang="EN-US">guilty</span></b>”<span lang="EN-US"> since VRAM lost actually ruins all VRAM contents<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:42.0pt;text-indent:-21.0pt;mso-list:l0 level2 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">c)<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-US">Kick out all jobs in all ctx</span>’<span lang="EN-US">s KFIFO queue, and set all their fence status to
</span>“<b><span lang="EN-US">ECANCELDED</span></b>”<span lang="EN-US"><o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">10.<span style="font:7.0pt "Times New Roman"">    
</span></span></span><![endif]><span lang="EN-US">Do GTT recovery and VRAM page tables/entries recovery (optional, do we need it ???)<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l0 level1 lfo4">
<![if !supportLists]><span lang="EN-US"><span style="mso-list:Ignore">11.<span style="font:7.0pt "Times New Roman"">    
</span></span></span><![endif]><span lang="EN-US">Re-schedule all JOBs remains in mirror list to ring again and restart scheduler (for VRAM lost case, no JOB will re-scheduled)<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l1 level1 lfo2">
<![if !supportLists]><span lang="EN-US" style="font-family:Wingdings"><span style="mso-list:Ignore">l<span style="font:7.0pt "Times New Roman""> 
</span></span></span><![endif]><span lang="EN-US">For cs_wait() IOCTL:<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">After it found fence signaled, it should check with
</span><b>“<span lang="EN-US">dma_fence_get_status</span>” </b><span lang="EN-US">to see if there is error there,<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">And return the error status of fence<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l1 level1 lfo2">
<![if !supportLists]><span lang="EN-US" style="font-family:Wingdings"><span style="mso-list:Ignore">l<span style="font:7.0pt "Times New Roman""> 
</span></span></span><![endif]><span lang="EN-US">For cs_wait_fences() IOCTL:<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">Similar with above approach<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l1 level1 lfo2">
<![if !supportLists]><span lang="EN-US" style="font-family:Wingdings"><span style="mso-list:Ignore">l<span style="font:7.0pt "Times New Roman""> 
</span></span></span><![endif]><span lang="EN-US">For cs_submit() IOCTL:<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">It need to check if current ctx been marked as
</span>“<b><span lang="EN-US">guilty</span></b>”<span lang="EN-US"> and return </span>
“<b><span lang="EN-US">ECANCELED</span></b>”<span lang="EN-US"> if so<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l1 level1 lfo2">
<![if !supportLists]><span lang="EN-US" style="font-family:Wingdings"><span style="mso-list:Ignore">l<span style="font:7.0pt "Times New Roman""> 
</span></span></span><![endif]><span lang="EN-US">Introduce a new IOCTL to let UMD query
<b>vram_lost_counter</b>:<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">This way, UMD can also block app from submitting, like @Nicolai mentioned, we can cache one copy of
<b>vram_lost_counter</b> when enumerate physical device, and deny all <o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">gl-context from submitting if the counter queried bigger than that one cached in physical device. (looks a little overkill to me, but easy to implement )
<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">UMD can also return error to APP when creating gl-context if found current queried<b> vram_lost_counter
</b>bigger than that one cached in physical device.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">BTW: I realized that gl-context is a little different with kernel</span>’<span lang="EN-US">s context. Because for kernel. BO is not related with context but only with FD, while in UMD, BO have a backend<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">gl-context, so block submitting in UMD layer is also needed although KMD will do its job as bottom line
<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoListParagraph" style="margin-left:21.0pt;text-indent:-21.0pt;mso-list:l1 level1 lfo2">
<![if !supportLists]><span lang="EN-US" style="font-family:Wingdings"><span style="mso-list:Ignore">l<span style="font:7.0pt "Times New Roman""> 
</span></span></span><![endif]><span lang="EN-US">Basically </span>“<span lang="EN-US">vram_lost_counter</span>”<span lang="EN-US"> is exposure by kernel to let UMD take the control of robust extension feature, it will be UMD</span>’<span lang="EN-US">s call
 to move, KMD only deny </span>“<span lang="EN-US">guilty</span>”<span lang="EN-US"> context from submitting<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">Need your feedback, thx<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">We</span>’<span lang="EN-US">d better make TDR feature landed ASAP<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US">BR Monk<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US"><o:p> </o:p></span></p>
</div>
</body>
</html>