<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 5/26/2022 3:56 PM, Wang, Yang(Kevin)
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:CO6PR12MB5473648915874B1A5D6C2A9C82D99@CO6PR12MB5473.namprd12.prod.outlook.com">
      
      <style type="text/css" style="display:none;">P {margin-top:0;margin-bottom:0;}</style>
      <p style="font-family:Arial;font-size:10pt;color:#0000FF;margin:5pt;" align="Left">
        [AMD Official Use Only - General]<br>
      </p>
      <br>
      <div>
        <div style="font-family: Calibri, Arial, Helvetica, sans-serif;
          font-size: 12pt; color: rgb(0, 0, 0);">
          <br>
        </div>
        <div style="font-family:Calibri,Arial,Helvetica,sans-serif;
          font-size:12pt; color:rgb(0,0,0)">
          <br>
        </div>
        <hr tabindex="-1" style="display:inline-block; width:98%">
        <div id="divRplyFwdMsg" dir="ltr"><font style="font-size:11pt" face="Calibri, sans-serif" color="#000000"><b>From:</b>
            amd-gfx <a class="moz-txt-link-rfc2396E" href="mailto:amd-gfx-bounces@lists.freedesktop.org"><amd-gfx-bounces@lists.freedesktop.org></a> on
            behalf of Somalapuram Amaranath
            <a class="moz-txt-link-rfc2396E" href="mailto:Amaranath.Somalapuram@amd.com"><Amaranath.Somalapuram@amd.com></a><br>
            <b>Sent:</b> Thursday, May 26, 2022 5:48 PM<br>
            <b>To:</b> <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>
            <a class="moz-txt-link-rfc2396E" href="mailto:amd-gfx@lists.freedesktop.org"><amd-gfx@lists.freedesktop.org></a><br>
            <b>Cc:</b> Deucher, Alexander
            <a class="moz-txt-link-rfc2396E" href="mailto:Alexander.Deucher@amd.com"><Alexander.Deucher@amd.com></a>; Somalapuram, Amaranath
            <a class="moz-txt-link-rfc2396E" href="mailto:Amaranath.Somalapuram@amd.com"><Amaranath.Somalapuram@amd.com></a>; Koenig, Christian
            <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a>; Sharma, Shashank
            <a class="moz-txt-link-rfc2396E" href="mailto:Shashank.Sharma@amd.com"><Shashank.Sharma@amd.com></a><br>
            <b>Subject:</b> [PATCH v2 2/2] drm/amdgpu: adding device
            coredump support</font>
          <div> </div>
        </div>
        <div class="BodyFragment"><font size="2"><span style="font-size:11pt">
              <div class="PlainText elementToProof">Added device
                coredump information:<br>
                - Kernel version<br>
                - Module<br>
                - Time<br>
                - VRAM status<br>
                - Guilty process name and PID<br>
                - GPU register dumps<br>
                v1 -> v2: Variable name change<br>
                v1 -> v2: NULL check<br>
                v1 -> v2: Code alignment<br>
                v1 -> v2: Adding dummy amdgpu_devcoredump_free<br>
                v1 -> v2: memset reset_task_info to zero<br>
                <br>
                Signed-off-by: Somalapuram Amaranath
                <a class="moz-txt-link-rfc2396E" href="mailto:Amaranath.Somalapuram@amd.com"><Amaranath.Somalapuram@amd.com></a><br>
                ---<br>
                 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +<br>
                 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 67
                ++++++++++++++++++++++<br>
                 2 files changed, 70 insertions(+)<br>
                <br>
                diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
                b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
                index c79d9992b113..25a7b2c74928 100644<br>
                --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
                +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
                @@ -1044,6 +1044,9 @@ struct amdgpu_device {<br>
                         uint32_t                       
                *reset_dump_reg_list;<br>
                         uint32_t                       
                *reset_dump_reg_value;<br>
                         int                             num_regs;<br>
                +       struct amdgpu_task_info         reset_task_info;<br>
                +       bool                            reset_vram_lost;<br>
                +       struct timespec64               reset_time;</div>
              <div class="PlainText elementToProof"><br>
              </div>
              <div class="PlainText elementToProof">[kevin]:</div>
              <div class="PlainText elementToProof">the <span style="background-color:rgb(255, 255,
                  255);display:inline !important">
                  CONFIG_DEV_COREDUMP check is needed for above variable
                  to avoid compiler warning when coredump feautre is not
                  enabled.</span><br>
              </div>
              <div class="PlainText elementToProof"> <br>
              </div>
            </span></font></div>
      </div>
    </blockquote>
    <font size="2">Agreed.</font><br>
    <blockquote type="cite" cite="mid:CO6PR12MB5473648915874B1A5D6C2A9C82D99@CO6PR12MB5473.namprd12.prod.outlook.com">
      <div>
        <div class="BodyFragment"><font size="2"><span style="font-size:11pt">
              <div class="PlainText elementToProof">
                         struct amdgpu_reset_domain      *reset_domain;<br>
                 <br>
                diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
                b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                index 866b4980a6fa..ca97afe5be63 100644<br>
                --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
                @@ -32,6 +32,8 @@<br>
                 #include <linux/slab.h><br>
                 #include <linux/iommu.h><br>
                 #include <linux/pci.h><br>
                +#include <linux/devcoredump.h><br>
                +#include <generated/utsrelease.h><br>
                 <br>
                 #include <drm/drm_atomic_helper.h><br>
                 #include <drm/drm_probe_helper.h><br>
                @@ -4734,6 +4736,62 @@ static int
                amdgpu_reset_reg_dumps(struct amdgpu_device *adev)<br>
                         return 0;<br>
                 }<br>
                 <br>
                +#ifdef CONFIG_DEV_COREDUMP<br>
                +static ssize_t amdgpu_devcoredump_read(char *buffer,
                loff_t offset,<br>
                +               size_t count, void *data, size_t
                datalen)<br>
                +{<br>
                +       struct drm_printer p;<br>
                +       struct amdgpu_device *adev = data;<br>
                +       struct drm_print_iterator iter;<br>
                +       int i;<br>
                +<br>
                +       if (adev == NULL)<br>
                +               return 0;</div>
              <div class="PlainText elementToProof">[kevin]:</div>
              <div class="PlainText elementToProof"> this check is not
                needed, because this private data is passed by our
                driver as below:</div>
              <div class="PlainText elementToProof"><br>
              </div>
            </span></font></div>
      </div>
    </blockquote>
    <p>In my testing if the reset is unsuccessful <font size="2"><span style="font-size:11pt">amdgpu_devcoredump_read will not be
          called.</span></font></p>
    <p><font size="2"><span style="font-size:11pt">Shashank: Any inputs
          on this.</span></font></p>
    <p><font size="2"><span style="font-size:11pt"><br>
        </span></font></p>
    <p><font size="2"><span style="font-size:11pt">Regards,</span></font></p>
    <p><font size="2"><span style="font-size:11pt">S.Amarnath<br>
        </span></font></p>
    <p><font size="2"><span style="font-size:11pt"></span></font></p>
    <blockquote type="cite" cite="mid:CO6PR12MB5473648915874B1A5D6C2A9C82D99@CO6PR12MB5473.namprd12.prod.outlook.com">
      <div>
        <div class="BodyFragment"><font size="2"><span style="font-size:11pt">
              <div class="PlainText elementToProof">
              </div>
              <div class="PlainText elementToProof"><span style="background-color:rgb(255, 255,
                  255);display:inline !important">     
                   dev_coredumpm(dev->dev, THIS_MODULE, adev, 0,
                  GFP_KERNEL,</span><br style="background-color:rgb(255,
                  255, 255)">
              </div>
              <div class="PlainText elementToProof"><span style="background-color:rgb(255, 255,
                  255);display:inline !important">                     
                   amdgpu_devcoredump_read, amdgpu_devcoredump_free);</span><br>
              </div>
              <div class="PlainText elementToProof">+<br>
                +       iter.data = buffer;<br>
                +       iter.offset = 0;<br>
                +       iter.start = offset;<br>
                +       iter.remain = count;<br>
                +<br>
                +       p = drm_coredump_printer(&iter);<br>
                +<br>
                +       drm_printf(&p, "**** AMDGPU Device Coredump
                ****\n");<br>
                +       drm_printf(&p, "kernel: " UTS_RELEASE "\n");<br>
                +       drm_printf(&p, "module: " KBUILD_MODNAME
                "\n");<br>
                +       drm_printf(&p, "time: %lld.%09ld\n",
                adev->reset_time.tv_sec,
                adev->reset_time.tv_nsec);<br>
                +       if (adev->reset_task_info.pid)<br>
                +               drm_printf(&p, "process_name: %s
                PID: %d\n",<br>
                +                         
                adev->reset_task_info.process_name,<br>
                +                         
                adev->reset_task_info.pid);<br>
                +<br>
                +       if (adev->reset_vram_lost)<br>
                +               drm_printf(&p, "VRAM is lost due to
                GPU reset!\n");<br>
                +       if (adev->num_regs) {<br>
                +               drm_printf(&p, "AMDGPU register
                dumps:\nOffset:     Value:\n");<br>
                +<br>
                +               for (i = 0; i < adev->num_regs;
                i++)<br>
                +                       drm_printf(&p, "0x%08x:
                0x%08x\n",<br>
                +                                 
                adev->reset_dump_reg_list[i],<br>
                +                                 
                adev->reset_dump_reg_value[i]);<br>
                +       }<br>
                +<br>
                +       return count - iter.remain;<br>
                +}<br>
                +<br>
                +static void amdgpu_devcoredump_free(void *data)<br>
                +{<br>
                +}<br>
                +<br>
                +static void amdgpu_reset_capture_coredumpm(struct
                amdgpu_device *adev)<br>
                +{<br>
                +       struct drm_device *dev = adev_to_drm(adev);<br>
                +<br>
                +       ktime_get_ts64(&adev->reset_time);<br>
                +       dev_coredumpm(dev->dev, THIS_MODULE, adev, 0,
                GFP_KERNEL,<br>
                +                       amdgpu_devcoredump_read,
                amdgpu_devcoredump_free);<br>
                +}<br>
                +#endif<br>
                +<br>
                 int amdgpu_do_asic_reset(struct list_head
                *device_list_handle,<br>
                                          struct amdgpu_reset_context
                *reset_context)<br>
                 {<br>
                @@ -4818,6 +4876,15 @@ int amdgpu_do_asic_reset(struct
                list_head *device_list_handle,<br>
                                                         goto out;<br>
                 <br>
                                                 vram_lost =
                amdgpu_device_check_vram_lost(tmp_adev);<br>
                +#ifdef CONFIG_DEV_COREDUMP<br>
                +                              
                tmp_adev->reset_vram_lost = vram_lost;<br>
                +                              
                memset(&tmp_adev->reset_task_info, 0,<br>
                +                                              
                sizeof(tmp_adev->reset_task_info));<br>
                +                               if
                (reset_context->job &&
                reset_context->job->vm)<br>
                +                                      
                tmp_adev->reset_task_info =<br>
                +                                              
                reset_context->job->vm->task_info;<br>
                +                              
                amdgpu_reset_capture_coredumpm(tmp_adev);<br>
                +#endif<br>
                                                 if (vram_lost) {<br>
                                                         DRM_INFO("VRAM
                is lost due to GPU reset!\n");<br>
                                                        
                amdgpu_inc_vram_lost(tmp_adev);<br>
                -- <br>
                2.32.0<br>
                <br>
              </div>
            </span></font></div>
      </div>
    </blockquote>
  </body>
</html>