<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 1/17/2022 3:49 PM, Christian König
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:62372e77-31c8-211f-0d9c-01a0f880badf@amd.com">Am
      17.01.22 um 11:09 schrieb Somalapuram, Amaranath:
      <br>
      <blockquote type="cite">[AMD Official Use Only]
        <br>
        <br>
        <br>
        <br>
        -----Original Message-----
        <br>
        From: Koenig, Christian <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a>
        <br>
        Sent: Monday, January 17, 2022 3:33 PM
        <br>
        To: Somalapuram, Amaranath
        <a class="moz-txt-link-rfc2396E" href="mailto:Amaranath.Somalapuram@amd.com"><Amaranath.Somalapuram@amd.com></a>;
        <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>
        <br>
        Cc: Deucher, Alexander <a class="moz-txt-link-rfc2396E" href="mailto:Alexander.Deucher@amd.com"><Alexander.Deucher@amd.com></a>;
        Sharma, Shashank <a class="moz-txt-link-rfc2396E" href="mailto:Shashank.Sharma@amd.com"><Shashank.Sharma@amd.com></a>
        <br>
        Subject: Re: [PATCH 2/2] drm/amdgpu: add AMDGPURESET uevent on
        AMD GPU reset
        <br>
        <br>
        Am 17.01.22 um 11:01 schrieb Somalapuram, Amaranath:
        <br>
        <blockquote type="cite">[AMD Official Use Only]
          <br>
          <br>
          <br>
          <br>
          -----Original Message-----
          <br>
          From: Koenig, Christian <a class="moz-txt-link-rfc2396E" href="mailto:Christian.Koenig@amd.com"><Christian.Koenig@amd.com></a>
          <br>
          Sent: Monday, January 17, 2022 12:57 PM
          <br>
          To: Somalapuram, Amaranath
          <a class="moz-txt-link-rfc2396E" href="mailto:Amaranath.Somalapuram@amd.com"><Amaranath.Somalapuram@amd.com></a>;
          <br>
          <a class="moz-txt-link-abbreviated" href="mailto:amd-gfx@lists.freedesktop.org">amd-gfx@lists.freedesktop.org</a>
          <br>
          Cc: Deucher, Alexander <a class="moz-txt-link-rfc2396E" href="mailto:Alexander.Deucher@amd.com"><Alexander.Deucher@amd.com></a>;
          Sharma, Shashank
          <br>
          <a class="moz-txt-link-rfc2396E" href="mailto:Shashank.Sharma@amd.com"><Shashank.Sharma@amd.com></a>
          <br>
          Subject: Re: [PATCH 2/2] drm/amdgpu: add AMDGPURESET uevent on
          AMD GPU
          <br>
          reset
          <br>
          <br>
          Am 17.01.22 um 07:33 schrieb Somalapuram Amaranath:
          <br>
          <blockquote type="cite">AMDGPURESET uevent added to notify
            userspace, collect dump_stack and
            <br>
            trace
            <br>
            <br>
            Signed-off-by: Somalapuram Amaranath
            <a class="moz-txt-link-rfc2396E" href="mailto:Amaranath.Somalapuram@amd.com"><Amaranath.Somalapuram@amd.com></a>
            <br>
            ---
            <br>
                drivers/gpu/drm/amd/amdgpu/nv.c | 45
            +++++++++++++++++++++++++++++++++
            <br>
                1 file changed, 45 insertions(+)
            <br>
            <br>
            diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c
            <br>
            b/drivers/gpu/drm/amd/amdgpu/nv.c index
            2ec1ffb36b1f..b73147ae41fb
            <br>
            100644
            <br>
            --- a/drivers/gpu/drm/amd/amdgpu/nv.c
            <br>
            +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
            <br>
            @@ -529,10 +529,55 @@ nv_asic_reset_method(struct
            amdgpu_device *adev)
            <br>
                    }
            <br>
                }
            <br>
                +/**
            <br>
            + * drm_sysfs_reset_event - generate a DRM uevent
            <br>
            + * @dev: DRM device
            <br>
            + *
            <br>
            + * Send a uevent for the DRM device specified by @dev. 
            Currently we
            <br>
            +only
            <br>
            + * set AMDGPURESET=1 in the uevent environment, but this
            could be
            <br>
            +expanded to
            <br>
            + * deal with other types of events.
            <br>
            + *
            <br>
            + * Any new uapi should be using the
            <br>
            +drm_sysfs_connector_status_event()
            <br>
            + * for uevents on connector status change.
            <br>
            + */
            <br>
            +void drm_sysfs_reset_event(struct drm_device *dev)
            <br>
          </blockquote>
          This should probably go directly into the DRM subsystem.
          <br>
          <br>
          <blockquote type="cite">+{
            <br>
            +    char *event_string = "AMDGPURESET=1";
            <br>
            +    char *envp[2] = { event_string, NULL };
            <br>
            +
            <br>
            +   
            kobject_uevent_env(&dev->primary->kdev->kobj,
            KOBJ_CHANGE, envp);
            <br>
          </blockquote>
          As I said this approach is a clear NAK. We can't allocate any
          memory when we do a reset.
          <br>
          <br>
          Regards,
          <br>
          Christian.
          <br>
          <br>
          Can I do something like this:
          <br>
          <br>
          void drm_sysfs_reset_event(struct drm_device *dev)
          <br>
             {
          <br>
          -       char *event_string = "AMDGPURESET=1";
          <br>
          -       char *envp[2] = { event_string, NULL };
          <br>
          +       char **envp;
          <br>
          +
          <br>
          +       envp = kcalloc(2,sizeof(char *), GFP_ATOMIC);
          <br>
          +       envp[0] = kcalloc(30, sizeof(char), GFP_ATOMIC);
          <br>
          +       envp[1] = kcalloc(30, sizeof(char), GFP_ATOMIC);
          <br>
        </blockquote>
        No, not really. kobject_uevent_env() will still allocate memory
        without GFP_ATOMIC.
        <br>
        <br>
        I think the whole approach of using udev won't work for this.
        <br>
        <br>
        Regards,
        <br>
        Christian.
        <br>
        <br>
        I have tested it with sample applications:
        <br>
        Gpu reset:
        <br>
        sudo cat /sys/kernel/debug/dri/0/amdgpu_gpu_recover
        <br>
        <br>
        And I never missed the AMDGPURESET=1 event in user space,
        <br>
      </blockquote>
      <br>
      That's not the problem. Allocating memory when we need to do a
      reset can cause a *HARD* kernel deadlock.
      <br>
      <br>
      This is absolutely not something we can do and Daniel even tried
      to add a few lockdep annotations for this.
      <br>
      <br>
      So automated testing scripts will complain that this won't work.
      <br>
      <br>
      Regards,
      <br>
      Christian.
      <br>
    </blockquote>
    Any suggestion how we can notify user space during this situation.
    <p class="MsoNormal">Regards,</p>
    <p class="MsoNormal">S.Amarnath</p>
    <blockquote type="cite" cite="mid:62372e77-31c8-211f-0d9c-01a0f880badf@amd.com">
      <br>
      <blockquote type="cite">even with continues resets with sudo cat
        /sys/kernel/debug/dri/0/amdgpu_gpu_recover .
        <br>
      </blockquote>
      <br>
      <br>
      <blockquote type="cite">
        <br>
        Regards,
        <br>
        S.Amarnath
        <br>
        <blockquote type="cite">+
          <br>
          +       strcpy(envp[0], "AMDGPURESET=1");
          <br>
          +       strcpy(envp[1], "");
          <br>
          +
          <br>
          <br>
                   
          kobject_uevent_env(&dev->primary->kdev->kobj,
          KOBJ_CHANGE,
          <br>
          envp);
          <br>
          +
          <br>
          +       kfree(envp[0]);
          <br>
          +       kfree(envp[1]);
          <br>
          +       kfree(envp);
          <br>
             }
          <br>
          <br>
          Regards,
          <br>
          S.Amarnath
          <br>
          <br>
          <blockquote type="cite">+}
            <br>
            +
            <br>
            +void amdgpu_reset_dumps(struct amdgpu_device *adev) {
            <br>
            +    struct drm_device *ddev = adev_to_drm(adev);
            <br>
            +    int r = 0, i;
            <br>
            +
            <br>
            +    /* original raven doesn't have full asic reset */
            <br>
            +    if ((adev->apu_flags & AMD_APU_IS_RAVEN)
            &&
            <br>
            +        !(adev->apu_flags & AMD_APU_IS_RAVEN2))
            <br>
            +        return;
            <br>
            +    for (i = 0; i < adev->num_ip_blocks; i++) {
            <br>
            +        if (!adev->ip_blocks[i].status.valid)
            <br>
            +            continue;
            <br>
            +        if
            (!adev->ip_blocks[i].version->funcs->reset_reg_dumps)
            <br>
            +            continue;
            <br>
            +        r =
            adev->ip_blocks[i].version->funcs->reset_reg_dumps(adev);
            <br>
            +
            <br>
            +        if (r)
            <br>
            +            DRM_ERROR("reset_reg_dumps of IP block
            <%s> failed %d\n",
            <br>
            +                   
            adev->ip_blocks[i].version->funcs->name, r);
            <br>
            +    }
            <br>
            +
            <br>
            +    drm_sysfs_reset_event(ddev);
            <br>
            +    dump_stack();
            <br>
            +}
            <br>
            +
            <br>
                static int nv_asic_reset(struct amdgpu_device *adev)
            <br>
                {
            <br>
                    int ret = 0;
            <br>
                +    amdgpu_reset_dumps(adev);
            <br>
                    switch (nv_asic_reset_method(adev)) {
            <br>
                    case AMD_RESET_METHOD_PCI:
            <br>
                        dev_info(adev->dev, "PCI reset\n");
            <br>
          </blockquote>
        </blockquote>
      </blockquote>
      <br>
    </blockquote>
  </body>
</html>