<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2024-08-29 17:15, Felix Kuehling
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:1376d3e2-f978-418d-9aee-c8282565d236@amd.com">On
      2024-08-23 15:49, Philip Yang wrote:
      <br>
      <blockquote type="cite">If GPU reset kick in while KFD
        restore_process_worker running, this may
        <br>
        causes different issues, for example below rcu stall warning,
        because
        <br>
        restore work may move BOs and evict queues under VRAM pressure.
        <br>
        <br>
        Fix this race by taking adev reset_domain read semaphore to
        prevent GPU
        <br>
        reset in restore_process_worker, the reset read semaphore can be
        taken
        <br>
        recursively if adev have multiple partitions.
        <br>
        <br>
        Then there is live locking issue if CP hangs while
        <br>
        restore_process_worker runs, then GPU reset wait for semaphore
        to start
        <br>
        and restore_process_worker cannot finish to release semaphore.
        We need
        <br>
        signal eviction fence to solve the live locking if evict queue
        return
        <br>
        -ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP
        hangs,
        <br>
        <br>
          amdgpu 0000:af:00.0: amdgpu: GPU reset(21) succeeded!
        <br>
          rcu: INFO: rcu_sched self-detected stall on CPU
        <br>
        <br>
          Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
        <br>
          Call Trace:
        <br>
           update_process_times+0x94/0xd0
        <br>
          RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]
        <br>
           amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]
        <br>
           restore_process_helper+0x27/0x80 [amdgpu]
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
      </blockquote>
      <br>
      See comments inline. I'd also like Christian to take a look at
      this patch since he's the expert on the reset locking stuff.
      <br>
      <br>
      <br>
      <blockquote type="cite">---
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56
        +++++++++++++++++++++++-
        <br>
          1 file changed, 55 insertions(+), 1 deletion(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        <br>
        index a902950cc060..53a814347522 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        <br>
        @@ -35,6 +35,7 @@
        <br>
          #include <linux/pm_runtime.h>
        <br>
          #include "amdgpu_amdkfd.h"
        <br>
          #include "amdgpu.h"
        <br>
        +#include "amdgpu_reset.h"
        <br>
            struct mm_struct;
        <br>
          @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct
        work_struct *work)
        <br>
                      kfd_process_restore_queues(p);
        <br>
                    pr_debug("Finished evicting pasid 0x%x\n",
        p->pasid);
        <br>
        -    } else
        <br>
        +    } else if (ret == -ETIMEDOUT || ret == -ETIME) {
        <br>
        +        /* If CP hangs, signal the eviction fence, then
        restore_bo_worker
        <br>
        +         * can finish to up_read GPU reset semaphore to start
        GPU reset.
        <br>
        +         */
        <br>
        +        signal_eviction_fence(p);
        <br>
        +    } else {
        <br>
                  pr_err("Failed to evict queues of pasid 0x%x\n",
        p->pasid);
        <br>
        +    }
        <br>
          }
        <br>
            static int restore_process_helper(struct kfd_process *p)
        <br>
        @@ -1997,6 +2004,45 @@ static int restore_process_helper(struct
        kfd_process *p)
        <br>
              return ret;
        <br>
          }
        <br>
          +/*
        <br>
        + * kfd_hold_devices_reset_semaphore
        <br>
        + *
        <br>
        + * return:
        <br>
        + *   true : hold reset domain semaphore to prevent device reset
        <br>
        + *   false: one of the device is resetting or already reset
        <br>
        + *
        <br>
        + */
        <br>
        +static bool kfd_hold_devices_reset_semaphore(struct kfd_process
        *p)
        <br>
      </blockquote>
      <br>
      I find the function naming of these functions (hold/unhold) a bit
      weird. I'd suggest
      kfd_process_trylock_reset_sems/kfd_process_unlock_reset_sems.
      <br>
    </blockquote>
    ok<br>
    <blockquote type="cite" cite="mid:1376d3e2-f978-418d-9aee-c8282565d236@amd.com">
      <br>
      <br>
      <blockquote type="cite">+{
        <br>
        +    struct amdgpu_device *adev;
        <br>
        +    int i;
        <br>
        +
        <br>
        +    for (i = 0; i < p->n_pdds; i++) {
        <br>
        +        adev = p->pdds[i]->dev->adev;
        <br>
        +        if
        (!down_read_trylock(&adev->reset_domain->sem))
        <br>
        +            goto out_upread;
        <br>
        +    }
        <br>
        +    return true;
        <br>
        +
        <br>
        +out_upread:
        <br>
        +    while (i--) {
        <br>
        +        adev = p->pdds[i]->dev->adev;
        <br>
        +        up_read(&adev->reset_domain->sem);
        <br>
        +    }
        <br>
        +    return false;
        <br>
        +}
        <br>
        +
        <br>
        +static void kfd_unhold_devices_reset_semaphore(struct
        kfd_process *p)
        <br>
        +{
        <br>
        +    struct amdgpu_device *adev;
        <br>
        +    int i;
        <br>
        +
        <br>
        +    for (i = 0; i < p->n_pdds; i++) {
        <br>
        +        adev = p->pdds[i]->dev->adev;
        <br>
        +        up_read(&adev->reset_domain->sem);
        <br>
        +    }
        <br>
        +}
        <br>
        +
        <br>
          static void restore_process_worker(struct work_struct *work)
        <br>
          {
        <br>
              struct delayed_work *dwork;
        <br>
        @@ -2009,6 +2055,12 @@ static void restore_process_worker(struct
        work_struct *work)
        <br>
               * lifetime of this thread, kfd_process p will be valid
        <br>
               */
        <br>
              p = container_of(dwork, struct kfd_process, restore_work);
        <br>
        +
        <br>
        +    if (!kfd_hold_devices_reset_semaphore(p)) {
        <br>
        +        pr_debug("GPU resetting, restore bo and queue
        skipped\n");
        <br>
      </blockquote>
      <br>
      Should we reschedule the restore worker to make sure it runs again
      after the reset is done?
      <br>
    </blockquote>
    <p>After GPU mode1 reset, user processes already aborted, it is
      meaningless to reschedule restore worker to update GPU mapping.</p>
    <p>Regards,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:1376d3e2-f978-418d-9aee-c8282565d236@amd.com">
      <br>
      Thanks,
      <br>
        Felix
      <br>
      <br>
      <br>
      <blockquote type="cite">+        return;
        <br>
        +    }
        <br>
        +
        <br>
              pr_debug("Started restoring pasid 0x%x\n", p->pasid);
        <br>
                /* Setting last_restore_timestamp before successful
        restoration.
        <br>
        @@ -2031,6 +2083,8 @@ static void restore_process_worker(struct
        work_struct *work)
        <br>
                              
        msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
        <br>
                      kfd_process_restore_queues(p);
        <br>
              }
        <br>
        +
        <br>
        +    kfd_unhold_devices_reset_semaphore(p);
        <br>
          }
        <br>
            void kfd_suspend_all_processes(void)
        <br>
      </blockquote>
    </blockquote>
  </body>
</html>