<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2022-06-30 10:19, Felix Kuehling
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:7c8b25a1-2937-18b6-c329-e901673ae691@amd.com">
      <br>
      Am 2022-06-28 um 10:50 schrieb Philip Yang:
      <br>
      <blockquote type="cite">Use ktime_get_boottime_ns() as timestamp
        to correlate with other
        <br>
        APIs. Output timestamp when GPU recoverable fault starts and
        ends to
        <br>
        recover the fault, if migration happened or only GPU page table
        is
        <br>
        updated to recover, fault address, if read or write fault.
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 17
        +++++++++++++++++
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  6 +++++-
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_svm.c        | 17
        +++++++++++++----
        <br>
          drivers/gpu/drm/amd/amdkfd/kfd_svm.h        |  2 +-
        <br>
          4 files changed, 36 insertions(+), 6 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        <br>
        index 55ed026435e2..b7e68283925f 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
        <br>
        @@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct
        kfd_dev *dev, uint16_t pasid)
        <br>
                        task_info.pid, task_info.task_name);
        <br>
          }
        <br>
          +void kfd_smi_event_page_fault_start(struct kfd_dev *dev,
        pid_t pid,
        <br>
        +                    unsigned long address, bool write_fault,
        <br>
        +                    ktime_t ts)
        <br>
        +{
        <br>
        +    kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
        <br>
        +              "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
        <br>
        +              address, dev->id, write_fault ? 'W' : 'R');
        <br>
        +}
        <br>
        +
        <br>
        +void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t
        pid,
        <br>
        +                  unsigned long address, bool migration)
        <br>
        +{
        <br>
        +    kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
        <br>
        +              "%lld -%d @%lx(%x) %c\n",
        ktime_get_boottime_ns(),
        <br>
        +              pid, address, dev->id, migration ? 'M' : 'U');
        <br>
        +}
        <br>
        +
        <br>
          int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
        <br>
          {
        <br>
              struct kfd_smi_client *client;
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
        b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
        <br>
        index dfe101c21166..7903718cd9eb 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
        <br>
        @@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct
        kfd_dev *dev, uint16_t pasid);
        <br>
          void kfd_smi_event_update_thermal_throttling(struct kfd_dev
        *dev,
        <br>
                                   uint64_t throttle_bitmask);
        <br>
          void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool
        post_reset);
        <br>
        -
        <br>
        +void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t
        pid,
        <br>
        +                    unsigned long address, bool write_fault,
        <br>
        +                    ktime_t ts);
        <br>
        +void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t
        pid,
        <br>
        +                  unsigned long address, bool migration);
        <br>
          #endif
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        index d6fc00d51c8c..2ad08a1f38dd 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
        <br>
        @@ -32,6 +32,7 @@
        <br>
          #include "kfd_priv.h"
        <br>
          #include "kfd_svm.h"
        <br>
          #include "kfd_migrate.h"
        <br>
        +#include "kfd_smi_events.h"
        <br>
            #ifdef dev_fmt
        <br>
          #undef dev_fmt
        <br>
        @@ -1617,7 +1618,7 @@ static int
        svm_range_validate_and_map(struct mm_struct *mm,
        <br>
              svm_range_unreserve_bos(&ctx);
        <br>
                if (!r)
        <br>
        -        prange->validate_timestamp =
        ktime_to_us(ktime_get());
        <br>
        +        prange->validate_timestamp = ktime_get_boottime();
        <br>
                return r;
        <br>
          }
        <br>
        @@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
              struct svm_range_list *svms;
        <br>
              struct svm_range *prange;
        <br>
              struct kfd_process *p;
        <br>
        -    uint64_t timestamp;
        <br>
        +    ktime_t timestamp = ktime_get_boottime();
        <br>
              int32_t best_loc;
        <br>
              int32_t gpuidx = MAX_GPU_INSTANCE;
        <br>
              bool write_locked = false;
        <br>
              struct vm_area_struct *vma;
        <br>
        +    bool migration = false;
        <br>
              int r = 0;
        <br>
                if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
        <br>
        @@ -2775,9 +2777,9 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                  goto out_unlock_range;
        <br>
              }
        <br>
          -    timestamp = ktime_to_us(ktime_get()) -
        prange->validate_timestamp;
        <br>
              /* skip duplicate vm fault on different pages of same
        range */
        <br>
        -    if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
        <br>
        +    if (ktime_before(timestamp,
        ktime_add_ns(prange->validate_timestamp,
        <br>
        +                AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
        <br>
      </blockquote>
      <br>
      You changed the timestamp units from us to ns. I think you'll need
      to update AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (multiply with
      1000) to account for that.
      <br>
    </blockquote>
    <p>Thanks for catching this, the change was in v4, I removed it by
      mistake when updating comments.</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:7c8b25a1-2937-18b6-c329-e901673ae691@amd.com">
      <br>
      Other than that, this patch is
      <br>
      <br>
      Reviewed-by: Felix Kuehling <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a>
      <br>
      <br>
      <br>
      <blockquote type="cite">          pr_debug("svms 0x%p [0x%lx %lx]
        already restored\n",
        <br>
                       svms, prange->start, prange->last);
        <br>
                  r = 0;
        <br>
        @@ -2813,7 +2815,11 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                   svms, prange->start, prange->last, best_loc,
        <br>
                   prange->actual_loc);
        <br>
          +    kfd_smi_event_page_fault_start(adev->kfd.dev,
        p->lead_thread->pid, addr,
        <br>
        +                       write_fault, timestamp);
        <br>
        +
        <br>
              if (prange->actual_loc != best_loc) {
        <br>
        +        migration = true;
        <br>
                  if (best_loc) {
        <br>
                      r = svm_migrate_to_vram(prange, best_loc, mm);
        <br>
                      if (r) {
        <br>
        @@ -2842,6 +2848,9 @@ svm_range_restore_pages(struct
        amdgpu_device *adev, unsigned int pasid,
        <br>
                  pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to
        gpus\n",
        <br>
                       r, svms, prange->start, prange->last);
        <br>
          +    kfd_smi_event_page_fault_end(adev->kfd.dev,
        p->lead_thread->pid, addr,
        <br>
        +                     migration);
        <br>
        +
        <br>
          out_unlock_range:
        <br>
              mutex_unlock(&prange->migrate_mutex);
        <br>
          out_unlock_svms:
        <br>
        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        <br>
        index 2d54147b4dda..eab7f6d3b13c 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        <br>
        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
        <br>
        @@ -125,7 +125,7 @@ struct svm_range {
        <br>
              uint32_t            actual_loc;
        <br>
              uint8_t                granularity;
        <br>
              atomic_t            invalid;
        <br>
        -    uint64_t            validate_timestamp;
        <br>
        +    ktime_t                validate_timestamp;
        <br>
              struct mmu_interval_notifier    notifier;
        <br>
              struct svm_work_list_item    work_item;
        <br>
              struct list_head        deferred_list;
        <br>
      </blockquote>
    </blockquote>
  </body>
</html>