<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2021-04-27 3:25 a.m., Christian
      König wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:7b3b212a-bdca-1778-2732-4260253cd45d@gmail.com">Am
      26.04.21 um 23:26 schrieb Philip Yang:
      <br>
      <blockquote type="cite">Add interface to remove address from fault
        filter ring by resetting
        <br>
        fault ring entry key, then future vm fault on the address will
        be
        <br>
        processed to recover.
        <br>
        <br>
        Define fault key as atomic64_t type to use atomic
        read/set/cmpxchg key
        <br>
        to protect fault ring access by interrupt handler and interrupt
        deferred
        <br>
        work for vg20. Change fault->timestamp to 56-bit to share
        same uint64_t
        <br>
        with 8-bit fault->next, it is big enough for 48bit IH
        timestamp.
        <br>
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 54
        +++++++++++++++++++++++--
        <br>
          drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++-
        <br>
          2 files changed, 55 insertions(+), 5 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
        <br>
        index c39ed9eb0987..888b749bd75e 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
        <br>
        @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
        amdgpu_device *adev, struct amdgpu_gmc *mc)
        <br>
                      mc->agp_size >> 20, mc->agp_start,
        mc->agp_end);
        <br>
          }
        <br>
          +/**
        <br>
        + * amdgpu_gmc_fault_key - get hask key from vm fault address
        and pasid
        <br>
        + *
        <br>
        + * @addr: 48bit physical address
        <br>
        + * @pasid: 4 bit
        <br>
        + */
        <br>
        +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr,
        uint16_t pasid)
        <br>
        +{
        <br>
        +    return addr << 4 | pasid;
        <br>
        +}
        <br>
        +
        <br>
          /**
        <br>
           * amdgpu_gmc_filter_faults - filter VM faults
        <br>
           *
        <br>
        @@ -349,13 +360,14 @@ bool amdgpu_gmc_filter_faults(struct
        amdgpu_device *adev, uint64_t addr,
        <br>
          {
        <br>
              struct amdgpu_gmc *gmc = &adev->gmc;
        <br>
          -    uint64_t stamp, key = addr << 4 | pasid;
        <br>
        +    uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
        <br>
              struct amdgpu_gmc_fault *fault;
        <br>
              uint32_t hash;
        <br>
                /* If we don't have space left in the ring buffer return
        immediately */
        <br>
              stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
        <br>
                  AMDGPU_GMC_FAULT_TIMEOUT;
        <br>
        +
        <br>
              if (gmc->fault_ring[gmc->last_fault].timestamp >=
        stamp)
        <br>
                  return true;
        <br>
          @@ -365,7 +377,7 @@ bool amdgpu_gmc_filter_faults(struct
        amdgpu_device *adev, uint64_t addr,
        <br>
              while (fault->timestamp >= stamp) {
        <br>
                  uint64_t tmp;
        <br>
          -        if (fault->key == key)
        <br>
        +        if (atomic64_read(&fault->key) == key)
        <br>
                      return true;
        <br>
                    tmp = fault->timestamp;
        <br>
        @@ -378,7 +390,7 @@ bool amdgpu_gmc_filter_faults(struct
        amdgpu_device *adev, uint64_t addr,
        <br>
                /* Add the fault to the ring */
        <br>
              fault = &gmc->fault_ring[gmc->last_fault];
        <br>
        -    fault->key = key;
        <br>
        +    atomic64_set(&fault->key, key);
        <br>
              fault->timestamp = timestamp;
        <br>
                /* And update the hash */
        <br>
        @@ -387,6 +399,42 @@ bool amdgpu_gmc_filter_faults(struct
        amdgpu_device *adev, uint64_t addr,
        <br>
              return false;
        <br>
          }
        <br>
          +/**
        <br>
        + * amdgpu_gmc_filter_faults_remove - remove address from VM
        faults filter
        <br>
        + *
        <br>
        + * @adev: amdgpu device structure
        <br>
        + * @addr: address of the VM fault
        <br>
        + * @pasid: PASID of the process causing the fault
        <br>
        + *
        <br>
        + * Remove the address from fault filter, then future vm fault
        on this address
        <br>
        + * will pass to retry fault handler to recover.
        <br>
        + */
        <br>
        +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device
        *adev, uint64_t addr,
        <br>
        +                     uint16_t pasid)
        <br>
        +{
        <br>
        +    struct amdgpu_gmc *gmc = &adev->gmc;
        <br>
        +
        <br>
        +    uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
        <br>
        +    struct amdgpu_gmc_fault *fault;
        <br>
        +    uint32_t hash;
        <br>
        +
        <br>
        +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
        <br>
        +    fault =
        &gmc->fault_ring[gmc->fault_hash[hash].idx];
        <br>
        +    while (true) {
        <br>
        +        uint64_t tmp;
        <br>
        +
        <br>
        +        if (atomic64_cmpxchg(&fault->key, key, 0) ==
        key)
        <br>
        +            break;
        <br>
        +
        <br>
        +        tmp = fault->timestamp;
        <br>
        +        fault = &gmc->fault_ring[fault->next];
        <br>
        +
        <br>
        +        /* Check if the entry was reused */
        <br>
        +        if (fault->timestamp >= tmp)
        <br>
        +            break;
        <br>
        +    }
        <br>
      </blockquote>
      <br>
      Maybe rewrite this as "do { ... } while (fault->timestamp <
      tmp)".
      <br>
      <br>
      <blockquote type="cite">+}
        <br>
        +
        <br>
          int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
        <br>
          {
        <br>
              int r;
        <br>
        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
        <br>
        index 9d11c02a3938..95e18ef83aec 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
        <br>
        @@ -66,9 +66,9 @@ struct firmware;
        <br>
           * GMC page fault information
        <br>
           */
        <br>
          struct amdgpu_gmc_fault {
        <br>
        -    uint64_t    timestamp;
        <br>
        +    uint64_t    timestamp:56;
        <br>
      </blockquote>
      <br>
      I think 48 bits should be enough for the timestamp for current
      hardware.
      <br>
      <br>
    </blockquote>
    <p>Thanks for the advise, I will fix both and send out new patch.</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:7b3b212a-bdca-1778-2732-4260253cd45d@gmail.com">Apart
      from that looks good to me now,
      <br>
      Christian.
      <br>
      <br>
      <blockquote type="cite">      uint64_t   
        next:AMDGPU_GMC_FAULT_RING_ORDER;
        <br>
        -    uint64_t    key:52;
        <br>
        +    atomic64_t    key;
        <br>
          };
        <br>
            /*
        <br>
        @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct
        amdgpu_device *adev,
        <br>
                           struct amdgpu_gmc *mc);
        <br>
          bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
        uint64_t addr,
        <br>
                            uint16_t pasid, uint64_t timestamp);
        <br>
        +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device
        *adev, uint64_t addr,
        <br>
        +                     uint16_t pasid);
        <br>
          int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
        <br>
          void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
        <br>
          int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device
        *adev);
        <br>
      </blockquote>
      <br>
    </blockquote>
  </body>
</html>