<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<p><br>
</p>
<div class="moz-cite-prefix">On 2021-04-27 3:25 a.m., Christian
König wrote:<br>
</div>
<blockquote type="cite" cite="mid:7b3b212a-bdca-1778-2732-4260253cd45d@gmail.com">Am
26.04.21 um 23:26 schrieb Philip Yang:
<br>
<blockquote type="cite">Add interface to remove address from fault
filter ring by resetting
<br>
fault ring entry key, then future vm fault on the address will
be
<br>
processed to recover.
<br>
<br>
Define fault key as atomic64_t type to use atomic
read/set/cmpxchg key
<br>
to protect fault ring access by interrupt handler and interrupt
deferred
<br>
work for vg20. Change fault->timestamp to 56-bit to share
same uint64_t
<br>
with 8-bit fault->next, it is big enough for 48bit IH
timestamp.
<br>
<br>
Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
<br>
---
<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 54
+++++++++++++++++++++++--
<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 6 ++-
<br>
2 files changed, 55 insertions(+), 5 deletions(-)
<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
<br>
index c39ed9eb0987..888b749bd75e 100644
<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
<br>
@@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
amdgpu_device *adev, struct amdgpu_gmc *mc)
<br>
mc->agp_size >> 20, mc->agp_start,
mc->agp_end);
<br>
}
<br>
+/**
<br>
+ * amdgpu_gmc_fault_key - get hask key from vm fault address
and pasid
<br>
+ *
<br>
+ * @addr: 48bit physical address
<br>
+ * @pasid: 4 bit
<br>
+ */
<br>
+static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr,
uint16_t pasid)
<br>
+{
<br>
+ return addr << 4 | pasid;
<br>
+}
<br>
+
<br>
/**
<br>
* amdgpu_gmc_filter_faults - filter VM faults
<br>
*
<br>
@@ -349,13 +360,14 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
<br>
{
<br>
struct amdgpu_gmc *gmc = &adev->gmc;
<br>
- uint64_t stamp, key = addr << 4 | pasid;
<br>
+ uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
<br>
struct amdgpu_gmc_fault *fault;
<br>
uint32_t hash;
<br>
/* If we don't have space left in the ring buffer return
immediately */
<br>
stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
<br>
AMDGPU_GMC_FAULT_TIMEOUT;
<br>
+
<br>
if (gmc->fault_ring[gmc->last_fault].timestamp >=
stamp)
<br>
return true;
<br>
@@ -365,7 +377,7 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
<br>
while (fault->timestamp >= stamp) {
<br>
uint64_t tmp;
<br>
- if (fault->key == key)
<br>
+ if (atomic64_read(&fault->key) == key)
<br>
return true;
<br>
tmp = fault->timestamp;
<br>
@@ -378,7 +390,7 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
<br>
/* Add the fault to the ring */
<br>
fault = &gmc->fault_ring[gmc->last_fault];
<br>
- fault->key = key;
<br>
+ atomic64_set(&fault->key, key);
<br>
fault->timestamp = timestamp;
<br>
/* And update the hash */
<br>
@@ -387,6 +399,42 @@ bool amdgpu_gmc_filter_faults(struct
amdgpu_device *adev, uint64_t addr,
<br>
return false;
<br>
}
<br>
+/**
<br>
+ * amdgpu_gmc_filter_faults_remove - remove address from VM
faults filter
<br>
+ *
<br>
+ * @adev: amdgpu device structure
<br>
+ * @addr: address of the VM fault
<br>
+ * @pasid: PASID of the process causing the fault
<br>
+ *
<br>
+ * Remove the address from fault filter, then future vm fault
on this address
<br>
+ * will pass to retry fault handler to recover.
<br>
+ */
<br>
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device
*adev, uint64_t addr,
<br>
+ uint16_t pasid)
<br>
+{
<br>
+ struct amdgpu_gmc *gmc = &adev->gmc;
<br>
+
<br>
+ uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
<br>
+ struct amdgpu_gmc_fault *fault;
<br>
+ uint32_t hash;
<br>
+
<br>
+ hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
<br>
+ fault =
&gmc->fault_ring[gmc->fault_hash[hash].idx];
<br>
+ while (true) {
<br>
+ uint64_t tmp;
<br>
+
<br>
+ if (atomic64_cmpxchg(&fault->key, key, 0) ==
key)
<br>
+ break;
<br>
+
<br>
+ tmp = fault->timestamp;
<br>
+ fault = &gmc->fault_ring[fault->next];
<br>
+
<br>
+ /* Check if the entry was reused */
<br>
+ if (fault->timestamp >= tmp)
<br>
+ break;
<br>
+ }
<br>
</blockquote>
<br>
Maybe rewrite this as "do { ... } while (fault->timestamp <
tmp)".
<br>
<br>
<blockquote type="cite">+}
<br>
+
<br>
int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
<br>
{
<br>
int r;
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
<br>
index 9d11c02a3938..95e18ef83aec 100644
<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
<br>
@@ -66,9 +66,9 @@ struct firmware;
<br>
* GMC page fault information
<br>
*/
<br>
struct amdgpu_gmc_fault {
<br>
- uint64_t timestamp;
<br>
+ uint64_t timestamp:56;
<br>
</blockquote>
<br>
I think 48 bits should be enough for the timestamp for current
hardware.
<br>
<br>
</blockquote>
<p>Thanks for the advise, I will fix both and send out new patch.</p>
<p>Philip<br>
</p>
<blockquote type="cite" cite="mid:7b3b212a-bdca-1778-2732-4260253cd45d@gmail.com">Apart
from that looks good to me now,
<br>
Christian.
<br>
<br>
<blockquote type="cite"> uint64_t
next:AMDGPU_GMC_FAULT_RING_ORDER;
<br>
- uint64_t key:52;
<br>
+ atomic64_t key;
<br>
};
<br>
/*
<br>
@@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct
amdgpu_device *adev,
<br>
struct amdgpu_gmc *mc);
<br>
bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
uint64_t addr,
<br>
uint16_t pasid, uint64_t timestamp);
<br>
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device
*adev, uint64_t addr,
<br>
+ uint16_t pasid);
<br>
int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
<br>
void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
<br>
int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device
*adev);
<br>
</blockquote>
<br>
</blockquote>
</body>
</html>