[PATCH] drm/amdgpu: Show retry fault message if process xnack on
Philip Yang
Philip.Yang at amd.com
Tue May 7 19:45:05 UTC 2024
If vm_context_cntl set xnack on, then GPU vm fault has retry_fault bit
set, but the driver select xnack on or off path depending on per process
xnack setting which is also used to set qpd mem_config xnack on or off
if KFD_SUPPORT_XNACK_PER_PROCESS.
If process is xnack on, then GPU page fault show retry page fault
message, otherwise show no-retry page fault message, to avoid misleading
when debugging application page fault issue.
The process lookup from pasid is done inside retry fault handler
svm_range_restore_pages, add xnack_on parameter to pass process xnack
setting back to amdgpu_vm_handle_fault and then to gmc interrupt handler
to show vm fault message.
Signed-off-by: Philip Yang <Philip.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++--
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 ++++---
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +++-
drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +-
6 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 64ddc87f7fb6..58f7ab193027 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2757,13 +2757,14 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
* GFX 9.4.3.
* @addr: Address of the fault
* @write_fault: true is write fault, false is read fault
+ * @xnack_on: return value, true if the process sets xnack on
*
* Try to gracefully handle a VM fault. Return true if the fault was handled and
* shouldn't be reported any more.
*/
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
- bool write_fault)
+ bool write_fault, bool *xnack_on)
{
bool is_compute_context = false;
struct amdgpu_bo *root;
@@ -2788,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
addr /= AMDGPU_GPU_PAGE_SIZE;
if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
- node_id, addr, write_fault)) {
+ node_id, addr, write_fault, xnack_on)) {
amdgpu_bo_unref(&root);
return true;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index bc71b44387b2..7f364f0b9a60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -549,7 +549,7 @@ void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
u32 vmid, u32 node_id, uint64_t addr,
- bool write_fault);
+ bool write_fault, bool *xnack_on);
void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..2f0752376236 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
/* Try to handle the recoverable page faults by filling page
* tables
*/
- if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
+ if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault, NULL))
return 1;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 671a6766df5b..3db0f2304b6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -558,6 +558,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
uint32_t cam_index = 0;
int ret, xcc_id = 0;
uint32_t node_id;
+ bool xnack_on = false;
node_id = entry->node_id;
@@ -595,7 +596,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
cam_index = entry->src_data[2] & 0x3ff;
ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
- addr, write_fault);
+ addr, write_fault, &xnack_on);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
if (ret)
return 1;
@@ -618,7 +619,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
- addr, write_fault))
+ addr, write_fault, &xnack_on))
return 1;
}
}
@@ -628,7 +629,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
dev_err(adev->dev,
"[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", hub_name,
- retry_fault ? "retry" : "no-retry",
+ (retry_fault && xnack_on) ? "retry" : "no-retry",
entry->src_id, entry->ring_id, entry->vmid, entry->pasid);
task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 234ea0fbfa0c..9d44a52bc4b2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2887,7 +2887,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
int
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
uint32_t vmid, uint32_t node_id,
- uint64_t addr, bool write_fault)
+ uint64_t addr, bool write_fault, bool *xnack_on)
{
unsigned long start, last, size;
struct mm_struct *mm = NULL;
@@ -2923,6 +2923,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
goto out;
}
+ if (xnack_on)
+ *xnack_on = p->xnack_enabled;
if (!p->xnack_enabled) {
pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
r = -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 22f22b06a2f4..402f6fbb6452 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -182,7 +182,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
void svm_range_vram_node_free(struct svm_range *prange);
int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
uint32_t vmid, uint32_t node_id, uint64_t addr,
- bool write_fault);
+ bool write_fault, bool *xnack_on);
int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
void svm_range_add_list_work(struct svm_range_list *svms,
struct svm_range *prange, struct mm_struct *mm,
--
2.43.2
More information about the amd-gfx
mailing list