[PATCH 144/159] drm/amdkfd: Add kernel parameter to stop queue eviction on vm fault

Alex Deucher alexander.deucher at amd.com
Wed Feb 24 22:18:44 UTC 2021


From: Oak Zeng <Oak.Zeng at amd.com>

This is to keep wavefront context for debug purpose

Signed-off-by: Oak Zeng <Oak.Zeng at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 7 +++++++
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  | 5 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h            | 5 +++++
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1d7839dbbc33..c4d822b46ea4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -748,6 +748,13 @@ bool no_system_mem_limit;
 module_param(no_system_mem_limit, bool, 0644);
 MODULE_PARM_DESC(no_system_mem_limit, "disable system memory limit (false = default)");
 
+/**
+ * DOC: no_queue_eviction_on_vm_fault (int)
+ * If set, process queues will not be evicted on gpuvm fault. This is to keep the wavefront context for debugging (0 = queue eviction, 1 = no queue eviction). The default is 0 (queue eviction).
+ */
+int amdgpu_no_queue_eviction_on_vm_fault = 0;
+MODULE_PARM_DESC(no_queue_eviction_on_vm_fault, "No queue eviction on VM fault (0 = queue eviction, 1 = no queue eviction)");
+module_param_named(no_queue_eviction_on_vm_fault, amdgpu_no_queue_eviction_on_vm_fault, int, 0444);
 #endif
 
 /**
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 8e64c01565ac..60f752d75833 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -80,8 +80,9 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 		ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
 		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
 		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
-		ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
-		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
+		((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
+		!amdgpu_no_queue_eviction_on_vm_fault);
 }
 
 static void cik_event_interrupt_wq(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 74a460be077b..1c20458f3962 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -98,9 +98,10 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 		source_id == SOC15_INTSRC_SDMA_TRAP ||
 		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
 		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
-		client_id == SOC15_IH_CLIENTID_VMC ||
+		((client_id == SOC15_IH_CLIENTID_VMC ||
 		client_id == SOC15_IH_CLIENTID_VMC1 ||
-		client_id == SOC15_IH_CLIENTID_UTCL2;
+		client_id == SOC15_IH_CLIENTID_UTCL2) &&
+		!amdgpu_no_queue_eviction_on_vm_fault);
 }
 
 static void event_interrupt_wq_v9(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e2ebd5a1d4de..b9839c650f21 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -169,6 +169,11 @@ extern bool hws_gws_support;
 /* Queue preemption timeout in ms */
 extern int queue_preemption_timeout_ms;
 
+/*
+ * Don't evict process queues on vm fault
+ */
+extern int amdgpu_no_queue_eviction_on_vm_fault;
+
 /* Enable eviction debug messages */
 extern bool debug_evictions;
 
-- 
2.29.2



More information about the amd-gfx mailing list