[PATCH 12/14] drm/amdkfd: Update interrupt handling for GFX9.4.3

Alex Deucher alexander.deucher at amd.com
Wed Mar 29 20:14:24 UTC 2023


From: Mukul Joshi <mukul.joshi at amd.com>

Update interrupt handling in CPX mode for GFX9.4.3 by using the
VMID space instead of SDMA client id to determine if an interrupt
should be processed by a KFD node. This is especially needed for
handling retry faults from MMHUB.

Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  7 +++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 16 ++++++----------
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  8 ++++----
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |  2 +-
 6 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index a5064d95e7f5..e9b4b3c68b1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2431,6 +2431,9 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
  * amdgpu_vm_handle_fault - graceful handling of VM faults.
  * @adev: amdgpu device pointer
  * @pasid: PASID of the VM
+ * @vmid: VMID, only used for GFX 9.4.3.
+ * @node_id: Node_id received in IH cookie. Only applicable for
+ *           GFX 9.4.3.
  * @addr: Address of the fault
  * @write_fault: true is write fault, false is read fault
  *
@@ -2438,7 +2441,7 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			    u32 client_id, u32 node_id, uint64_t addr,
+			    u32 vmid, u32 node_id, uint64_t addr,
 			    bool write_fault)
 {
 	bool is_compute_context = false;
@@ -2463,7 +2466,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
 
 	addr /= AMDGPU_GPU_PAGE_SIZE;
 
-	if (is_compute_context && !svm_range_restore_pages(adev, pasid, client_id,
+	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
 	    node_id, addr, write_fault)) {
 		amdgpu_bo_unref(&root);
 		return true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 721bc55bfafa..14864a8541ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -455,7 +455,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
 			     struct amdgpu_task_info *task_info);
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			    u32 client_id, u32 node_id, uint64_t addr,
+			    u32 vmid, u32 node_id, uint64_t addr,
 			    bool write_fault);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 5c209f2d38c6..d819b544b043 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -593,8 +593,8 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 		/* Try to handle the recoverable page faults by filling page
 		 * tables
 		 */
-		if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->client_id, node_id,
-					   addr, write_fault))
+		if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid,
+					   node_id, addr, write_fault))
 			return 1;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index df372de6b056..fb3cf2c51da8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1073,18 +1073,14 @@ struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
 struct kfd_node *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev);
 struct kfd_node *kfd_device_by_adev(const struct amdgpu_device *adev);
-static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t client_id,
-				     uint32_t node_id)
+static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t node_id,
+					uint32_t vmid)
 {
-	if ((node->interrupt_bitmap & (0x1U << node_id)) ||
-	    ((node_id % 4) == 0 &&
-	    (node->interrupt_bitmap >> 16) & (0x1U << client_id)))
-		return true;
-
-	return false;
+	return (node->interrupt_bitmap & (1 << node_id)) != 0 &&
+	       (node->compute_vmid_bitmap & (1 << vmid)) != 0;
 }
 static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
-					uint32_t client_id, uint32_t node_id) {
+					uint32_t node_id, uint32_t vmid) {
 	struct kfd_dev *dev = adev->kfd.dev;
 	uint32_t i;
 
@@ -1092,7 +1088,7 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
 		return dev->nodes[0];
 
 	for (i = 0; i < dev->num_nodes; i++)
-		if (kfd_irq_is_from_node(dev->nodes[i], client_id, node_id))
+		if (kfd_irq_is_from_node(dev->nodes[i], node_id, vmid))
 			return dev->nodes[i];
 
 	return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 0e2b21ec468c..8bd9d88655b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2788,7 +2788,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
 
 int
 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
-			uint32_t client_id, uint32_t node_id,
+			uint32_t vmid, uint32_t node_id,
 			uint64_t addr, bool write_fault)
 {
 	struct mm_struct *mm = NULL;
@@ -2840,10 +2840,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		goto out;
 	}
 
-	node = kfd_node_by_irq_ids(adev, node_id, client_id);
+	node = kfd_node_by_irq_ids(adev, node_id, vmid);
 	if (!node) {
-		pr_debug("kfd node does not exist node_id: %d, client_id: %d\n", node_id,
-			 client_id);
+		pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
+			 vmid);
 		r = -EFAULT;
 		goto out;
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index a165c73b40b2..5116786718b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -173,7 +173,7 @@ int svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
 			       unsigned long addr, struct svm_range *parent,
 			       struct svm_range *prange);
 int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
-			    uint32_t client_id, uint32_t node_id, uint64_t addr,
+			    uint32_t vmid, uint32_t node_id, uint64_t addr,
 			    bool write_fault);
 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
 void svm_range_add_list_work(struct svm_range_list *svms,
-- 
2.39.2



More information about the amd-gfx mailing list