[PATCH 2/3] drm/amdgpu: support utcl2 RAS poison query for mmhub

Tao Zhou tao.zhou1 at amd.com
Wed Mar 13 09:11:54 UTC 2024


Support the query for both gfxhub and mmhub, also replace
xcc_id with hub_inst.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c      | 17 ++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h      |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c           |  3 +--
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c    | 17 +++++++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 17 +++++++++++------
 5 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index fa958cbc603a..9687650b0fe3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -782,12 +782,19 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
 }
 
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-			int xcc_id)
+			int hub_inst, int hub_type)
 {
-	if (adev->gfxhub.funcs->query_utcl2_poison_status)
-		return adev->gfxhub.funcs->query_utcl2_poison_status(adev, xcc_id);
-	else
-		return false;
+	if (!hub_type) {
+		if (adev->gfxhub.funcs->query_utcl2_poison_status)
+			return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+		else
+			return false;
+	} else {
+		if (adev->mmhub.funcs->query_utcl2_poison_status)
+			return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+		else
+			return false;
+	}
 }
 
 int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73b7fa7c5116..03bf20e0e3da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -406,7 +406,7 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
-			int xcc_id);
+			int hub_inst, int hub_type);
 int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
 		uint64_t size, u32 alloc_flag, int8_t xcp_id);
 void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index fb19b88e5522..d615d0fc2c6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -672,8 +672,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 
 	/* for gfx fed error, kfd will handle it, return directly */
 	if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
-	    (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) &&
-	    (vmhub < AMDGPU_MMHUB0_START))
+	    (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
 			return 0;
 
 	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a8e76287dde0..650da18b0d87 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -369,18 +369,23 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
 		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
 		uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
 		uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
-		int xcc_id = 0;
+		int hub_inst = 0;
 		struct kfd_hsa_memory_exception_data exception_data;
 
+		/* gfxhub */
 		if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
-			xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+			hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
 				node_id);
-			if (xcc_id < 0)
-				xcc_id = 0;
+			if (hub_inst < 0)
+				hub_inst = 0;
 		}
 
-		if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
-		    amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, xcc_id)) {
+		/* mmhub */
+		if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
+			hub_inst = node_id / 4;
+
+		if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
+					hub_inst, vmid_type)) {
 			event_interrupt_poison_consumption(dev, pasid, client_id);
 			return;
 		}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index ff7392336795..11641f4645e6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -415,18 +415,23 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
 		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
 		uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
 		uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
-		int xcc_id = 0;
+		int hub_inst = 0;
 		struct kfd_hsa_memory_exception_data exception_data;
 
+		/* gfxhub */
 		if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
-			xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+			hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
 				node_id);
-			if (xcc_id < 0)
-				xcc_id = 0;
+			if (hub_inst < 0)
+				hub_inst = 0;
 		}
 
-		if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
-		    amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, xcc_id)) {
+		/* mmhub */
+		if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
+			hub_inst = node_id / 4;
+
+		if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
+					hub_inst, vmid_type)) {
 			event_interrupt_poison_consumption_v9(dev, pasid, client_id);
 			return;
 		}
-- 
2.34.1



More information about the amd-gfx mailing list