[PATCH 2/3] drm/amdgpu: support utcl2 RAS poison query for mmhub
Tao Zhou
tao.zhou1 at amd.com
Mon Mar 18 07:26:22 UTC 2024
Support the query for both gfxhub and mmhub, also replace
xcc_id with hub_inst.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 17 ++++++++++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 ++---
.../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 17 +++++++++++------
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 17 +++++++++++------
5 files changed, 37 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0c794301d18d..0b4910108f61 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -782,12 +782,19 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
}
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
- int xcc_id)
+ int hub_inst, int hub_type)
{
- if (adev->gfxhub.funcs->query_utcl2_poison_status)
- return adev->gfxhub.funcs->query_utcl2_poison_status(adev, xcc_id);
- else
- return false;
+ if (!hub_type) {
+ if (adev->gfxhub.funcs->query_utcl2_poison_status)
+ return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+ else
+ return false;
+ } else {
+ if (adev->mmhub.funcs->query_utcl2_poison_status)
+ return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
+ else
+ return false;
+ }
}
int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 73b7fa7c5116..03bf20e0e3da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -406,7 +406,7 @@ bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *
void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p);
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
- int xcc_id);
+ int hub_inst, int hub_type);
int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b3894fe868b2..4ba26d7e52bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -670,10 +670,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
- /* for gfx fed error, kfd will handle it, return directly */
+ /* for fed error, kfd will handle it, return directly */
if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
- (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)) &&
- (vmhub < AMDGPU_MMHUB0_START))
+ (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
return 0;
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a8e76287dde0..650da18b0d87 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -369,18 +369,23 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
- int xcc_id = 0;
+ int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
+ /* gfxhub */
if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
- xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+ hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
node_id);
- if (xcc_id < 0)
- xcc_id = 0;
+ if (hub_inst < 0)
+ hub_inst = 0;
}
- if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
- amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, xcc_id)) {
+ /* mmhub */
+ if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
+ hub_inst = node_id / 4;
+
+ if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
+ hub_inst, vmid_type)) {
event_interrupt_poison_consumption(dev, pasid, client_id);
return;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index ff7392336795..11641f4645e6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -415,18 +415,23 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
- int xcc_id = 0;
+ int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
+ /* gfxhub */
if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
- xcc_id = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
+ hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
node_id);
- if (xcc_id < 0)
- xcc_id = 0;
+ if (hub_inst < 0)
+ hub_inst = 0;
}
- if (client_id == SOC15_IH_CLIENTID_UTCL2 && !vmid_type &&
- amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, xcc_id)) {
+ /* mmhub */
+ if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
+ hub_inst = node_id / 4;
+
+ if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev,
+ hub_inst, vmid_type)) {
event_interrupt_poison_consumption_v9(dev, pasid, client_id);
return;
}
--
2.34.1
More information about the amd-gfx
mailing list