[PATCH 2/3] drm/amdgpu: introduce new API for GPU core dump
Trigger.Huang at amd.com
Trigger.Huang at amd.com
Thu Aug 15 11:38:35 UTC 2024
From: Trigger Huang <Trigger.Huang at amd.com>
Put ip dump and register to dev_coredumpm together
Signed-off-by: Trigger Huang <Trigger.Huang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 73 ++++++++++++++++++++++
2 files changed, 75 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4dd465ad14af..b64284324961 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1401,6 +1401,8 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_job *job,
struct amdgpu_reset_context *reset_context);
+void amdgpu_device_gpu_coredump(struct amdgpu_device *adev,
+ struct amdgpu_job *job);
void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
int amdgpu_device_pci_reset(struct amdgpu_device *adev);
bool amdgpu_device_need_post(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a6b8d0ba4758..69d6a5b7ca34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -145,6 +145,8 @@ const char *amdgpu_asic_name[] = {
};
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
+static void amdgpu_device_gpu_coredump_single(struct amdgpu_device *adev,
+ struct amdgpu_job *job);
/**
* DOC: pcie_replay_count
@@ -5965,6 +5967,77 @@ static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
}
}
+static void amdgpu_device_gpu_coredump_single(struct amdgpu_device *adev,
+ struct amdgpu_job *job)
+{
+ struct amdgpu_reset_context reset_context;
+ bool vram_lost;
+ int i = 0;
+
+ dev_info(adev->dev, "Dumping IP State\n");
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (adev->ip_blocks[i].version->funcs->dump_ip_state)
+ adev->ip_blocks[i].version->funcs
+ ->dump_ip_state((void *)adev);
+ dev_info(adev->dev, "Dumping IP State Completed\n");
+ }
+
+ vram_lost = amdgpu_device_check_vram_lost(adev);
+
+ memset(&reset_context, 0, sizeof(reset_context));
+ reset_context.job = job;
+
+ amdgpu_coredump(adev, vram_lost, &reset_context);
+}
+
+/**
+ * amdgpu_device_gpu_coredump - Dump GPU status
+ *
+ * @adev: amdgpu_device pointer
+ * @job: dump for specific job
+ *
+ * Attempt to dump all the current GPU status, like registers, ring buffer, etc.
+ * This function is mostly invoked when something goes wrong on the GPU,
+ * for example, job timeout.
+ */
+void amdgpu_device_gpu_coredump(struct amdgpu_device *adev,
+ struct amdgpu_job *job)
+{
+ struct list_head device_list, *device_list_handle = NULL;
+ struct amdgpu_device *tmp_adev = NULL;
+ struct amdgpu_hive_info *hive = NULL;
+
+ if (!amdgpu_sriov_vf(adev))
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (hive)
+ mutex_lock(&hive->hive_lock);
+
+ /*
+ * Reuse the logic in amdgpu_device_gpu_recover() to build list of
+ * devices for code dump
+ */
+ INIT_LIST_HEAD(&device_list);
+ if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ list_add_tail(&tmp_adev->reset_list, &device_list);
+ if (!list_is_first(&adev->reset_list, &device_list))
+ list_rotate_to_front(&adev->reset_list, &device_list);
+ device_list_handle = &device_list;
+ } else {
+ list_add_tail(&adev->reset_list, &device_list);
+ device_list_handle = &device_list;
+ }
+
+ /* Do the coredump for each device */
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+ amdgpu_device_gpu_coredump_single(tmp_adev, job);
+
+ if (hive) {
+ mutex_unlock(&hive->hive_lock);
+ amdgpu_put_xgmi_hive(hive);
+ }
+}
+
/**
* amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
*
--
2.34.1
More information about the amd-gfx
mailing list