<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Arial;font-size:10pt;color:#0000FF;margin:5pt;" align="Left">
[AMD Official Use Only - General]<br>
</p>
<br>
<div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div id="appendonsend"></div>
<div style="font-family:Calibri,Arial,Helvetica,sans-serif; font-size:12pt; color:rgb(0,0,0)">
<br>
</div>
<hr tabindex="-1" style="display:inline-block; width:98%">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" color="#000000" style="font-size:11pt"><b>From:</b> amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Somalapuram Amaranath <Amaranath.Somalapuram@amd.com><br>
<b>Sent:</b> Thursday, May 26, 2022 5:48 PM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org><br>
<b>Cc:</b> Deucher, Alexander <Alexander.Deucher@amd.com>; Somalapuram, Amaranath <Amaranath.Somalapuram@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Sharma, Shashank <Shashank.Sharma@amd.com><br>
<b>Subject:</b> [PATCH v2 2/2] drm/amdgpu: adding device coredump support</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt">
<div class="PlainText elementToProof">Added device coredump information:<br>
- Kernel version<br>
- Module<br>
- Time<br>
- VRAM status<br>
- Guilty process name and PID<br>
- GPU register dumps<br>
v1 -> v2: Variable name change<br>
v1 -> v2: NULL check<br>
v1 -> v2: Code alignment<br>
v1 -> v2: Adding dummy amdgpu_devcoredump_free<br>
v1 -> v2: memset reset_task_info to zero<br>
<br>
Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@amd.com><br>
---<br>
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 67 ++++++++++++++++++++++<br>
2 files changed, 70 insertions(+)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
index c79d9992b113..25a7b2c74928 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
@@ -1044,6 +1044,9 @@ struct amdgpu_device {<br>
uint32_t *reset_dump_reg_list;<br>
uint32_t *reset_dump_reg_value;<br>
int num_regs;<br>
+ struct amdgpu_task_info reset_task_info;<br>
+ bool reset_vram_lost;<br>
+ struct timespec64 reset_time;</div>
<div class="PlainText elementToProof"><br>
</div>
<div class="PlainText elementToProof">[kevin]:</div>
<div class="PlainText elementToProof">the <span style="background-color:rgb(255, 255, 255);display:inline !important">
CONFIG_DEV_COREDUMP check is needed for above variable to avoid compiler warning when coredump feautre is not enabled.</span><br>
</div>
<div class="PlainText elementToProof"> <br>
struct amdgpu_reset_domain *reset_domain;<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
index 866b4980a6fa..ca97afe5be63 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
@@ -32,6 +32,8 @@<br>
#include <linux/slab.h><br>
#include <linux/iommu.h><br>
#include <linux/pci.h><br>
+#include <linux/devcoredump.h><br>
+#include <generated/utsrelease.h><br>
<br>
#include <drm/drm_atomic_helper.h><br>
#include <drm/drm_probe_helper.h><br>
@@ -4734,6 +4736,62 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)<br>
return 0;<br>
}<br>
<br>
+#ifdef CONFIG_DEV_COREDUMP<br>
+static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,<br>
+ size_t count, void *data, size_t datalen)<br>
+{<br>
+ struct drm_printer p;<br>
+ struct amdgpu_device *adev = data;<br>
+ struct drm_print_iterator iter;<br>
+ int i;<br>
+<br>
+ if (adev == NULL)<br>
+ return 0;</div>
<div class="PlainText elementToProof">[kevin]:</div>
<div class="PlainText elementToProof"> this check is not needed, because this private data is passed by our driver as below:</div>
<div class="PlainText elementToProof"><br>
</div>
<div class="PlainText elementToProof"><span style="background-color:rgb(255, 255, 255);display:inline !important"> dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,</span><br style="background-color:rgb(255, 255, 255)">
</div>
<div class="PlainText elementToProof"><span style="background-color:rgb(255, 255, 255);display:inline !important"> amdgpu_devcoredump_read, amdgpu_devcoredump_free);</span><br>
</div>
<div class="PlainText elementToProof">+<br>
+ iter.data = buffer;<br>
+ iter.offset = 0;<br>
+ iter.start = offset;<br>
+ iter.remain = count;<br>
+<br>
+ p = drm_coredump_printer(&iter);<br>
+<br>
+ drm_printf(&p, "**** AMDGPU Device Coredump ****\n");<br>
+ drm_printf(&p, "kernel: " UTS_RELEASE "\n");<br>
+ drm_printf(&p, "module: " KBUILD_MODNAME "\n");<br>
+ drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);<br>
+ if (adev->reset_task_info.pid)<br>
+ drm_printf(&p, "process_name: %s PID: %d\n",<br>
+ adev->reset_task_info.process_name,<br>
+ adev->reset_task_info.pid);<br>
+<br>
+ if (adev->reset_vram_lost)<br>
+ drm_printf(&p, "VRAM is lost due to GPU reset!\n");<br>
+ if (adev->num_regs) {<br>
+ drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");<br>
+<br>
+ for (i = 0; i < adev->num_regs; i++)<br>
+ drm_printf(&p, "0x%08x: 0x%08x\n",<br>
+ adev->reset_dump_reg_list[i],<br>
+ adev->reset_dump_reg_value[i]);<br>
+ }<br>
+<br>
+ return count - iter.remain;<br>
+}<br>
+<br>
+static void amdgpu_devcoredump_free(void *data)<br>
+{<br>
+}<br>
+<br>
+static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)<br>
+{<br>
+ struct drm_device *dev = adev_to_drm(adev);<br>
+<br>
+ ktime_get_ts64(&adev->reset_time);<br>
+ dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,<br>
+ amdgpu_devcoredump_read, amdgpu_devcoredump_free);<br>
+}<br>
+#endif<br>
+<br>
int amdgpu_do_asic_reset(struct list_head *device_list_handle,<br>
struct amdgpu_reset_context *reset_context)<br>
{<br>
@@ -4818,6 +4876,15 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,<br>
goto out;<br>
<br>
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);<br>
+#ifdef CONFIG_DEV_COREDUMP<br>
+ tmp_adev->reset_vram_lost = vram_lost;<br>
+ memset(&tmp_adev->reset_task_info, 0,<br>
+ sizeof(tmp_adev->reset_task_info));<br>
+ if (reset_context->job && reset_context->job->vm)<br>
+ tmp_adev->reset_task_info =<br>
+ reset_context->job->vm->task_info;<br>
+ amdgpu_reset_capture_coredumpm(tmp_adev);<br>
+#endif<br>
if (vram_lost) {<br>
DRM_INFO("VRAM is lost due to GPU reset!\n");<br>
amdgpu_inc_vram_lost(tmp_adev);<br>
-- <br>
2.32.0<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>