[PATCH] drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"
Das, Nirmoy
nirmoy.das at amd.com
Thu Sep 30 09:48:20 UTC 2021
Acked-by: Nirmoy Das <nirmoy.das at amd.com>
On 9/30/2021 11:26 AM, Christian König wrote:
> This reverts commit 728e7e0cd61899208e924472b9e641dbeb0775c4.
>
> Further discussion reveals that this feature is severely broken
> and needs to be reverted ASAP.
>
> GPU reset can never be delayed by userspace even for debugging or
> otherwise we can run into in kernel deadlocks.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 ---------------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 5 --
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 --
> 4 files changed, 91 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index dc3c6b3a00e5..6a1928a720a6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1078,8 +1078,6 @@ struct amdgpu_device {
> char product_name[32];
> char serial[20];
>
> - struct amdgpu_autodump autodump;
> -
> atomic_t throttling_logging_enabled;
> struct ratelimit_state throttling_logging_rs;
> uint32_t ras_hw_enabled;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 277128846dd1..0b89ba142a59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,6 @@
> #include <linux/pci.h>
> #include <linux/uaccess.h>
> #include <linux/pm_runtime.h>
> -#include <linux/poll.h>
>
> #include "amdgpu.h"
> #include "amdgpu_pm.h"
> @@ -37,85 +36,7 @@
> #include "amdgpu_securedisplay.h"
> #include "amdgpu_fw_attestation.h"
>
> -int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> -{
> #if defined(CONFIG_DEBUG_FS)
> - unsigned long timeout = 600 * HZ;
> - int ret;
> -
> - wake_up_interruptible(&adev->autodump.gpu_hang);
> -
> - ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> - if (ret == 0) {
> - pr_err("autodump: timeout, move on to gpu recovery\n");
> - return -ETIMEDOUT;
> - }
> -#endif
> - return 0;
> -}
> -
> -#if defined(CONFIG_DEBUG_FS)
> -
> -static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> -{
> - struct amdgpu_device *adev = inode->i_private;
> - int ret;
> -
> - file->private_data = adev;
> -
> - ret = down_read_killable(&adev->reset_sem);
> - if (ret)
> - return ret;
> -
> - if (adev->autodump.dumping.done) {
> - reinit_completion(&adev->autodump.dumping);
> - ret = 0;
> - } else {
> - ret = -EBUSY;
> - }
> -
> - up_read(&adev->reset_sem);
> -
> - return ret;
> -}
> -
> -static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> -{
> - struct amdgpu_device *adev = file->private_data;
> -
> - complete_all(&adev->autodump.dumping);
> - return 0;
> -}
> -
> -static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> -{
> - struct amdgpu_device *adev = file->private_data;
> -
> - poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> -
> - if (amdgpu_in_reset(adev))
> - return POLLIN | POLLRDNORM | POLLWRNORM;
> -
> - return 0;
> -}
> -
> -static const struct file_operations autodump_debug_fops = {
> - .owner = THIS_MODULE,
> - .open = amdgpu_debugfs_autodump_open,
> - .poll = amdgpu_debugfs_autodump_poll,
> - .release = amdgpu_debugfs_autodump_release,
> -};
> -
> -static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> -{
> - init_completion(&adev->autodump.dumping);
> - complete_all(&adev->autodump.dumping);
> - init_waitqueue_head(&adev->autodump.gpu_hang);
> -
> - debugfs_create_file("amdgpu_autodump", 0600,
> - adev_to_drm(adev)->primary->debugfs_root,
> - adev, &autodump_debug_fops);
> -}
>
> /**
> * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
> @@ -1590,7 +1511,6 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
> }
>
> amdgpu_ras_debugfs_create_all(adev);
> - amdgpu_debugfs_autodump_init(adev);
> amdgpu_rap_debugfs_init(adev);
> amdgpu_securedisplay_debugfs_init(adev);
> amdgpu_fw_attestation_debugfs_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index 141a8474e24f..8b641f40fdf6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -26,10 +26,6 @@
> /*
> * Debugfs
> */
> -struct amdgpu_autodump {
> - struct completion dumping;
> - struct wait_queue_head gpu_hang;
> -};
>
> int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
> int amdgpu_debugfs_init(struct amdgpu_device *adev);
> @@ -37,4 +33,3 @@ void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
> void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
> void amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> -int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 41c6b3aacd37..4d34b2da8582 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4458,10 +4458,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> if (reset_context->reset_req_dev == adev)
> job = reset_context->job;
>
> - /* no need to dump if device is not in good state during probe period */
> - if (!adev->gmc.xgmi.pending_reset)
> - amdgpu_debugfs_wait_dump(adev);
> -
> if (amdgpu_sriov_vf(adev)) {
> /* stop the data exchange thread */
> amdgpu_virt_fini_data_exchange(adev);
More information about the amd-gfx
mailing list