[PATCH] drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"

Das, Nirmoy nirmoy.das at amd.com
Thu Sep 30 09:48:20 UTC 2021


Acked-by: Nirmoy Das <nirmoy.das at amd.com>

On 9/30/2021 11:26 AM, Christian König wrote:
> This reverts commit 728e7e0cd61899208e924472b9e641dbeb0775c4.
>
> Further discussion reveals that this feature is severely broken
> and needs to be reverted ASAP.
>
> GPU reset can never be delayed by userspace even for debugging or
> otherwise we can run into in kernel deadlocks.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 ---------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  5 --
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 --
>   4 files changed, 91 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index dc3c6b3a00e5..6a1928a720a6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1078,8 +1078,6 @@ struct amdgpu_device {
>   	char				product_name[32];
>   	char				serial[20];
>   
> -	struct amdgpu_autodump		autodump;
> -
>   	atomic_t			throttling_logging_enabled;
>   	struct ratelimit_state		throttling_logging_rs;
>   	uint32_t                        ras_hw_enabled;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 277128846dd1..0b89ba142a59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -27,7 +27,6 @@
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/pm_runtime.h>
> -#include <linux/poll.h>
>   
>   #include "amdgpu.h"
>   #include "amdgpu_pm.h"
> @@ -37,85 +36,7 @@
>   #include "amdgpu_securedisplay.h"
>   #include "amdgpu_fw_attestation.h"
>   
> -int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> -{
>   #if defined(CONFIG_DEBUG_FS)
> -	unsigned long timeout = 600 * HZ;
> -	int ret;
> -
> -	wake_up_interruptible(&adev->autodump.gpu_hang);
> -
> -	ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> -	if (ret == 0) {
> -		pr_err("autodump: timeout, move on to gpu recovery\n");
> -		return -ETIMEDOUT;
> -	}
> -#endif
> -	return 0;
> -}
> -
> -#if defined(CONFIG_DEBUG_FS)
> -
> -static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> -{
> -	struct amdgpu_device *adev = inode->i_private;
> -	int ret;
> -
> -	file->private_data = adev;
> -
> -	ret = down_read_killable(&adev->reset_sem);
> -	if (ret)
> -		return ret;
> -
> -	if (adev->autodump.dumping.done) {
> -		reinit_completion(&adev->autodump.dumping);
> -		ret = 0;
> -	} else {
> -		ret = -EBUSY;
> -	}
> -
> -	up_read(&adev->reset_sem);
> -
> -	return ret;
> -}
> -
> -static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> -{
> -	struct amdgpu_device *adev = file->private_data;
> -
> -	complete_all(&adev->autodump.dumping);
> -	return 0;
> -}
> -
> -static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> -{
> -	struct amdgpu_device *adev = file->private_data;
> -
> -	poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> -
> -	if (amdgpu_in_reset(adev))
> -		return POLLIN | POLLRDNORM | POLLWRNORM;
> -
> -	return 0;
> -}
> -
> -static const struct file_operations autodump_debug_fops = {
> -	.owner = THIS_MODULE,
> -	.open = amdgpu_debugfs_autodump_open,
> -	.poll = amdgpu_debugfs_autodump_poll,
> -	.release = amdgpu_debugfs_autodump_release,
> -};
> -
> -static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> -{
> -	init_completion(&adev->autodump.dumping);
> -	complete_all(&adev->autodump.dumping);
> -	init_waitqueue_head(&adev->autodump.gpu_hang);
> -
> -	debugfs_create_file("amdgpu_autodump", 0600,
> -		adev_to_drm(adev)->primary->debugfs_root,
> -		adev, &autodump_debug_fops);
> -}
>   
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
> @@ -1590,7 +1511,6 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   	}
>   
>   	amdgpu_ras_debugfs_create_all(adev);
> -	amdgpu_debugfs_autodump_init(adev);
>   	amdgpu_rap_debugfs_init(adev);
>   	amdgpu_securedisplay_debugfs_init(adev);
>   	amdgpu_fw_attestation_debugfs_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> index 141a8474e24f..8b641f40fdf6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> @@ -26,10 +26,6 @@
>   /*
>    * Debugfs
>    */
> -struct amdgpu_autodump {
> -	struct completion		dumping;
> -	struct wait_queue_head		gpu_hang;
> -};
>   
>   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
>   int amdgpu_debugfs_init(struct amdgpu_device *adev);
> @@ -37,4 +33,3 @@ void amdgpu_debugfs_fini(struct amdgpu_device *adev);
>   void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
>   void amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> -int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 41c6b3aacd37..4d34b2da8582 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4458,10 +4458,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   	if (reset_context->reset_req_dev == adev)
>   		job = reset_context->job;
>   
> -	/* no need to dump if device is not in good state during probe period */
> -	if (!adev->gmc.xgmi.pending_reset)
> -		amdgpu_debugfs_wait_dump(adev);
> -
>   	if (amdgpu_sriov_vf(adev)) {
>   		/* stop the data exchange thread */
>   		amdgpu_virt_fini_data_exchange(adev);


More information about the amd-gfx mailing list