[PATCH] drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"

Alex Deucher alexdeucher at gmail.com
Mon Oct 4 13:32:03 UTC 2021


Acked-by: Alex Deucher <alexander.deucher at amd.com>

On Mon, Oct 4, 2021 at 4:31 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> Ping? Alex any objections to this?
>
> Otherwise I'm going to push it with Nirmoy's acked-by.
>
> Christian.
>
> Am 30.09.21 um 11:26 schrieb Christian König:
> > This reverts commit 728e7e0cd61899208e924472b9e641dbeb0775c4.
> >
> > Further discussion reveals that this feature is severely broken
> > and needs to be reverted ASAP.
> >
> > GPU reset can never be delayed by userspace even for debugging or
> > otherwise we can run into in kernel deadlocks.
> >
> > Signed-off-by: Christian König <christian.koenig at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 -
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 ---------------------
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  5 --
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 --
> >   4 files changed, 91 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index dc3c6b3a00e5..6a1928a720a6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1078,8 +1078,6 @@ struct amdgpu_device {
> >       char                            product_name[32];
> >       char                            serial[20];
> >
> > -     struct amdgpu_autodump          autodump;
> > -
> >       atomic_t                        throttling_logging_enabled;
> >       struct ratelimit_state          throttling_logging_rs;
> >       uint32_t                        ras_hw_enabled;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > index 277128846dd1..0b89ba142a59 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> > @@ -27,7 +27,6 @@
> >   #include <linux/pci.h>
> >   #include <linux/uaccess.h>
> >   #include <linux/pm_runtime.h>
> > -#include <linux/poll.h>
> >
> >   #include "amdgpu.h"
> >   #include "amdgpu_pm.h"
> > @@ -37,85 +36,7 @@
> >   #include "amdgpu_securedisplay.h"
> >   #include "amdgpu_fw_attestation.h"
> >
> > -int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> > -{
> >   #if defined(CONFIG_DEBUG_FS)
> > -     unsigned long timeout = 600 * HZ;
> > -     int ret;
> > -
> > -     wake_up_interruptible(&adev->autodump.gpu_hang);
> > -
> > -     ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
> > -     if (ret == 0) {
> > -             pr_err("autodump: timeout, move on to gpu recovery\n");
> > -             return -ETIMEDOUT;
> > -     }
> > -#endif
> > -     return 0;
> > -}
> > -
> > -#if defined(CONFIG_DEBUG_FS)
> > -
> > -static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
> > -{
> > -     struct amdgpu_device *adev = inode->i_private;
> > -     int ret;
> > -
> > -     file->private_data = adev;
> > -
> > -     ret = down_read_killable(&adev->reset_sem);
> > -     if (ret)
> > -             return ret;
> > -
> > -     if (adev->autodump.dumping.done) {
> > -             reinit_completion(&adev->autodump.dumping);
> > -             ret = 0;
> > -     } else {
> > -             ret = -EBUSY;
> > -     }
> > -
> > -     up_read(&adev->reset_sem);
> > -
> > -     return ret;
> > -}
> > -
> > -static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
> > -{
> > -     struct amdgpu_device *adev = file->private_data;
> > -
> > -     complete_all(&adev->autodump.dumping);
> > -     return 0;
> > -}
> > -
> > -static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
> > -{
> > -     struct amdgpu_device *adev = file->private_data;
> > -
> > -     poll_wait(file, &adev->autodump.gpu_hang, poll_table);
> > -
> > -     if (amdgpu_in_reset(adev))
> > -             return POLLIN | POLLRDNORM | POLLWRNORM;
> > -
> > -     return 0;
> > -}
> > -
> > -static const struct file_operations autodump_debug_fops = {
> > -     .owner = THIS_MODULE,
> > -     .open = amdgpu_debugfs_autodump_open,
> > -     .poll = amdgpu_debugfs_autodump_poll,
> > -     .release = amdgpu_debugfs_autodump_release,
> > -};
> > -
> > -static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> > -{
> > -     init_completion(&adev->autodump.dumping);
> > -     complete_all(&adev->autodump.dumping);
> > -     init_waitqueue_head(&adev->autodump.gpu_hang);
> > -
> > -     debugfs_create_file("amdgpu_autodump", 0600,
> > -             adev_to_drm(adev)->primary->debugfs_root,
> > -             adev, &autodump_debug_fops);
> > -}
> >
> >   /**
> >    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
> > @@ -1590,7 +1511,6 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
> >       }
> >
> >       amdgpu_ras_debugfs_create_all(adev);
> > -     amdgpu_debugfs_autodump_init(adev);
> >       amdgpu_rap_debugfs_init(adev);
> >       amdgpu_securedisplay_debugfs_init(adev);
> >       amdgpu_fw_attestation_debugfs_init(adev);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> > index 141a8474e24f..8b641f40fdf6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
> > @@ -26,10 +26,6 @@
> >   /*
> >    * Debugfs
> >    */
> > -struct amdgpu_autodump {
> > -     struct completion               dumping;
> > -     struct wait_queue_head          gpu_hang;
> > -};
> >
> >   int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
> >   int amdgpu_debugfs_init(struct amdgpu_device *adev);
> > @@ -37,4 +33,3 @@ void amdgpu_debugfs_fini(struct amdgpu_device *adev);
> >   void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
> >   void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
> >   void amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
> > -int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 41c6b3aacd37..4d34b2da8582 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -4458,10 +4458,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> >       if (reset_context->reset_req_dev == adev)
> >               job = reset_context->job;
> >
> > -     /* no need to dump if device is not in good state during probe period */
> > -     if (!adev->gmc.xgmi.pending_reset)
> > -             amdgpu_debugfs_wait_dump(adev);
> > -
> >       if (amdgpu_sriov_vf(adev)) {
> >               /* stop the data exchange thread */
> >               amdgpu_virt_fini_data_exchange(adev);
>


More information about the amd-gfx mailing list