[PATCH 2/2] dmr/amdgpu: Add system auto reboot to RAS.

Thu Aug 29 07:33:25 UTC 2019

Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:
> In case of RAS error allow user configure auto system
> reboot through ras_ctrl.
> This is also part of the temproray work around for the RAS
> hang problem.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 10 +++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  1 +
>   3 files changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3ecee10..f1cff47 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3805,6 +3805,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	int i, r = 0;
>   	bool in_ras_intr = amdgpu_ras_intr_triggered();
>   
> +	/*
> +	 * Flush RAM to disk so that after reboot
> +	 * the user can read log and see why the system rebooted.
> +	 *
> +	 * Using user mode app call instead of kernel APIs such as
> +	 * ksys_sync_helper for backward comparability with earlier
> +	 * kernels into which this is also intended.
> +	 */
> +	if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
> +		char *envp[] = { "HOME=/", NULL };
> +		char *argv[] = { "/bin/sync", NULL };
> +
> +		DRM_WARN("Emergency reboot.");
> +
> +		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
> +		emergency_restart();
> +	}
> +
>   	need_full_reset = job_signaled = false;
>   	INIT_LIST_HEAD(&device_list);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 086e6df..423a1ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -30,6 +30,7 @@
>   #include "amdgpu_ras.h"
>   #include "amdgpu_atomfirmware.h"
>   #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
> +#include <linux/kmod.h>
>   
>   const char *ras_error_string[] = {
>   	"none",
> @@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>   		op = 1;
>   	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>   		op = 2;
> +	else if (sscanf(str, "reboot %32s", block_name) == 1)
> +		op = 3;
>   	else if (str[0] && str[1] && str[2] && str[3])
>   		/* ascii string, but commands are not matched. */
>   		return -EINVAL;

This is actually becoming quite a mess. We should consider removing the 
parsing in the long term and using separate debugfs files for each action.

Christian.

> @@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
>   		break;
> +	case 3:
> +		amdgpu_ras_get_context(adev)->reboot = true;
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>   void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
>   {
>   	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
> -		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
> +		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
> +
> +		amdgpu_ras_reset_gpu(adev, false);
>   	}
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index c0e22af..e3f0764 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -333,6 +333,7 @@ struct amdgpu_ras {
>   	struct mutex recovery_lock;
>   
>   	uint32_t flags;
> +	bool reboot;
>   };
>   
>   struct ras_fs_data {