[PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of ECC/EDC.

Deucher, Alexander Alexander.Deucher at amd.com
Tue Jun 6 21:22:28 UTC 2017


> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf
> Of David Panariti
> Sent: Tuesday, June 06, 2017 4:33 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Panariti, David
> Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
> ECC/EDC.
> 
> Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be
> enabled
> or disabled.  By default, all features are disabled.
> 
> EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
> more of: count SEC (single error corrected) and DED (double error detected,
> i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
> Currently, only counting errors is supported.
> 
> Fixed a whitespace error.
> 
> Signed-off-by: David Panariti <David.Panariti at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28
> ++++++++++++++++++++++++++--
>  drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
>  4 files changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index a6f51eb..3e930ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
>  extern int amdgpu_param_buf_per_se;
>  extern int amdgpu_job_hang_limit;
>  extern int amdgpu_lbpw;
> +extern unsigned amdgpu_ecc_flags;
> 
>  #define AMDGPU_DEFAULT_GTT_SIZE_MB		3072ULL /* 3GB by
> default */
>  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS	        3000
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index c4825ff..972660d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;
>  int amdgpu_param_buf_per_se = 0;
>  int amdgpu_job_hang_limit = 0;
>  int amdgpu_lbpw = -1;
> +unsigned amdgpu_ecc_flags = 0;

I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by default.  That can be our asic specific default setting.  In the case of CZ, that will be disabled until we decide to enable EDC by default.

> 
>  MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in
> megabytes");
>  module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
> @@ -795,6 +796,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>  	.driver.pm = &amdgpu_pm_ops,
>  };
> 
> +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable
> ECC/EDC (default))");
> +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
> +
> 
> 
>  static int __init amdgpu_init(void)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 46e766e..3b5685c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1824,7 +1824,17 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  	if (dis_bit) {
>  		/* On Carrizo, EDC may be permanently disabled by a fuse. */
>  		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> 0x%08x.\n",
> -			tmp);
> +			 tmp);
> +		return 0;
> +	}
> +
> +	/*
> +	 * Check if EDC has been requested by a kernel parameter.
> +	 * For Carrizo, EDC is the best/safest mode WRT error handling.
> +	 */
> +	if (!(amdgpu_ecc_flags
> +	      & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
> +		DRM_INFO("EDC support has not been requested.\n");

Can drop this debugging statement or make it DRM_DEBUG.

>  		return 0;
>  	}
> 
> @@ -1962,6 +1972,20 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  		goto fail;
>  	}
> 
> +	/* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt
> */
> +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
> +	/* Do not propagate the errors to the next block. */
> +	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
> +	WREG32(mmGB_EDC_MODE, tmp);
> +
> +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> +
> +	/*
> +	 * Clear EDC_DISABLE bit so the counters are available.
> +	 */
> +	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
> +	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> +
>  	gfx_v8_0_edc_clear_counters(adev);
> 
>  fail:
> @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void *handle)
>  		gfx_v8_0_cp_compute_enable(adev, false);
>  	}
> 
> -       return 0;
> +	return 0;
>  }
> 
>  static int gfx_v8_0_soft_reset(void *handle)
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h
> b/drivers/gpu/drm/amd/include/amd_shared.h
> index 0f58e95..1cf30aa 100644
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
>  #define AMD_PG_SUPPORT_GFX_QUICK_MG		(1 << 11)
>  #define AMD_PG_SUPPORT_GFX_PIPELINE		(1 << 12)
> 
> +/*
> + * ECC flags
> + * Allows the user to choose what kind of error detection/correction is
> used.
> + * Currently, EDC is supported on Carrizo.
> + *
> + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the
> driver
> + * set what it thinks is best/safest mode.  This may not be the same as the
> + * default, depending on the GPU and the application.
> + * Using a single bit makes it easy to request the best support without
> + * needing to know all currently supported modes.
> + */
> +#define AMD_ECC_SUPPORT_BEST			(1 << 0)
> +#define AMD_ECC_SUPPORT_EDC			(1 << 1)
> +
>  enum amd_pm_state_type {
>  	/* not used for dpm */
>  	POWER_STATE_TYPE_DEFAULT,
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


More information about the amd-gfx mailing list