[PATCH 1/9] drm/amdgpu: add bad page count threshold in module parameter
Guchun Chen
guchun.chen at amd.com
Thu Jul 23 08:33:38 UTC 2020
bad_page_threshold could be specified to detect and retire
bad GPU if faulty bad pages exceed it.
When it's -1, ras will use typical bad page failure value.
When it's 0, disable bad gpu check.
v2: correct documentation of this parameter.
Signed-off-by: Guchun Chen <guchun.chen at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++++++++++
2 files changed, 12 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 06bfb8658dec..bb83ffb5e26a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -181,6 +181,7 @@ extern uint amdgpu_dm_abm_level;
extern struct amdgpu_mgpu_info mgpu_info;
extern int amdgpu_ras_enable;
extern uint amdgpu_ras_mask;
+extern int amdgpu_bad_page_threshold;
extern int amdgpu_async_gfx_ring;
extern int amdgpu_mcbp;
extern int amdgpu_discovery;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d28b95f721c4..5a517f2fce35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -161,6 +161,7 @@ struct amdgpu_mgpu_info mgpu_info = {
};
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
+int amdgpu_bad_page_threshold = -1;
/**
* DOC: vramlimit (int)
@@ -801,6 +802,16 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);
MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");
module_param_named(reset_method, amdgpu_reset_method, int, 0444);
+/**
+ * DOC: bad_page_threshold (int)
+ * Bad page threshold configuration is applied by GPU retirement policy.
+ * The detail is to specify the threshold value of faulty pages detected by
+ * ECC, that may result in GPU's retirement if total faulty pages by ECC
+ * exceed threshold value.
+ */
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default typical value), 0 = disable)");
+module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
+
static const struct pci_device_id pciidlist[] = {
#ifdef CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
--
2.17.1
More information about the amd-gfx
mailing list