[PATCH 154/159] drm/amdgpu: enable watchdog feature for SQ of aldebaran
Alex Deucher
alexander.deucher at amd.com
Wed Feb 24 22:18:54 UTC 2021
From: Dennis Li <Dennis.Li at amd.com>
SQ's watchdog timer monitors forward progress, a mask of which waves
caused the watchdog timeout is recorded into ras status registers and
then trigger a system fatal error event.
v2:
1. change *query_timeout_status to *query_sq_timeout_status.
2. move query_sq_timeout_status into amdgpu_ras_do_recovery.
3. add module parameters to enable/disable fatal error event and modify
the watchdog timer.
v3:
1. remove unused parameters of *enable_watchdog_timer
Signed-off-by: Dennis Li <Dennis.Li at amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 5 +
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 106 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h | 3 +
drivers/gpu/drm/amd/amdgpu/soc15_common.h | 30 ++++++
8 files changed, 174 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 361e84369b6c..f82ad1b8eed5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -127,6 +127,12 @@ struct amdgpu_mgpu_info
uint32_t num_apu;
};
+struct amdgpu_watchdog_timer
+{
+ bool timeout_fatal_disable;
+ uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
+};
+
#define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH 256
/*
@@ -186,6 +192,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
extern int amdgpu_ras_enable;
extern uint amdgpu_ras_mask;
extern int amdgpu_bad_page_threshold;
+extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
extern int amdgpu_async_gfx_ring;
extern int amdgpu_mcbp;
extern int amdgpu_discovery;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c4d822b46ea4..42654deb9c55 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -172,6 +172,10 @@ struct amdgpu_mgpu_info mgpu_info = {
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
int amdgpu_bad_page_threshold = 100;
+struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
+ .timeout_fatal_disable = false,
+ .period = 0x3f, /* about 8s */
+};
/**
* DOC: vramlimit (int)
@@ -527,6 +531,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);
+/**
+ * DOC: timeout_fatal_disable (bool)
+ * Disable Watchdog timeout fatal error event
+ */
+MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)");
+module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644);
+
+/**
+ * DOC: timeout_period (uint)
+ * Modify the watchdog timeout max_cycles as (1 << period)
+ */
+MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
+module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
+
/**
* DOC: si_support (int)
* Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 1ab9632282d4..d92f0f14cbeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs {
void (*init_spm_golden)(struct amdgpu_device *adev);
void (*query_ras_error_status) (struct amdgpu_device *adev);
void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
+ void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+ void (*query_sq_timeout_status)(struct amdgpu_device *adev);
};
struct sq_work {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 09546dec40ff..5805c78c356b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
case AMDGPU_RAS_BLOCK__GFX:
if (adev->gfx.funcs->query_ras_error_status)
adev->gfx.funcs->query_ras_error_status(adev);
+
+ if (adev->gfx.funcs->query_sq_timeout_status)
+ adev->gfx.funcs->query_sq_timeout_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
if (adev->mmhub.funcs->query_ras_error_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 977eeec70e1b..c37139d2c487 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2121,6 +2121,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
+ .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+ .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
};
static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
@@ -3965,6 +3967,9 @@ static int gfx_v9_0_hw_init(void *handle)
if (adev->asic_type == CHIP_ALDEBARAN)
gfx_v9_4_2_set_power_brake_sequence(adev);
+ if (adev->gfx.funcs->enable_watchdog_timer)
+ adev->gfx.funcs->enable_watchdog_timer(adev);
+
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index b2e2026c3ec7..1faeae14ead9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
gfx_v9_4_2_query_ea_err_status(adev);
gfx_v9_4_2_query_utc_err_status(adev);
}
+
+void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
+{
+ uint32_t i;
+ uint32_t data;
+
+ data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
+ amdgpu_watchdog_timer.timeout_fatal_disable ? 1 :
+ 0);
+ data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL,
+ amdgpu_watchdog_timer.period);
+
+ mutex_lock(&adev->grbm_idx_mutex);
+ for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
+ gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff);
+ WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data);
+ }
+ gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+ mutex_unlock(&adev->grbm_idx_mutex);
+}
+
+static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
+{
+ WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX,
+ (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
+ (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
+ (address << SQ_IND_INDEX__INDEX__SHIFT) |
+ (SQ_IND_INDEX__FORCE_READ_MASK));
+ return RREG32_SOC15(GC, 0, regSQ_IND_DATA);
+}
+
+static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev,
+ uint32_t status)
+{
+ struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
+ uint32_t i, simd, wave;
+ uint32_t wave_status;
+ uint32_t wave_pc_lo, wave_pc_hi;
+ uint32_t wave_exec_lo, wave_exec_hi;
+ uint32_t wave_inst_dw0, wave_inst_dw1;
+ uint32_t wave_ib_sts;
+
+ for (i = 0; i < 32; i++) {
+ if (!((i << 1) & status))
+ continue;
+
+ simd = i / cu_info->max_waves_per_simd;
+ wave = i % cu_info->max_waves_per_simd;
+
+ wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS);
+ wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO);
+ wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI);
+ wave_exec_lo =
+ wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO);
+ wave_exec_hi =
+ wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI);
+ wave_inst_dw0 =
+ wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0);
+ wave_inst_dw1 =
+ wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1);
+ wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS);
+
+ dev_info(
+ adev->dev,
+ "\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n",
+ simd, wave, wave_status,
+ ((uint64_t)wave_pc_hi << 32 | wave_pc_lo),
+ ((uint64_t)wave_exec_hi << 32 | wave_exec_lo),
+ ((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0),
+ wave_ib_sts);
+ }
+}
+
+void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
+{
+ uint32_t se_idx, sh_idx, cu_idx;
+ uint32_t status;
+
+ mutex_lock(&adev->grbm_idx_mutex);
+ for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
+ se_idx++) {
+ for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
+ sh_idx++) {
+ for (cu_idx = 0;
+ cu_idx < adev->gfx.config.max_cu_per_sh;
+ cu_idx++) {
+ gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
+ cu_idx);
+ status = RREG32_SOC15(GC, 0,
+ regSQ_TIMEOUT_STATUS);
+ if (status != 0) {
+ dev_info(
+ adev->dev,
+ "GFX Watchdog Timeout: SE %d, SH %d, CU %d\n",
+ se_idx, sh_idx, cu_idx);
+ gfx_v9_4_2_log_cu_timeout_status(
+ adev, status);
+ }
+ /* clear old status */
+ WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
+ }
+ }
+ }
+ gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+ mutex_unlock(&adev->grbm_idx_mutex);
+}
\ No newline at end of file
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
index d7e3041947e8..e01fa6afa8e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
@@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if);
void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev);
int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status);
+
+void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev);
+void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
#endif /* __GFX_V9_4_2_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
index 52ffbea63a4f..8cdf5d1685cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
@@ -100,6 +100,30 @@
} \
} while (0)
+#define WREG32_RLC_EX(prefix, reg, value) \
+ do { \
+ if (amdgpu_sriov_fullaccess(adev)) { \
+ uint32_t i = 0; \
+ uint32_t retries = 50000; \
+ uint32_t r0 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG0_BASE_IDX] + prefix##SCRATCH_REG0; \
+ uint32_t r1 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG1_BASE_IDX] + prefix##SCRATCH_REG1; \
+ uint32_t spare_int = adev->reg_offset[GC_HWIP][0][prefix##RLC_SPARE_INT_BASE_IDX] + prefix##RLC_SPARE_INT; \
+ WREG32(r0, value); \
+ WREG32(r1, (reg | 0x80000000)); \
+ WREG32(spare_int, 0x1); \
+ for (i = 0; i < retries; i++) { \
+ u32 tmp = RREG32(r1); \
+ if (!(tmp & 0x80000000)) \
+ break; \
+ udelay(10); \
+ } \
+ if (i >= retries) \
+ pr_err("timeout: rlcg program reg:0x%05x failed !\n", reg); \
+ } else { \
+ WREG32(reg, value); \
+ } \
+ } while (0)
+
#define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \
do { \
uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\
@@ -142,6 +166,12 @@
WREG32_RLC(target_reg, value); \
} while (0)
+#define WREG32_SOC15_RLC_EX(prefix, ip, inst, reg, value) \
+ do { \
+ uint32_t target_reg = adev->reg_offset[GC_HWIP][0][reg##_BASE_IDX] + reg;\
+ WREG32_RLC_EX(prefix, target_reg, value); \
+ } while (0)
+
#define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \
WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \
(RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \
--
2.29.2
More information about the amd-gfx
mailing list