[PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters
Hawking Zhang
Hawking.Zhang at amd.com
Wed Jan 8 16:17:21 UTC 2020
SDMA edc counter registers were added in gfx edc counters
array. When querying gfx error counter in that array, there
is no way to differentiate sdma instance number for different
asic and then results to NULL pointer access when trying to
read sdma register base address for instances greater
than 2 on Vega20.
In addition, this also results to wrong gfx error counters
since it actually added sdma edc counters.
Therefore, sdma edc counter registers should be separated
from gfx edc counter regsiter array and only get initialized
when driver tries to enable sdma ras.
Change-Id: I206917f9d7b81670a8fed84dc749085ce5a6f678
Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 +----------
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 7 +++++++
2 files changed, 8 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 33d1c57aaaf1..c9ade16bbcc3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4038,14 +4038,6 @@ static const struct soc15_reg_entry sec_ded_counter_registers[] = {
{ SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
{ SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
{ SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA1, 0, mmSDMA1_EDC_COUNTER), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA2, 0, mmSDMA2_EDC_COUNTER), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA3, 0, mmSDMA3_EDC_COUNTER), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA4, 0, mmSDMA4_EDC_COUNTER), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA5, 0, mmSDMA5_EDC_COUNTER), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA6, 0, mmSDMA6_EDC_COUNTER), 0, 1, 1},
- { SOC15_REG_ENTRY(SDMA7, 0, mmSDMA7_EDC_COUNTER), 0, 1, 1},
};
static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
@@ -4109,7 +4101,6 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
adev->gfx.config.max_sh_per_se;
int sgpr_work_group_size = 5;
int gpr_reg_size = compute_dim_x / 16 + 6;
- int sec_ded_counter_reg_size = adev->sdma.num_instances + 34;
/* only support when RAS is enabled */
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
@@ -4249,7 +4240,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
/* read back registers to clear the counters */
mutex_lock(&adev->grbm_idx_mutex);
- for (i = 0; i < sec_ded_counter_reg_size; i++) {
+ for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++) {
for (j = 0; j < sec_ded_counter_registers[i].se_num; j++) {
for (k = 0; k < sec_ded_counter_registers[i].instance; k++) {
gfx_v9_0_select_se_sh(adev, j, 0x0, k);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index fd20594b6d6e..f4107f9b75f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1802,6 +1802,13 @@ static int sdma_v4_0_late_init(void *handle)
struct ras_ih_if ih_info = {
.cb = sdma_v4_0_process_ras_data_cb,
};
+ int i;
+
+ /* read back edc counter registers to clear the counters */
+ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+ for (i = 0; i < adev->sdma.num_instances; i++)
+ RREG32_SDMA(i, mmSDMA0_EDC_COUNTER);
+ }
return adev->sdma.funcs->ras_late_init(adev, &ih_info);
}
--
2.17.1
More information about the amd-gfx
mailing list