[PATCH 09/21] drm/amdgpu: do RAS MCA2PA conversion in device init phase
Tao Zhou
tao.zhou1 at amd.com
Tue Nov 19 06:35:52 UTC 2024
NPS mode is introduced, the value of memory physical address (PA)
related to a MCA address varies per nps mode. We need to rely on
MCA address and convert it into PA accroding to nps mode.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 97 ++++++++++++++++++++++---
1 file changed, 85 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a529aec19527..f2018c3e96a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2664,40 +2664,113 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
return 0;
}
+static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps,
+ struct ras_err_data *err_data)
+{
+ struct ta_ras_query_address_input addr_in;
+ uint32_t socket = 0;
+ int ret = 0;
+
+ if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
+ socket = adev->smuio.funcs->get_socket_id(adev);
+
+ /* reinit err_data */
+ err_data->err_addr_cnt = 0;
+ err_data->err_addr_len = adev->umc.retire_unit;
+
+ memset(&addr_in, 0, sizeof(addr_in));
+ addr_in.ma.err_addr = bps->address;
+ addr_in.ma.socket_id = socket;
+ addr_in.ma.ch_inst = bps->mem_channel;
+ /* tell RAS TA the node instance is not used */
+ addr_in.ma.node_inst = TA_RAS_INV_NODE;
+
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
+ ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
+ &addr_in, NULL, false);
+
+ return ret;
+}
+
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
+ struct ras_err_data err_data;
+ struct eeprom_table_record *err_rec;
+ struct amdgpu_ras_eeprom_control *control =
+ &adev->psp.ras_context.ras->eeprom_control;
int ret = 0;
- uint32_t i;
+ uint32_t i, j, loop_cnt = 1;
+ bool is_mca_add = true;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
+ if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) {
+ is_mca_add = false;
+ } else {
+ if ((pages > 1) &&
+ (bps[0].address == bps[1].address) &&
+ (bps[0].mem_channel == bps[1].mem_channel))
+ is_mca_add = false;
+ }
+
mutex_lock(&con->recovery_lock);
data = con->eh_data;
if (!data)
goto out;
- for (i = 0; i < pages; i++) {
- if (amdgpu_ras_check_bad_page_unlock(con,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
- continue;
-
- if (!data->space_left &&
- amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ if (is_mca_add) {
+ err_data.err_addr =
+ kcalloc(adev->umc.retire_unit,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+ if(!err_data.err_addr) {
+ dev_warn(adev->dev, "Failed to alloc UMC error address record in "
+ "mca2pa conversion!\n");
ret = -ENOMEM;
goto out;
}
- amdgpu_ras_reserve_page(adev, bps[i].retired_page);
+ loop_cnt = adev->umc.retire_unit;
+ }
+
+ for (i = 0; i < pages; i++) {
+ if (is_mca_add) {
+ if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data))
+ goto free;
+
+ err_rec = err_data.err_addr;
+ } else {
+ err_rec = &bps[i];
+ }
+
+ for (j = 0; j < loop_cnt; j++) {
+ if (amdgpu_ras_check_bad_page_unlock(con,
+ err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ continue;
+
+ if (!data->space_left &&
+ amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
- memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
- data->count++;
- data->space_left--;
+ memcpy(&data->bps[data->count], &(err_rec[j]),
+ sizeof(struct eeprom_table_record));
+ data->count++;
+ data->space_left--;
+ }
}
+
+free:
+ if (is_mca_add)
+ kfree(err_data.err_addr);
out:
mutex_unlock(&con->recovery_lock);
--
2.34.1
More information about the amd-gfx
mailing list