[PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure
Zhou1, Tao
Tao.Zhou1 at amd.com
Mon Sep 2 03:14:12 UTC 2019
> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: 2019年9月2日 10:13
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org;
> Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Li, Dennis
> <Dennis.Li at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
>
>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx at lists.freedesktop.org; Grodzovsky, Andrey
> <Andrey.Grodzovsky at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>;
> Li, Dennis <Dennis.Li at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
>
> change bps type from retired page to eeprom table record, prepare for
> saving error records to eeprom
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++-------
> -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
> 2 files changed, 43 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2ca3997d4b3a..24663ec41248 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct
> amdgpu_device *adev,
>
> for (; i < data->count; i++) {
> (*bps)[i] = (struct ras_badpage){
> - .bp = data->bps[i].bp,
> + .bp = data->bps[i].retired_page,
> .size = AMDGPU_GPU_PAGE_SIZE,
> .flags = 0,
> };
>
> if (data->last_reserved <= i)
> (*bps)[i].flags = 1;
> - else if (data->bps[i].bo == NULL)
> + else if (data->bps_bo[i] == NULL)
> (*bps)[i].flags = 2;
> }
>
> @@ -1288,30 +1288,40 @@ static int
> amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, {
> unsigned int old_space = data->count + data->space_left;
> unsigned int new_space = old_space + pages;
> - unsigned int align_space = ALIGN(new_space, 1024);
> - void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> -
> - if (!tmp)
> + unsigned int align_space = ALIGN(new_space, 512);
> [Guchun]Any special reason to change alignment from 512 to 1024?
[Tao] The old "data->bps" is 16 byte and new " struct eeprom_table_record bps" is 31 bytes on 64bit system, I'd like to lower the pressure on memory system. The value can be adjusted according to feedback in the future.
>
> + void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> + struct amdgpu_bo **bps_bo =
> + kmalloc(align_space * sizeof(*data->bps_bo),
> GFP_KERNEL);
> +
> + if (!bps || !bps_bo) {
> + kfree(bps);
> + kfree(bps_bo);
> return -ENOMEM;
> + }
>
> if (data->bps) {
> - memcpy(tmp, data->bps,
> + memcpy(bps, data->bps,
> data->count * sizeof(*data->bps));
> kfree(data->bps);
> }
> + if (data->bps_bo) {
> + memcpy(bps_bo, data->bps_bo,
> + data->count * sizeof(*data->bps_bo));
> + kfree(data->bps_bo);
> + }
>
> - data->bps = tmp;
> + data->bps = bps;
> + data->bps_bo = bps_bo;
> data->space_left += align_space - old_space;
> return 0;
> }
>
> /* it deal with vram only. */
> int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> - unsigned long *bps, int pages)
> + struct eeprom_table_record *bps, int pages)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data *data;
> - int i = pages;
> int ret = 0;
>
> if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10
> +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> goto out;
> }
>
> - while (i--)
> - data->bps[data->count++].bp = bps[i];
> -
> + memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
> + data->count += pages;
> data->space_left -= pages;
> +
> out:
> mutex_unlock(&con->recovery_lock);
>
> @@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
> goto out;
> /* reserve vram at driver post stage. */
> for (i = data->last_reserved; i < data->count; i++) {
> - bp = data->bps[i].bp;
> + bp = data->bps[i].retired_page;
>
> if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
> PAGE_SIZE, &bo))
> DRM_ERROR("RAS ERROR: reserve vram %llx fail\n",
> bp);
>
> - data->bps[i].bo = bo;
> + data->bps_bo[i] = bo;
> data->last_reserved = i + 1;
> }
> out:
> @@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
> goto out;
>
> for (i = data->last_reserved - 1; i >= 0; i--) {
> - bo = data->bps[i].bo;
> + bo = data->bps_bo[i];
>
> amdgpu_ras_release_vram(adev, &bo);
>
> - data->bps[i].bo = bo;
> + data->bps_bo[i] = bo;
> data->last_reserved = i;
> }
> out:
> @@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct
> amdgpu_device *adev)
> return 0;
> }
>
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
> static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
> - /* TODO
> - * read the array to eeprom when SMU disabled.
> - */
> - return 0;
> + struct eeprom_table_record *bps = NULL;
> + int ret;
> +
> + ret = amdgpu_ras_add_bad_pages(adev, bps,
> + adev->umc.max_ras_err_cnt_per_query);
> +
> + return ret;
> }
>
> static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 66b71525446e..b6bac873c588 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -351,11 +351,10 @@ struct ras_err_data { };
>
> struct ras_err_handler_data {
> - /* point to bad pages array */
> - struct {
> - unsigned long bp;
> - struct amdgpu_bo *bo;
> - } *bps;
> + /* point to bad page records array */
> + struct eeprom_table_record *bps;
> + /* point to reserved bo array */
> + struct amdgpu_bo **bps_bo;
> /* the count of entries */
> int count;
> /* the space can place new entries */
> @@ -492,7 +491,7 @@ unsigned long
> amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>
> /* error handling functions */
> int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> - unsigned long *bps, int pages);
> + struct eeprom_table_record *bps, int pages);
>
> int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
>
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list