[PATCH] drm/amdgpu: add command to check address validity

Chai, Thomas YiPeng.Chai at amd.com
Thu Jul 17 07:40:07 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Thursday, July 17, 2025 2:40 PM
To: Chai, Thomas <YiPeng.Chai at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: RE: [PATCH] drm/amdgpu: add command to check address validity

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai at amd.com>
> Sent: Wednesday, July 16, 2025 2:48 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas
> <YiPeng.Chai at amd.com>
> Subject: [PATCH] drm/amdgpu: add command to check address validity
>
> Add command to check address validity and remove unused command codes.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58
> +++++++++++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3
> ++
>  2 files changed, 29 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 15bde4904996..68feec0956f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -163,47 +163,38 @@ static bool
> amdgpu_ras_get_error_query_ready(struct
> amdgpu_device *adev)
>       return false;
>  }
>
> -static int amdgpu_reserve_page_direct(struct amdgpu_device *adev,
> uint64_t
> address)
> +static int amdgpu_check_address_validity(struct amdgpu_device *adev,
> +uint64_t address)
>  {
> -     struct ras_err_data err_data;
> -     struct eeprom_table_record err_rec;
> -     int ret;
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct amdgpu_vram_block_info blk_info;
> +     uint64_t page_pfns[32] = {0};
> +     int i, ret, count;
> +
> +     if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0))
> +             return 0;
>
>       if ((address >= adev->gmc.mc_vram_size) ||
>           (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
>               dev_warn(adev->dev,
> -                      "RAS WARN: input address 0x%llx is invalid.\n",
> -                      address);
> +                 "RAS WARN: input address 0x%llx is invalid.\n",
> +                 address);
>               return -EINVAL;
>       }
>
> -     if (amdgpu_ras_check_bad_page(adev, address)) {
> -             dev_warn(adev->dev,
> -                      "RAS WARN: 0x%llx has already been marked as bad
> page!\n",
> -                      address);
> -             return 0;
> -     }
> -
> -     ret = amdgpu_ras_error_data_init(&err_data);
> -     if (ret)
> -             return ret;
> +     count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
> +                             address, page_pfns, ARRAY_SIZE(page_pfns));
> +     if (count <= 0)
> +             return -EPERM;
>
> -     memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
> -     err_data.err_addr = &err_rec;
> -     amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
> -
> -     if (amdgpu_bad_page_threshold != 0) {
> -             amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
> -                                      err_data.err_addr_cnt, false);
> -             amdgpu_ras_save_bad_pages(adev, NULL);
> +     for (i = 0; i < count; i++) {
> +             memset(&blk_info, 0, sizeof(blk_info));
> +             ret = amdgpu_vram_mgr_query_address_block_info(&adev-
> >mman.vram_mgr,
> +                                     page_pfns[i] <<
> AMDGPU_GPU_PAGE_SHIFT, &blk_info);
> +             if (!ret && (blk_info.task.pid == con->init_task_pid) &&
> +                     !strncmp(blk_info.task.comm,
> + con->init_task_comm,
> TASK_COMM_LEN))
> +                     return -EACCES;
>       }
>
> -     amdgpu_ras_error_data_fini(&err_data);
> -
> -     dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES
> AND WILL CORRUPT RAS EEPROM\n");
> -     dev_warn(adev->dev, "Clear EEPROM:\n");
> -     dev_warn(adev->dev, "    echo 1 >
> /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
> -
>       return 0;
>  }
>
> @@ -295,7 +286,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>               op = 1;
>       else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>               op = 2;
> -     else if (strstr(str, "retire_page") != NULL)
> +     else if (strstr(str, "check_address") != NULL)

> [Tao] the added check is fine for me, but I prefer to reserve retire_page command, we can set check_address to op 4.

[Thomas] ok.

>               op = 3;
>       else if (str[0] && str[1] && str[2] && str[3])
>               /* ascii string, but commands are not matched. */ @@
> -495,7 +486,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>               return ret;
>
>       if (data.op == 3) {
> -             ret = amdgpu_reserve_page_direct(adev, data.inject.address);
> +             ret = amdgpu_check_address_validity(adev,
> + data.inject.address);
>               if (!ret)
>                       return size;
>               else
> @@ -4103,6 +4094,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>                       goto release_con;
>       }
>
> +     con->init_task_pid = task_pid_nr(current);
> +     get_task_comm(con->init_task_comm, current);
> +
>       dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>                "hardware ability[%x] ras_mask[%x]\n",
>                adev->ras_hw_enabled, adev->ras_enabled); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 927d6bff734a..7f10a7402160 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -570,6 +570,9 @@ struct amdgpu_ras {
>       struct ras_event_manager *event_mgr;
>
>       uint64_t reserved_pages_in_bytes;
> +
> +     pid_t init_task_pid;
> +     char init_task_comm[TASK_COMM_LEN];
>  };
>
>  struct ras_fs_data {
> --
> 2.34.1




More information about the amd-gfx mailing list