[PATCH V2] drm/amdgpu: add command to check address validity

Zhou1, Tao Tao.Zhou1 at amd.com
Fri Jul 18 07:38:57 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai at amd.com>
> Sent: Friday, July 18, 2025 11:26 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas
> <YiPeng.Chai at amd.com>
> Subject: [PATCH V2] drm/amdgpu: add command to check address validity
>
> Add command to check address validity and remove unused command codes.
>
> v2:
>  The command interface adds new parameters to support  multiple check address
> strategies.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 63 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 ++
>  2 files changed, 66 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 15bde4904996..185b9e538f98 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -128,6 +128,9 @@ const char *get_ras_block_str(struct ras_common_if
> *ras_block)
>
>  #define MAX_FLUSH_RETIRE_DWORK_TIMES  100
>
> +#define BYPASS_ALLOCATED_ADDRESS        0x0
> +#define BYPASS_INITIALIZATION_ADDRESS   0x1
> +
>  enum amdgpu_ras_retire_page_reservation {
>       AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>       AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -207,6 +210,49 @@ static int amdgpu_reserve_page_direct(struct
> amdgpu_device *adev, uint64_t addre
>       return 0;
>  }
>
> +static int amdgpu_check_address_validity(struct amdgpu_device *adev,
> +                     uint64_t address, uint64_t flags)
> +{
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct amdgpu_vram_block_info blk_info;
> +     uint64_t page_pfns[32] = {0};
> +     int i, ret, count;
> +
> +     if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0))
> +             return 0;
> +
> +     if ((address >= adev->gmc.mc_vram_size) ||
> +         (address >= RAS_UMC_INJECT_ADDR_LIMIT))
> +             return -EFAULT;
> +
> +     count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
> +                             address, page_pfns, ARRAY_SIZE(page_pfns));
> +     if (count <= 0)
> +             return -EPERM;
> +
> +     for (i = 0; i < count; i++) {
> +             memset(&blk_info, 0, sizeof(blk_info));
> +             ret = amdgpu_vram_mgr_query_address_block_info(&adev-
> >mman.vram_mgr,
> +                                     page_pfns[i] <<
> AMDGPU_GPU_PAGE_SHIFT, &blk_info);
> +             if (!ret) {
> +                     /* The input address that needs to be checked is allocated by
> +                      * current calling process, so it is necessary to exclude
> +                      * the calling process.
> +                      */
> +                     if ((flags == BYPASS_ALLOCATED_ADDRESS) &&
> +                         ((blk_info.task.pid != task_pid_nr(current)) ||
> +                             strncmp(blk_info.task.comm, current->comm,
> TASK_COMM_LEN)))
> +                             return -EACCES;
> +                     else if ((flags == BYPASS_INITIALIZATION_ADDRESS) &&
> +                             (blk_info.task.pid == con->init_task_pid) &&
> +                             !strncmp(blk_info.task.comm, con->init_task_comm,
> TASK_COMM_LEN))
> +                             return -EACCES;
> +             }
> +     }
> +
> +     return 0;
> +}
> +
>  static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
>                                       size_t size, loff_t *pos)
>  {
> @@ -297,6 +343,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>               op = 2;
>       else if (strstr(str, "retire_page") != NULL)
>               op = 3;
> +     else if (strstr(str, "check_address") != NULL)
> +             op = 4;
>       else if (str[0] && str[1] && str[2] && str[3])
>               /* ascii string, but commands are not matched. */
>               return -EINVAL;
> @@ -310,6 +358,15 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file
> *f,
>                       data->op = op;
>                       data->inject.address = address;
>
> +                     return 0;
> +             } else if (op == 4) {
> +                     if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 &&
> +                         sscanf(str, "%*s %llu %llu", &address, &value) != 2)
> +                             return -EINVAL;
> +
> +                     data->op = op;
> +                     data->inject.address = address;
> +                     data->inject.value = value;
>                       return 0;
>               }
>
> @@ -500,6 +557,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>                       return size;
>               else
>                       return ret;
> +     } else if (data.op == 4) {
> +             ret = amdgpu_check_address_validity(adev, data.inject.address,
> data.inject.value);
> +             return ret ? ret : size;
>       }
>
>       if (!amdgpu_ras_is_supported(adev, data.head.block)) @@ -4103,6 +4163,9
> @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>                       goto release_con;
>       }
>
> +     con->init_task_pid = task_pid_nr(current);
> +     get_task_comm(con->init_task_comm, current);
> +
>       dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>                "hardware ability[%x] ras_mask[%x]\n",
>                adev->ras_hw_enabled, adev->ras_enabled); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 927d6bff734a..7f10a7402160 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -570,6 +570,9 @@ struct amdgpu_ras {
>       struct ras_event_manager *event_mgr;
>
>       uint64_t reserved_pages_in_bytes;
> +
> +     pid_t init_task_pid;
> +     char init_task_comm[TASK_COMM_LEN];
>  };
>
>  struct ras_fs_data {
> --
> 2.34.1



More information about the amd-gfx mailing list