[PATCH 1/2] drm/amdgpu: add range check for RAS bad page address

Zhou1, Tao Tao.Zhou1 at amd.com
Tue Jul 15 10:02:59 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

Ping for the series...

> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Friday, July 11, 2025 5:06 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: add range check for RAS bad page address
>
> Exclude invalid bad pages.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 ++++++++++++-------------
>  1 file changed, 28 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index a6f512293b5c..1d6d4625abb3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -136,9 +136,9 @@ enum amdgpu_ras_retire_page_reservation {
>
>  atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
>
> -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
> +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
>                               uint64_t addr);
> -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
>                               uint64_t addr);
>  #ifdef CONFIG_X86_MCE_AMD
>  static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
> @@ -169,18 +169,16 @@ static int amdgpu_reserve_page_direct(struct
> amdgpu_device *adev, uint64_t addre
>       struct eeprom_table_record err_rec;
>       int ret;
>
> -     if ((address >= adev->gmc.mc_vram_size) ||
> -         (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
> +     ret = amdgpu_ras_check_bad_page(adev, address);
> +     if (ret == -EINVAL) {
>               dev_warn(adev->dev,
> -                      "RAS WARN: input address 0x%llx is invalid.\n",
> -                      address);
> +                     "RAS WARN: input address 0x%llx is invalid.\n",
> +                     address);
>               return -EINVAL;
> -     }
> -
> -     if (amdgpu_ras_check_bad_page(adev, address)) {
> +     } else if (ret == 1) {
>               dev_warn(adev->dev,
> -                      "RAS WARN: 0x%llx has already been marked as bad
> page!\n",
> -                      address);
> +                     "RAS WARN: 0x%llx has already been marked as bad
> page!\n",
> +                     address);
>               return 0;
>       }
>
> @@ -513,22 +511,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file
> *f,
>               ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
>               break;
>       case 2:
> -             if ((data.inject.address >= adev->gmc.mc_vram_size &&
> -                 adev->gmc.mc_vram_size) ||
> -                 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
> -                     dev_warn(adev->dev, "RAS WARN: input address "
> -                                     "0x%llx is invalid.",
> +             /* umc ce/ue error injection for a bad page is not allowed */
> +             if (data.head.block == AMDGPU_RAS_BLOCK__UMC)
> +                     ret = amdgpu_ras_check_bad_page(adev,
> data.inject.address);
> +             if (ret == -EINVAL) {
> +                     dev_warn(adev->dev, "RAS WARN: input address 0x%llx is
> invalid.",
>                                       data.inject.address);
> -                     ret = -EINVAL;
>                       break;
> -             }
> -
> -             /* umc ce/ue error injection for a bad page is not allowed */
> -             if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
> -                 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
> -                     dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
> -                              "already been marked as bad!\n",
> -                              data.inject.address);
> +             } else if (ret == 1) {
> +                     dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already
> been marked as bad!\n",
> +                                     data.inject.address);
>                       break;
>               }
>
> @@ -3122,18 +3114,24 @@ static int amdgpu_ras_load_bad_pages(struct
> amdgpu_device *adev)
>       return ret;
>  }
>
> -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
> +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
>                               uint64_t addr)
>  {
>       struct ras_err_handler_data *data = con->eh_data;
> +     struct amdgpu_device *adev = con->adev;
>       int i;
>
> +     if ((addr >= adev->gmc.mc_vram_size &&
> +         adev->gmc.mc_vram_size) ||
> +         (addr >= RAS_UMC_INJECT_ADDR_LIMIT))
> +             return -EINVAL;
> +
>       addr >>= AMDGPU_GPU_PAGE_SHIFT;
>       for (i = 0; i < data->count; i++)
>               if (addr == data->bps[i].retired_page)
> -                     return true;
> +                     return 1;
>
> -     return false;
> +     return 0;
>  }
>
>  /*
> @@ -3141,11 +3139,11 @@ static bool
> amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
>   *
>   * Note: this check is only for umc block
>   */
> -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
>                               uint64_t addr)
>  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -     bool ret = false;
> +     int ret = 0;
>
>       if (!con || !con->eh_data)
>               return ret;
> --
> 2.34.1



More information about the amd-gfx mailing list