[PATCH 1/2] drm/amdgpu: add range check for RAS bad page address

Zhang, Hawking Hawking.Zhang at amd.com
Tue Jul 15 13:53:33 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, July 11, 2025 17:06
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 1/2] drm/amdgpu: add range check for RAS bad page address

Exclude invalid bad pages.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 ++++++++++++-------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a6f512293b5c..1d6d4625abb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -136,9 +136,9 @@ enum amdgpu_ras_retire_page_reservation {

 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);

-static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
                                uint64_t addr);
-static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
                                uint64_t addr);
 #ifdef CONFIG_X86_MCE_AMD
 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); @@ -169,18 +169,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
        struct eeprom_table_record err_rec;
        int ret;

-       if ((address >= adev->gmc.mc_vram_size) ||
-           (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+       ret = amdgpu_ras_check_bad_page(adev, address);
+       if (ret == -EINVAL) {
                dev_warn(adev->dev,
-                        "RAS WARN: input address 0x%llx is invalid.\n",
-                        address);
+                       "RAS WARN: input address 0x%llx is invalid.\n",
+                       address);
                return -EINVAL;
-       }
-
-       if (amdgpu_ras_check_bad_page(adev, address)) {
+       } else if (ret == 1) {
                dev_warn(adev->dev,
-                        "RAS WARN: 0x%llx has already been marked as bad page!\n",
-                        address);
+                       "RAS WARN: 0x%llx has already been marked as bad page!\n",
+                       address);
                return 0;
        }

@@ -513,22 +511,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
                ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
                break;
        case 2:
-               if ((data.inject.address >= adev->gmc.mc_vram_size &&
-                   adev->gmc.mc_vram_size) ||
-                   (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
-                       dev_warn(adev->dev, "RAS WARN: input address "
-                                       "0x%llx is invalid.",
+               /* umc ce/ue error injection for a bad page is not allowed */
+               if (data.head.block == AMDGPU_RAS_BLOCK__UMC)
+                       ret = amdgpu_ras_check_bad_page(adev, data.inject.address);
+               if (ret == -EINVAL) {
+                       dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.",
                                        data.inject.address);
-                       ret = -EINVAL;
                        break;
-               }
-
-               /* umc ce/ue error injection for a bad page is not allowed */
-               if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
-                   amdgpu_ras_check_bad_page(adev, data.inject.address)) {
-                       dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
-                                "already been marked as bad!\n",
-                                data.inject.address);
+               } else if (ret == 1) {
+                       dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n",
+                                       data.inject.address);
                        break;
                }

@@ -3122,18 +3114,24 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
        return ret;
 }

-static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
                                uint64_t addr)
 {
        struct ras_err_handler_data *data = con->eh_data;
+       struct amdgpu_device *adev = con->adev;
        int i;

+       if ((addr >= adev->gmc.mc_vram_size &&
+           adev->gmc.mc_vram_size) ||
+           (addr >= RAS_UMC_INJECT_ADDR_LIMIT))
+               return -EINVAL;
+
        addr >>= AMDGPU_GPU_PAGE_SHIFT;
        for (i = 0; i < data->count; i++)
                if (addr == data->bps[i].retired_page)
-                       return true;
+                       return 1;

-       return false;
+       return 0;
 }

 /*
@@ -3141,11 +3139,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
  *
  * Note: this check is only for umc block
  */
-static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
                                uint64_t addr)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       bool ret = false;
+       int ret = 0;

        if (!con || !con->eh_data)
                return ret;
--
2.34.1



More information about the amd-gfx mailing list