[PATCH] drm/amdgpu: add command to check address validity

YiPeng Chai YiPeng.Chai at amd.com
Wed Jul 16 06:48:23 UTC 2025


Add command to check address validity and remove
unused command codes.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 +++++++++++--------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 ++
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 15bde4904996..68feec0956f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -163,47 +163,38 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
 	return false;
 }
 
-static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
+static int amdgpu_check_address_validity(struct amdgpu_device *adev, uint64_t address)
 {
-	struct ras_err_data err_data;
-	struct eeprom_table_record err_rec;
-	int ret;
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct amdgpu_vram_block_info blk_info;
+	uint64_t page_pfns[32] = {0};
+	int i, ret, count;
+
+	if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0))
+		return 0;
 
 	if ((address >= adev->gmc.mc_vram_size) ||
 	    (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
 		dev_warn(adev->dev,
-		         "RAS WARN: input address 0x%llx is invalid.\n",
-		         address);
+		    "RAS WARN: input address 0x%llx is invalid.\n",
+		    address);
 		return -EINVAL;
 	}
 
-	if (amdgpu_ras_check_bad_page(adev, address)) {
-		dev_warn(adev->dev,
-			 "RAS WARN: 0x%llx has already been marked as bad page!\n",
-			 address);
-		return 0;
-	}
-
-	ret = amdgpu_ras_error_data_init(&err_data);
-	if (ret)
-		return ret;
+	count = amdgpu_umc_lookup_bad_pages_in_a_row(adev,
+				address, page_pfns, ARRAY_SIZE(page_pfns));
+	if (count <= 0)
+		return -EPERM;
 
-	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
-	err_data.err_addr = &err_rec;
-	amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
-
-	if (amdgpu_bad_page_threshold != 0) {
-		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
-					 err_data.err_addr_cnt, false);
-		amdgpu_ras_save_bad_pages(adev, NULL);
+	for (i = 0; i < count; i++) {
+		memset(&blk_info, 0, sizeof(blk_info));
+		ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr,
+					page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info);
+		if (!ret && (blk_info.task.pid == con->init_task_pid) &&
+			!strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN))
+			return -EACCES;
 	}
 
-	amdgpu_ras_error_data_fini(&err_data);
-
-	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
-	dev_warn(adev->dev, "Clear EEPROM:\n");
-	dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
-
 	return 0;
 }
 
@@ -295,7 +286,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
-	else if (strstr(str, "retire_page") != NULL)
+	else if (strstr(str, "check_address") != NULL)
 		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
@@ -495,7 +486,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
 		return ret;
 
 	if (data.op == 3) {
-		ret = amdgpu_reserve_page_direct(adev, data.inject.address);
+		ret = amdgpu_check_address_validity(adev, data.inject.address);
 		if (!ret)
 			return size;
 		else
@@ -4103,6 +4094,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 			goto release_con;
 	}
 
+	con->init_task_pid = task_pid_nr(current);
+	get_task_comm(con->init_task_comm, current);
+
 	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
 		 "hardware ability[%x] ras_mask[%x]\n",
 		 adev->ras_hw_enabled, adev->ras_enabled);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 927d6bff734a..7f10a7402160 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -570,6 +570,9 @@ struct amdgpu_ras {
 	struct ras_event_manager *event_mgr;
 
 	uint64_t reserved_pages_in_bytes;
+
+	pid_t init_task_pid;
+	char init_task_comm[TASK_COMM_LEN];
 };
 
 struct ras_fs_data {
-- 
2.34.1



More information about the amd-gfx mailing list