[PATCH 1/6] drm/amdgpu: update XGMI physical node id and GMC configs on resume

Samuel Zhang guoqing.zhang at amd.com
Mon Apr 14 10:46:50 UTC 2025


For virtual machine with vGPUs in SRIOV single device mode and XGMI
is enabled, XGMI physical node ids may change when waking up from
hiberation with different vGPU devices. So update XGMI physical node
ids on resume.

Update GPU memory controller configuration on resume if XGMI physical
node ids are changed.

Signed-off-by: Jiang Liu <gerry at linux.alibaba.com>
Signed-off-by: Samuel Zhang <guoqing.zhang at amd.com>
Change-Id: I0bcac2d46fdeed66c9cf7e6a378134769c95df61
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c    |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  8 +++++++
 4 files changed, 37 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f212ce3f5d34..12f115602ab2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5098,6 +5098,28 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
 	return 0;
 }
 
+static int amdgpu_device_update_xgmi_nodes(struct amdgpu_device *adev)
+{
+	int r = 0;
+	/* Get xgmi info again for sriov to detect device changes */
+	if (amdgpu_sriov_vf(adev) &&
+	    !(adev->flags & AMD_IS_APU) &&
+	    adev->gmc.xgmi.supported &&
+	    !adev->gmc.xgmi.connected_to_cpu) {
+		adev->gmc.xgmi.prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
+		r = adev->gfxhub.funcs->get_xgmi_info(adev);
+		if (r)
+			return r;
+
+		adev->gmc.xgmi.physical_node_id_changed =
+			adev->gmc.xgmi.physical_node_id != adev->gmc.xgmi.prev_physical_node_id;
+		dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
+			adev->gmc.xgmi.prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
+	}
+	return 0;
+}
+
+
 /**
  * amdgpu_device_resume - initiate device resume
  *
@@ -5117,6 +5139,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
 		r = amdgpu_virt_request_full_gpu(adev, true);
 		if (r)
 			return r;
+		r = amdgpu_device_update_xgmi_nodes(adev);
+		if (r)
+			return r;
 	}
 
 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index ecb74ccf1d90..5b60d714e089 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -1288,6 +1288,8 @@ int amdgpu_gmc_get_nps_memranges(struct amdgpu_device *adev,
 
 	refresh = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) &&
 		  (adev->gmc.reset_flags & AMDGPU_GMC_INIT_RESET_NPS);
+	if (adev->gmc.xgmi.physical_node_id_changed)
+		refresh = true;
 	ret = amdgpu_discovery_get_nps_info(adev, &nps_type, &ranges,
 					    &range_cnt, refresh);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 32dabba4062f..3d5f01a1b657 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -89,6 +89,8 @@ struct amdgpu_xgmi {
 	u64 node_segment_size;
 	/* physical node (0-3) */
 	unsigned physical_node_id;
+	unsigned prev_physical_node_id;
+	bool physical_node_id_changed;
 	/* number of nodes (0-4) */
 	unsigned num_physical_nodes;
 	/* gpu list in the same hive */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 8d3560314e5b..7c7a9fe6be6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2515,6 +2515,14 @@ static int gmc_v9_0_resume(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int r;
 
+	/* Update MC configuration if XGMI physical node id has changed for dGPU. */
+	if (adev->gmc.xgmi.physical_node_id_changed) {
+		r = gmc_v9_0_mc_init(adev);
+		if (r)
+			return r;
+		gmc_v9_0_init_sw_mem_ranges(adev, adev->gmc.mem_partitions);
+	}
+
 	/* If a reset is done for NPS mode switch, read the memory range
 	 * information again.
 	 */
-- 
2.43.5



More information about the amd-gfx mailing list