[PATCH] drm/amdgpu: skip reset other device in the same hive if it's sriov vf
Zhigang Luo
zhigang.luo at amd.com
Fri Dec 3 22:05:43 UTC 2021
For sriov vf hang, vf flr will be triggered. Hive reset is not needed.
Signed-off-by: Zhigang Luo <zhigang.luo at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c5afa45173c..474f8ea58aa5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4746,7 +4746,7 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp
{
struct amdgpu_device *tmp_adev = NULL;
- if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
if (!hive) {
dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
return -ENODEV;
@@ -4958,7 +4958,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too.
*/
- hive = amdgpu_get_xgmi_hive(adev);
+ if (!amdgpu_sriov_vf(adev))
+ hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
@@ -4999,7 +5000,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* to put adev in the 1st position.
*/
INIT_LIST_HEAD(&device_list);
- if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
list_add_tail(&tmp_adev->reset_list, &device_list);
if (!list_is_first(&adev->reset_list, &device_list))
--
2.17.1
More information about the amd-gfx
mailing list