[PATCH] drm/amdgpu: fix race between pstate and remote buffer map
Jonathan Kim
jonathan.kim at amd.com
Thu Apr 16 11:59:53 UTC 2020
Vega20 arbitrates pstate at hive level and not device level. Last peer to
remote buffer unmap could drop P-State while another process is still
remote buffer mapped.
With this fix, P-States still needs to be disabled for now as SMU bug
was discovered on synchronous P2P transfers. This should be fixed in the
next FW update.
Signed-off-by: Jonathan Kim <Jonathan.Kim at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 16 ++----
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 --
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 66 ++++++++++++------------
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 6 +++
5 files changed, 43 insertions(+), 51 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4e1d4cfe7a9f..94dff899248d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -982,8 +982,6 @@ struct amdgpu_device {
uint64_t unique_id;
uint64_t df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];
- /* device pstate */
- int pstate;
/* enable runtime pm on the device */
bool runpm;
bool in_runpm;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index accbb34ea670..95560eea61c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2135,11 +2135,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) &&
(bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) {
bo_va->is_xgmi = true;
- mutex_lock(&adev->vm_manager.lock_pstate);
/* Power up XGMI if it can be potentially used */
- if (++adev->vm_manager.xgmi_map_counter == 1)
- amdgpu_xgmi_set_pstate(adev, 1);
- mutex_unlock(&adev->vm_manager.lock_pstate);
+ amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MAX_VEGA20);
}
return bo_va;
@@ -2562,12 +2559,8 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
dma_fence_put(bo_va->last_pt_update);
- if (bo && bo_va->is_xgmi) {
- mutex_lock(&adev->vm_manager.lock_pstate);
- if (--adev->vm_manager.xgmi_map_counter == 0)
- amdgpu_xgmi_set_pstate(adev, 0);
- mutex_unlock(&adev->vm_manager.lock_pstate);
- }
+ if (bo && bo_va->is_xgmi)
+ amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MIN);
kfree(bo_va);
}
@@ -3177,9 +3170,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
idr_init(&adev->vm_manager.pasid_idr);
spin_lock_init(&adev->vm_manager.pasid_lock);
-
- adev->vm_manager.xgmi_map_counter = 0;
- mutex_init(&adev->vm_manager.lock_pstate);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index ea771d84bf2b..c8e68d7890bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -357,10 +357,6 @@ struct amdgpu_vm_manager {
*/
struct idr pasid_idr;
spinlock_t pasid_lock;
-
- /* counter of mapped memory through xgmi */
- uint32_t xgmi_map_counter;
- struct mutex lock_pstate;
};
#define amdgpu_vm_copy_pte(adev, ib, pe, src, count) ((adev)->vm_manager.vm_pte_funcs->copy_pte((ib), (pe), (src), (count)))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8c3215505e78..52f45b9fe271 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -373,7 +373,13 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
if (lock)
mutex_lock(&tmp->hive_lock);
- tmp->pstate = -1;
+ tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
+ tmp->high_gpu = NULL;
+ /*
+ * hive pstate on boot is high in vega20 so we have to go to low
+ * pstate on after boot.
+ */
+ tmp->map_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
mutex_unlock(&xgmi_mutex);
return tmp;
@@ -383,50 +389,49 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
{
int ret = 0;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
- struct amdgpu_device *tmp_adev;
- bool update_hive_pstate = true;
- bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20;
+ struct amdgpu_device *tar_adev = hive->high_gpu ?
+ hive->high_gpu : adev;
+ bool map = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
+ bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
- if (!hive)
+ if (!hive || adev->asic_type == CHIP_VEGA20)
return 0;
mutex_lock(&hive->hive_lock);
- if (hive->pstate == pstate) {
- adev->pstate = is_high_pstate ? pstate : adev->pstate;
+ if (map)
+ hive->map_count++;
+ else
+ hive->map_count--;
+
+ /*
+ * Vega20 only needs single peer to request pstate high for the hive to
+ * go high but all peers must request pstate low for the hive to go low
+ */
+ if (hive->pstate == pstate || (!map && hive->map_count && !init_low))
goto out;
- }
- dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
+ dev_dbg(tar_adev->dev, "Set xgmi pstate %d.\n", pstate);
- ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate);
+ ret = amdgpu_dpm_set_xgmi_pstate(tar_adev, pstate);
if (ret) {
- dev_err(adev->dev,
+ dev_err(tar_adev->dev,
"XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
- adev->gmc.xgmi.node_id,
- adev->gmc.xgmi.hive_id, ret);
+ tar_adev->gmc.xgmi.node_id,
+ tar_adev->gmc.xgmi.hive_id, ret);
goto out;
}
- /* Update device pstate */
- adev->pstate = pstate;
-
- /*
- * Update the hive pstate only all devices of the hive
- * are in the same pstate
- */
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- if (tmp_adev->pstate != adev->pstate) {
- update_hive_pstate = false;
- break;
- }
- }
- if (update_hive_pstate || is_high_pstate)
+ if (init_low)
+ hive->pstate = hive->map_count ?
+ hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
+ else {
hive->pstate = pstate;
-
+ hive->high_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
+ adev : NULL;
+ }
out:
mutex_unlock(&hive->hive_lock);
-
return ret;
}
@@ -507,9 +512,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit;
}
- /* Set default device pstate */
- adev->pstate = -1;
-
top_info = &adev->psp.xgmi_context.top_info;
list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index d5a63904ec33..b5c4acf2316d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -25,6 +25,10 @@
#include <drm/task_barrier.h>
#include "amdgpu_psp.h"
+#define AMDGPU_XGMI_PSTATE_UNKNOWN -1
+#define AMDGPU_XGMI_PSTATE_MIN 0
+#define AMDGPU_XGMI_PSTATE_MAX_VEGA20 1
+
struct amdgpu_hive_info {
uint64_t hive_id;
struct list_head device_list;
@@ -34,6 +38,8 @@ struct amdgpu_hive_info {
struct device_attribute dev_attr;
struct amdgpu_device *adev;
int pstate; /*0 -- low , 1 -- high , -1 unknown*/
+ int map_count;
+ struct amdgpu_device *high_gpu;
struct task_barrier tb;
};
--
2.17.1
More information about the amd-gfx
mailing list