[PATCH Review 1/1] drm/amdgpu: support ras on SRIOV
Stanley.Yang
Stanley.Yang at amd.com
Wed May 18 08:31:59 UTC 2022
support umc/gfx/sdma ras on guest side
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 ++++++++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 9 ++++++---
5 files changed, 37 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 85412e1a04be..e832c5bceb63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5372,6 +5372,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
r = amdgpu_device_reset_sriov(adev, job ? false : true);
if (r)
adev->asic_reset_res = r;
+
+ /* Aldebaran support ras in SRIOV, so need resume ras during reset */
+ if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+ amdgpu_ras_resume(adev);
} else {
r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
if (r && r == -EAGAIN)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 31e07dfc874b..12a1f2389714 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -202,6 +202,10 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg)
if (ret == IRQ_HANDLED)
pm_runtime_mark_last_busy(dev->dev);
+ /* Fatal error events are handled on host side */
+ if (amdgpu_sriov_vf(adev))
+ return ret;
+
/* For the hardware that cannot enable bif ring for both ras_controller_irq
* and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
* register to check whether the interrupt is triggered or not, and properly
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2b80a3037481..930fa3837ef9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -721,7 +721,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
- if (!amdgpu_ras_intr_triggered()) {
+ /* Enable ras feature operator handle on host side */
+ if (!amdgpu_sriov_vf(adev) &&
+ !amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) {
dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
@@ -2181,10 +2183,14 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
{
adev->ras_hw_enabled = adev->ras_enabled = 0;
- if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
+ if (!adev->is_atom_fw ||
!amdgpu_ras_asic_supported(adev))
return;
+ if (!(amdgpu_sriov_vf(adev) &&
+ (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))))
+ return;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
dev_info(adev->dev, "MEM ECC is active.\n");
@@ -2196,8 +2202,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
dev_info(adev->dev, "SRAM ECC is active.\n");
- adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
- 1 << AMDGPU_RAS_BLOCK__DF);
+ if (!amdgpu_sriov_vf(adev))
+ adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+ else
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
+ 1 << AMDGPU_RAS_BLOCK__SDMA |
+ 1 << AMDGPU_RAS_BLOCK__GFX);
} else {
dev_info(adev->dev, "SRAM ECC is not presented.\n");
}
@@ -2532,6 +2543,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
struct amdgpu_ras_block_object *obj;
int r;
+ /* Guest side doesn't need init ras feature */
+ if (amdgpu_sriov_vf(adev))
+ return 0;
+
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 3b5c43575aa3..72bfac9bf9d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -123,6 +123,10 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
+ if (amdgpu_sriov_vf(adev))
+ return AMDGPU_RAS_SUCCESS;
+
amdgpu_ras_reset_gpu(adev);
return AMDGPU_RAS_SUCCESS;
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 2c6070b90dcf..165cdc2d7f0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -75,9 +75,12 @@ static int psp_v13_0_init_microcode(struct psp_context *psp)
err = psp_init_sos_microcode(psp, chip_name);
if (err)
return err;
- err = psp_init_ta_microcode(&adev->psp, chip_name);
- if (err)
- return err;
+ /* It's not necessary to load ras ta on Guest side */
+ if (!amdgpu_sriov_vf(adev)) {
+ err = psp_init_ta_microcode(&adev->psp, chip_name);
+ if (err)
+ return err;
+ }
break;
case IP_VERSION(13, 0, 1):
case IP_VERSION(13, 0, 3):
--
2.17.1
More information about the amd-gfx
mailing list