[PATCH] drm/amdgpu: Fix discovery initialization failure during pci rescan
Ma Jun
Jun.Ma2 at amd.com
Mon Apr 1 10:18:47 UTC 2024
Waiting for system ready to fix the discovery initialization
failure issue. This failure usually occurs when dGPU is
removed and then rescanned via command line.
It's caused by following two errors:
[1] vram size is 0
[2] wrong binary signature
Signed-off-by: Ma Jun <Jun.Ma2 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 07c5fca06178..ac6b2ae6414c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -276,7 +276,12 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
msleep(1);
}
}
- vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
+ for (i = 0; i < 100; i++) {
+ vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
+ if (vram_size)
+ break;
+ usleep_range(1000, 1100);
+ }
if (vram_size) {
uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
@@ -371,6 +376,7 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
{
struct table_info *info;
struct binary_header *bhdr;
+ int error_count = 0;
uint16_t offset;
uint16_t size;
uint16_t checksum;
@@ -380,7 +386,7 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
adev->mman.discovery_bin = kzalloc(adev->mman.discovery_tmr_size, GFP_KERNEL);
if (!adev->mman.discovery_bin)
return -ENOMEM;
-
+retry:
/* Read from file if it is the preferred option */
if (amdgpu_discovery == 2) {
dev_info(adev->dev, "use ip discovery information from file");
@@ -401,6 +407,10 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
/* check the ip discovery binary signature */
if (!amdgpu_discovery_verify_binary_signature(adev->mman.discovery_bin)) {
+ if (error_count++ < 1) {
+ msleep(100);
+ goto retry;
+ }
dev_err(adev->dev,
"get invalid ip discovery binary signature\n");
r = -EINVAL;
@@ -515,7 +525,6 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
if (0 && offset) {
struct mall_info_header *mhdr =
(struct mall_info_header *)(adev->mman.discovery_bin + offset);
-
if (le32_to_cpu(mhdr->table_id) != MALL_INFO_TABLE_ID) {
dev_err(adev->dev, "invalid ip discovery mall table id\n");
r = -EINVAL;
--
2.34.1
More information about the amd-gfx
mailing list