[PATCH] drm/amdgpu: Fix discovery initialization failure during pci rescan
Christian König
ckoenig.leichtzumerken at gmail.com
Mon Apr 1 12:24:53 UTC 2024
Am 01.04.24 um 12:18 schrieb Ma Jun:
> Waiting for system ready to fix the discovery initialization
> failure issue. This failure usually occurs when dGPU is
> removed and then rescanned via command line.
> It's caused by following two errors:
> [1] vram size is 0
> [2] wrong binary signature
>
> Signed-off-by: Ma Jun <Jun.Ma2 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 15 ++++++++++++---
> 1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> index 07c5fca06178..ac6b2ae6414c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> @@ -276,7 +276,12 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
> msleep(1);
> }
> }
> - vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
> + for (i = 0; i < 100; i++) {
> + vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
> + if (vram_size)
> + break;
> + usleep_range(1000, 1100);
> + }
Nice that we come closer to a solution for this, but that here is really
hacky.
Do we have any idea why mmRCC_CONFIG_MEMSIZE is zero when we re-scan? Is
some initialization not completed yet or something like that.
Regards,
Christian.
>
> if (vram_size) {
> uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
> @@ -371,6 +376,7 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
> {
> struct table_info *info;
> struct binary_header *bhdr;
> + int error_count = 0;
> uint16_t offset;
> uint16_t size;
> uint16_t checksum;
> @@ -380,7 +386,7 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
> adev->mman.discovery_bin = kzalloc(adev->mman.discovery_tmr_size, GFP_KERNEL);
> if (!adev->mman.discovery_bin)
> return -ENOMEM;
> -
> +retry:
> /* Read from file if it is the preferred option */
> if (amdgpu_discovery == 2) {
> dev_info(adev->dev, "use ip discovery information from file");
> @@ -401,6 +407,10 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
>
> /* check the ip discovery binary signature */
> if (!amdgpu_discovery_verify_binary_signature(adev->mman.discovery_bin)) {
> + if (error_count++ < 1) {
> + msleep(100);
> + goto retry;
> + }
> dev_err(adev->dev,
> "get invalid ip discovery binary signature\n");
> r = -EINVAL;
> @@ -515,7 +525,6 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
> if (0 && offset) {
> struct mall_info_header *mhdr =
> (struct mall_info_header *)(adev->mman.discovery_bin + offset);
> -
> if (le32_to_cpu(mhdr->table_id) != MALL_INFO_TABLE_ID) {
> dev_err(adev->dev, "invalid ip discovery mall table id\n");
> r = -EINVAL;
More information about the amd-gfx
mailing list