[v3 3/6] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms()

Lazar, Lijo lijo.lazar at amd.com
Wed Jan 8 09:54:01 UTC 2025



On 1/8/2025 2:26 PM, Jiang Liu wrote:
> If some GPU device failed to probe, `rmmod amdgpu` will trigger a use
> after free bug related to amdgpu_driver_release_kms() as:
> 2024-12-26 16:17:45 [16002.085540] BUG: kernel NULL pointer dereference, address: 0000000000000000
> 2024-12-26 16:17:45 [16002.093792] #PF: supervisor read access in kernel mode
> 2024-12-26 16:17:45 [16002.099993] #PF: error_code(0x0000) - not-present page
> 2024-12-26 16:17:45 [16002.106188] PGD 0 P4D 0
> 2024-12-26 16:17:45 [16002.109464] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
> 2024-12-26 16:17:45 [16002.115372] CPU: 2 PID: 14375 Comm: rmmod Kdump: loaded Tainted: G        W   E      6.10.0+ #2
> 2024-12-26 16:17:45 [16002.125577] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.0.ES.AL.P.087.05 04/07/2024
> 2024-12-26 16:17:45 [16002.136858] RIP: 0010:drm_sched_fini+0x3f/0xe0 [gpu_sched]
> 2024-12-26 16:17:45 [16002.143463] Code: 60 c6 87 be 00 00 00 01 e8 ce e0 90 d8 48 8d bb 80 00 00 00 e8 c2 e0 90 d8 8b 43 20 85 c0 74 51 45 31 e4 48 8b
> 43 28 4d 63 ec <4a> 8b 2c e8 48 89 ef e8 f5 0e 59 d9 48 8b 45 10 48 8d 55 10 48 39
> 2024-12-26 16:17:45 [16002.164992] RSP: 0018:ffffb570dbbb7da8 EFLAGS: 00010246
> 2024-12-26 16:17:45 [16002.171316] RAX: 0000000000000000 RBX: ffff96b0fdadc878 RCX: 0000000000000000
> 2024-12-26 16:17:46 [16002.179784] RDX: 000fffffffe00000 RSI: 0000000000000000 RDI: ffff96b0fdadc8f8
> 2024-12-26 16:17:46 [16002.188252] RBP: ffff96b0fdadc800 R08: ffff97abbd035040 R09: ffffffff9ac52540
> 2024-12-26 16:17:46 [16002.196722] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> 2024-12-26 16:17:46 [16002.205179] R13: 0000000000000000 R14: ffff96b0fdadfc00 R15: 0000000000000000
> 2024-12-26 16:17:46 [16002.213648] FS:  00007f2737000740(0000) GS:ffff97abbd100000(0000) knlGS:0000000000000000
> 2024-12-26 16:17:46 [16002.223189] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> 2024-12-26 16:17:46 [16002.230103] CR2: 0000000000000000 CR3: 000000011be3a005 CR4: 0000000000f70ef0
> 2024-12-26 16:17:46 [16002.238581] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> 2024-12-26 16:17:46 [16002.247053] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
> e024se+0x3c/0x90 [amdxcp]
> 2024-12-26 16:17:46 [16002.337645]  __do_sys_delete_module.constprop.0+0x176/0x310
> 2024-12-26 16:17:46 [16002.344324]  do_syscall_64+0x5d/0x170
> 2024-12-26 16:17:46 [16002.348858]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 2024-12-26 16:17:46 [16002.354956] RIP: 0033:0x7f2736a620cb-12-26
> 
> Fix it by removing xcp drm devices when failed to probe GPU devices.
> 
> Signed-off-by: Jiang Liu <gerry at linux.alibaba.com>
> Tested-by: Shuo Liu <shuox.liu at linux.alibaba.com>
> Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c    | 47 +++++++++++++++++++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h    |  4 +-
>  5 files changed, 47 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5ff53a3b9851..510074a9074e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6682,7 +6682,7 @@ void amdgpu_device_halt(struct amdgpu_device *adev)
>  	struct pci_dev *pdev = adev->pdev;
>  	struct drm_device *ddev = adev_to_drm(adev);
>  
> -	amdgpu_xcp_dev_unplug(adev);
> +	amdgpu_xcp_dev_deregister(adev);
>  	drm_dev_unplug(ddev);
>  
>  	amdgpu_irq_disable_all(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 62de668e9ff8..41d1b06be600 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2435,7 +2435,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>  	struct drm_device *dev = pci_get_drvdata(pdev);
>  	struct amdgpu_device *adev = drm_to_adev(dev);
>  
> -	amdgpu_xcp_dev_unplug(adev);
> +	amdgpu_xcp_dev_deregister(adev);
>  	amdgpu_gmc_prepare_nps_mode_change(adev);
>  	drm_dev_unplug(dev);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index d2a046736edd..be9147eb8308 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1508,6 +1508,7 @@ void amdgpu_driver_release_kms(struct drm_device *dev)
>  	struct amdgpu_device *adev = drm_to_adev(dev);
>  
>  	amdgpu_device_fini_sw(adev);
> +	amdgpu_xcp_mgr_fini(adev);

Suggest to move this inside fini_sw()

>  	pci_set_drvdata(adev->pdev, NULL);
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> index e209b5e101df..62dd5287808b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
> @@ -283,6 +283,33 @@ static int amdgpu_xcp_dev_alloc(struct amdgpu_device *adev)
>  	return 0;
>  }
>  
> +static void amdgpu_xcp_dev_free(struct amdgpu_device *adev)
> +{
> +	struct drm_device *p_ddev;
> +	int i;
> +
> +	if (!adev->xcp_mgr)
> +		return;
> +
> +	for (i = 1; i < MAX_XCP; i++) {
> +		if (!adev->xcp_mgr->xcp[i].ddev)
> +			break;
> +
> +		// Restore and free the original drm_device.
> +		p_ddev = adev->xcp_mgr->xcp[i].ddev;
> +		p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
> +		p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
> +		p_ddev->driver =  adev->xcp_mgr->xcp[i].driver;
> +		p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;

Now that there are more calls, this doesn't make sense here. What about
moving the redirection along with register() (I guess it matters from
that point onwards) and undoing it (restore back saved values) along
with deregister()? With that, there won't be a need to have registered
flag. You may only need to check if xcp rdev/pdev is not NULL.

> +		amdgpu_xcp_drm_dev_free(p_ddev);
> +
> +		adev->xcp_mgr->xcp[i].ddev = NULL;
> +	}
> +
> +	adev->xcp_mgr->xcp->ddev = NULL;
> +}
> +
> +
>  int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
>  			int init_num_xcps,
>  			struct amdgpu_xcp_mgr_funcs *xcp_funcs)
> @@ -310,6 +337,13 @@ int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
>  	return amdgpu_xcp_dev_alloc(adev);
>  }
>  
> +void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev)
> +{
> +	amdgpu_xcp_dev_free(adev);
> +	kfree(adev->xcp_mgr);
> +	adev->xcp_mgr = NULL;

Thanks for adding this.

Thanks,
Lijo

> +}
> +
>  int amdgpu_xcp_get_partition(struct amdgpu_xcp_mgr *xcp_mgr,
>  			     enum AMDGPU_XCP_IP_BLOCK ip, int instance)
>  {
> @@ -359,12 +393,14 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
>  		ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, ent->driver_data);
>  		if (ret)
>  			return ret;
> +
> +		adev->xcp_mgr->xcp[i].registered = true;
>  	}
>  
>  	return 0;
>  }
>  
> -void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
> +void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev)
>  {
>  	struct drm_device *p_ddev;
>  	int i;
> @@ -377,11 +413,10 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
>  			break;
>  
>  		p_ddev = adev->xcp_mgr->xcp[i].ddev;
> -		drm_dev_unplug(p_ddev);
> -		p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
> -		p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
> -		p_ddev->driver =  adev->xcp_mgr->xcp[i].driver;
> -		p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
> +		if (adev->xcp_mgr->xcp[i].registered) {
> +			drm_dev_unplug(p_ddev);
> +			adev->xcp_mgr->xcp[i].registered = false;
> +		}
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
> index b63f53242c57..be22d4398463 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
> @@ -101,6 +101,7 @@ struct amdgpu_xcp {
>  	uint8_t id;
>  	uint8_t mem_id;
>  	bool valid;
> +	bool registered;
>  	atomic_t	ref_cnt;
>  	struct drm_device *ddev;
>  	struct drm_device *rdev;
> @@ -155,6 +156,7 @@ int amdgpu_xcp_resume(struct amdgpu_xcp_mgr *xcp_mgr, int xcp_id);
>  
>  int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
>  			int init_xcps, struct amdgpu_xcp_mgr_funcs *xcp_funcs);
> +void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev);
>  int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode);
>  int amdgpu_xcp_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, u32 flags);
>  int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, int mode);
> @@ -168,7 +170,7 @@ int amdgpu_xcp_get_inst_details(struct amdgpu_xcp *xcp,
>  
>  int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
>  				const struct pci_device_id *ent);
> -void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev);
> +void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev);
>  int amdgpu_xcp_open_device(struct amdgpu_device *adev,
>  			   struct amdgpu_fpriv *fpriv,
>  			   struct drm_file *file_priv);



More information about the amd-gfx mailing list