[v3 3/6] drm/amdgpu: fix use after free bug related to amdgpu_driver_release_kms()

Gerry Liu gerry at linux.alibaba.com
Thu Jan 9 03:00:23 UTC 2025



> 2025年1月8日 17:54,Lazar, Lijo <lijo.lazar at amd.com> 写道:
> 
> 
> 
> On 1/8/2025 2:26 PM, Jiang Liu wrote:
>> If some GPU device failed to probe, `rmmod amdgpu` will trigger a use
>> after free bug related to amdgpu_driver_release_kms() as:
>> 2024-12-26 16:17:45 [16002.085540] BUG: kernel NULL pointer dereference, address: 0000000000000000
>> 2024-12-26 16:17:45 [16002.093792] #PF: supervisor read access in kernel mode
>> 2024-12-26 16:17:45 [16002.099993] #PF: error_code(0x0000) - not-present page
>> 2024-12-26 16:17:45 [16002.106188] PGD 0 P4D 0
>> 2024-12-26 16:17:45 [16002.109464] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI
>> 2024-12-26 16:17:45 [16002.115372] CPU: 2 PID: 14375 Comm: rmmod Kdump: loaded Tainted: G        W   E      6.10.0+ #2
>> 2024-12-26 16:17:45 [16002.125577] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.0.ES.AL.P.087.05 04/07/2024
>> 2024-12-26 16:17:45 [16002.136858] RIP: 0010:drm_sched_fini+0x3f/0xe0 [gpu_sched]
>> 2024-12-26 16:17:45 [16002.143463] Code: 60 c6 87 be 00 00 00 01 e8 ce e0 90 d8 48 8d bb 80 00 00 00 e8 c2 e0 90 d8 8b 43 20 85 c0 74 51 45 31 e4 48 8b
>> 43 28 4d 63 ec <4a> 8b 2c e8 48 89 ef e8 f5 0e 59 d9 48 8b 45 10 48 8d 55 10 48 39
>> 2024-12-26 16:17:45 [16002.164992] RSP: 0018:ffffb570dbbb7da8 EFLAGS: 00010246
>> 2024-12-26 16:17:45 [16002.171316] RAX: 0000000000000000 RBX: ffff96b0fdadc878 RCX: 0000000000000000
>> 2024-12-26 16:17:46 [16002.179784] RDX: 000fffffffe00000 RSI: 0000000000000000 RDI: ffff96b0fdadc8f8
>> 2024-12-26 16:17:46 [16002.188252] RBP: ffff96b0fdadc800 R08: ffff97abbd035040 R09: ffffffff9ac52540
>> 2024-12-26 16:17:46 [16002.196722] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
>> 2024-12-26 16:17:46 [16002.205179] R13: 0000000000000000 R14: ffff96b0fdadfc00 R15: 0000000000000000
>> 2024-12-26 16:17:46 [16002.213648] FS:  00007f2737000740(0000) GS:ffff97abbd100000(0000) knlGS:0000000000000000
>> 2024-12-26 16:17:46 [16002.223189] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> 2024-12-26 16:17:46 [16002.230103] CR2: 0000000000000000 CR3: 000000011be3a005 CR4: 0000000000f70ef0
>> 2024-12-26 16:17:46 [16002.238581] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> 2024-12-26 16:17:46 [16002.247053] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
>> e024se+0x3c/0x90 [amdxcp]
>> 2024-12-26 16:17:46 [16002.337645]  __do_sys_delete_module.constprop.0+0x176/0x310
>> 2024-12-26 16:17:46 [16002.344324]  do_syscall_64+0x5d/0x170
>> 2024-12-26 16:17:46 [16002.348858]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
>> 2024-12-26 16:17:46 [16002.354956] RIP: 0033:0x7f2736a620cb-12-26
>> 
>> Fix it by removing xcp drm devices when failed to probe GPU devices.
>> 
>> Signed-off-by: Jiang Liu <gerry at linux.alibaba.com>
>> Tested-by: Shuo Liu <shuox.liu at linux.alibaba.com>
>> Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  2 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c    | 47 +++++++++++++++++++---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h    |  4 +-
>> 5 files changed, 47 insertions(+), 9 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5ff53a3b9851..510074a9074e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -6682,7 +6682,7 @@ void amdgpu_device_halt(struct amdgpu_device *adev)
>> 	struct pci_dev *pdev = adev->pdev;
>> 	struct drm_device *ddev = adev_to_drm(adev);
>> 
>> -	amdgpu_xcp_dev_unplug(adev);
>> +	amdgpu_xcp_dev_deregister(adev);
>> 	drm_dev_unplug(ddev);
>> 
>> 	amdgpu_irq_disable_all(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 62de668e9ff8..41d1b06be600 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -2435,7 +2435,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
>> 	struct drm_device *dev = pci_get_drvdata(pdev);
>> 	struct amdgpu_device *adev = drm_to_adev(dev);
>> 
>> -	amdgpu_xcp_dev_unplug(adev);
>> +	amdgpu_xcp_dev_deregister(adev);
>> 	amdgpu_gmc_prepare_nps_mode_change(adev);
>> 	drm_dev_unplug(dev);
>> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index d2a046736edd..be9147eb8308 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -1508,6 +1508,7 @@ void amdgpu_driver_release_kms(struct drm_device *dev)
>> 	struct amdgpu_device *adev = drm_to_adev(dev);
>> 
>> 	amdgpu_device_fini_sw(adev);
>> +	amdgpu_xcp_mgr_fini(adev);
> 
> Suggest to move this inside fini_sw()
> 
>> 	pci_set_drvdata(adev->pdev, NULL);
>> }
>> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
>> index e209b5e101df..62dd5287808b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
>> @@ -283,6 +283,33 @@ static int amdgpu_xcp_dev_alloc(struct amdgpu_device *adev)
>> 	return 0;
>> }
>> 
>> +static void amdgpu_xcp_dev_free(struct amdgpu_device *adev)
>> +{
>> +	struct drm_device *p_ddev;
>> +	int i;
>> +
>> +	if (!adev->xcp_mgr)
>> +		return;
>> +
>> +	for (i = 1; i < MAX_XCP; i++) {
>> +		if (!adev->xcp_mgr->xcp[i].ddev)
>> +			break;
>> +
>> +		// Restore and free the original drm_device.
>> +		p_ddev = adev->xcp_mgr->xcp[i].ddev;
>> +		p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
>> +		p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
>> +		p_ddev->driver =  adev->xcp_mgr->xcp[i].driver;
>> +		p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
> 
> Now that there are more calls, this doesn't make sense here. What about
> moving the redirection along with register() (I guess it matters from
> that point onwards) and undoing it (restore back saved values) along
> with deregister()? With that, there won't be a need to have registered
> flag. You may only need to check if xcp rdev/pdev is not NULL.
Good point, this makes code more clear.


> 
>> +		amdgpu_xcp_drm_dev_free(p_ddev);
>> +
>> +		adev->xcp_mgr->xcp[i].ddev = NULL;
>> +	}
>> +
>> +	adev->xcp_mgr->xcp->ddev = NULL;
>> +}
>> +
>> +
>> int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
>> 			int init_num_xcps,
>> 			struct amdgpu_xcp_mgr_funcs *xcp_funcs)
>> @@ -310,6 +337,13 @@ int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
>> 	return amdgpu_xcp_dev_alloc(adev);
>> }
>> 
>> +void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev)
>> +{
>> +	amdgpu_xcp_dev_free(adev);
>> +	kfree(adev->xcp_mgr);
>> +	adev->xcp_mgr = NULL;
> 
> Thanks for adding this.
> 
> Thanks,
> Lijo
> 
>> +}
>> +
>> int amdgpu_xcp_get_partition(struct amdgpu_xcp_mgr *xcp_mgr,
>> 			     enum AMDGPU_XCP_IP_BLOCK ip, int instance)
>> {
>> @@ -359,12 +393,14 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
>> 		ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, ent->driver_data);
>> 		if (ret)
>> 			return ret;
>> +
>> +		adev->xcp_mgr->xcp[i].registered = true;
>> 	}
>> 
>> 	return 0;
>> }
>> 
>> -void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
>> +void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev)
>> {
>> 	struct drm_device *p_ddev;
>> 	int i;
>> @@ -377,11 +413,10 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
>> 			break;
>> 
>> 		p_ddev = adev->xcp_mgr->xcp[i].ddev;
>> -		drm_dev_unplug(p_ddev);
>> -		p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev;
>> -		p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
>> -		p_ddev->driver =  adev->xcp_mgr->xcp[i].driver;
>> -		p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
>> +		if (adev->xcp_mgr->xcp[i].registered) {
>> +			drm_dev_unplug(p_ddev);
>> +			adev->xcp_mgr->xcp[i].registered = false;
>> +		}
>> 	}
>> }
>> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
>> index b63f53242c57..be22d4398463 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
>> @@ -101,6 +101,7 @@ struct amdgpu_xcp {
>> 	uint8_t id;
>> 	uint8_t mem_id;
>> 	bool valid;
>> +	bool registered;
>> 	atomic_t	ref_cnt;
>> 	struct drm_device *ddev;
>> 	struct drm_device *rdev;
>> @@ -155,6 +156,7 @@ int amdgpu_xcp_resume(struct amdgpu_xcp_mgr *xcp_mgr, int xcp_id);
>> 
>> int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode,
>> 			int init_xcps, struct amdgpu_xcp_mgr_funcs *xcp_funcs);
>> +void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev);
>> int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode);
>> int amdgpu_xcp_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, u32 flags);
>> int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, int mode);
>> @@ -168,7 +170,7 @@ int amdgpu_xcp_get_inst_details(struct amdgpu_xcp *xcp,
>> 
>> int amdgpu_xcp_dev_register(struct amdgpu_device *adev,
>> 				const struct pci_device_id *ent);
>> -void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev);
>> +void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev);
>> int amdgpu_xcp_open_device(struct amdgpu_device *adev,
>> 			   struct amdgpu_fpriv *fpriv,
>> 			   struct drm_file *file_priv);

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20250109/9741b823/attachment-0001.htm>


More information about the amd-gfx mailing list