[PATCH v4 07/14] drm/amdgpu: Register IOMMU topology notifier per device.

Daniel Vetter daniel at ffwll.ch
Tue Jan 19 13:45:56 UTC 2021


On Tue, Jan 19, 2021 at 09:48:03AM +0100, Christian König wrote:
> Am 18.01.21 um 22:01 schrieb Andrey Grodzovsky:
> > Handle all DMA IOMMU gropup related dependencies before the
> > group is removed.
> > 
> > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  5 ++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 ++++++++++++++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h   |  1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 10 +++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  2 ++
> >   6 files changed, 65 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 478a7d8..2953420 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -51,6 +51,7 @@
> >   #include <linux/dma-fence.h>
> >   #include <linux/pci.h>
> >   #include <linux/aer.h>
> > +#include <linux/notifier.h>
> >   #include <drm/ttm/ttm_bo_api.h>
> >   #include <drm/ttm/ttm_bo_driver.h>
> > @@ -1041,6 +1042,10 @@ struct amdgpu_device {
> >   	bool                            in_pci_err_recovery;
> >   	struct pci_saved_state          *pci_state;
> > +
> > +	struct notifier_block		nb;
> > +	struct blocking_notifier_head	notifier;
> > +	struct list_head		device_bo_list;
> >   };
> >   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 45e23e3..e99f4f1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -70,6 +70,8 @@
> >   #include <drm/task_barrier.h>
> >   #include <linux/pm_runtime.h>
> > +#include <linux/iommu.h>
> > +
> >   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
> >   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
> >   MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
> > @@ -3200,6 +3202,39 @@ static const struct attribute *amdgpu_dev_attributes[] = {
> >   };
> > +static int amdgpu_iommu_group_notifier(struct notifier_block *nb,
> > +				     unsigned long action, void *data)
> > +{
> > +	struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, nb);
> > +	struct amdgpu_bo *bo = NULL;
> > +
> > +	/*
> > +	 * Following is a set of IOMMU group dependencies taken care of before
> > +	 * device's IOMMU group is removed
> > +	 */
> > +	if (action == IOMMU_GROUP_NOTIFY_DEL_DEVICE) {
> > +
> > +		spin_lock(&ttm_bo_glob.lru_lock);
> > +		list_for_each_entry(bo, &adev->device_bo_list, bo) {
> > +			if (bo->tbo.ttm)
> > +				ttm_tt_unpopulate(bo->tbo.bdev, bo->tbo.ttm);
> > +		}
> > +		spin_unlock(&ttm_bo_glob.lru_lock);
> 
> That approach won't work. ttm_tt_unpopulate() might sleep on an IOMMU lock.
> 
> You need to use a mutex here or even better make sure you can access the
> device_bo_list without a lock in this moment.

I'd also be worried about the notifier mutex getting really badly in the
way.

Plus I'm worried why we even need this, it sounds a bit like papering over
the iommu subsystem. Assuming we clean up all our iommu mappings in our
device hotunplug/unload code, why do we still need to have an additional
iommu notifier on top, with all kinds of additional headaches? The iommu
shouldn't clean up before the devices in its group have cleaned up.

I think we need more info here on what the exact problem is first.
-Daniel

> 
> Christian.
> 
> > +
> > +		if (adev->irq.ih.use_bus_addr)
> > +			amdgpu_ih_ring_fini(adev, &adev->irq.ih);
> > +		if (adev->irq.ih1.use_bus_addr)
> > +			amdgpu_ih_ring_fini(adev, &adev->irq.ih1);
> > +		if (adev->irq.ih2.use_bus_addr)
> > +			amdgpu_ih_ring_fini(adev, &adev->irq.ih2);
> > +
> > +		amdgpu_gart_dummy_page_fini(adev);
> > +	}
> > +
> > +	return NOTIFY_OK;
> > +}
> > +
> > +
> >   /**
> >    * amdgpu_device_init - initialize the driver
> >    *
> > @@ -3304,6 +3339,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> >   	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
> > +	INIT_LIST_HEAD(&adev->device_bo_list);
> > +
> >   	adev->gfx.gfx_off_req_count = 1;
> >   	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
> > @@ -3575,6 +3612,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> >   	if (amdgpu_device_cache_pci_state(adev->pdev))
> >   		pci_restore_state(pdev);
> > +	BLOCKING_INIT_NOTIFIER_HEAD(&adev->notifier);
> > +	adev->nb.notifier_call = amdgpu_iommu_group_notifier;
> > +
> > +	if (adev->dev->iommu_group) {
> > +		r = iommu_group_register_notifier(adev->dev->iommu_group, &adev->nb);
> > +		if (r)
> > +			goto failed;
> > +	}
> > +
> >   	return 0;
> >   failed:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> > index 0db9330..486ad6d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> > @@ -92,7 +92,7 @@ static int amdgpu_gart_dummy_page_init(struct amdgpu_device *adev)
> >    *
> >    * Frees the dummy page used by the driver (all asics).
> >    */
> > -static void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
> > +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
> >   {
> >   	if (!adev->dummy_page_addr)
> >   		return;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> > index afa2e28..5678d9c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> > @@ -61,6 +61,7 @@ int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev);
> >   void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev);
> >   int amdgpu_gart_init(struct amdgpu_device *adev);
> >   void amdgpu_gart_fini(struct amdgpu_device *adev);
> > +void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev);
> >   int amdgpu_gart_unbind(struct amdgpu_device *adev, uint64_t offset,
> >   		       int pages);
> >   int amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > index 6cc9919..4a1de69 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > @@ -94,6 +94,10 @@ static void amdgpu_bo_destroy(struct ttm_buffer_object *tbo)
> >   	}
> >   	amdgpu_bo_unref(&bo->parent);
> > +	spin_lock(&ttm_bo_glob.lru_lock);
> > +	list_del(&bo->bo);
> > +	spin_unlock(&ttm_bo_glob.lru_lock);
> > +
> >   	kfree(bo->metadata);
> >   	kfree(bo);
> >   }
> > @@ -613,6 +617,12 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
> >   	if (bp->type == ttm_bo_type_device)
> >   		bo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
> > +	INIT_LIST_HEAD(&bo->bo);
> > +
> > +	spin_lock(&ttm_bo_glob.lru_lock);
> > +	list_add_tail(&bo->bo, &adev->device_bo_list);
> > +	spin_unlock(&ttm_bo_glob.lru_lock);
> > +
> >   	return 0;
> >   fail_unreserve:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > index 9ac3756..5ae8555 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> > @@ -110,6 +110,8 @@ struct amdgpu_bo {
> >   	struct list_head		shadow_list;
> >   	struct kgd_mem                  *kfd_bo;
> > +
> > +	struct list_head		bo;
> >   };
> >   static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


More information about the amd-gfx mailing list