[Intel-xe] [PATCH] drm/xe: fix pvc unload issue

Matthew Auld matthew.william.auld at gmail.com
Fri Mar 31 05:40:06 UTC 2023


On Thu, 30 Mar 2023 at 16:40, Chang, Bruce <yu.bruce.chang at intel.com> wrote:
>
> Currently, unload pvc driver will generate a null dereference
> and the call stack is as below.
>
> [ 4850.618000] Call Trace:
> [ 4850.620740]  <TASK>
> [ 4850.623134]  ttm_bo_cleanup_memtype_use+0x3f/0x50 [ttm]
> [ 4850.628661]  ttm_bo_release+0x154/0x2c0 [ttm]
> [ 4850.633317]  ? drm_buddy_fini+0x62/0x80 [drm_buddy]
> [ 4850.638487]  ? __kmem_cache_free+0x27d/0x2c0
> [ 4850.643054]  ttm_bo_put+0x38/0x60 [ttm]
> [ 4850.647190]  xe_gem_object_free+0x1f/0x30 [xe]
> [ 4850.651945]  drm_gem_object_free+0x1e/0x30 [drm]
> [ 4850.656904]  ggtt_fini_noalloc+0x9d/0xe0 [xe]
> [ 4850.661574]  drm_managed_release+0xb5/0x150 [drm]
> [ 4850.666617]  drm_dev_release+0x30/0x50 [drm]
> [ 4850.671209]  devm_drm_dev_init_release+0x3c/0x60 [drm]
>
> There are a couple issues, but the main one is due to TTM has only
> one TTM_PL_TT region, but since pvc has 2 tiles and tries to setup
> 1 TTM_PL_TT each tile. The second will overwrite the first one.
>
> During unload time, the first tile will reset the TTM_PL_TT manger
> and when the second tile is trying to free Bo and it will generate
> the null reference since the TTM manage is already got reset to 0.
>
> The fix is to share the TTM_PL_TT manager and use a count to only
> allow the last instance to release.

Would it make sense to rather move it out of GT init, and just have
one instance in the xe_device? And with that to rather consider
xe->mem.vram_size, instead of gt->mem.vram_size when considering the
potential "GTT" size? I think PL_TT is just system memory that can be
touched by the GPU device (likely has a gtt mapping), so not sure why
that needs to be GT centric.

>
> Cc: Stuart Summers <stuart.summers at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Signed-off-by: Bruce Chang <yu.bruce.chang at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device_types.h |  3 +++
>  drivers/gpu/drm/xe/xe_gt.c           |  9 +++++++--
>  drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c  | 25 +++++++++++++++----------
>  3 files changed, 25 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 88f863edc41c..ea771f120c0f 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -195,6 +195,9 @@ struct xe_device {
>                         /** @mapping: pointer to VRAM mappable space */
>                         void *__iomem mapping;
>                 } vram;
> +               /** @gtt_mr: GTT TTM manager */
> +               struct xe_ttm_gtt_mgr *gtt_mgr;
> +               int instance;
>         } mem;
>
>         /** @usm: unified memory state */
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index fd7a5b43ba3e..e7ca141478ef 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -77,8 +77,13 @@ int xe_gt_alloc(struct xe_device *xe, struct xe_gt *gt)
>                 if (!gt->mem.vram_mgr)
>                         return -ENOMEM;
>
> -               gt->mem.gtt_mgr = drmm_kzalloc(drm, sizeof(*gt->mem.gtt_mgr),
> -                                              GFP_KERNEL);
> +               if (!xe->mem.gtt_mgr) {
> +                       xe->mem.gtt_mgr =
> +                               drmm_kzalloc(drm, sizeof(*gt->mem.gtt_mgr),
> +                                            GFP_KERNEL);
> +                       xe->mem.instance = 0;
> +               }
> +               gt->mem.gtt_mgr = xe->mem.gtt_mgr;
>                 if (!gt->mem.gtt_mgr)
>                         return -ENOMEM;
>         } else {
> diff --git a/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c b/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c
> index 8075781070f2..05300c71928e 100644
> --- a/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c
> +++ b/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c
> @@ -94,6 +94,9 @@ static void ttm_gtt_mgr_fini(struct drm_device *drm, void *arg)
>         struct ttm_resource_manager *man = &mgr->manager;
>         int err;
>
> +       if (--xe->mem.instance)
> +               return;
> +
>         ttm_resource_manager_set_used(man, false);
>
>         err = ttm_resource_manager_evict_all(&xe->ttm, man);
> @@ -113,18 +116,20 @@ int xe_ttm_gtt_mgr_init(struct xe_gt *gt, struct xe_ttm_gtt_mgr *mgr,
>
>         XE_BUG_ON(xe_gt_is_media_type(gt));
>
> -       mgr->gt = gt;
> -       man->use_tt = true;
> -       man->func = &xe_ttm_gtt_mgr_func;
> -
> -       ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT);
> +       if (!xe->mem.instance) {
> +               mgr->gt = gt;
> +               man->use_tt = true;
> +               man->func = &xe_ttm_gtt_mgr_func;
>
> -       ttm_set_driver_manager(&xe->ttm, XE_PL_TT, &mgr->manager);
> -       ttm_resource_manager_set_used(man, true);
> +               ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT);
>
> -       err = drmm_add_action_or_reset(&xe->drm, ttm_gtt_mgr_fini, mgr);
> -       if (err)
> -               return err;
> +               ttm_set_driver_manager(&xe->ttm, XE_PL_TT, &mgr->manager);
> +               ttm_resource_manager_set_used(man, true);
> +               err = drmm_add_action_or_reset(&xe->drm, ttm_gtt_mgr_fini, mgr);
> +               if (err)
> +                       return err;
> +       }
> +       xe->mem.instance ++;
>
>         return 0;
>  }
> --
> 2.25.1
>


More information about the Intel-xe mailing list