[PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks.

Tue Jan 29 10:41:01 UTC 2019

On Tue, Jan 29, 2019 at 11:33 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> Am 29.01.19 um 11:20 schrieb Bas Nieuwenhuizen:
> > If we have some disabled HW blocks (say VCN), then the rings are
> > not initialized. This reuslts in entities that refer to uninitialized
> > rqs (runqueues?).
> >
> > In normal usage this does not result in issues because userspace
> > generally knows to ignore the unsupported blocks, but e.g. setting
> > the priorities on all the entities resulted in a NULL access while
> > locking the rq spinlock.
> >
> > This could probably also be induced when actually adding a job for
> > the blocks whith some less smart userspace.
>
> In general I agree that we need to improve the handling here. But this
> looks completely incorrect to me.

In what sense is this incorrect?

I'm fencing of all access to entities which use an uninitialized ring
(and therefore rq) and do not initialize/deinitialize them.

>
> We should always initialize all entities, but only with rq which are
> initialized as well.
>
> When this results in zero rq then we can later on handle that gracefully.
>
> Regards,
> Christian.
>
> >
> > Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++++------
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
> >   2 files changed, 25 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > index d85184b5b35c..6f72ce785b32 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > @@ -169,9 +169,13 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
> >               for (j = 0; j < num_rings; ++j)
> >                       rqs[j] = &rings[j]->sched.sched_rq[priority];
> >
> > -             for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
> > -                     r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> > -                                               rqs, num_rings, &ctx->guilty);
> > +             for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> > +                     ctx->entities[i][j].enabled = rings[0]->adev != NULL;
> > +                     if (ctx->entities[i][j].enabled) {
> > +                             r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> > +                                                       rqs, num_rings, &ctx->guilty);
> > +                     }
> > +             }
> >               if (r)
> >                       goto error_cleanup_entities;
> >       }
> > @@ -180,7 +184,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
> >
> >   error_cleanup_entities:
> >       for (i = 0; i < num_entities; ++i)
> > -             drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> > +             if (ctx->entities[0][i].enabled)
> > +                     drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> >       kfree(ctx->entities[0]);
> >
> >   error_free_fences:
> > @@ -229,6 +234,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
> >               return -EINVAL;
> >       }
> >
> > +     if (!ctx->entities[hw_ip][ring].enabled) {
> > +             DRM_DEBUG("disabled ring: %d %d\n", hw_ip, ring);
> > +             return -EINVAL;
> > +     }
> > +
> >       *entity = &ctx->entities[hw_ip][ring].entity;
> >       return 0;
> >   }
> > @@ -279,7 +289,8 @@ static void amdgpu_ctx_do_release(struct kref *ref)
> >               num_entities += amdgpu_ctx_num_entities[i];
> >
> >       for (i = 0; i < num_entities; i++)
> > -             drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> > +             if (ctx->entities[0][i].enabled)
> > +                     drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> >
> >       amdgpu_ctx_fini(ref);
> >   }
> > @@ -505,7 +516,9 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
> >       for (i = 0; i < num_entities; i++) {
> >               struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
> >
> > -             drm_sched_entity_set_priority(entity, ctx_prio);
> > +
> > +             if (ctx->entities[0][1].enabled)
> > +                     drm_sched_entity_set_priority(entity, ctx_prio);
> >       }
> >   }
> >
> > @@ -557,6 +570,9 @@ void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr)
> >               for (i = 0; i < num_entities; i++) {
> >                       struct drm_sched_entity *entity;
> >
> > +                     if (!ctx->entities[0][i].enabled)
> > +                             continue;
> > +
> >                       entity = &ctx->entities[0][i].entity;
> >                       max_wait = drm_sched_entity_flush(entity, max_wait);
> >               }
> > @@ -584,7 +600,8 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
> >               }
> >
> >               for (i = 0; i < num_entities; i++)
> > -                     drm_sched_entity_fini(&ctx->entities[0][i].entity);
> > +                     if (ctx->entities[0][i].enabled)
> > +                             drm_sched_entity_fini(&ctx->entities[0][i].entity);
> >       }
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > index b3b012c0a7da..183a783aedd8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > @@ -33,6 +33,7 @@ struct amdgpu_ctx_entity {
> >       uint64_t                sequence;
> >       struct dma_fence        **fences;
> >       struct drm_sched_entity entity;
> > +     bool                    enabled;
> >   };
> >
> >   struct amdgpu_ctx {
>