[PATCH 1/3] drm/amdgpu: Fix entities for disabled HW blocks.
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Tue Jan 29 10:41:01 UTC 2019
On Tue, Jan 29, 2019 at 11:33 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> Am 29.01.19 um 11:20 schrieb Bas Nieuwenhuizen:
> > If we have some disabled HW blocks (say VCN), then the rings are
> > not initialized. This reuslts in entities that refer to uninitialized
> > rqs (runqueues?).
> >
> > In normal usage this does not result in issues because userspace
> > generally knows to ignore the unsupported blocks, but e.g. setting
> > the priorities on all the entities resulted in a NULL access while
> > locking the rq spinlock.
> >
> > This could probably also be induced when actually adding a job for
> > the blocks whith some less smart userspace.
>
> In general I agree that we need to improve the handling here. But this
> looks completely incorrect to me.
In what sense is this incorrect?
I'm fencing of all access to entities which use an uninitialized ring
(and therefore rq) and do not initialize/deinitialize them.
>
> We should always initialize all entities, but only with rq which are
> initialized as well.
>
> When this results in zero rq then we can later on handle that gracefully.
>
> Regards,
> Christian.
>
> >
> > Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++++------
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 +
> > 2 files changed, 25 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > index d85184b5b35c..6f72ce785b32 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > @@ -169,9 +169,13 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
> > for (j = 0; j < num_rings; ++j)
> > rqs[j] = &rings[j]->sched.sched_rq[priority];
> >
> > - for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
> > - r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> > - rqs, num_rings, &ctx->guilty);
> > + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> > + ctx->entities[i][j].enabled = rings[0]->adev != NULL;
> > + if (ctx->entities[i][j].enabled) {
> > + r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> > + rqs, num_rings, &ctx->guilty);
> > + }
> > + }
> > if (r)
> > goto error_cleanup_entities;
> > }
> > @@ -180,7 +184,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
> >
> > error_cleanup_entities:
> > for (i = 0; i < num_entities; ++i)
> > - drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> > + if (ctx->entities[0][i].enabled)
> > + drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> > kfree(ctx->entities[0]);
> >
> > error_free_fences:
> > @@ -229,6 +234,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
> > return -EINVAL;
> > }
> >
> > + if (!ctx->entities[hw_ip][ring].enabled) {
> > + DRM_DEBUG("disabled ring: %d %d\n", hw_ip, ring);
> > + return -EINVAL;
> > + }
> > +
> > *entity = &ctx->entities[hw_ip][ring].entity;
> > return 0;
> > }
> > @@ -279,7 +289,8 @@ static void amdgpu_ctx_do_release(struct kref *ref)
> > num_entities += amdgpu_ctx_num_entities[i];
> >
> > for (i = 0; i < num_entities; i++)
> > - drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> > + if (ctx->entities[0][i].enabled)
> > + drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> >
> > amdgpu_ctx_fini(ref);
> > }
> > @@ -505,7 +516,9 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
> > for (i = 0; i < num_entities; i++) {
> > struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
> >
> > - drm_sched_entity_set_priority(entity, ctx_prio);
> > +
> > + if (ctx->entities[0][1].enabled)
> > + drm_sched_entity_set_priority(entity, ctx_prio);
> > }
> > }
> >
> > @@ -557,6 +570,9 @@ void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr)
> > for (i = 0; i < num_entities; i++) {
> > struct drm_sched_entity *entity;
> >
> > + if (!ctx->entities[0][i].enabled)
> > + continue;
> > +
> > entity = &ctx->entities[0][i].entity;
> > max_wait = drm_sched_entity_flush(entity, max_wait);
> > }
> > @@ -584,7 +600,8 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
> > }
> >
> > for (i = 0; i < num_entities; i++)
> > - drm_sched_entity_fini(&ctx->entities[0][i].entity);
> > + if (ctx->entities[0][i].enabled)
> > + drm_sched_entity_fini(&ctx->entities[0][i].entity);
> > }
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > index b3b012c0a7da..183a783aedd8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> > @@ -33,6 +33,7 @@ struct amdgpu_ctx_entity {
> > uint64_t sequence;
> > struct dma_fence **fences;
> > struct drm_sched_entity entity;
> > + bool enabled;
> > };
> >
> > struct amdgpu_ctx {
>
More information about the amd-gfx
mailing list