[PATCH] Revert "drm/amdgpu: remove ctx->lock"

Alex Deucher alexdeucher at gmail.com
Mon Jun 27 21:30:46 UTC 2022


On Tue, Jun 21, 2022 at 10:42 AM Luben Tuikov <luben.tuikov at amd.com> wrote:
>
> This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.
>
> We see that the bo validate list gets corrupted, in
> amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
> the next line, references a NULL bo and we get a koops.
>
> Bisecting leads to the commit being reverted as the cause of the list
> corruption. See the link below for details of the corruption failure.
>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
> Cc: Alex Deucher <Alexander.Deucher at amd.com>
> Cc: Vitaly Prosyak <Vitaly.Prosyak at amd.com>
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539

Looks like this bug is also relevant:

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=216143

> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>  3 files changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 36ac1f1d11e6b4..e619031753b927 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>                 goto free_chunk;
>         }
>
> +       mutex_lock(&p->ctx->lock);
> +
>         /* skip guilty context job */
>         if (atomic_read(&p->ctx->guilty) == 1) {
>                 ret = -ECANCELED;
> @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>                 }
>         }
>
> -       /* Move fence waiting after getting reservation lock of
> -        * PD root. Then there is no need on a ctx mutex lock.
> -        */
> -       r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
> -       if (unlikely(r != 0)) {
> -               if (r != -ERESTARTSYS)
> -                       DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> -               goto error_validate;
> -       }
> -
>         amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
>                                           &p->bytes_moved_vis_threshold);
>         p->bytes_moved = 0;
> @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
>         dma_fence_put(parser->fence);
>
>         if (parser->ctx) {
> +               mutex_unlock(&parser->ctx->lock);
>                 amdgpu_ctx_put(parser->ctx);
>         }
>         if (parser->bo_list)
> @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>         if (parser->job->uf_addr && ring->funcs->no_user_fence)
>                 return -EINVAL;
>
> -       return 0;
> +       return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
>  }
>
>  static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
> @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>                 goto out;
>
>         r = amdgpu_cs_submit(&parser, cs);
> +
>  out:
>         amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 53f9268366f29e..2ef5296216d64d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
>         kref_init(&ctx->refcount);
>         ctx->mgr = mgr;
>         spin_lock_init(&ctx->ring_lock);
> +       mutex_init(&ctx->lock);
>
>         ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
>         ctx->reset_counter_query = ctx->reset_counter;
> @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
>                 drm_dev_exit(idx);
>         }
>
> +       mutex_destroy(&ctx->lock);
>         kfree(ctx);
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67e9..cc7c8afff4144c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -53,6 +53,7 @@ struct amdgpu_ctx {
>         bool                            preamble_presented;
>         int32_t                         init_priority;
>         int32_t                         override_priority;
> +       struct mutex                    lock;
>         atomic_t                        guilty;
>         unsigned long                   ras_counter_ce;
>         unsigned long                   ras_counter_ue;
>
> base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2
> --
> 2.36.1.74.g277cf0bc36
>


More information about the amd-gfx mailing list