[PATCH] drm/amdgpu: allocate entities on demand
Christian König
christian.koenig at amd.com
Fri Jan 24 12:09:42 UTC 2020
Am 24.01.20 um 12:53 schrieb Nirmoy Das:
> Currently we pre-allocate entities and fences for all the HW IPs on
> context creation and some of which are might never be used.
>
> This patch tries to resolve entity/fences wastage by creating entity
> only when needed.
>
> Signed-off-by: Nirmoy Das <nirmoy.das at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 244 +++++++++++++-----------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 2 +-
> 2 files changed, 135 insertions(+), 111 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 05c2af61e7de..73f7615df8c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -42,19 +42,12 @@ const unsigned int amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM] = {
> [AMDGPU_HW_IP_VCN_JPEG] = 1,
> };
>
> -static int amdgpu_ctx_total_num_entities(void)
> -{
> - unsigned i, num_entities = 0;
> -
> - for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
> - num_entities += amdgpu_ctx_num_entities[i];
> -
> - return num_entities;
> -}
> -
> static int amdgpu_ctx_priority_permit(struct drm_file *filp,
> enum drm_sched_priority priority)
> {
> + if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
> + return -EINVAL;
> +
> /* NORMAL and below are accessible by everyone */
> if (priority <= DRM_SCHED_PRIORITY_NORMAL)
> return 0;
> @@ -68,64 +61,44 @@ static int amdgpu_ctx_priority_permit(struct drm_file *filp,
> return -EACCES;
> }
>
> -static int amdgpu_ctx_init(struct amdgpu_device *adev,
> - enum drm_sched_priority priority,
> - struct drm_file *filp,
> - struct amdgpu_ctx *ctx)
> +static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, const u32 hw_ip, const u32 ring)
> {
> - unsigned num_entities = amdgpu_ctx_total_num_entities();
> - unsigned i, j;
> + struct amdgpu_device *adev = ctx->adev;
> + struct amdgpu_ctx_entity *entity;
> + struct drm_gpu_scheduler **scheds;
> + struct drm_gpu_scheduler *sched;
> + unsigned num_scheds = 0;
> + enum drm_sched_priority priority;
> int r;
>
> - if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
> - return -EINVAL;
> -
> - r = amdgpu_ctx_priority_permit(filp, priority);
> - if (r)
> - return r;
> -
> - memset(ctx, 0, sizeof(*ctx));
> - ctx->adev = adev;
> -
> -
> - ctx->entities[0] = kcalloc(num_entities,
> - sizeof(struct amdgpu_ctx_entity),
> - GFP_KERNEL);
> - if (!ctx->entities[0])
> - return -ENOMEM;
> -
> -
> - for (i = 0; i < num_entities; ++i) {
> - struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
> -
> - entity->sequence = 1;
> - entity->fences = kcalloc(amdgpu_sched_jobs,
> - sizeof(struct dma_fence*), GFP_KERNEL);
> - if (!entity->fences) {
> - r = -ENOMEM;
> - goto error_cleanup_memory;
> - }
> + if (!ctx->entities[hw_ip]) {
> + ctx->entities[hw_ip] = kcalloc(amdgpu_ctx_num_entities[hw_ip],
> + sizeof(struct amdgpu_ctx_entity *),
> + GFP_KERNEL);
> + if (!ctx->entities[hw_ip])
> + return -ENOMEM;
That's complete overkill, just statically allocate the array in the
amdgpu_ctx structure.
The maximum instance should be 4 IIRC, so something like "struct
amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][4];" so a maximum of 288
bytes used.
Only alternative I see would be to allocate the array behind the
structure, see dma_resv_list_alloc() for an example on how to do this.
But I don't think that this is worth it.
Christian.
> }
> - for (i = 1; i < AMDGPU_HW_IP_NUM; ++i)
> - ctx->entities[i] = ctx->entities[i - 1] +
> - amdgpu_ctx_num_entities[i - 1];
>
> - kref_init(&ctx->refcount);
> - spin_lock_init(&ctx->ring_lock);
> - mutex_init(&ctx->lock);
> + ctx->entities[hw_ip][ring] = kcalloc(1, sizeof(struct amdgpu_ctx_entity),
> + GFP_KERNEL);
> + if (!ctx->entities[hw_ip][ring]) {
> + r = -ENOMEM;
> + goto error_free_entity;
> + }
>
> - ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
> - ctx->reset_counter_query = ctx->reset_counter;
> - ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> - ctx->init_priority = priority;
> - ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
> + entity = ctx->entities[hw_ip][ring];
>
> - for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> - struct drm_gpu_scheduler **scheds;
> - struct drm_gpu_scheduler *sched;
> - unsigned num_scheds = 0;
> + entity->sequence = 1;
> + entity->fences = kcalloc(amdgpu_sched_jobs,
> + sizeof(struct dma_fence*), GFP_KERNEL);
> + if (!entity->fences) {
> + r = -ENOMEM;
> + goto error_free_entity;
> + }
>
> - switch (i) {
> + priority = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
> + ctx->init_priority : ctx->override_priority;
> + switch (hw_ip) {
> case AMDGPU_HW_IP_GFX:
> sched = &adev->gfx.gfx_ring[0].sched;
> scheds = &sched;
> @@ -166,57 +139,85 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
> scheds = adev->jpeg.jpeg_sched;
> num_scheds = adev->jpeg.num_jpeg_sched;
> break;
> - }
> -
> - for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
> - r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> - priority, scheds,
> - num_scheds, &ctx->guilty);
> - if (r)
> - goto error_cleanup_entities;
> }
>
> + r = drm_sched_entity_init(&ctx->entities[hw_ip][ring]->entity,
> + priority, scheds,
> + num_scheds, &ctx->guilty);
> + if (r)
> + goto error_free_entity;
> +
> return 0;
>
> -error_cleanup_entities:
> - for (i = 0; i < num_entities; ++i)
> - drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> +error_free_entity:
> + kfree(ctx->entities[hw_ip][ring]);
> + return r;
> +}
>
> -error_cleanup_memory:
> - for (i = 0; i < num_entities; ++i) {
> - struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
> +static int amdgpu_ctx_init(struct amdgpu_device *adev,
> + enum drm_sched_priority priority,
> + struct drm_file *filp,
> + struct amdgpu_ctx *ctx)
> +{
> + int r;
>
> - kfree(entity->fences);
> - entity->fences = NULL;
> - }
> + r = amdgpu_ctx_priority_permit(filp, priority);
> + if (r)
> + return r;
> +
> + memset(ctx, 0, sizeof(*ctx));
> +
> + ctx->adev = adev;
> +
> + kref_init(&ctx->refcount);
> + spin_lock_init(&ctx->ring_lock);
> + mutex_init(&ctx->lock);
> +
> + ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
> + ctx->reset_counter_query = ctx->reset_counter;
> + ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> + ctx->init_priority = priority;
> + ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
> +
> + return 0;
>
> - kfree(ctx->entities[0]);
> - ctx->entities[0] = NULL;
> - return r;
> }
>
> static void amdgpu_ctx_fini(struct kref *ref)
> {
> struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
> - unsigned num_entities = amdgpu_ctx_total_num_entities();
> struct amdgpu_device *adev = ctx->adev;
> - unsigned i, j;
> + unsigned i, j, k;
>
> if (!adev)
> return;
>
> - for (i = 0; i < num_entities; ++i) {
> - struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
> + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> + struct amdgpu_ctx_entity *entity;
> +
> + if (!ctx->entities[i] || !ctx->entities[i][j])
> + continue;
> +
> + entity = ctx->entities[i][j];
> + if (!entity->fences)
> + continue;
>
> - for (j = 0; j < amdgpu_sched_jobs; ++j)
> - dma_fence_put(entity->fences[j]);
> + for (k = 0; k < amdgpu_sched_jobs; ++k)
> + dma_fence_put(entity->fences[k]);
>
> - kfree(entity->fences);
> + kfree(entity->fences);
> + entity->fences = NULL;
> +
> + kfree(entity);
> + ctx->entities[i][j] = NULL;
> + }
> +
> + kfree(ctx->entities[i];
> + ctx->entities[i] = NULL;
> }
>
> - kfree(ctx->entities[0]);
> mutex_destroy(&ctx->lock);
> -
> kfree(ctx);
> }
>
> @@ -239,7 +240,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
> return -EINVAL;
> }
>
> - *entity = &ctx->entities[hw_ip][ring].entity;
> + if ((ctx->entities[hw_ip] == NULL) || (ctx->entities[hw_ip][ring] == NULL))
> + amdgpu_ctx_init_entity(ctx, hw_ip, ring);
> +
> +
> + *entity = &ctx->entities[hw_ip][ring]->entity;
> return 0;
> }
>
> @@ -279,14 +284,17 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
> static void amdgpu_ctx_do_release(struct kref *ref)
> {
> struct amdgpu_ctx *ctx;
> - unsigned num_entities;
> - u32 i;
> + u32 i, j;
>
> ctx = container_of(ref, struct amdgpu_ctx, refcount);
> + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> + if (!ctx->entities[i] || !ctx->entities[i][j])
> + continue;
>
> - num_entities = amdgpu_ctx_total_num_entities();
> - for (i = 0; i < num_entities; i++)
> - drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> + drm_sched_entity_destroy(&ctx->entities[i][j]->entity);
> + }
> + }
>
> amdgpu_ctx_fini(ref);
> }
> @@ -516,19 +524,23 @@ struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
> void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
> enum drm_sched_priority priority)
> {
> - unsigned num_entities = amdgpu_ctx_total_num_entities();
> enum drm_sched_priority ctx_prio;
> - unsigned i;
> + unsigned i, j;
>
> ctx->override_priority = priority;
>
> ctx_prio = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
> ctx->init_priority : ctx->override_priority;
> + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> + struct drm_sched_entity *entity;
>
> - for (i = 0; i < num_entities; i++) {
> - struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
> + if (!ctx->entities[i] || !ctx->entities[i][j])
> + continue;
>
> - drm_sched_entity_set_priority(entity, ctx_prio);
> + entity = &ctx->entities[i][j]->entity;
> + drm_sched_entity_set_priority(entity, ctx_prio);
> + }
> }
> }
>
> @@ -564,20 +576,24 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
>
> long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
> {
> - unsigned num_entities = amdgpu_ctx_total_num_entities();
> struct amdgpu_ctx *ctx;
> struct idr *idp;
> - uint32_t id, i;
> + uint32_t id, i, j;
>
> idp = &mgr->ctx_handles;
>
> mutex_lock(&mgr->lock);
> idr_for_each_entry(idp, ctx, id) {
> - for (i = 0; i < num_entities; i++) {
> - struct drm_sched_entity *entity;
> + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> + struct drm_sched_entity *entity;
> +
> + if (!ctx->entities[i] || !ctx->entities[i][j])
> + continue;
>
> - entity = &ctx->entities[0][i].entity;
> - timeout = drm_sched_entity_flush(entity, timeout);
> + entity = &ctx->entities[i][j]->entity;
> + timeout = drm_sched_entity_flush(entity, timeout);
> + }
> }
> }
> mutex_unlock(&mgr->lock);
> @@ -586,10 +602,9 @@ long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
>
> void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
> {
> - unsigned num_entities = amdgpu_ctx_total_num_entities();
> struct amdgpu_ctx *ctx;
> struct idr *idp;
> - uint32_t id, i;
> + uint32_t id, i, j;
>
> idp = &mgr->ctx_handles;
>
> @@ -599,8 +614,17 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
> continue;
> }
>
> - for (i = 0; i < num_entities; i++)
> - drm_sched_entity_fini(&ctx->entities[0][i].entity);
> + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> + struct drm_sched_entity *entity;
> +
> + if (!ctx->entities[i] || !ctx->entities[i][j])
> + continue;
> +
> + entity = &ctx->entities[i][j]->entity;
> + drm_sched_entity_fini(entity);
> + }
> + }
> }
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index a6cd9d4b078c..ca9363e71df5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -42,7 +42,7 @@ struct amdgpu_ctx {
> unsigned reset_counter_query;
> uint32_t vram_lost_counter;
> spinlock_t ring_lock;
> - struct amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM];
> + struct amdgpu_ctx_entity **entities[AMDGPU_HW_IP_NUM];
> bool preamble_presented;
> enum drm_sched_priority init_priority;
> enum drm_sched_priority override_priority;
More information about the amd-gfx
mailing list