[PATCH V2] drm/amdgpu: allocate entities on demand

Fri Jan 24 15:31:01 UTC 2020

Am 24.01.20 um 16:15 schrieb Nirmoy Das:
> Currently we pre-allocate entities and fences for all the HW IPs on
> context creation and some of which are might never be used.
>
> This patch tries to resolve entity/fences wastage by creating entity
> only when needed.
>
> v2: allocate memory for entity and fences together
>
> Signed-off-by: Nirmoy Das <nirmoy.das at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 231 ++++++++++++------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |   6 +-
>   2 files changed, 122 insertions(+), 115 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 05c2af61e7de..d246ae9fe0eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -42,19 +42,12 @@ const unsigned int amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM] = {
>   	[AMDGPU_HW_IP_VCN_JPEG]	=	1,
>   };
>
> -static int amdgpu_ctx_total_num_entities(void)
> -{
> -	unsigned i, num_entities = 0;
> -
> -	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
> -		num_entities += amdgpu_ctx_num_entities[i];
> -
> -	return num_entities;
> -}
> -
>   static int amdgpu_ctx_priority_permit(struct drm_file *filp,
>   				      enum drm_sched_priority priority)
>   {
> +	if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
> +		return -EINVAL;
> +
>   	/* NORMAL and below are accessible by everyone */
>   	if (priority <= DRM_SCHED_PRIORITY_NORMAL)
>   		return 0;
> @@ -68,64 +61,26 @@ static int amdgpu_ctx_priority_permit(struct drm_file *filp,
>   	return -EACCES;
>   }
>
> -static int amdgpu_ctx_init(struct amdgpu_device *adev,
> -			   enum drm_sched_priority priority,
> -			   struct drm_file *filp,
> -			   struct amdgpu_ctx *ctx)
> +static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, const u32 hw_ip, const u32 ring)
>   {
> -	unsigned num_entities = amdgpu_ctx_total_num_entities();
> -	unsigned i, j;
> +	struct amdgpu_device *adev = ctx->adev;
> +	struct amdgpu_ctx_entity *entity;
> +	struct drm_gpu_scheduler **scheds = NULL, *sched = NULL;
> +	unsigned num_scheds = 0;
> +	enum drm_sched_priority priority;
>   	int r;
>
> -	if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
> -		return -EINVAL;
> +	entity = kcalloc(1, offsetof(typeof(*entity), fences[amdgpu_sched_jobs]),
> +			 GFP_KERNEL);
> +	if (!entity)
> +		return  -ENOMEM;
>
> -	r = amdgpu_ctx_priority_permit(filp, priority);
> -	if (r)
> -		return r;
> -
> -	memset(ctx, 0, sizeof(*ctx));
> -	ctx->adev = adev;
> -
> -
> -	ctx->entities[0] = kcalloc(num_entities,
> -				   sizeof(struct amdgpu_ctx_entity),
> -				   GFP_KERNEL);
> -	if (!ctx->entities[0])
> -		return -ENOMEM;
> +	entity->sequence = 1;
>
> -
> -	for (i = 0; i < num_entities; ++i) {
> -		struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
> -
> -		entity->sequence = 1;
> -		entity->fences = kcalloc(amdgpu_sched_jobs,
> -					 sizeof(struct dma_fence*), GFP_KERNEL);
> -		if (!entity->fences) {
> -			r = -ENOMEM;
> -			goto error_cleanup_memory;
> -		}
> -	}
> -	for (i = 1; i < AMDGPU_HW_IP_NUM; ++i)
> -		ctx->entities[i] = ctx->entities[i - 1] +
> -			amdgpu_ctx_num_entities[i - 1];
> -
> -	kref_init(&ctx->refcount);
> -	spin_lock_init(&ctx->ring_lock);
> -	mutex_init(&ctx->lock);
> -
> -	ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
> -	ctx->reset_counter_query = ctx->reset_counter;
> -	ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> -	ctx->init_priority = priority;
> -	ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
> -
> -	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> -		struct drm_gpu_scheduler **scheds;
> -		struct drm_gpu_scheduler *sched;
> -		unsigned num_scheds = 0;
> -
> -		switch (i) {
> +	ctx->entities[hw_ip][ring] = entity;
> +	priority = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
> +				ctx->init_priority : ctx->override_priority;
> +	switch (hw_ip) {
>   		case AMDGPU_HW_IP_GFX:
>   			sched = &adev->gfx.gfx_ring[0].sched;
>   			scheds = &sched;
> @@ -166,63 +121,87 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>   			scheds = adev->jpeg.jpeg_sched;
>   			num_scheds =  adev->jpeg.num_jpeg_sched;
>   			break;
> -		}
> -
> -		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
> -			r = drm_sched_entity_init(&ctx->entities[i][j].entity,
> -						  priority, scheds,
> -						  num_scheds, &ctx->guilty);
> -		if (r)
> -			goto error_cleanup_entities;
>   	}
>
> +	r = drm_sched_entity_init(&ctx->entities[hw_ip][ring]->entity,
> +				  priority, scheds,
> +				  num_scheds, &ctx->guilty);
> +	if (r)
> +		goto error_free_entity;
> +
>   	return 0;
>
> -error_cleanup_entities:
> -	for (i = 0; i < num_entities; ++i)
> -		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> +error_free_entity:
> +	kfree(ctx->entities[hw_ip][ring]);

You also need to reset ctx->entities[hw_ip][ring] to NULL here.

Even better would be to use kfree(entity) and only assign 
ctx->entities[hw_ip][ring] to entity when everything worked as expected.

> +	return r;
> +}
>
> -error_cleanup_memory:
> -	for (i = 0; i < num_entities; ++i) {
> -		struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
> +static int amdgpu_ctx_init(struct amdgpu_device *adev,
> +			   enum drm_sched_priority priority,
> +			   struct drm_file *filp,
> +			   struct amdgpu_ctx *ctx)
> +{
> +	int r;
>
> -		kfree(entity->fences);
> -		entity->fences = NULL;
> -	}
> +	r = amdgpu_ctx_priority_permit(filp, priority);
> +	if (r)
> +		return r;
> +
> +	memset(ctx, 0, sizeof(*ctx));
> +
> +	ctx->adev = adev;
> +
> +	kref_init(&ctx->refcount);
> +	spin_lock_init(&ctx->ring_lock);
> +	mutex_init(&ctx->lock);
> +
> +	ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
> +	ctx->reset_counter_query = ctx->reset_counter;
> +	ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> +	ctx->init_priority = priority;
> +	ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
> +
> +	return 0;
>
> -	kfree(ctx->entities[0]);
> -	ctx->entities[0] = NULL;
> -	return r;
>   }
>
>   static void amdgpu_ctx_fini(struct kref *ref)
>   {
>   	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
> -	unsigned num_entities = amdgpu_ctx_total_num_entities();
>   	struct amdgpu_device *adev = ctx->adev;
> -	unsigned i, j;
> +	unsigned i, j, k;
>
>   	if (!adev)
>   		return;
>
> -	for (i = 0; i < num_entities; ++i) {
> -		struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
> +	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> +		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {

Maybe just use AMDGPU_MAX_ENTITY_NUM here.

> +			struct amdgpu_ctx_entity *entity;
> +
> +			if (!ctx->entities[i] || !ctx->entities[i][j])
> +				continue;
>
> -		for (j = 0; j < amdgpu_sched_jobs; ++j)
> -			dma_fence_put(entity->fences[j]);
> +			entity = ctx->entities[i][j];
> +			if (!entity->fences)
> +				continue;

Well that won't work, since entity->fences is now embedded into the 
structure. I think you can just drop the "if".

>
> -		kfree(entity->fences);
> +			for (k = 0; k < amdgpu_sched_jobs; ++k)
> +				dma_fence_put(entity->fences[k]);
> +
> +			kfree(entity);

Maybe move that into a amdgpu_ctx_fini_entity function.

Regards,
Christian.

> +			ctx->entities[i][j] = NULL;
> +		}
>   	}
>
> -	kfree(ctx->entities[0]);
>   	mutex_destroy(&ctx->lock);
> -
>   	kfree(ctx);
>   }
>
>   int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
>   			  u32 ring, struct drm_sched_entity **entity)
>   {
> +	int r;
> +
>   	if (hw_ip >= AMDGPU_HW_IP_NUM) {
>   		DRM_ERROR("unknown HW IP type: %d\n", hw_ip);
>   		return -EINVAL;
> @@ -239,7 +218,14 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
>   		return -EINVAL;
>   	}
>
> -	*entity = &ctx->entities[hw_ip][ring].entity;
> +	if (ctx->entities[hw_ip][ring] == NULL) {
> +		r = amdgpu_ctx_init_entity(ctx, hw_ip, ring);
> +		if (r)
> +			return r;
> +	}
> +
> +
> +	*entity = &ctx->entities[hw_ip][ring]->entity;
>   	return 0;
>   }
>
> @@ -279,14 +265,17 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
>   static void amdgpu_ctx_do_release(struct kref *ref)
>   {
>   	struct amdgpu_ctx *ctx;
> -	unsigned num_entities;
> -	u32 i;
> +	u32 i, j;
>
>   	ctx = container_of(ref, struct amdgpu_ctx, refcount);
> +	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> +		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> +			if (!ctx->entities[i][j])
> +				continue;
>
> -	num_entities = amdgpu_ctx_total_num_entities();
> -	for (i = 0; i < num_entities; i++)
> -		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
> +			drm_sched_entity_destroy(&ctx->entities[i][j]->entity);
> +		}
> +	}
>
>   	amdgpu_ctx_fini(ref);
>   }

> @@ -516,19 +505,23 @@ struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
>   void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
>   				  enum drm_sched_priority priority)
>   {
> -	unsigned num_entities = amdgpu_ctx_total_num_entities();
>   	enum drm_sched_priority ctx_prio;
> -	unsigned i;
> +	unsigned i, j;
>
>   	ctx->override_priority = priority;
>
>   	ctx_prio = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
>   			ctx->init_priority : ctx->override_priority;
> +	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> +		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> +			struct drm_sched_entity *entity;
>
> -	for (i = 0; i < num_entities; i++) {
> -		struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
> +			if (!ctx->entities[i][j])
> +				continue;
>
> -		drm_sched_entity_set_priority(entity, ctx_prio);
> +			entity = &ctx->entities[i][j]->entity;
> +			drm_sched_entity_set_priority(entity, ctx_prio);
> +		}
>   	}
>   }
>
> @@ -564,20 +557,24 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
>
>   long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
>   {
> -	unsigned num_entities = amdgpu_ctx_total_num_entities();
>   	struct amdgpu_ctx *ctx;
>   	struct idr *idp;
> -	uint32_t id, i;
> +	uint32_t id, i, j;
>
>   	idp = &mgr->ctx_handles;
>
>   	mutex_lock(&mgr->lock);
>   	idr_for_each_entry(idp, ctx, id) {
> -		for (i = 0; i < num_entities; i++) {
> -			struct drm_sched_entity *entity;
> +		for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> +			for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> +				struct drm_sched_entity *entity;
> +
> +				if (!ctx->entities[i][j])
> +					continue;
>
> -			entity = &ctx->entities[0][i].entity;
> -			timeout = drm_sched_entity_flush(entity, timeout);
> +				entity = &ctx->entities[i][j]->entity;
> +				timeout = drm_sched_entity_flush(entity, timeout);
> +			}
>   		}
>   	}
>   	mutex_unlock(&mgr->lock);
> @@ -586,10 +583,9 @@ long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
>
>   void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
>   {
> -	unsigned num_entities = amdgpu_ctx_total_num_entities();
>   	struct amdgpu_ctx *ctx;
>   	struct idr *idp;
> -	uint32_t id, i;
> +	uint32_t id, i, j;
>
>   	idp = &mgr->ctx_handles;
>
> @@ -599,8 +595,17 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
>   			continue;
>   		}
>
> -		for (i = 0; i < num_entities; i++)
> -			drm_sched_entity_fini(&ctx->entities[0][i].entity);
> +		for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
> +			for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
> +				struct drm_sched_entity *entity;
> +
> +				if (!ctx->entities[i][j])
> +					continue;
> +
> +				entity = &ctx->entities[i][j]->entity;
> +				drm_sched_entity_fini(entity);
> +			}
> +		}
>   	}
>   }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index a6cd9d4b078c..de490f183af2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -29,10 +29,12 @@ struct drm_device;
>   struct drm_file;
>   struct amdgpu_fpriv;
>
> +#define AMDGPU_MAX_ENTITY_NUM 4
> +
>   struct amdgpu_ctx_entity {
>   	uint64_t		sequence;
> -	struct dma_fence	**fences;
>   	struct drm_sched_entity	entity;
> +	struct dma_fence	*fences[];
>   };
>
>   struct amdgpu_ctx {
> @@ -42,7 +44,7 @@ struct amdgpu_ctx {
>   	unsigned			reset_counter_query;
>   	uint32_t			vram_lost_counter;
>   	spinlock_t			ring_lock;
> -	struct amdgpu_ctx_entity	*entities[AMDGPU_HW_IP_NUM];
> +	struct amdgpu_ctx_entity	*entities[AMDGPU_HW_IP_NUM][AMDGPU_MAX_ENTITY_NUM];
>   	bool				preamble_presented;
>   	enum drm_sched_priority		init_priority;
>   	enum drm_sched_priority		override_priority;
> --
> 2.24.1
>