[PATCH] drm/amdgpu: Use the slab allocator to reduce job allocation fragmentation

Fri May 10 09:30:31 UTC 2024

Am 10.05.24 um 10:11 schrieb Prike Liang:
> Using kzalloc() results in about 50% memory fragmentation, therefore
> use the slab allocator to reproduce memory fragmentation.
>
> Signed-off-by: Prike Liang <Prike.Liang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 26 ++++++++++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 +
>   3 files changed, 23 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ea14f1c8f430..3de1b42291b6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -3040,6 +3040,7 @@ static void __exit amdgpu_exit(void)
>   	amdgpu_fence_slab_fini();
>   	mmu_notifier_synchronize();
>   	amdgpu_xcp_drv_release();
> +	amdgpue_job_slab_fini();
>   }
>   
>   module_init(amdgpu_init);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e4742b65032d..8327bf017a0e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -31,6 +31,8 @@
>   #include "amdgpu_trace.h"
>   #include "amdgpu_reset.h"
>   
> +static struct kmem_cache *amdgpu_job_slab;
> +
>   static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>   {
>   	struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> @@ -101,10 +103,19 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	if (num_ibs == 0)
>   		return -EINVAL;
>   
> -	*job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL);
> -	if (!*job)
> +	amdgpu_job_slab = kmem_cache_create("amdgpu_job",
> +				struct_size(*job, ibs, num_ibs), 0,
> +				SLAB_HWCACHE_ALIGN, NULL);

Well you are declaring a global slab cache for a dynamic job size, then 
try to set it up in the job allocation function which can be called 
concurrently with different number of IBs.

To sum it up  this is completely racy and will go boom immediately in 
testing. As far as I can see this suggestion is just utterly nonsense.

Regards,
Christian.

> +	if (!amdgpu_job_slab) {
> +		DRM_ERROR("create amdgpu_job cache failed\n");
>   		return -ENOMEM;
> +	}
>   
> +	*job = kmem_cache_zalloc(amdgpu_job_slab, GFP_KERNEL);
> +	if (!*job) {
> +		kmem_cache_destroy(amdgpu_job_slab);
> +		return -ENOMEM;
> +	}
>   	/*
>   	 * Initialize the scheduler to at least some ring so that we always
>   	 * have a pointer to adev.
> @@ -138,7 +149,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
>   	if (r) {
>   		if (entity)
>   			drm_sched_job_cleanup(&(*job)->base);
> -		kfree(*job);
> +		kmem_cache_free(amdgpu_job_slab, job);
>   	}
>   
>   	return r;
> @@ -179,6 +190,11 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
>   		amdgpu_ib_free(ring->adev, &job->ibs[i], f);
>   }
>   
> +void amdgpue_job_slab_fini(void)
> +{
> +	kmem_cache_destroy(amdgpu_job_slab);
> +}
> +
>   static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>   {
>   	struct amdgpu_job *job = to_amdgpu_job(s_job);
> @@ -189,7 +205,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>   
>   	/* only put the hw fence if has embedded fence */
>   	if (!job->hw_fence.ops)
> -		kfree(job);
> +		kmem_cache_free(amdgpu_job_slab, job);
>   	else
>   		dma_fence_put(&job->hw_fence);
>   }
> @@ -221,7 +237,7 @@ void amdgpu_job_free(struct amdgpu_job *job)
>   		dma_fence_put(job->gang_submit);
>   
>   	if (!job->hw_fence.ops)
> -		kfree(job);
> +		kmem_cache_free(amdgpu_job_slab, job);
>   	else
>   		dma_fence_put(&job->hw_fence);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index a963a25ddd62..4491d5f9c74d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -103,5 +103,6 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   			     struct dma_fence **fence);
>   
>   void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);
> +void amdgpue_job_slab_fini(void);
>   
>   #endif