[PATCH v2] drm/amdgpu: Fix KFD oversubscription by tracking queues correctly

Andres Rodriguez andresx7 at gmail.com
Thu Jul 13 20:36:48 UTC 2017


Reviewed-by: Andres Rodriguez <andresx7 at gmail.com>

On 2017-07-13 04:23 PM, Jay Cornwall wrote:
> The number of compute queues available to the KFD was erroneously
> calculated as 64. Only the first MEC can execute compute queues and
> it has 32 queue slots.
> 
> This caused the oversubscription limit to be calculated incorrectly,
> leading to a missing chained runlist command at the end of an
> oversubscribed runlist.
> 
> v2: Remove unused num_mec field to avoid duplicate logic
> 
> Change-Id: Ic4a139c04b8a6d025fbb831a0a67e98728bfe461
> Signed-off-by: Jay Cornwall <Jay.Cornwall at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c            | 3 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c               | 4 ----
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 7 -------
>   drivers/gpu/drm/amd/include/kgd_kfd_interface.h       | 3 ---
>   drivers/gpu/drm/radeon/radeon_kfd.c                   | 1 -
>   5 files changed, 1 insertion(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 7060daf..8c710f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -116,7 +116,6 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
>   	if (adev->kfd) {
>   		struct kgd2kfd_shared_resources gpu_resources = {
>   			.compute_vmid_bitmap = global_compute_vmid_bitmap,
> -			.num_mec = adev->gfx.mec.num_mec,
>   			.num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec,
>   			.num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe,
>   			.gpuvm_size = (uint64_t)amdgpu_vm_size << 30
> @@ -140,7 +139,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
>   		/* According to linux/bitmap.h we shouldn't use bitmap_clear if
>   		 * nbits is not compile time constant
>   		 */
> -		last_valid_bit = adev->gfx.mec.num_mec
> +		last_valid_bit = 1 /* only first MEC can have compute queues */
>   				* adev->gfx.mec.num_pipe_per_mec
>   				* adev->gfx.mec.num_queue_per_pipe;
>   		for (i = last_valid_bit; i < KGD_MAX_QUEUES; ++i)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 1cf00d4..95f9396 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -494,10 +494,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>   	} else
>   		kfd->max_proc_per_quantum = hws_max_conc_proc;
>   
> -	/* We only use the first MEC */
> -	if (kfd->shared_resources.num_mec > 1)
> -		kfd->shared_resources.num_mec = 1;
> -
>   	/* calculate max size of mqds needed for queues */
>   	size = max_num_of_queues_per_device *
>   			kfd->device_info->mqd_size_aligned;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 7607989..306144f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -82,13 +82,6 @@ static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe)
>   	return false;
>   }
>   
> -unsigned int get_mec_num(struct device_queue_manager *dqm)
> -{
> -	BUG_ON(!dqm || !dqm->dev);
> -
> -	return dqm->dev->shared_resources.num_mec;
> -}
> -
>   unsigned int get_queues_num(struct device_queue_manager *dqm)
>   {
>   	BUG_ON(!dqm || !dqm->dev);
> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index a4d2fee..10794b3 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -107,9 +107,6 @@ struct kgd2kfd_shared_resources {
>   	/* Bit n == 1 means VMID n is available for KFD. */
>   	unsigned int compute_vmid_bitmap;
>   
> -	/* number of mec available from the hardware */
> -	uint32_t num_mec;
> -
>   	/* number of pipes per mec */
>   	uint32_t num_pipe_per_mec;
>   
> diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c b/drivers/gpu/drm/radeon/radeon_kfd.c
> index 719ea51..8f8c7c1 100644
> --- a/drivers/gpu/drm/radeon/radeon_kfd.c
> +++ b/drivers/gpu/drm/radeon/radeon_kfd.c
> @@ -251,7 +251,6 @@ void radeon_kfd_device_init(struct radeon_device *rdev)
>   	if (rdev->kfd) {
>   		struct kgd2kfd_shared_resources gpu_resources = {
>   			.compute_vmid_bitmap = 0xFF00,
> -			.num_mec = 1,
>   			.num_pipe_per_mec = 4,
>   			.num_queue_per_pipe = 8,
>   			.gpuvm_size = (uint64_t)radeon_vm_size << 30
> 


More information about the amd-gfx mailing list