[PATCH v2] drm/amdgpu: Fix KFD oversubscription by tracking queues correctly

Felix Kuehling felix.kuehling at amd.com
Thu Jul 13 21:16:24 UTC 2017


People don't seem to like cross-component changes if they can be
avoided. I'd prefer separate commits for KFD, radeon, amdgpu, and
finally the kgd2kfd interface.

With that fixed, Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>

On 17-07-13 04:36 PM, Andres Rodriguez wrote:
>
> Reviewed-by: Andres Rodriguez <andresx7 at gmail.com>
>
> On 2017-07-13 04:23 PM, Jay Cornwall wrote:
>> The number of compute queues available to the KFD was erroneously
>> calculated as 64. Only the first MEC can execute compute queues and
>> it has 32 queue slots.
>>
>> This caused the oversubscription limit to be calculated incorrectly,
>> leading to a missing chained runlist command at the end of an
>> oversubscribed runlist.
>>
>> v2: Remove unused num_mec field to avoid duplicate logic
>>
>> Change-Id: Ic4a139c04b8a6d025fbb831a0a67e98728bfe461
>> Signed-off-by: Jay Cornwall <Jay.Cornwall at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c            | 3 +--
>>   drivers/gpu/drm/amd/amdkfd/kfd_device.c               | 4 ----
>>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 7 -------
>>   drivers/gpu/drm/amd/include/kgd_kfd_interface.h       | 3 ---
>>   drivers/gpu/drm/radeon/radeon_kfd.c                   | 1 -
>>   5 files changed, 1 insertion(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 7060daf..8c710f7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -116,7 +116,6 @@ void amdgpu_amdkfd_device_init(struct
>> amdgpu_device *adev)
>>       if (adev->kfd) {
>>           struct kgd2kfd_shared_resources gpu_resources = {
>>               .compute_vmid_bitmap = global_compute_vmid_bitmap,
>> -            .num_mec = adev->gfx.mec.num_mec,
>>               .num_pipe_per_mec = adev->gfx.mec.num_pipe_per_mec,
>>               .num_queue_per_pipe = adev->gfx.mec.num_queue_per_pipe,
>>               .gpuvm_size = (uint64_t)amdgpu_vm_size << 30
>> @@ -140,7 +139,7 @@ void amdgpu_amdkfd_device_init(struct
>> amdgpu_device *adev)
>>           /* According to linux/bitmap.h we shouldn't use
>> bitmap_clear if
>>            * nbits is not compile time constant
>>            */
>> -        last_valid_bit = adev->gfx.mec.num_mec
>> +        last_valid_bit = 1 /* only first MEC can have compute queues */
>>                   * adev->gfx.mec.num_pipe_per_mec
>>                   * adev->gfx.mec.num_queue_per_pipe;
>>           for (i = last_valid_bit; i < KGD_MAX_QUEUES; ++i)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 1cf00d4..95f9396 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -494,10 +494,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>>       } else
>>           kfd->max_proc_per_quantum = hws_max_conc_proc;
>>   -    /* We only use the first MEC */
>> -    if (kfd->shared_resources.num_mec > 1)
>> -        kfd->shared_resources.num_mec = 1;
>> -
>>       /* calculate max size of mqds needed for queues */
>>       size = max_num_of_queues_per_device *
>>               kfd->device_info->mqd_size_aligned;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 7607989..306144f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -82,13 +82,6 @@ static bool is_pipe_enabled(struct
>> device_queue_manager *dqm, int mec, int pipe)
>>       return false;
>>   }
>>   -unsigned int get_mec_num(struct device_queue_manager *dqm)
>> -{
>> -    BUG_ON(!dqm || !dqm->dev);
>> -
>> -    return dqm->dev->shared_resources.num_mec;
>> -}
>> -
>>   unsigned int get_queues_num(struct device_queue_manager *dqm)
>>   {
>>       BUG_ON(!dqm || !dqm->dev);
>> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>> index a4d2fee..10794b3 100644
>> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>> @@ -107,9 +107,6 @@ struct kgd2kfd_shared_resources {
>>       /* Bit n == 1 means VMID n is available for KFD. */
>>       unsigned int compute_vmid_bitmap;
>>   -    /* number of mec available from the hardware */
>> -    uint32_t num_mec;
>> -
>>       /* number of pipes per mec */
>>       uint32_t num_pipe_per_mec;
>>   diff --git a/drivers/gpu/drm/radeon/radeon_kfd.c
>> b/drivers/gpu/drm/radeon/radeon_kfd.c
>> index 719ea51..8f8c7c1 100644
>> --- a/drivers/gpu/drm/radeon/radeon_kfd.c
>> +++ b/drivers/gpu/drm/radeon/radeon_kfd.c
>> @@ -251,7 +251,6 @@ void radeon_kfd_device_init(struct radeon_device
>> *rdev)
>>       if (rdev->kfd) {
>>           struct kgd2kfd_shared_resources gpu_resources = {
>>               .compute_vmid_bitmap = 0xFF00,
>> -            .num_mec = 1,
>>               .num_pipe_per_mec = 4,
>>               .num_queue_per_pipe = 8,
>>               .gpuvm_size = (uint64_t)radeon_vm_size << 30
>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



More information about the amd-gfx mailing list