[PATCH] Fix Incorrect VMIDs passed to HWS
Felix Kuehling
felix.kuehling at amd.com
Thu Mar 17 19:45:40 UTC 2022
Am 2022-03-17 um 15:37 schrieb Tushar Patel:
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 21 ++++++++++++---------
> 2 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 4c20c23d6ba0..bda1b5132ee8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -680,7 +680,7 @@ MODULE_PARM_DESC(sched_policy,
> * Maximum number of processes that HWS can schedule concurrently. The maximum is the
> * number of VMIDs assigned to the HWS, which is also the default.
> */
> -int hws_max_conc_proc = 8;
> +int hws_max_conc_proc = -1;
> module_param(hws_max_conc_proc, int, 0444);
> MODULE_PARM_DESC(hws_max_conc_proc,
> "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 339e12c94cff..39073f72fe5f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -483,15 +483,18 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> }
>
> /* Verify module parameters regarding mapped process number*/
> - if ((hws_max_conc_proc < 0)
> - || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
> - dev_err(kfd_device,
> - "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
> - hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
> - kfd->vm_info.vmid_num_kfd);
> - kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
> - } else
> - kfd->max_proc_per_quantum = hws_max_conc_proc;
> + kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
> + if (hws_max_conc_proc != -1) {
> + if ((hws_max_conc_proc > 0)
> + && (hws_max_conc_proc < kfd->vm_info.vmid_num_kfd)) {
I think this should be <= kfd->vm_info.vmid_num_kfd.
> + kfd->max_proc_per_quantum = hws_max_conc_proc;
> + } else {
> + dev_err(kfd_device,
> + "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
> + hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
> + kfd->vm_info.vmid_num_kfd);
I think this error message is the wrong approach. hws_max_conc_proc is a
global setting that affects all GPUs. Different GPUs may have different
numbers of VMIDs. So we can't treat (hws_max_conc_proc >
kfd->vm_info.vmid_num_kfd) as an error. It may be an error on one GPU
but perfectly fine on another.
I think you can simplify this if-else like this and get rid of the dev_err:
kfd->max_proc_per_quantum = min(hws_max_conc_proc,
kfd->vm_info.vmid_num_kfd);
Regards,
Felix
> + }
> + }
>
> /* calculate max size of mqds needed for queues */
> size = max_num_of_queues_per_device *
More information about the amd-gfx
mailing list