[PATCH v2] drm/amdkfd: Differentiate logging message for driver oversubscription

Wed Nov 20 23:55:19 UTC 2024

On 2024-11-12 12:25, Xiaogang.Chen wrote:
> From: Xiaogang Chen <xiaogang.chen at amd.com>
> 
> To have user better understand the causes triggering runlist oversubscription.
> No function change.
> 
> Signed-off-by: Xiaogang Chen Xiaogang.Chen at amd.com
> ---
>  .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 44 +++++++++++++------
>  1 file changed, 30 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 37930629edc5..1848578dd5a9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -28,6 +28,10 @@
>  #include "kfd_kernel_queue.h"
>  #include "kfd_priv.h"
>  
> +#define OVER_SUBSCRIPTION_PROCESS_COUNT (1 << 0)
> +#define OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT (1 << 1)
> +#define OVER_SUBSCRIPTION_GWS_QUEUE_COUNT (1 << 2)
> +
>  static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
>  				unsigned int buffer_size_bytes)
>  {
> @@ -40,7 +44,7 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
>  
>  static void pm_calc_rlib_size(struct packet_manager *pm,
>  				unsigned int *rlib_size,
> -				bool *over_subscription)
> +				int *over_subscription)
>  {
>  	unsigned int process_count, queue_count, compute_queue_count, gws_queue_count;
>  	unsigned int map_queue_size;
> @@ -58,17 +62,20 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
>  	 * hws_max_conc_proc has been done in
>  	 * kgd2kfd_device_init().
>  	 */
> -	*over_subscription = false;
> +	*over_subscription = 0;
>  
>  	if (node->max_proc_per_quantum > 1)
>  		max_proc_per_quantum = node->max_proc_per_quantum;
>  
> -	if ((process_count > max_proc_per_quantum) ||
> -	    compute_queue_count > get_cp_queues_num(pm->dqm) ||
> -	    gws_queue_count > 1) {
> -		*over_subscription = true;
> +	if (process_count > max_proc_per_quantum)
> +		*over_subscription |= OVER_SUBSCRIPTION_PROCESS_COUNT;
> +	if (compute_queue_count > get_cp_queues_num(pm->dqm))
> +		*over_subscription |= OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT;
> +	if (gws_queue_count > 1)
> +		*over_subscription |= OVER_SUBSCRIPTION_GWS_QUEUE_COUNT;
> +
> +	if (*over_subscription)
>  		dev_dbg(dev, "Over subscribed runlist\n");
> -	}
>  
>  	map_queue_size = pm->pmf->map_queues_size;
>  	/* calculate run list ib allocation size */
> @@ -89,7 +96,7 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
>  				unsigned int **rl_buffer,
>  				uint64_t *rl_gpu_buffer,
>  				unsigned int *rl_buffer_size,
> -				bool *is_over_subscription)
> +				int *is_over_subscription)
>  {
>  	struct kfd_node *node = pm->dqm->dev;
>  	struct device *dev = node->adev->dev;
> @@ -134,7 +141,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
>  	struct qcm_process_device *qpd;
>  	struct queue *q;
>  	struct kernel_queue *kq;
> -	bool is_over_subscription;
> +	int is_over_subscription;
>  
>  	rl_wptr = retval = processes_mapped = 0;
>  
> @@ -212,16 +219,25 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
>  	dev_dbg(dev, "Finished map process and queues to runlist\n");
>  
>  	if (is_over_subscription) {
> -		if (!pm->is_over_subscription)
> -			dev_warn(
> -				dev,
> -				"Runlist is getting oversubscribed. Expect reduced ROCm performance.\n");
> +		if (!pm->is_over_subscription) {

Braces are unnecessary here.

> +
> +			dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s."
> +				" Expect reduced ROCm performance.\n",
> +				is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ?
> +				" number of processes more than maximum number of processes"
> +				" that HWS can schedule concurrently." : "",

I'd prefer not to break string literals over multiple lines. It makes it impossible to grep the source code for error messages. I've also seen that result checkpatch errors. I'd rather take long lines. checkpatch is OK with that for long string literals. Maybe you can also make the messages more concise. Suggestions:

* " too many processes."
* " too many queues."
* " multiple processes using cooperative launch."

> +				is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ?
> +				" number of queues is more than assigned compute queues." : "",
> +				is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ?
> +				" cooperative launch is more than allowed." : "");
> +
> +		}
>  		retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
>  					*rl_gpu_addr,
>  					alloc_size_bytes / sizeof(uint32_t),
>  					true);
>  	}
> -	pm->is_over_subscription = is_over_subscription;
> +	pm->is_over_subscription = is_over_subscription ? true : false;

The ?-operator is unnecessary here. It's the same as the implicit conversion to bool. If you want to make it explicit, you can use

	pm->is_over_subscription = !!is_over_subscription;

With these nit-picks fixed, the patch is

Reviewed-by: Felix Kuehling <felix.kuehling at amd.com>

>  
>  	for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
>  		pr_debug("0x%2X ", rl_buffer[i]);