[RFC 20/23] cgroup/drm: Introduce weight based scheduling control

Michal Koutný mkoutny at suse.com
Mon Jun 2 16:22:11 UTC 2025


Hello.

On Fri, May 02, 2025 at 01:32:53PM +0100, Tvrtko Ursulin <tvrtko.ursulin at igalia.com> wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> Similar to CPU and IO scheduling, implement a concept of weight in the DRM
> cgroup controller.
> 
> Individual drivers are now able to register with the controller which will
> notify them of the relative scheduling weight for each open DRM client.
> 
> The notifications are triggered on cgroup weight changes and DRM clients
> appearing and disappearing in/from cgroups. Latter is done because it is
> handy to ignore the groups with no DRM clients in relative weight
> calculations.
> 
> The notifications are also consolidated by using a delayed worker.
> 
> On the userspace API level we use the same range and defaults as the CPU
> controller - CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_DFL and CGROUP_WEIGHT_MAX.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> Cc: Michal Koutný <mkoutny at suse.com>
> Cc: Tejun Heo <tj at kernel.org>
> ---
>  Documentation/admin-guide/cgroup-v2.rst |  22 ++
>  include/linux/cgroup_drm.h              |   2 +
>  kernel/cgroup/drm.c                     | 313 +++++++++++++++++++++++-
>  3 files changed, 331 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 1a16ce68a4d7..095b7dee0151 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2729,6 +2729,28 @@ HugeTLB Interface Files
>          hugetlb pages of <hugepagesize> in this cgroup.  Only active in
>          use hugetlb pages are included.  The per-node values are in bytes.
>  
> +DRM
> +---
> +
> +The controller allows for configuring of scheduling weights of cgroups relative
> +to their siblings.
> +
> +NOTE: This is an optional feature into which individual DRM drivers need to
> +      opt-in if they want to support it.
> +
> +NOTE: Only single GPU systems will work as expected in the current
> +      implementation.
> +
> +DRM Interface Files
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +
> +  drm.weight
> +        A read-write single value file which exists on non-root cgroups. The
> +        default is "100".

Should this be akin to IO controller and have subkey granularity for
individual devices? (With special 'default' entry.)

(Can those devices come and go (hotplug)?)

> +
> +        The weights are in the range [1, 10000] and specify the relative
> +        scheduling weights for cgroups in relation to their siblings.
> +
>  Misc
>  ----
>  
> diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
> index d6a9a2fbdbf5..9961003958b4 100644
> --- a/include/linux/cgroup_drm.h
> +++ b/include/linux/cgroup_drm.h
> @@ -6,6 +6,8 @@
>  
>  #include <drm/drm_file.h>
>  
> +#define DRM_CGROUP_WEIGHT_SHIFT 10
> +
>  #if IS_ENABLED(CONFIG_CGROUP_DRM)
>  void drmcgroup_client_open(struct drm_file *file_priv);
>  void drmcgroup_client_close(struct drm_file *file_priv);
> diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
> index ea7655edf86a..532702604786 100644
> --- a/kernel/cgroup/drm.c
> +++ b/kernel/cgroup/drm.c
> @@ -13,10 +13,18 @@ struct drm_cgroup_state {
>  	struct cgroup_subsys_state css;
>  
>  	struct list_head clients;
> +	unsigned int num_clients; /* Whole branch */
> +
> +	unsigned int sum_children_weights;
> +
> +	unsigned int weight;
> +	unsigned int effective_weight;

IIUC, this is weight normalized across siblings (and not scaled by
ancestors)?

It should be then sufficient to only notify the siblings of modified
cgroup after the recalculation, shouldn't it?

>  };
>  
>  struct drm_root_cgroup_state {
>  	struct drm_cgroup_state drmcs;
> +
> +	struct delayed_work notify_work;
>  };
>  
>  static struct drm_root_cgroup_state root_drmcs = {
> @@ -31,7 +39,7 @@ css_to_drmcs(struct cgroup_subsys_state *css)
>  	return container_of(css, struct drm_cgroup_state, css);
>  }
>  
> -static void __maybe_unused
> +static void
>  drmcs_notify_weight(struct drm_cgroup_state *drmcs)
>  {
>  	struct drm_file *fpriv;
> @@ -43,16 +51,152 @@ drmcs_notify_weight(struct drm_cgroup_state *drmcs)
>  			fpriv->minor->dev->driver->cg_ops;
>  
>  		if (cg_ops && cg_ops->notify_weight)
> -			cg_ops->notify_weight(fpriv, 0);
> +			cg_ops->notify_weight(fpriv, drmcs->effective_weight);
>  	}
>  }
>  
> +static void drmcg_update_weights_locked(void)
> +{
> +	lockdep_assert_held(&drmcg_mutex);
> +
> +	mod_delayed_work(system_wq,
> +			 &root_drmcs.notify_work,
> +			 usecs_to_jiffies(1000));

This value is little bit magic.
What is this consolidatiot good for?
(I guess it's rather because of clients joining/leaving rather than
cgroup attribute modifications.)

> +}
> +
> +static void drmcg_update_weights(void)
> +{
> +	mutex_lock(&drmcg_mutex);
> +	drmcg_update_weights_locked();
> +	mutex_unlock(&drmcg_mutex);
> +}
> +
> +static u64
> +drmcs_read_weight(struct cgroup_subsys_state *css, struct cftype *cft)
> +{
> +	struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> +
> +	return drmcs->weight;
> +}
> +
> +static int
> +drmcs_write_weight(struct cgroup_subsys_state *css, struct cftype *cftype,
> +		   u64 weight)
> +{
> +	struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> +	int ret;
> +
> +	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
> +		return -ERANGE;
> +
> +	ret = mutex_lock_interruptible(&drmcg_mutex);
> +	if (ret)
> +		return ret;
> +	drmcs->weight = weight;
> +	drmcg_update_weights_locked();
> +	mutex_unlock(&drmcg_mutex);
> +
> +	return 0;
> +}
> +
> +static void notify_worker(struct work_struct *work)
> +{
> +	struct drm_cgroup_state *root = &root_drmcs.drmcs;
> +	struct cgroup_subsys_state *node;
> +	bool updated;
> +
> +	mutex_lock(&drmcg_mutex);
> +	rcu_read_lock();
> +
> +	/*
> +	 * Always come back later if we race with core cgroup management.
> +	 */
> +	updated = false;
> +	if (WARN_ON_ONCE(!css_tryget_online(&root->css)))
> +		goto out_unlock;
> +
> +	css_for_each_descendant_post(node, &root->css) {
> +		struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> +
> +		if (!css_tryget_online(node))
> +			goto out_put;
> +
> +		drmcs->sum_children_weights = 0;
> +		css_put(node);
> +	}
> +
> +	css_for_each_descendant_post(node, &root->css) {
> +		struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> +		struct drm_cgroup_state *parent;
> +
> +		if (!css_tryget_online(node))
> +			goto out_put;
> +		if (!node->parent || !drmcs->num_clients) {
> +			css_put(node);
> +			continue;
> +		}
> +		if (!css_tryget_online(node->parent)) {
> +			css_put(node);
> +			goto out_put;
> +		}
> +
> +		parent = css_to_drmcs(node->parent);
> +		parent->sum_children_weights += drmcs->weight;
> +		css_put(node);
> +		css_put(&parent->css);
> +	}
> +
> +	css_for_each_descendant_pre(node, &root->css) {
> +		struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> +		struct cgroup_subsys_state *css;
> +
> +		if (!css_tryget_online(node))
> +			goto out_put;
> +		if (!drmcs->num_clients) {
> +			css_put(node);
> +			continue;
> +		}
> +
> +		css_for_each_child(css, &drmcs->css) {
> +			struct drm_cgroup_state *sibling = css_to_drmcs(css);
> +
> +			if (!css_tryget_online(css)) {
> +				css_put(node);
> +				goto out_put;
> +			}
> +			if (!sibling->num_clients) {
> +				css_put(css);
> +				continue;
> +			}
> +
> +			sibling->effective_weight =
> +				DIV_ROUND_CLOSEST(sibling->weight <<
> +						  DRM_CGROUP_WEIGHT_SHIFT,
> +						  drmcs->sum_children_weights);
> +			drmcs_notify_weight(sibling);
> +			css_put(css);
> +		}
> +
> +		css_put(node);
> +	}
> +
> +	updated = true;
> +
> +out_put:
> +	css_put(&root->css);
> +out_unlock:
> +	rcu_read_unlock();
> +
> +	if (!updated)
> +		drmcg_update_weights_locked();
> +
> +	mutex_unlock(&drmcg_mutex);
> +}
> +
>  static void drmcs_free(struct cgroup_subsys_state *css)
>  {
> -	struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> -
> -	if (drmcs != &root_drmcs.drmcs)
> -		kfree(drmcs);
> +	if (css != &root_drmcs.drmcs.css)
> +		kfree(css_to_drmcs(css));
>  }
>  
>  static struct cgroup_subsys_state *
> @@ -62,6 +206,7 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css)
>  
>  	if (!parent_css) {
>  		drmcs = &root_drmcs.drmcs;
> +		INIT_DELAYED_WORK(&root_drmcs.notify_work, notify_worker);
>  	} else {
>  		drmcs = kzalloc(sizeof(*drmcs), GFP_KERNEL);
>  		if (!drmcs)
> @@ -70,9 +215,147 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css)
>  		INIT_LIST_HEAD(&drmcs->clients);
>  	}
>  
> +	drmcs->weight = CGROUP_WEIGHT_DFL;
> +	drmcs->effective_weight = (1 << DRM_CGROUP_WEIGHT_SHIFT) / 2;
> +
>  	return &drmcs->css;
>  }
>  
> +static int drmcs_online(struct cgroup_subsys_state *css)
> +{
> +	drmcg_update_weights();
> +
> +	return 0;
> +}
> +
> +static void drmcs_offline(struct cgroup_subsys_state *css)
> +{
> +	drmcg_update_weights();
> +}
> +
> +static struct drm_cgroup_state *old_drmcs;
> +
> +static int drmcs_can_attach(struct cgroup_taskset *tset)
> +{
> +	struct cgroup_subsys_state *css;
> +	struct task_struct *task;
> +
> +	task = cgroup_taskset_first(tset, &css);
> +	old_drmcs = css_to_drmcs(task_css(task, drm_cgrp_id));

This is similar to cpuset's cpuset_attach_old_cs.
Beware that when controller is disabled, the migration happens from
multiple (chilren) csses.

> +
> +	return 0;
> +}
> +
> +static void __inc_clients(struct drm_cgroup_state *drmcs)
> +{
> +	struct cgroup_subsys_state *parent = NULL;
> +
> +	rcu_read_lock();
> +	do {
> +		drmcs->num_clients++;
> +		WARN_ON_ONCE(!drmcs->num_clients);
> +
> +		if (parent)
> +			css_put(parent);
> +
> +		parent = drmcs->css.parent;
> +		if (parent) {
> +			if (WARN_ON_ONCE(!css_tryget(parent)))

This should be ensured implicitly thanks to css::online_cnt.

> +				break;
> +
> +			drmcs = css_to_drmcs(parent);
> +		}
> +	} while (parent);
> +	rcu_read_unlock();
> +}
> +
> +static void __dec_clients(struct drm_cgroup_state *drmcs)
> +{
> +	struct cgroup_subsys_state *parent = NULL;
> +
> +	rcu_read_lock();
> +	do {
> +		WARN_ON_ONCE(!drmcs->num_clients);
> +		drmcs->num_clients--;
> +
> +		if (parent)
> +			css_put(parent);
> +
> +		parent = drmcs->css.parent;
> +		if (parent) {
> +			if (WARN_ON_ONCE(!css_tryget(parent)))
> +				break;
> +
> +			drmcs = css_to_drmcs(parent);
> +		}
> +	} while (parent);
> +	rcu_read_unlock();
> +}
> +
> +static void drmcs_attach(struct cgroup_taskset *tset)
> +{
> +	struct drm_cgroup_state *old = old_drmcs;
> +	struct cgroup_subsys_state *css;
> +	struct drm_file *fpriv, *next;
> +	struct drm_cgroup_state *new;
> +	struct task_struct *task;
> +	bool migrated = false;
> +
> +	if (!old)
> +		return;
> +
> +	task = cgroup_taskset_first(tset, &css);
> +	new = css_to_drmcs(task_css(task, drm_cgrp_id));
> +	if (new == old)
> +		return;

Beware here too, it'd be better to use css from
cgroup_taskset_for_each(task, css, tset) below as the new css.

> +
> +	mutex_lock(&drmcg_mutex);
> +
> +	list_for_each_entry_safe(fpriv, next, &old->clients, clink) {
> +		cgroup_taskset_for_each(task, css, tset) {
> +			struct cgroup_subsys_state *old_css;
> +			struct drm_cgroup_state *old_;
> +
> +			if (task->flags & PF_KTHREAD)
> +				continue;
> +			if (!thread_group_leader(task))
> +				continue;

Maybe cgroup_taskset_for_each_leader()

> +
> +			new = css_to_drmcs(task_css(task, drm_cgrp_id));
> +			if (WARN_ON_ONCE(new == old))
> +				continue;
> +
> +			if (rcu_access_pointer(fpriv->pid) != task_tgid(task))
> +				continue;
> +
> +			if (WARN_ON_ONCE(fpriv->__css != &old->css))
> +				continue;
> +
> +			old_css = fpriv->__css;
> +			old_ = css_to_drmcs(old_css);
> +			fpriv->__css = &new->css;
> +			css_get(fpriv->__css);
> +			list_move_tail(&fpriv->clink, &new->clients);
> +			__dec_clients(old);
> +			__inc_clients(new);
> +			css_put(old_css);
> +			migrated = true;
> +		}
> +	}
> +
> +	if (migrated)
> +		drmcg_update_weights_locked();
> +
> +	mutex_unlock(&drmcg_mutex);
> +
> +	old_drmcs = NULL;
> +}
> +
> +static void drmcs_cancel_attach(struct cgroup_taskset *tset)
> +{
> +	old_drmcs = NULL;
> +}
> +
>  void drmcgroup_client_open(struct drm_file *file_priv)
>  {
>  	struct drm_cgroup_state *drmcs;
> @@ -85,6 +368,8 @@ void drmcgroup_client_open(struct drm_file *file_priv)
>  	mutex_lock(&drmcg_mutex);
>  	file_priv->__css = &drmcs->css; /* Keeps the reference. */
>  	list_add_tail(&file_priv->clink, &drmcs->clients);
> +	__inc_clients(drmcs);
> +	drmcg_update_weights_locked();
>  	mutex_unlock(&drmcg_mutex);
>  }
>  EXPORT_SYMBOL_GPL(drmcgroup_client_open);
> @@ -100,7 +385,9 @@ void drmcgroup_client_close(struct drm_file *file_priv)
>  
>  	mutex_lock(&drmcg_mutex);
>  	list_del(&file_priv->clink);
> +	__dec_clients(drmcs);
>  	file_priv->__css = NULL;
> +	drmcg_update_weights_locked();
>  	mutex_unlock(&drmcg_mutex);
>  
>  	css_put(&drmcs->css);
> @@ -124,6 +411,9 @@ void drmcgroup_client_migrate(struct drm_file *file_priv)
>  	if (src != dst) {
>  		file_priv->__css = &dst->css; /* Keeps the reference. */
>  		list_move_tail(&file_priv->clink, &dst->clients);
> +		__dec_clients(src);
> +		__inc_clients(dst);
> +		drmcg_update_weights_locked();
>  	}
>  
>  	mutex_unlock(&drmcg_mutex);
> @@ -133,12 +423,23 @@ void drmcgroup_client_migrate(struct drm_file *file_priv)
>  EXPORT_SYMBOL_GPL(drmcgroup_client_migrate);
>  
>  struct cftype files[] = {
> +	{
> +		.name = "weight",
> +		.flags = CFTYPE_NOT_ON_ROOT,
> +		.read_u64 = drmcs_read_weight,
> +		.write_u64 = drmcs_write_weight,
> +	},
>  	{ } /* Zero entry terminates. */
>  };
>  
>  struct cgroup_subsys drm_cgrp_subsys = {
>  	.css_alloc	= drmcs_alloc,
>  	.css_free	= drmcs_free,
> +	.css_online	= drmcs_online,
> +	.css_offline	= drmcs_offline,
> +	.can_attach     = drmcs_can_attach,
> +	.attach		= drmcs_attach,
> +	.cancel_attach  = drmcs_cancel_attach,
>  	.early_init	= false,
>  	.legacy_cftypes	= files,
>  	.dfl_cftypes	= files,
> -- 
> 2.48.0
> 


Regards,
Michal
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 228 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/dri-devel/attachments/20250602/8f68c566/attachment.sig>


More information about the dri-devel mailing list