[RFC 20/23] cgroup/drm: Introduce weight based scheduling control
Michal Koutný
mkoutny at suse.com
Mon Jun 2 16:22:11 UTC 2025
Hello.
On Fri, May 02, 2025 at 01:32:53PM +0100, Tvrtko Ursulin <tvrtko.ursulin at igalia.com> wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Similar to CPU and IO scheduling, implement a concept of weight in the DRM
> cgroup controller.
>
> Individual drivers are now able to register with the controller which will
> notify them of the relative scheduling weight for each open DRM client.
>
> The notifications are triggered on cgroup weight changes and DRM clients
> appearing and disappearing in/from cgroups. Latter is done because it is
> handy to ignore the groups with no DRM clients in relative weight
> calculations.
>
> The notifications are also consolidated by using a delayed worker.
>
> On the userspace API level we use the same range and defaults as the CPU
> controller - CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_DFL and CGROUP_WEIGHT_MAX.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
> Cc: Michal Koutný <mkoutny at suse.com>
> Cc: Tejun Heo <tj at kernel.org>
> ---
> Documentation/admin-guide/cgroup-v2.rst | 22 ++
> include/linux/cgroup_drm.h | 2 +
> kernel/cgroup/drm.c | 313 +++++++++++++++++++++++-
> 3 files changed, 331 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 1a16ce68a4d7..095b7dee0151 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2729,6 +2729,28 @@ HugeTLB Interface Files
> hugetlb pages of <hugepagesize> in this cgroup. Only active in
> use hugetlb pages are included. The per-node values are in bytes.
>
> +DRM
> +---
> +
> +The controller allows for configuring of scheduling weights of cgroups relative
> +to their siblings.
> +
> +NOTE: This is an optional feature into which individual DRM drivers need to
> + opt-in if they want to support it.
> +
> +NOTE: Only single GPU systems will work as expected in the current
> + implementation.
> +
> +DRM Interface Files
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +
> + drm.weight
> + A read-write single value file which exists on non-root cgroups. The
> + default is "100".
Should this be akin to IO controller and have subkey granularity for
individual devices? (With special 'default' entry.)
(Can those devices come and go (hotplug)?)
> +
> + The weights are in the range [1, 10000] and specify the relative
> + scheduling weights for cgroups in relation to their siblings.
> +
> Misc
> ----
>
> diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
> index d6a9a2fbdbf5..9961003958b4 100644
> --- a/include/linux/cgroup_drm.h
> +++ b/include/linux/cgroup_drm.h
> @@ -6,6 +6,8 @@
>
> #include <drm/drm_file.h>
>
> +#define DRM_CGROUP_WEIGHT_SHIFT 10
> +
> #if IS_ENABLED(CONFIG_CGROUP_DRM)
> void drmcgroup_client_open(struct drm_file *file_priv);
> void drmcgroup_client_close(struct drm_file *file_priv);
> diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
> index ea7655edf86a..532702604786 100644
> --- a/kernel/cgroup/drm.c
> +++ b/kernel/cgroup/drm.c
> @@ -13,10 +13,18 @@ struct drm_cgroup_state {
> struct cgroup_subsys_state css;
>
> struct list_head clients;
> + unsigned int num_clients; /* Whole branch */
> +
> + unsigned int sum_children_weights;
> +
> + unsigned int weight;
> + unsigned int effective_weight;
IIUC, this is weight normalized across siblings (and not scaled by
ancestors)?
It should be then sufficient to only notify the siblings of modified
cgroup after the recalculation, shouldn't it?
> };
>
> struct drm_root_cgroup_state {
> struct drm_cgroup_state drmcs;
> +
> + struct delayed_work notify_work;
> };
>
> static struct drm_root_cgroup_state root_drmcs = {
> @@ -31,7 +39,7 @@ css_to_drmcs(struct cgroup_subsys_state *css)
> return container_of(css, struct drm_cgroup_state, css);
> }
>
> -static void __maybe_unused
> +static void
> drmcs_notify_weight(struct drm_cgroup_state *drmcs)
> {
> struct drm_file *fpriv;
> @@ -43,16 +51,152 @@ drmcs_notify_weight(struct drm_cgroup_state *drmcs)
> fpriv->minor->dev->driver->cg_ops;
>
> if (cg_ops && cg_ops->notify_weight)
> - cg_ops->notify_weight(fpriv, 0);
> + cg_ops->notify_weight(fpriv, drmcs->effective_weight);
> }
> }
>
> +static void drmcg_update_weights_locked(void)
> +{
> + lockdep_assert_held(&drmcg_mutex);
> +
> + mod_delayed_work(system_wq,
> + &root_drmcs.notify_work,
> + usecs_to_jiffies(1000));
This value is little bit magic.
What is this consolidatiot good for?
(I guess it's rather because of clients joining/leaving rather than
cgroup attribute modifications.)
> +}
> +
> +static void drmcg_update_weights(void)
> +{
> + mutex_lock(&drmcg_mutex);
> + drmcg_update_weights_locked();
> + mutex_unlock(&drmcg_mutex);
> +}
> +
> +static u64
> +drmcs_read_weight(struct cgroup_subsys_state *css, struct cftype *cft)
> +{
> + struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> +
> + return drmcs->weight;
> +}
> +
> +static int
> +drmcs_write_weight(struct cgroup_subsys_state *css, struct cftype *cftype,
> + u64 weight)
> +{
> + struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> + int ret;
> +
> + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
> + return -ERANGE;
> +
> + ret = mutex_lock_interruptible(&drmcg_mutex);
> + if (ret)
> + return ret;
> + drmcs->weight = weight;
> + drmcg_update_weights_locked();
> + mutex_unlock(&drmcg_mutex);
> +
> + return 0;
> +}
> +
> +static void notify_worker(struct work_struct *work)
> +{
> + struct drm_cgroup_state *root = &root_drmcs.drmcs;
> + struct cgroup_subsys_state *node;
> + bool updated;
> +
> + mutex_lock(&drmcg_mutex);
> + rcu_read_lock();
> +
> + /*
> + * Always come back later if we race with core cgroup management.
> + */
> + updated = false;
> + if (WARN_ON_ONCE(!css_tryget_online(&root->css)))
> + goto out_unlock;
> +
> + css_for_each_descendant_post(node, &root->css) {
> + struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> +
> + if (!css_tryget_online(node))
> + goto out_put;
> +
> + drmcs->sum_children_weights = 0;
> + css_put(node);
> + }
> +
> + css_for_each_descendant_post(node, &root->css) {
> + struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> + struct drm_cgroup_state *parent;
> +
> + if (!css_tryget_online(node))
> + goto out_put;
> + if (!node->parent || !drmcs->num_clients) {
> + css_put(node);
> + continue;
> + }
> + if (!css_tryget_online(node->parent)) {
> + css_put(node);
> + goto out_put;
> + }
> +
> + parent = css_to_drmcs(node->parent);
> + parent->sum_children_weights += drmcs->weight;
> + css_put(node);
> + css_put(&parent->css);
> + }
> +
> + css_for_each_descendant_pre(node, &root->css) {
> + struct drm_cgroup_state *drmcs = css_to_drmcs(node);
> + struct cgroup_subsys_state *css;
> +
> + if (!css_tryget_online(node))
> + goto out_put;
> + if (!drmcs->num_clients) {
> + css_put(node);
> + continue;
> + }
> +
> + css_for_each_child(css, &drmcs->css) {
> + struct drm_cgroup_state *sibling = css_to_drmcs(css);
> +
> + if (!css_tryget_online(css)) {
> + css_put(node);
> + goto out_put;
> + }
> + if (!sibling->num_clients) {
> + css_put(css);
> + continue;
> + }
> +
> + sibling->effective_weight =
> + DIV_ROUND_CLOSEST(sibling->weight <<
> + DRM_CGROUP_WEIGHT_SHIFT,
> + drmcs->sum_children_weights);
> + drmcs_notify_weight(sibling);
> + css_put(css);
> + }
> +
> + css_put(node);
> + }
> +
> + updated = true;
> +
> +out_put:
> + css_put(&root->css);
> +out_unlock:
> + rcu_read_unlock();
> +
> + if (!updated)
> + drmcg_update_weights_locked();
> +
> + mutex_unlock(&drmcg_mutex);
> +}
> +
> static void drmcs_free(struct cgroup_subsys_state *css)
> {
> - struct drm_cgroup_state *drmcs = css_to_drmcs(css);
> -
> - if (drmcs != &root_drmcs.drmcs)
> - kfree(drmcs);
> + if (css != &root_drmcs.drmcs.css)
> + kfree(css_to_drmcs(css));
> }
>
> static struct cgroup_subsys_state *
> @@ -62,6 +206,7 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css)
>
> if (!parent_css) {
> drmcs = &root_drmcs.drmcs;
> + INIT_DELAYED_WORK(&root_drmcs.notify_work, notify_worker);
> } else {
> drmcs = kzalloc(sizeof(*drmcs), GFP_KERNEL);
> if (!drmcs)
> @@ -70,9 +215,147 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css)
> INIT_LIST_HEAD(&drmcs->clients);
> }
>
> + drmcs->weight = CGROUP_WEIGHT_DFL;
> + drmcs->effective_weight = (1 << DRM_CGROUP_WEIGHT_SHIFT) / 2;
> +
> return &drmcs->css;
> }
>
> +static int drmcs_online(struct cgroup_subsys_state *css)
> +{
> + drmcg_update_weights();
> +
> + return 0;
> +}
> +
> +static void drmcs_offline(struct cgroup_subsys_state *css)
> +{
> + drmcg_update_weights();
> +}
> +
> +static struct drm_cgroup_state *old_drmcs;
> +
> +static int drmcs_can_attach(struct cgroup_taskset *tset)
> +{
> + struct cgroup_subsys_state *css;
> + struct task_struct *task;
> +
> + task = cgroup_taskset_first(tset, &css);
> + old_drmcs = css_to_drmcs(task_css(task, drm_cgrp_id));
This is similar to cpuset's cpuset_attach_old_cs.
Beware that when controller is disabled, the migration happens from
multiple (chilren) csses.
> +
> + return 0;
> +}
> +
> +static void __inc_clients(struct drm_cgroup_state *drmcs)
> +{
> + struct cgroup_subsys_state *parent = NULL;
> +
> + rcu_read_lock();
> + do {
> + drmcs->num_clients++;
> + WARN_ON_ONCE(!drmcs->num_clients);
> +
> + if (parent)
> + css_put(parent);
> +
> + parent = drmcs->css.parent;
> + if (parent) {
> + if (WARN_ON_ONCE(!css_tryget(parent)))
This should be ensured implicitly thanks to css::online_cnt.
> + break;
> +
> + drmcs = css_to_drmcs(parent);
> + }
> + } while (parent);
> + rcu_read_unlock();
> +}
> +
> +static void __dec_clients(struct drm_cgroup_state *drmcs)
> +{
> + struct cgroup_subsys_state *parent = NULL;
> +
> + rcu_read_lock();
> + do {
> + WARN_ON_ONCE(!drmcs->num_clients);
> + drmcs->num_clients--;
> +
> + if (parent)
> + css_put(parent);
> +
> + parent = drmcs->css.parent;
> + if (parent) {
> + if (WARN_ON_ONCE(!css_tryget(parent)))
> + break;
> +
> + drmcs = css_to_drmcs(parent);
> + }
> + } while (parent);
> + rcu_read_unlock();
> +}
> +
> +static void drmcs_attach(struct cgroup_taskset *tset)
> +{
> + struct drm_cgroup_state *old = old_drmcs;
> + struct cgroup_subsys_state *css;
> + struct drm_file *fpriv, *next;
> + struct drm_cgroup_state *new;
> + struct task_struct *task;
> + bool migrated = false;
> +
> + if (!old)
> + return;
> +
> + task = cgroup_taskset_first(tset, &css);
> + new = css_to_drmcs(task_css(task, drm_cgrp_id));
> + if (new == old)
> + return;
Beware here too, it'd be better to use css from
cgroup_taskset_for_each(task, css, tset) below as the new css.
> +
> + mutex_lock(&drmcg_mutex);
> +
> + list_for_each_entry_safe(fpriv, next, &old->clients, clink) {
> + cgroup_taskset_for_each(task, css, tset) {
> + struct cgroup_subsys_state *old_css;
> + struct drm_cgroup_state *old_;
> +
> + if (task->flags & PF_KTHREAD)
> + continue;
> + if (!thread_group_leader(task))
> + continue;
Maybe cgroup_taskset_for_each_leader()
> +
> + new = css_to_drmcs(task_css(task, drm_cgrp_id));
> + if (WARN_ON_ONCE(new == old))
> + continue;
> +
> + if (rcu_access_pointer(fpriv->pid) != task_tgid(task))
> + continue;
> +
> + if (WARN_ON_ONCE(fpriv->__css != &old->css))
> + continue;
> +
> + old_css = fpriv->__css;
> + old_ = css_to_drmcs(old_css);
> + fpriv->__css = &new->css;
> + css_get(fpriv->__css);
> + list_move_tail(&fpriv->clink, &new->clients);
> + __dec_clients(old);
> + __inc_clients(new);
> + css_put(old_css);
> + migrated = true;
> + }
> + }
> +
> + if (migrated)
> + drmcg_update_weights_locked();
> +
> + mutex_unlock(&drmcg_mutex);
> +
> + old_drmcs = NULL;
> +}
> +
> +static void drmcs_cancel_attach(struct cgroup_taskset *tset)
> +{
> + old_drmcs = NULL;
> +}
> +
> void drmcgroup_client_open(struct drm_file *file_priv)
> {
> struct drm_cgroup_state *drmcs;
> @@ -85,6 +368,8 @@ void drmcgroup_client_open(struct drm_file *file_priv)
> mutex_lock(&drmcg_mutex);
> file_priv->__css = &drmcs->css; /* Keeps the reference. */
> list_add_tail(&file_priv->clink, &drmcs->clients);
> + __inc_clients(drmcs);
> + drmcg_update_weights_locked();
> mutex_unlock(&drmcg_mutex);
> }
> EXPORT_SYMBOL_GPL(drmcgroup_client_open);
> @@ -100,7 +385,9 @@ void drmcgroup_client_close(struct drm_file *file_priv)
>
> mutex_lock(&drmcg_mutex);
> list_del(&file_priv->clink);
> + __dec_clients(drmcs);
> file_priv->__css = NULL;
> + drmcg_update_weights_locked();
> mutex_unlock(&drmcg_mutex);
>
> css_put(&drmcs->css);
> @@ -124,6 +411,9 @@ void drmcgroup_client_migrate(struct drm_file *file_priv)
> if (src != dst) {
> file_priv->__css = &dst->css; /* Keeps the reference. */
> list_move_tail(&file_priv->clink, &dst->clients);
> + __dec_clients(src);
> + __inc_clients(dst);
> + drmcg_update_weights_locked();
> }
>
> mutex_unlock(&drmcg_mutex);
> @@ -133,12 +423,23 @@ void drmcgroup_client_migrate(struct drm_file *file_priv)
> EXPORT_SYMBOL_GPL(drmcgroup_client_migrate);
>
> struct cftype files[] = {
> + {
> + .name = "weight",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_u64 = drmcs_read_weight,
> + .write_u64 = drmcs_write_weight,
> + },
> { } /* Zero entry terminates. */
> };
>
> struct cgroup_subsys drm_cgrp_subsys = {
> .css_alloc = drmcs_alloc,
> .css_free = drmcs_free,
> + .css_online = drmcs_online,
> + .css_offline = drmcs_offline,
> + .can_attach = drmcs_can_attach,
> + .attach = drmcs_attach,
> + .cancel_attach = drmcs_cancel_attach,
> .early_init = false,
> .legacy_cftypes = files,
> .dfl_cftypes = files,
> --
> 2.48.0
>
Regards,
Michal
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 228 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/dri-devel/attachments/20250602/8f68c566/attachment.sig>
More information about the dri-devel
mailing list