[PATCH] dma-buf: Move sysfs work out of DMA-BUF export/release path

Christian König christian.koenig at amd.com
Thu Jan 6 08:59:08 UTC 2022


Am 05.01.22 um 00:51 schrieb Hridya Valsaraju:
> Recently, we noticed an issue where a process went into direct reclaim
> while holding the kernfs rw semaphore for sysfs in write(exclusive)
> mode. This caused processes who were doing DMA-BUF exports and releases
> to go into uninterruptible sleep since they needed to acquire the same
> semaphore for the DMA-BUF sysfs entry creation/deletion. In order to avoid
> blocking DMA-BUF export/release for an indeterminate amount of time
> while another process is holding the sysfs rw semaphore in exclusive
> mode, this patch moves the per-buffer sysfs file creation/deleteion to
> a kthread.

Well I absolutely don't think that this is justified.

You adding tons of complexity here just to avoid the overhead of 
creating the sysfs files while exporting DMA-bufs which is an operation 
which should be done exactly once in the lifecycle for the most common 
use case.

Please explain further why that should be necessary.

Regards,
Christian.

>
> Fixes: bdb8d06dfefd ("dmabuf: Add the capability to expose DMA-BUF stats in sysfs")
> Signed-off-by: Hridya Valsaraju <hridya at google.com>
> ---
>   drivers/dma-buf/dma-buf-sysfs-stats.c | 343 ++++++++++++++++++++++++--
>   include/linux/dma-buf.h               |  46 ++++
>   2 files changed, 366 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c
> index 053baadcada9..3251fdf2f05f 100644
> --- a/drivers/dma-buf/dma-buf-sysfs-stats.c
> +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c
> @@ -7,13 +7,39 @@
>   
>   #include <linux/dma-buf.h>
>   #include <linux/dma-resv.h>
> +#include <linux/freezer.h>
>   #include <linux/kobject.h>
> +#include <linux/kthread.h>
> +#include <linux/list.h>
>   #include <linux/printk.h>
> +#include <linux/sched/signal.h>
>   #include <linux/slab.h>
>   #include <linux/sysfs.h>
>   
>   #include "dma-buf-sysfs-stats.h"
>   
> +struct dmabuf_kobj_work {
> +	struct list_head list;
> +	struct dma_buf_sysfs_entry *sysfs_entry;
> +	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
> +	unsigned long uid;
> +};
> +
> +/* Both kobject setup and teardown work gets queued on the list. */
> +static LIST_HEAD(dmabuf_kobj_work_list);
> +
> +/* dmabuf_kobj_list_lock protects dmabuf_kobj_work_list. */
> +static DEFINE_SPINLOCK(dmabuf_kobj_list_lock);
> +
> +/*
> + * dmabuf_sysfs_show_lock prevents a race between a DMA-BUF sysfs file being
> + * read and the DMA-BUF being freed by protecting sysfs_entry->dmabuf.
> + */
> +static DEFINE_SPINLOCK(dmabuf_sysfs_show_lock);
> +
> +static struct task_struct *dmabuf_kobject_task;
> +static wait_queue_head_t dmabuf_kobject_waitqueue;
> +
>   #define to_dma_buf_entry_from_kobj(x) container_of(x, struct dma_buf_sysfs_entry, kobj)
>   
>   /**
> @@ -64,15 +90,26 @@ static ssize_t dma_buf_stats_attribute_show(struct kobject *kobj,
>   	struct dma_buf_stats_attribute *attribute;
>   	struct dma_buf_sysfs_entry *sysfs_entry;
>   	struct dma_buf *dmabuf;
> +	int ret;
>   
>   	attribute = to_dma_buf_stats_attr(attr);
>   	sysfs_entry = to_dma_buf_entry_from_kobj(kobj);
> +
> +	/*
> +	 * acquire dmabuf_sysfs_show_lock to prevent a race with the DMA-BUF
> +	 * being freed while sysfs_entry->dmabuf is being accessed.
> +	 */
> +	spin_lock(&dmabuf_sysfs_show_lock);
>   	dmabuf = sysfs_entry->dmabuf;
>   
> -	if (!dmabuf || !attribute->show)
> +	if (!dmabuf || !attribute->show) {
> +		spin_unlock(&dmabuf_sysfs_show_lock);
>   		return -EIO;
> +	}
>   
> -	return attribute->show(dmabuf, attribute, buf);
> +	ret = attribute->show(dmabuf, attribute, buf);
> +	spin_unlock(&dmabuf_sysfs_show_lock);
> +	return ret;
>   }
>   
>   static const struct sysfs_ops dma_buf_stats_sysfs_ops = {
> @@ -118,33 +155,275 @@ static struct kobj_type dma_buf_ktype = {
>   	.default_groups = dma_buf_stats_default_groups,
>   };
>   
> -void dma_buf_stats_teardown(struct dma_buf *dmabuf)
> +/* Statistics files do not need to send uevents. */
> +static int dmabuf_sysfs_uevent_filter(struct kset *kset, struct kobject *kobj)
>   {
> -	struct dma_buf_sysfs_entry *sysfs_entry;
> +	return 0;
> +}
>   
> -	sysfs_entry = dmabuf->sysfs_entry;
> -	if (!sysfs_entry)
> -		return;
> +static const struct kset_uevent_ops dmabuf_sysfs_no_uevent_ops = {
> +	.filter = dmabuf_sysfs_uevent_filter,
> +};
> +
> +/* setup of sysfs entries done asynchronously in the worker thread. */
> +static void dma_buf_sysfs_stats_setup_work(struct dmabuf_kobj_work *kobject_work)
> +{
> +	struct dma_buf_sysfs_entry *sysfs_entry = kobject_work->sysfs_entry;
> +	struct dma_buf_sysfs_entry_metadata *sysfs_metadata =
> +			kobject_work->sysfs_metadata;
> +	bool free_metadata = false;
> +
> +	int ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL,
> +				       "%lu", kobject_work->uid);
> +	if (ret) {
> +		kobject_put(&sysfs_entry->kobj);
> +
> +		spin_lock(&sysfs_metadata->sysfs_entry_lock);
> +		if (sysfs_metadata->status == SYSFS_ENTRY_INIT_ABORTED) {
> +			/*
> +			 * SYSFS_ENTRY_INIT_ABORTED means that the DMA-BUF has already
> +			 * been freed. At this point, its safe to free the memory for
> +			 * the sysfs_metadata;
> +			 */
> +			free_metadata = true;
> +		} else {
> +			/*
> +			 * The DMA-BUF has not yet been freed, set the status to
> +			 * sysfs_entry_error so that when the DMA-BUF gets
> +			 * freed, we know there is no need to teardown the sysfs
> +			 * entry.
> +			 */
> +			sysfs_metadata->status = SYSFS_ENTRY_ERROR;
> +		}
> +		goto unlock;
> +	}
> +
> +	/*
> +	 * If the DMA-BUF has not yet been released, status would still be
> +	 * SYSFS_ENTRY_INIT_IN_PROGRESS. We set the status as initialized.
> +	 */
> +	spin_lock(&sysfs_metadata->sysfs_entry_lock);
> +	if (sysfs_metadata->status == SYSFS_ENTRY_INIT_IN_PROGRESS) {
> +		sysfs_metadata->status = SYSFS_ENTRY_INITIALIZED;
> +		goto unlock;
> +	}
>   
> +	/*
> +	 * At this point the status is SYSFS_ENTRY_INIT_ABORTED which means
> +	 * that the DMA-BUF has already been freed. Hence, we cleanup the
> +	 * sysfs_entry and its metadata since neither of them are needed
> +	 * anymore.
> +	 */
> +	free_metadata = true;
>   	kobject_del(&sysfs_entry->kobj);
>   	kobject_put(&sysfs_entry->kobj);
> +
> +unlock:
> +	spin_unlock(&sysfs_metadata->sysfs_entry_lock);
> +	if (free_metadata) {
> +		kfree(kobject_work->sysfs_metadata);
> +		kobject_work->sysfs_metadata = NULL;
> +	}
>   }
>   
> +/* teardown of sysfs entries done asynchronously in the worker thread. */
> +static void dma_buf_sysfs_stats_teardown_work(struct dmabuf_kobj_work *kobject_work)
> +{
> +	struct dma_buf_sysfs_entry *sysfs_entry = kobject_work->sysfs_entry;
>   
> -/* Statistics files do not need to send uevents. */
> -static int dmabuf_sysfs_uevent_filter(struct kset *kset, struct kobject *kobj)
> +	kobject_del(&sysfs_entry->kobj);
> +	kobject_put(&sysfs_entry->kobj);
> +
> +	kfree(kobject_work->sysfs_metadata);
> +	kobject_work->sysfs_metadata = NULL;
> +}
> +
> +/* do setup or teardown of sysfs entries as required */
> +static void do_kobject_work(struct dmabuf_kobj_work *kobject_work)
>   {
> +	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
> +	bool setup_needed = false;
> +	bool teardown_needed = false;
> +
> +	sysfs_metadata = kobject_work->sysfs_metadata;
> +	spin_lock(&sysfs_metadata->sysfs_entry_lock);
> +	if (sysfs_metadata->status == SYSFS_ENTRY_UNINITIALIZED) {
> +		setup_needed = true;
> +		sysfs_metadata->status = SYSFS_ENTRY_INIT_IN_PROGRESS;
> +	} else if (sysfs_metadata->status == SYSFS_ENTRY_INITIALIZED) {
> +		teardown_needed = true;
> +	}
> +
> +	/*
> +	 * It is ok to release the sysfs_entry_lock here.
> +	 *
> +	 * If setup_needed is true, we check the status again after the kobject
> +	 * initialization to see if it has been set to SYSFS_ENTRY_INIT_ABORTED
> +	 * and if so teardown the kobject.
> +	 *
> +	 * If teardown_needed is true, there are no more changes expected to the
> +	 * status.
> +	 *
> +	 * If neither setup_needed nor teardown needed are true, it
> +	 * means the DMA-BUF was freed before we got around to setting up the
> +	 * sysfs entry and hence we just need to release the metadata and
> +	 * return.
> +	 */
> +	spin_unlock(&kobject_work->sysfs_metadata->sysfs_entry_lock);
> +
> +	if (setup_needed)
> +		dma_buf_sysfs_stats_setup_work(kobject_work);
> +	else if (teardown_needed)
> +		dma_buf_sysfs_stats_teardown_work(kobject_work);
> +	else
> +		kfree(kobject_work->sysfs_metadata);
> +
> +	kfree(kobject_work);
> +}
> +
> +static struct dmabuf_kobj_work *get_next_kobj_work(void)
> +{
> +	struct dmabuf_kobj_work *kobject_work;
> +
> +	spin_lock(&dmabuf_kobj_list_lock);
> +	kobject_work = list_first_entry_or_null(&dmabuf_kobj_work_list,
> +						struct dmabuf_kobj_work, list);
> +	if (kobject_work)
> +		list_del(&kobject_work->list);
> +	spin_unlock(&dmabuf_kobj_list_lock);
> +	return kobject_work;
> +}
> +
> +static int kobject_work_thread(void *data)
> +{
> +	struct dmabuf_kobj_work *kobject_work;
> +
> +	while (1) {
> +		wait_event_freezable(dmabuf_kobject_waitqueue,
> +				     (kobject_work = get_next_kobj_work()));
> +		do_kobject_work(kobject_work);
> +	}
> +
>   	return 0;
>   }
>   
> -static const struct kset_uevent_ops dmabuf_sysfs_no_uevent_ops = {
> -	.filter = dmabuf_sysfs_uevent_filter,
> -};
> +static int kobject_worklist_init(void)
> +{
> +	init_waitqueue_head(&dmabuf_kobject_waitqueue);
> +	dmabuf_kobject_task = kthread_run(kobject_work_thread, NULL,
> +					  "%s", "dmabuf-kobject-worker");
> +	if (IS_ERR(dmabuf_kobject_task)) {
> +		pr_err("Creating thread for deferred sysfs entry creation/deletion failed\n");
> +		return PTR_ERR(dmabuf_kobject_task);
> +	}
> +	sched_set_normal(dmabuf_kobject_task, MAX_NICE);
> +
> +	return 0;
> +}
> +
> +static void deferred_kobject_create(struct dmabuf_kobj_work *kobject_work)
> +{
> +	INIT_LIST_HEAD(&kobject_work->list);
> +
> +	spin_lock(&dmabuf_kobj_list_lock);
> +
> +	list_add_tail(&kobject_work->list, &dmabuf_kobj_work_list);
> +
> +	spin_unlock(&dmabuf_kobj_list_lock);
> +
> +	wake_up(&dmabuf_kobject_waitqueue);
> +}
> +
> +
> +void dma_buf_stats_teardown(struct dma_buf *dmabuf)
> +{
> +	struct dma_buf_sysfs_entry *sysfs_entry;
> +	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
> +	struct dmabuf_kobj_work *kobj_work;
> +
> +	sysfs_entry = dmabuf->sysfs_entry;
> +	if (!sysfs_entry)
> +		return;
> +
> +	sysfs_metadata = dmabuf->sysfs_entry_metadata;
> +	if (!sysfs_metadata)
> +		return;
> +
> +	spin_lock(&sysfs_metadata->sysfs_entry_lock);
> +
> +	if (sysfs_metadata->status == SYSFS_ENTRY_UNINITIALIZED ||
> +	    sysfs_metadata->status == SYSFS_ENTRY_INIT_IN_PROGRESS) {
> +		/*
> +		 * The sysfs entry for this buffer has not yet been initialized,
> +		 * we set the status to SYSFS_ENTRY_INIT_ABORTED to abort the
> +		 * initialization.
> +		 */
> +		sysfs_metadata->status = SYSFS_ENTRY_INIT_ABORTED;
> +		spin_unlock(&sysfs_metadata->sysfs_entry_lock);
> +
> +		/*
> +		 * In case kobject initialization completes right as we release
> +		 * the sysfs_entry_lock, disable show() for the sysfs entry by
> +		 * setting sysfs_entry->dmabuf to NULL to prevent a race.
> +		 */
> +		spin_lock(&dmabuf_sysfs_show_lock);
> +		sysfs_entry->dmabuf = NULL;
> +		spin_unlock(&dmabuf_sysfs_show_lock);
> +
> +		return;
> +	}
> +
> +	if (sysfs_metadata->status == SYSFS_ENTRY_INITIALIZED) {
> +		/*
> +		 * queue teardown work only if sysfs_entry is fully inititalized.
> +		 * It is ok to release the sysfs_entry_lock here since the
> +		 * status can no longer change.
> +		 */
> +		spin_unlock(&sysfs_metadata->sysfs_entry_lock);
> +
> +		/*
> +		 * Meanwhile disable show() for the sysfs entry to avoid a race
> +		 * between teardown and show().
> +		 */
> +		spin_lock(&dmabuf_sysfs_show_lock);
> +		sysfs_entry->dmabuf = NULL;
> +		spin_unlock(&dmabuf_sysfs_show_lock);
> +
> +		kobj_work = kzalloc(sizeof(struct dmabuf_kobj_work), GFP_KERNEL);
> +		if (!kobj_work) {
> +			/* do the teardown immediately. */
> +			kobject_del(&sysfs_entry->kobj);
> +			kobject_put(&sysfs_entry->kobj);
> +			kfree(sysfs_metadata);
> +		} else {
> +			/* queue teardown work. */
> +			kobj_work->sysfs_entry = dmabuf->sysfs_entry;
> +			kobj_work->sysfs_metadata = dmabuf->sysfs_entry_metadata;
> +			deferred_kobject_create(kobj_work);
> +		}
> +
> +		return;
> +	}
> +
> +	/*
> +	 * status is SYSFS_ENTRY_INIT_ERROR so we only need to free the
> +	 * metadata.
> +	 */
> +	spin_unlock(&sysfs_metadata->sysfs_entry_lock);
> +	kfree(dmabuf->sysfs_entry_metadata);
> +	dmabuf->sysfs_entry_metadata = NULL;
> +}
>   
>   static struct kset *dma_buf_stats_kset;
>   static struct kset *dma_buf_per_buffer_stats_kset;
>   int dma_buf_init_sysfs_statistics(void)
>   {
> +	int ret;
> +
> +	ret = kobject_worklist_init();
> +	if (ret)
> +		return ret;
> +
>   	dma_buf_stats_kset = kset_create_and_add("dmabuf",
>   						 &dmabuf_sysfs_no_uevent_ops,
>   						 kernel_kobj);
> @@ -171,7 +450,8 @@ void dma_buf_uninit_sysfs_statistics(void)
>   int dma_buf_stats_setup(struct dma_buf *dmabuf)
>   {
>   	struct dma_buf_sysfs_entry *sysfs_entry;
> -	int ret;
> +	struct dma_buf_sysfs_entry_metadata *sysfs_metadata;
> +	struct dmabuf_kobj_work *kobj_work;
>   
>   	if (!dmabuf || !dmabuf->file)
>   		return -EINVAL;
> @@ -188,18 +468,35 @@ int dma_buf_stats_setup(struct dma_buf *dmabuf)
>   	sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset;
>   	sysfs_entry->dmabuf = dmabuf;
>   
> +	sysfs_metadata = kzalloc(sizeof(struct dma_buf_sysfs_entry_metadata),
> +				 GFP_KERNEL);
> +	if (!sysfs_metadata) {
> +		kfree(sysfs_entry);
> +		return -ENOMEM;
> +	}
> +
>   	dmabuf->sysfs_entry = sysfs_entry;
>   
> -	/* create the directory for buffer stats */
> -	ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL,
> -				   "%lu", file_inode(dmabuf->file)->i_ino);
> -	if (ret)
> -		goto err_sysfs_dmabuf;
> +	sysfs_metadata->status = SYSFS_ENTRY_UNINITIALIZED;
> +	spin_lock_init(&sysfs_metadata->sysfs_entry_lock);
>   
> -	return 0;
> +	dmabuf->sysfs_entry_metadata = sysfs_metadata;
>   
> -err_sysfs_dmabuf:
> -	kobject_put(&sysfs_entry->kobj);
> -	dmabuf->sysfs_entry = NULL;
> -	return ret;
> +	kobj_work = kzalloc(sizeof(struct dmabuf_kobj_work), GFP_KERNEL);
> +	if (!kobj_work) {
> +		kfree(sysfs_entry);
> +		kfree(sysfs_metadata);
> +		return -ENOMEM;
> +	}
> +
> +	kobj_work->sysfs_entry = dmabuf->sysfs_entry;
> +	kobj_work->sysfs_metadata = dmabuf->sysfs_entry_metadata;
> +	/*
> +	 * stash the inode number in struct dmabuf_kobj_work since setup
> +	 * might race with DMA-BUF teardown.
> +	 */
> +	kobj_work->uid = file_inode(dmabuf->file)->i_ino;
> +
> +	deferred_kobject_create(kobj_work);
> +	return 0;
>   }
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index 7ab50076e7a6..0597690023a0 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -287,6 +287,50 @@ struct dma_buf_ops {
>   	void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
>   };
>   
> +#ifdef CONFIG_DMABUF_SYSFS_STATS
> +enum sysfs_entry_status {
> +	SYSFS_ENTRY_UNINITIALIZED,
> +	SYSFS_ENTRY_INIT_IN_PROGRESS,
> +	SYSFS_ENTRY_ERROR,
> +	SYSFS_ENTRY_INIT_ABORTED,
> +	SYSFS_ENTRY_INITIALIZED,
> +};
> +
> +/*
> + * struct dma_buf_sysfs_entry_metadata - Holds the current status for the
> + * DMA-BUF sysfs entry.
> + *
> + * @status: holds the current status for the DMA-BUF sysfs entry. The status of
> + * the sysfs entry has the following path.
> + *
> + *			SYSFS_ENTRY_UNINITIALIZED
> + *		 __________________|____________________
> + *		|					|
> + *	  SYSFS_ENTRY_INIT_IN_PROGRESS	    SYSFS_ENTRY_INIT_ABORTED (DMA-BUF
> + *		|						      gets freed
> + *		|						      before
> + *		|						      init)
> + *	________|_____________________________________
> + *	|			  |		      |
> + * SYSFS_ENTRY_INITIALIZED	  |	  SYSFS_ENTRY_INIT_ABORTED
> + *				  |		  (DMA-BUF gets freed during kobject
> + *				  |		  init)
> + *				  |
> + *				  |
> + *		      SYSFS_ENTRY_ERROR
> + *		      (error during kobject init)
> + *
> + * @sysfs_entry_lock: protects access to @status.
> + */
> +struct dma_buf_sysfs_entry_metadata {
> +	enum sysfs_entry_status status;
> +	/*
> +	 * Protects sysfs_entry_metadata->status
> +	 */
> +	spinlock_t sysfs_entry_lock;
> +};
> +#endif
> +
>   /**
>    * struct dma_buf - shared buffer object
>    *
> @@ -452,6 +496,8 @@ struct dma_buf {
>   		struct kobject kobj;
>   		struct dma_buf *dmabuf;
>   	} *sysfs_entry;
> +
> +	struct dma_buf_sysfs_entry_metadata *sysfs_entry_metadata;
>   #endif
>   };
>   



More information about the dri-devel mailing list