[PATCH 2/2] i915/pmu: Cleanup pending events on unbind

Umesh Nerlige Ramappa umesh.nerlige.ramappa at intel.com
Tue Feb 13 19:44:18 UTC 2024


On Tue, Feb 13, 2024 at 08:36:43PM +0200, Jani Nikula wrote:
>On Tue, 13 Feb 2024, Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com> wrote:
>> Once a user opens an fd for a perf event, if the driver undergoes a
>> function level reset (FLR), the resources are not cleaned up as
>> expected. For this discussion FLR is defined as a PCI unbind followed by
>> a bind. perf_pmu_unregister() would cleanup everything, but when the user
>> closes the perf fd, perf_release is executed and we encounter null
>> pointer dereferences and/or list corruption in that path which require a
>> reboot to recover.
>>
>> The only approach that worked to resolve this was to close the file
>> associated with the event such that the relevant cleanup happens w.r.t.
>> the open file. To do so, use the event->owner task and find the file
>> relevant to the event and close it. This relies on the
>> file->private_data matching the event object.
>>
>> Note:
>> - Closing the event file is a delayed work that gets queued to system_wq.
>> The close is seen to happen when kernel returns to user space following
>> the unbind.
>>
>> - perf framework will access the pmu object after the last event has
>> been destroyed. The drm device is refcounted in the init and destroy
>> hooks, so this causes a use after free if we are releasing the drm
>> device reference after unbind has been called. To work around this, we
>> take an extra reference in the unbind path and release it using a
>> delayed work in the destroy patch. The delayed work is queued to
>> system_wq.
>>
>> Ref: https://lore.kernel.org/lkml/20240115170120.662220-1-tvrtko.ursulin@linux.intel.com/T/#me72abfa2771e6fc94b167ce47efdbf391cc313ab
>>
>> Opens:
>> - Synchronization may be needed between i915_pmu_unregister and
>> i915_pmu_event_destroy to avoid any races.
>>
>> - If unbind and bind happen from the same process the event fd is closed
>> after bind completes. This means that the cleanup would not happen
>> until bind completes. In this case, i915 loads fine, but pmu
>> registration fails with an error that the sysfs entries are already
>> present. There is no solution feasible here. Since this is not a fatal
>> error (reloading i915 works fine) and the usual case is to have bind and
>> unbind in separate processes, there is no intention to solve this.
>>
>> Other solutions/aspects tried:
>> - Call perf_event_disable() followed by perf_event_release_kernel() in
>> the unbind path to clean up the events. This still causes issues when
>> user closes the fd since perf_event_release_kernel() is called again and
>> fails requiring reboot.
>>
>> - Close all event fds in unbind and wait for the close to complete by
>> checking if list is empty. This wait does not work since the files
>> are actually closed when unbind returns to user space.
>>
>> Testing:
>> - New IGT tests have been added for this and are run with KASAN and
>>   kmemleak enabled.
>>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_pmu.c | 96 ++++++++++++++++++++++++++++++++-
>>  drivers/gpu/drm/i915/i915_pmu.h | 15 ++++++
>>  2 files changed, 110 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>> index 4d2a289f848a..2f365c7f5db7 100644
>> --- a/drivers/gpu/drm/i915/i915_pmu.c
>> +++ b/drivers/gpu/drm/i915/i915_pmu.c
>> @@ -4,6 +4,8 @@
>>   * Copyright © 2017-2018 Intel Corporation
>>   */
>>
>> +#include <linux/fdtable.h>
>> +#include <linux/fs.h>
>>  #include <linux/pm_runtime.h>
>>
>>  #include "gt/intel_engine.h"
>> @@ -573,9 +575,21 @@ static void i915_pmu_event_destroy(struct perf_event *event)
>>  {
>>  	struct i915_pmu *pmu = event_to_pmu(event);
>>  	struct drm_i915_private *i915 = pmu_to_i915(pmu);
>> +	struct i915_event *e = event->pmu_private;
>>
>>  	drm_WARN_ON(&i915->drm, event->parent);
>>
>> +	if (e) {
>> +		event->pmu_private = NULL;
>> +		list_del(&e->link);
>> +		kfree(e);
>> +	}
>> +
>> +	if (i915->pmu.closed && list_empty(&i915->pmu.initialized_events)) {
>> +		pmu_teardown(&i915->pmu);
>> +		mod_delayed_work(system_wq, &i915->pmu.work, 50);
>> +	}
>> +
>>  	drm_dev_put(&i915->drm);
>>  }
>>
>> @@ -684,6 +698,14 @@ static int i915_pmu_event_init(struct perf_event *event)
>>  		return ret;
>>
>>  	if (!event->parent) {
>> +		struct i915_event *e = kzalloc(sizeof(*e), GFP_KERNEL);
>> +
>> +		if (!e)
>> +			return -ENOMEM;
>> +
>> +		e->event = event;
>> +		list_add(&e->link, &pmu->initialized_events);
>> +		event->pmu_private = e;
>>  		drm_dev_get(&i915->drm);
>>  		event->destroy = i915_pmu_event_destroy;
>>  	}
>> @@ -1256,6 +1278,14 @@ void i915_pmu_exit(void)
>>  		cpuhp_remove_multi_state(cpuhp_slot);
>>  }
>>
>> +static void i915_pmu_release(struct work_struct *work)
>> +{
>> +	struct i915_pmu *pmu = container_of(work, typeof(*pmu), work.work);
>> +	struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>> +
>> +	drm_dev_put(&i915->drm);
>> +}
>> +
>>  void i915_pmu_register(struct drm_i915_private *i915)
>>  {
>>  	struct i915_pmu *pmu = &i915->pmu;
>> @@ -1313,6 +1343,9 @@ void i915_pmu_register(struct drm_i915_private *i915)
>>  	pmu->base.read		= i915_pmu_event_read;
>>  	pmu->base.event_idx	= i915_pmu_event_event_idx;
>>
>> +	INIT_LIST_HEAD(&pmu->initialized_events);
>> +	INIT_DELAYED_WORK(&pmu->work, i915_pmu_release);
>> +
>>  	ret = perf_pmu_register(&pmu->base, pmu->name, -1);
>>  	if (ret)
>>  		goto err_groups;
>> @@ -1337,6 +1370,64 @@ void i915_pmu_register(struct drm_i915_private *i915)
>>  	drm_notice(&i915->drm, "Failed to register PMU!\n");
>>  }
>>
>> +/* Ref: close_fd() */
>> +static unsigned int __open_files(struct fdtable *fdt)
>> +{
>> +	unsigned int size = fdt->max_fds;
>> +	unsigned int i;
>> +
>> +	for (i = size / BITS_PER_LONG; i > 0; ) {
>> +		if (fdt->open_fds[--i])
>> +			break;
>> +	}
>> +	return (i + 1) * BITS_PER_LONG;
>> +}
>> +
>> +static void close_event_file(struct perf_event *event)
>> +{
>> +	unsigned int max_open_fds, fd;
>> +	struct files_struct *files;
>> +	struct task_struct *task;
>> +	struct fdtable *fdt;
>> +
>> +	task = event->owner;
>> +	if (!task)
>> +		return;
>> +
>> +	files = task->files;
>> +	if (!files)
>> +		return;
>> +
>> +	spin_lock(&files->file_lock);
>> +	fdt = files_fdtable(files);
>> +	max_open_fds = __open_files(fdt);
>> +	for (fd = 0; fd < max_open_fds; fd++) {
>> +		struct file *file = fdt->fd[fd];
>> +
>> +		if (!file || file->private_data != event)
>> +			continue;
>> +
>> +		rcu_assign_pointer(fdt->fd[fd], NULL);
>> +		__clear_bit(fd, fdt->open_fds);
>> +		__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
>> +		if (fd < files->next_fd)
>> +			files->next_fd = fd;
>> +		filp_close(file, files);
>> +		break;
>> +	}
>> +	spin_unlock(&files->file_lock);
>> +}
>> +
>> +static void cleanup_events(struct i915_pmu *pmu)
>> +{
>> +	struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>> +	struct i915_event *e, *tmp;
>> +
>> +	drm_dev_get(&i915->drm);
>> +	list_for_each_entry_safe(e, tmp, &pmu->initialized_events, link)
>> +		close_event_file(e->event);
>> +}
>> +
>>  void i915_pmu_unregister(struct drm_i915_private *i915)
>>  {
>>  	struct i915_pmu *pmu = &i915->pmu;
>> @@ -1354,5 +1445,8 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
>>
>>  	hrtimer_cancel(&pmu->timer);
>>
>> -	pmu_teardown(pmu);
>> +	if (list_empty(&pmu->initialized_events))
>> +		pmu_teardown(pmu);
>> +	else
>> +		cleanup_events(pmu);
>>  }
>> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
>> index 41af038c3738..6f62e820f34d 100644
>> --- a/drivers/gpu/drm/i915/i915_pmu.h
>> +++ b/drivers/gpu/drm/i915/i915_pmu.h
>> @@ -55,6 +55,11 @@ struct i915_pmu_sample {
>>  	u64 cur;
>>  };
>>
>> +struct i915_event {
>> +	struct perf_event *event;
>> +	struct list_head link;
>> +};
>> +
>
>Nobody needs this outside of i915_pmu.c.

Agree. Will move it to i915_pmu.c

Thanks,
Umesh

>
>>  struct i915_pmu {
>>  	/**
>>  	 * @cpuhp: Struct used for CPU hotplug handling.
>> @@ -152,6 +157,16 @@ struct i915_pmu {
>>  	 * @pmu_attr: Memory block holding device attributes.
>>  	 */
>>  	void *pmu_attr;
>> +
>> +	/**
>> +	 * @initialized_events: List of initialized events
>> +	 */
>> +	struct list_head initialized_events;
>> +
>> +	/**
>> +	 * @work: worker to delay release of drm device reference
>> +	 */
>> +	struct delayed_work work;
>>  };
>>
>>  #ifdef CONFIG_PERF_EVENTS
>
>-- 
>Jani Nikula, Intel


More information about the Intel-gfx mailing list