[PATCH 3/5] perf: Add pmu get/put

Lucas De Marchi lucas.demarchi at intel.com
Fri Oct 18 19:46:31 UTC 2024


On Wed, Oct 16, 2024 at 02:03:02PM +0200, Peter Zijlstra wrote:
>On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote:
>
>> Let me ponder that a little bit.
>
>So I did the thing on top of the get/put thing that would allow you to
>get rid of the ->closed thing, and before I was finished I already hated
>all of it :-(
>
>The thing is, if you're going to the effort of adding get/put and
>putting the responsibility on the implementation, then the ->closed
>thing is only a little extra ask.
>
>So... I wondered, how hard it would be for perf_pmu_unregister() to do
>what you want.
>
>Time passed, hacks were done...
>
>and while what I have is still riddled with holes (more work is
>definitely required), it does pass your dummy_pmu test scipt.
>
>I'll poke at this a little longer. Afaict it should be possible to make
>this good enough for what you want. Just needs more holes filled.
>
>---
> include/linux/perf_event.h |  13 ++-
> kernel/events/core.c       | 260 ++++++++++++++++++++++++++++++++++++++-------
> 2 files changed, 228 insertions(+), 45 deletions(-)
>
>diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>index fb908843f209..20995d33d27e 100644
>--- a/include/linux/perf_event.h
>+++ b/include/linux/perf_event.h
>@@ -318,6 +318,9 @@ struct perf_output_handle;
> struct pmu {
> 	struct list_head		entry;
>
>+	spinlock_t			events_lock;
>+	struct list_head		events;

oh... I looked at several lists and was wondering which one would
contain the events of our pmu. Now I see we didn't have that :)

>+
> 	struct module			*module;
> 	struct device			*dev;
> 	struct device			*parent;
>@@ -612,9 +615,10 @@ struct perf_addr_filter_range {
>  * enum perf_event_state - the states of an event:
>  */
> enum perf_event_state {
>-	PERF_EVENT_STATE_DEAD		= -4,
>-	PERF_EVENT_STATE_EXIT		= -3,
>-	PERF_EVENT_STATE_ERROR		= -2,
>+	PERF_EVENT_STATE_DEAD		= -5,
>+	PERF_EVENT_STATE_REVOKED	= -4, /* pmu gone, must not touch */
>+	PERF_EVENT_STATE_EXIT		= -3, /* task died, still inherit */
>+	PERF_EVENT_STATE_ERROR		= -2, /* scheduling error, can enable */
> 	PERF_EVENT_STATE_OFF		= -1,
> 	PERF_EVENT_STATE_INACTIVE	=  0,
> 	PERF_EVENT_STATE_ACTIVE		=  1,
>@@ -853,6 +857,7 @@ struct perf_event {
> 	void *security;
> #endif
> 	struct list_head		sb_list;
>+	struct list_head		pmu_list;
>
> 	/*
> 	 * Certain events gets forwarded to another pmu internally by over-
>@@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
> extern void perf_event_itrace_started(struct perf_event *event);
>
> extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
>-extern void perf_pmu_unregister(struct pmu *pmu);
>+extern int perf_pmu_unregister(struct pmu *pmu);
>
> extern void __perf_event_task_sched_in(struct task_struct *prev,
> 				       struct task_struct *task);
>diff --git a/kernel/events/core.c b/kernel/events/core.c
>index cdd09769e6c5..e66827367a97 100644
>--- a/kernel/events/core.c
>+++ b/kernel/events/core.c
>@@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
>
> #define DETACH_GROUP	0x01UL
> #define DETACH_CHILD	0x02UL
>-#define DETACH_DEAD	0x04UL
>+#define DETACH_EXIT	0x04UL
>+#define DETACH_REVOKE	0x08UL
>+#define DETACH_DEAD	0x10UL
>
> /*
>  * Cross CPU call to remove a performance event
>@@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
> 			   void *info)
> {
> 	struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
>+	enum perf_event_state state = PERF_EVENT_STATE_OFF;
> 	unsigned long flags = (unsigned long)info;
>
> 	ctx_time_update(cpuctx, ctx);
>@@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event,
> 	 * Ensure event_sched_out() switches to OFF, at the very least
> 	 * this avoids raising perf_pending_task() at this time.
> 	 */
>-	if (flags & DETACH_DEAD)
>+	if (flags & DETACH_EXIT)
>+		state = PERF_EVENT_STATE_EXIT;
>+	if (flags & DETACH_REVOKE)
>+		state = PERF_EVENT_STATE_REVOKED;
>+	if (flags & DETACH_DEAD) {
> 		event->pending_disable = 1;
>+		state = PERF_EVENT_STATE_DEAD;
>+	}
> 	event_sched_out(event, ctx);
> 	if (flags & DETACH_GROUP)
> 		perf_group_detach(event);
> 	if (flags & DETACH_CHILD)
> 		perf_child_detach(event);
> 	list_del_event(event, ctx);
>-	if (flags & DETACH_DEAD)
>-		event->state = PERF_EVENT_STATE_DEAD;
>+
>+	event->state = state;
>
> 	if (!pmu_ctx->nr_events) {
> 		pmu_ctx->rotate_necessary = 0;
>@@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>
> static void perf_remove_from_owner(struct perf_event *event);
> static void perf_event_exit_event(struct perf_event *event,
>-				  struct perf_event_context *ctx);
>+				  struct perf_event_context *ctx,
>+				  bool revoke);
>
> /*
>  * Removes all events from the current task that have been marked
>@@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
>
> 		modified = true;
>
>-		perf_event_exit_event(event, ctx);
>+		perf_event_exit_event(event, ctx, false);
> 	}
>
> 	raw_spin_lock_irqsave(&ctx->lock, flags);
>@@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event)
> 	    attr->context_switch || attr->text_poke ||
> 	    attr->bpf_event)
> 		return true;
>+
> 	return false;
> }
>
>@@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event)
>
> static void _free_event(struct perf_event *event)
> {
>+	struct pmu *pmu = event->pmu;
>+
> 	irq_work_sync(&event->pending_irq);
> 	irq_work_sync(&event->pending_disable_irq);
> 	perf_pending_task_sync(event);
>@@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event)
> 	security_perf_event_free(event);
>
> 	if (event->rb) {
>+		WARN_ON_ONCE(!pmu);
> 		/*
> 		 * Can happen when we close an event with re-directed output.
> 		 *
>@@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event)
> 			put_callchain_buffers();
> 	}
>
>-	perf_event_free_bpf_prog(event);
>-	perf_addr_filters_splice(event, NULL);
>-	kfree(event->addr_filter_ranges);
>+	if (pmu) {
>+		perf_event_free_bpf_prog(event);
>+		perf_addr_filters_splice(event, NULL);
>+		kfree(event->addr_filter_ranges);
>+	}
>
>-	if (event->destroy)
>+	if (event->destroy) {
>+		WARN_ON_ONCE(!pmu);
> 		event->destroy(event);
>+	}
>
> 	/*
> 	 * Must be after ->destroy(), due to uprobe_perf_close() using
>@@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event)
> 	if (event->hw.target)
> 		put_task_struct(event->hw.target);
>
>-	if (event->pmu_ctx)
>+	if (event->pmu_ctx) {
>+		WARN_ON_ONCE(!pmu);
> 		put_pmu_ctx(event->pmu_ctx);
>+	}
>
> 	/*
> 	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
>@@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event)
> 	if (event->ctx)
> 		put_ctx(event->ctx);
>
>-	exclusive_event_destroy(event);
>-	module_put(event->pmu->module);
>+	if (pmu) {
>+		exclusive_event_destroy(event);
>+		module_put(pmu->module);
>+		scoped_guard(spinlock, &pmu->events_lock) {
>+			list_del(&event->pmu_list);
>+			wake_up_var(pmu);
>+		}
>+	}
>
> 	call_rcu(&event->rcu_head, free_event_rcu);
> }
>@@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event)
> 	 * Thus this guarantees that we will in fact observe and kill _ALL_
> 	 * child events.
> 	 */
>-	perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
>+	if (event->state > PERF_EVENT_STATE_REVOKED) {
>+		perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
>+	} else {
>+		event->state = PERF_EVENT_STATE_DEAD;
>+	}
>
> 	perf_event_ctx_unlock(event, ctx);
>
>@@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count)
> 	 * error state (i.e. because it was pinned but it couldn't be
> 	 * scheduled on to the CPU at some point).
> 	 */
>-	if (event->state == PERF_EVENT_STATE_ERROR)
>+	if (event->state <= PERF_EVENT_STATE_ERROR)
> 		return 0;
>
> 	if (count < event->read_size)
>@@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
> 	struct perf_buffer *rb;
> 	__poll_t events = EPOLLHUP;
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return EPOLLERR;
>+
> 	poll_wait(file, &event->waitq, wait);
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return EPOLLERR;
>+
> 	if (is_event_hup(event))
> 		return events;
>
>@@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
> }
>
> static int perf_event_set_output(struct perf_event *event,
>-				 struct perf_event *output_event);
>+				 struct perf_event *output_event, bool force);
> static int perf_event_set_filter(struct perf_event *event, void __user *arg);
> static int perf_copy_attr(struct perf_event_attr __user *uattr,
> 			  struct perf_event_attr *attr);
>@@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
> 	void (*func)(struct perf_event *);
> 	u32 flags = arg;
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return -ENODEV;
>+
> 	switch (cmd) {
> 	case PERF_EVENT_IOC_ENABLE:
> 		func = _perf_event_enable;
>@@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
> 			if (ret)
> 				return ret;
> 			output_event = fd_file(output)->private_data;
>-			ret = perf_event_set_output(event, output_event);
>+			ret = perf_event_set_output(event, output_event, false);
> 			fdput(output);
> 		} else {
>-			ret = perf_event_set_output(event, NULL);
>+			ret = perf_event_set_output(event, NULL, false);
> 		}
> 		return ret;
> 	}
>@@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> 	unsigned long size = perf_data_size(rb);
> 	bool detach_rest = false;
>
>+	/* FIXIES vs perf_pmu_unregister() */
> 	if (event->pmu->event_unmapped)
> 		event->pmu->event_unmapped(event, vma->vm_mm);
>
>@@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> 	unsigned long vma_size;
> 	unsigned long nr_pages;
> 	long user_extra = 0, extra = 0;
>-	int ret = 0, flags = 0;
>+	int ret, flags = 0;
>+
>+	ret = security_perf_event_read(event);
>+	if (ret)
>+		return ret;
>+
>+	/* XXX needs event->mmap_mutex? */
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return -ENODEV;
>
> 	/*
> 	 * Don't allow mmap() of inherited per-task counters. This would
>@@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> 	if (!(vma->vm_flags & VM_SHARED))
> 		return -EINVAL;
>
>-	ret = security_perf_event_read(event);
>-	if (ret)
>-		return ret;
>-
> 	vma_size = vma->vm_end - vma->vm_start;
>
> 	if (vma->vm_pgoff == 0) {
>@@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
> 	struct perf_event *event = filp->private_data;
> 	int retval;
>
>+	if (event->state <= PERF_EVENT_STATE_REVOKED)
>+		return -ENODEV;
>+
> 	inode_lock(inode);
> 	retval = fasync_helper(fd, filp, on, &event->fasync);
> 	inode_unlock(inode);
>@@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event)
>
> static void free_pmu_context(struct pmu *pmu)
> {
>+	free_percpu(pmu->pmu_disable_count);
> 	free_percpu(pmu->cpu_pmu_context);
> }
>
>@@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> 	if (type >= 0)
> 		max = type;
>
>+	// XXX broken vs perf_init_event(), this publishes before @pmu is finalized
> 	ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
> 	if (ret < 0)
> 		goto free_pdc;
>@@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> 	if (!pmu->event_idx)
> 		pmu->event_idx = perf_event_idx_default;
>
>-	list_add_rcu(&pmu->entry, &pmus);
> 	atomic_set(&pmu->exclusive_cnt, 0);
>+	INIT_LIST_HEAD(&pmu->events);
>+	spin_lock_init(&pmu->events_lock);
>+
>+	/*
>+	 * Publish the pmu after it is fully initialized.
>+	 */
>+	list_add_rcu(&pmu->entry, &pmus);
> 	ret = 0;
> unlock:
> 	mutex_unlock(&pmus_lock);
>@@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> }
> EXPORT_SYMBOL_GPL(perf_pmu_register);
>
>-void perf_pmu_unregister(struct pmu *pmu)
>+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
>+			       struct perf_event_context *ctx)
> {
>+	/*
>+	 * De-schedule the event and mark it EXIT_PMU.
>+	 */
>+	perf_event_exit_event(event, ctx, true);
>+
>+	/*
>+	 * All _free_event() bits that rely on event->pmu:
>+	 */
>+	perf_event_set_output(event, NULL, true);
>+
>+	perf_event_free_bpf_prog(event);
>+	perf_addr_filters_splice(event, NULL);
>+	kfree(event->addr_filter_ranges);
>+
>+	if (event->destroy) {
>+		event->destroy(event);
>+		event->destroy = NULL;
>+	}
>+
>+	if (event->pmu_ctx) {
>+		/*
>+		 * Make sure pmu->cpu_pmu_context is unused. An alternative is
>+		 * to have it be pointers and make put_pmu_ctx()'s
>+		 * epc->embedded case be another RCU free case.
>+		 */
>+		put_pmu_ctx(event->pmu_ctx);
>+		event->pmu_ctx = NULL;
>+	}
>+
>+	exclusive_event_destroy(event);
>+	module_put(pmu->module);
>+
>+	event->pmu = NULL; /* force fault instead of UAF */
>+}
>+
>+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
>+{
>+	struct perf_event_context *ctx;
>+
>+	ctx = perf_event_ctx_lock(event);
>+	__pmu_detach_event(pmu, event, ctx);
>+	perf_event_ctx_unlock(event, ctx);
>+
>+	scoped_guard(spinlock, &pmu->events_lock)
>+		list_del(&event->pmu_list);
>+}
>+
>+static struct perf_event *pmu_get_event(struct pmu *pmu)
>+{
>+	struct perf_event *event;
>+
>+	guard(spinlock)(&pmu->events_lock);
>+	list_for_each_entry(event, &pmu->events, pmu_list) {
>+		if (atomic_long_inc_not_zero(&event->refcount))
>+			return event;
>+	}
>+
>+	return NULL;
>+}
>+
>+static bool pmu_empty(struct pmu *pmu)
>+{
>+	guard(spinlock)(&pmu->events_lock);
>+	return list_empty(&pmu->events);
>+}
>+
>+static void pmu_detach_events(struct pmu *pmu)
>+{
>+	struct perf_event *event;
>+
>+	for (;;) {
>+		event = pmu_get_event(pmu);
>+		if (!event)
>+			break;
>+
>+		pmu_detach_event(pmu, event);
>+		put_event(event);
>+	}
>+
>+	/*
>+	 * wait for pending _free_event()s
>+	 */
>+	wait_var_event(pmu, pmu_empty(pmu));


ok... so now we are synchronosly moving the events to a revoked state
during unregister, so we wouldn't need the refcount on the driver side
anymore and can just free the objects right after return.

I will give this a try with i915 and/or xe.

thanks
Lucas De Marchi


>+}
>+
>+int perf_pmu_unregister(struct pmu *pmu)
>+{
>+	/*
>+	 * FIXME!
>+	 *
>+	 *   perf_mmap_close(); event->pmu->event_unmapped()
>+	 *
>+	 * XXX this check is racy vs perf_event_alloc()
>+	 */
>+	if (pmu->event_unmapped && !pmu_empty(pmu))
>+		return -EBUSY;
>+
> 	mutex_lock(&pmus_lock);
> 	list_del_rcu(&pmu->entry);
>+	idr_remove(&pmu_idr, pmu->type);
>+	mutex_unlock(&pmus_lock);
>
> 	/*
> 	 * We dereference the pmu list under both SRCU and regular RCU, so
>@@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu)
> 	synchronize_srcu(&pmus_srcu);
> 	synchronize_rcu();
>
>-	free_percpu(pmu->pmu_disable_count);
>-	idr_remove(&pmu_idr, pmu->type);
>+	/*
>+	 * PMU is removed from the pmus list, so no new events will
>+	 * be created, now take care of the existing ones.
>+	 */
>+	pmu_detach_events(pmu);
>+
>+	/*
>+	 * PMU is unused, make it go away.
>+	 */
> 	if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
> 		if (pmu->nr_addr_filters)
> 			device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
> 		device_del(pmu->dev);
> 		put_device(pmu->dev);
> 	}
>+
>+	/*
>+	 * XXX -- broken? can still contain SW events at this point?
>+	 * audit more, make sure DETACH_GROUP DTRT
>+	 */
> 	free_pmu_context(pmu);
>-	mutex_unlock(&pmus_lock);
>+
>+	return 0;
> }
> EXPORT_SYMBOL_GPL(perf_pmu_unregister);
>
>@@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> 	/* symmetric to unaccount_event() in _free_event() */
> 	account_event(event);
>
>+	scoped_guard(spinlock, &pmu->events_lock)
>+		list_add(&event->pmu_list, &pmu->events);
>+
> 	return event;
>
> err_callchain_buffer:
>@@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
> }
>
> static int
>-perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>+perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force)
> {
> 	struct perf_buffer *rb = NULL;
> 	int ret = -EINVAL;
>@@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
> 	mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
> set:
> 	/* Can't redirect output if we've got an active mmap() */
>-	if (atomic_read(&event->mmap_count))
>-		goto unlock;
>+	if (atomic_read(&event->mmap_count)) {
>+		if (!force)
>+			goto unlock;
>+
>+		WARN_ON_ONCE(event->pmu->event_unmapped);
>+	}
>
> 	if (output_event) {
> 		/* get the rb we want to redirect to */
>@@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open,
> 		if (err)
> 			goto err_fd;
> 		group_leader = fd_file(group)->private_data;
>+		if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
>+			err = -ENODEV;
>+			goto err_group_fd;
>+		}
> 		if (flags & PERF_FLAG_FD_OUTPUT)
> 			output_event = group_leader;
> 		if (flags & PERF_FLAG_FD_NO_GROUP)
>@@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open,
> 	event->pmu_ctx = pmu_ctx;
>
> 	if (output_event) {
>-		err = perf_event_set_output(event, output_event);
>+		err = perf_event_set_output(event, output_event, false);
> 		if (err)
> 			goto err_context;
> 	}
>@@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event)
> }
>
> static void
>-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
>+perf_event_exit_event(struct perf_event *event,
>+		      struct perf_event_context *ctx, bool revoke)
> {
> 	struct perf_event *parent_event = event->parent;
>-	unsigned long detach_flags = 0;
>+	unsigned long detach_flags = DETACH_EXIT;
>
> 	if (parent_event) {
> 		/*
>@@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> 		 * Do destroy all inherited groups, we don't care about those
> 		 * and being thorough is better.
> 		 */
>-		detach_flags = DETACH_GROUP | DETACH_CHILD;
>+		detach_flags |= DETACH_GROUP | DETACH_CHILD;
> 		mutex_lock(&parent_event->child_mutex);
> 	}
>
>-	perf_remove_from_context(event, detach_flags);
>+	if (revoke)
>+		detach_flags |= DETACH_GROUP | DETACH_REVOKE;
>
>-	raw_spin_lock_irq(&ctx->lock);
>-	if (event->state > PERF_EVENT_STATE_EXIT)
>-		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
>-	raw_spin_unlock_irq(&ctx->lock);
>+	perf_remove_from_context(event, detach_flags);
>
> 	/*
> 	 * Child events can be freed.
>@@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
> 	perf_event_task(child, child_ctx, 0);
>
> 	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
>-		perf_event_exit_event(child_event, child_ctx);
>+		perf_event_exit_event(child_event, child_ctx, false);
>
> 	mutex_unlock(&child_ctx->mutex);
>


More information about the dri-devel mailing list