[Intel-xe] [PATCH v6 3/3] drm/xe/pmu: Enable PMU interface

Sat Sep 2 05:00:25 UTC 2023

On Fri, 01 Sep 2023 00:06:48 -0700, Aravind Iddamsetty wrote:
>

Hi Aravind,

> +static u64 __engine_group_busyness_read(struct xe_gt *gt, int sample_type)
> +{
> +	u64 val;
> +
> +	switch (sample_type) {
> +	case __XE_SAMPLE_RENDER_GROUP_BUSY:
> +		val = xe_mmio_read32(gt, XE_OAG_RENDER_BUSY_FREE);
> +		break;
> +	case __XE_SAMPLE_COPY_GROUP_BUSY:
> +		val = xe_mmio_read32(gt, XE_OAG_BLT_BUSY_FREE);
> +		break;
> +	case __XE_SAMPLE_MEDIA_GROUP_BUSY:
> +		/*
> +		 * This GSI offset is not replicated in media specific range
> +		 * BSPEC: 67789, 46559
> +		 */
> +		if (gt->info.type == XE_GT_TYPE_MEDIA)
> +			val = xe_mmio_read32(gt->tile->primary_gt, XE_OAG_ANY_MEDIA_FF_BUSY_FREE);

Hmm, this appeared in this version. Is this really needed? Did you test
this on MTL media gt?

E.g. if this is needed we'd need to do similar stuff in freq_act_show()
when reading MTL_MIRROR_TARGET_WP1, but we don't do this there. So we don't
see this done anywhere else in XE.

I don't understand this multicast steering but appears XE mmio read/write
functions might already be taking care of this via:

	if (reg.addr < gt->mmio.adj_limit)

So could you please check this out and take out this code if not needed.

> +		else
> +			val = xe_mmio_read32(gt, XE_OAG_ANY_MEDIA_FF_BUSY_FREE);
> +		break;
> +	case __XE_SAMPLE_ANY_ENGINE_GROUP_BUSY:
> +		val = xe_mmio_read32(gt, XE_OAG_RC0_ANY_ENGINE_BUSY_FREE);
> +		break;
> +	default:
> +		drm_warn(&gt->tile->xe->drm, "unknown pmu event\n");
> +	}
> +
> +	return xe_gt_clock_cycles_to_ns(gt, val * 16);
> +}
> +
> +static u64 engine_group_busyness_read(struct xe_gt *gt, u64 config)
> +{
> +	int sample_type = config_counter(config) - 1;
> +	const unsigned int gt_id = gt->info.id;
> +	struct xe_device *xe = gt->tile->xe;
> +	struct xe_pmu *pmu = &xe->pmu;
> +	unsigned long flags;
> +	bool device_awake;
> +	u64 val;
> +
> +	device_awake = xe_device_mem_access_get_if_ongoing(xe);
> +	if (device_awake) {
> +		XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FW_GT));
> +		val = __engine_group_busyness_read(gt, sample_type);
> +		XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FW_GT));
> +		xe_device_mem_access_put(xe);
> +	}
> +
> +	spin_lock_irqsave(&pmu->lock, flags);
> +
> +	if (device_awake)
> +		pmu->sample[gt_id][sample_type] = val;
> +	else
> +		val = pmu->sample[gt_id][sample_type];
> +
> +	spin_unlock_irqrestore(&pmu->lock, flags);
> +
> +	return val;
> +}
> +
> +static void engine_group_busyness_store(struct xe_gt *gt)
> +{
> +	struct xe_pmu *pmu = &gt->tile->xe->pmu;
> +	unsigned int gt_id = gt->info.id;
> +	unsigned long flags;
> +	int i;
> +
> +	spin_lock_irqsave(&pmu->lock, flags);
> +
> +	for (i = __XE_SAMPLE_RENDER_GROUP_BUSY; i <= __XE_SAMPLE_ANY_ENGINE_GROUP_BUSY; i++)
> +		pmu->sample[gt_id][i] = __engine_group_busyness_read(gt, i);

Just a reminder about what we discussed about this earlier. Here we are
saving all these samples unconditionally, whether or not PMU will ever be
used. We have deferred the optimization to only save enabled counters to a
later patch. So we might still come back and do that.

> +static void xe_pmu_event_read(struct perf_event *event)
> +{
> +	struct xe_device *xe =
> +		container_of(event->pmu, typeof(*xe), pmu.base);
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct xe_pmu *pmu = &xe->pmu;
> +	u64 prev, new;
> +
> +	if (pmu->closed) {
> +		event->hw.state = PERF_HES_STOPPED;
> +		return;
> +	}
> +again:
> +	prev = local64_read(&hwc->prev_count);
> +	new = __xe_pmu_event_read(event);
> +
> +	if (local64_cmpxchg(&hwc->prev_count, prev, new) != prev)
> +		goto again;
> +
> +	local64_add(new - prev, &event->count);

Just curious, in case you know, why do we do this here rather than just a:

	local64_set(new, &event->count);

The code is the same as i915 so is likely correct, so no issue, if you
don't know just ignore this comment. No changes needed here.

> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index 86f16d50e9cc..8a59350b3cea 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -1056,6 +1056,45 @@ struct drm_xe_vm_madvise {
>	__u64 reserved[2];
>  };
>
> +/* PMU event config IDs */
> +
> +/*
> + * The below config IDs needs to be filled in the struct perf_even_attr field
> + * as part of perf_event_open call to read a particular event.
> + *
> + * check man perf_event_open.
> + *
> + * example on how to open an event:
> + * .. code-block:: C
> + *	struct perf_event_attr pe;
> + *	long long count;
> + *	int cpu = 0;
> + *	int fd;
> + *
> + *	memset(&pe, 0, sizeof(struct perf_event_attr));
> + *	pe.type = type // eg: /sys/bus/event_source/devices/xe_0000_56_00.0/type
> + *	pe.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
> + *	pe.use_clockid = 1;
> + *	pe.clockid = CLOCK_MONOTONIC;
> + *	pe.config = XE_PMU_INTERRUPTS(0);
> + *
> + *	fd = perf_event_open(&pe, -1, cpu, -1, 0);
> + */

Pretty good, let me fix up the formatting, style and some typos. But please
check.

/**
 * XE PMU event config ID's
 *
 * Check 'man perf_event_open' to use these ID's in 'struct
 * perf_event_attr' as part of perf_event_open syscall to read particular
 * event counts.
 *
 * For example, to open the XE_PMU_INTERRUPTS(0) event:
 *
 * .. code-block:: C
 *
 *	struct perf_event_attr attr = {};
 *	long long count;
 *	int cpu = 0;
 *	int fd;
 *
 *	attr.type = type; /* /sys/bus/event_source/devices/xe_0000_56_00.0/type */
 *	attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
 *	attr.use_clockid = 1;
 *	attr.clockid = CLOCK_MONOTONIC;
 *	attr.config = XE_PMU_INTERRUPTS(0);
 *
 *	fd = syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
 */

Apart from the couple of things above to be fixed up, I don't want to hold
onto this review any longer, so this is now:

Reviewed-by: Ashutosh Dixit <ashutosh.dixit at intel.com>