[Intel-gfx] [PATCH 5/6] drm/i915/pmu: Prepare for multi-tile non-engine counters

Fri May 12 22:37:48 UTC 2023

On Fri, May 12, 2023 at 01:57:59PM -0700, Umesh Nerlige Ramappa wrote:
>On Fri, May 12, 2023 at 11:56:18AM +0100, Tvrtko Ursulin wrote:
>>
>>On 12/05/2023 02:08, Dixit, Ashutosh wrote:
>>>On Fri, 05 May 2023 17:58:15 -0700, Umesh Nerlige Ramappa wrote:
>>>>
>>>>From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>>>
>>>>Reserve some bits in the counter config namespace which will carry the
>>>>tile id and prepare the code to handle this.
>>>>
>>>>No per tile counters have been added yet.
>>>>
>>>>v2:
>>>>- Fix checkpatch issues
>>>>- Use 4 bits for gt id in non-engine counters. Drop FIXME.
>>>>- Set MAX GTs to 4. Drop FIXME.
>>>>
>>>>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>>>---
>>>> drivers/gpu/drm/i915/i915_pmu.c | 150 +++++++++++++++++++++++---------
>>>> drivers/gpu/drm/i915/i915_pmu.h |   9 +-
>>>> include/uapi/drm/i915_drm.h     |  17 +++-
>>>> 3 files changed, 129 insertions(+), 47 deletions(-)
>>>>
>>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>>>>index 669a42e44082..12b2f3169abf 100644
>>>>--- a/drivers/gpu/drm/i915/i915_pmu.c
>>>>+++ b/drivers/gpu/drm/i915/i915_pmu.c
>>>>@@ -56,11 +56,21 @@ static bool is_engine_config(u64 config)
>>>>	return config < __I915_PMU_OTHER(0);
>>>> }
>>>>
>>>>+static unsigned int config_gt_id(const u64 config)
>>>>+{
>>>>+	return config >> __I915_PMU_GT_SHIFT;
>>>>+}
>>>>+
>>>>+static u64 config_counter(const u64 config)
>>>>+{
>>>>+	return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
>>>
>>>ok, but another possibility:
>>>
>>>	return config & ~REG_GENMASK64(63, __I915_PMU_GT_SHIFT);
>>
>>It's not a register so no. :) GENMASK_ULL maybe but meh.
>
>leaving as is.
>
>>
>>>>+}
>>>>+
>>>> static unsigned int other_bit(const u64 config)
>>>> {
>>>>	unsigned int val;
>>>>
>>>>-	switch (config) {
>>>>+	switch (config_counter(config)) {
>>>>	case I915_PMU_ACTUAL_FREQUENCY:
>>>>		val =  __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
>>>>		break;
>>>>@@ -78,15 +88,20 @@ static unsigned int other_bit(const u64 config)
>>>>		return -1;
>>>>	}
>>>>
>>>>-	return I915_ENGINE_SAMPLE_COUNT + val;
>>>>+	return I915_ENGINE_SAMPLE_COUNT +
>>>>+	       config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
>>>>+	       val;
>>>> }
>>>>
>>>> static unsigned int config_bit(const u64 config)
>>>> {
>>>>-	if (is_engine_config(config))
>>>>+	if (is_engine_config(config)) {
>>>>+		GEM_BUG_ON(config_gt_id(config));
>>>
>>>This GEM_BUG_ON is not needed since:
>>>
>>>	static bool is_engine_config(u64 config)
>>>	{
>>>	        return config < __I915_PMU_OTHER(0);
>>>	}
>>
>>True!
>
>dropping BUG_ON
>
>>
>>>>+
>>>>		return engine_config_sample(config);
>>>>-	else
>>>>+	} else {
>>>>		return other_bit(config);
>>>>+	}
>>>> }
>>>>
>>>> static u64 config_mask(u64 config)
>>>>@@ -104,6 +119,18 @@ static unsigned int event_bit(struct perf_event *event)
>>>>	return config_bit(event->attr.config);
>>>> }
>>>>
>>>>+static u64 frequency_enabled_mask(void)
>>>>+{
>>>>+	unsigned int i;
>>>>+	u64 mask = 0;
>>>>+
>>>>+	for (i = 0; i < I915_PMU_MAX_GTS; i++)
>>>>+		mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
>>>>+			config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
>>>>+
>>>>+	return mask;
>>>>+}
>>>>+
>>>> static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
>>>> {
>>>>	struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>>>>@@ -120,9 +147,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
>>>>	 * Mask out all the ones which do not need the timer, or in
>>>>	 * other words keep all the ones that could need the timer.
>>>>	 */
>>>>-	enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
>>>>-		  config_mask(I915_PMU_REQUESTED_FREQUENCY) |
>>>>-		  ENGINE_SAMPLE_MASK;
>>>>+	enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
>>>
>>>u32 enable & u64 frequency_enabled_mask
>>>
>>>ugly but ok I guess? Or change enable to u64?
>
>making pmu->enable u64 as well as other places where it is assigned to 
>local variables.
>
>>
>>Hmm.. yes very ugly. Could have been an accident which happened to 
>>work because there is a single timer (not per tile).
>
>Happened to work because the frequency mask does not spill over to the 
>upper 32 bits (even for multi tile).
>
>--------------------- START_SECTION ----------------
>>
>>Similar issue in frequency_sampling_enabled too. Gt_id argument to 
>>it seems pointless.
>
>Not sure why it's pointless. We need the gt_id to determine the right 
>mask for that specific gt. If it's not enabled, then we just return 
>without pm_get and async put (like you mention later).
>
>And this piece of code is called within for_each_gt.
>
>>
>>So I now think whole frequency_enabled_mask() is just pointless and 
>>should be removed. And then pmu_needs_time code can stay as is. 
>>Possibly add a config_mask_32 helper which ensures no bits in upper 
>>32 bits are returned.
>>
>>That is if we are happy for the frequency_sampling_enabled returning 
>>true for all gts, regardless of which ones actually have frequency 
>>sampling enabled.
>>
>>Or if we want to implement it as I probably have intended, we will 
>>need to add some gt bits into pmu->enable. Maybe reserve top four 
>>same as with config counters.
>
>Nope. What you have here works just fine. pmu->enable should not 
>include any gt id info. gt_id[63:60] is only a concept for pmu config 
>sent by user.  config_mask and pmu->enable are i915 internal 
>bookkeeping (bit masks) just to track what events need to be sampled.  
>The 'other' bit masks are a function of gt_id because we use gt_id to 
>calculate a contiguous numerical value for these 'other' events. 
>That's all. Once the numerical value is calculated, there is no need 
>for gt_id because config_mask is BIT_ULL(numerical_value). Since the 
>numerical values never exceeded 31 (even for multi-gts), everything 
>worked even with 32 bit pmu->enable.
>
>>
>>In this case the config_mask needs to be updated to translate not 
>>just the config counter into the pmu tracked event bits, but config 
>>counter gt id into the pmu->enabled gt id.
>>
>>Sounds easily doable on a first thought.
>
>------------------------ END_SECTION ----------------
>
>
>>>>
>>>>	/*
>>>>	 * When the GPU is idle per-engine counters do not need to be
>>>>@@ -164,9 +189,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
>>>>	return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
>>>> }
>>>>
>>>>+static unsigned int
>>>>+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
>>>>+{
>>>>+	unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
>>>>+
>>>>+	GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
>>>
>>>Does this GEM_BUG_ON need to be split up as follows:
>>>
>>>	GEM_BUG_ON(gt_id >= I915_PMU_MAX_GTS);
>>>	GEM_BUG_ON(sample >= __I915_NUM_PMU_SAMPLERS);
>>>
>>>Since that is what we really mean here isn't it?
>>
>>ARRAY_SIZE check seems the safest option to me, given it is defined as:
>>
>>sample[I915_PMU_MAX_GTS * __I915_NUM_PMU_SAMPLERS];
>>
>>What problem do you see here?
>>
>>>>+
>>>>+	return idx;
>>>>+}
>>>>+
>>>>+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
>>>>+{
>>>>+	return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
>>>>+}
>>>>+
>>>>+static void
>>>>+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
>>>>+{
>>>>+	pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
>>>>+}
>>>>+
>>>>+static void
>>>>+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, u32 mul)
>>>>+{
>>>>+	pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, mul);
>>>>+}
>>>
>>>Gripe: I think this code should have per event data structures which store
>>>all information about a particular event. Rather than storing it in these
>>>arrays common to all events (and in bit-fields common to all events) which
>>>results in the kind of dance we have to do here. Anyway too big a change to
>>>make now but something to consider if we ever do this for xe.
>>
>>Could do a two dimensional array like:
>>
>>sample[I915_PMU_MAX_GTS][__I915_NUM_PMU_SAMPLERS];
>>
>>Any better? Honestly I don't remember if there was a special reason 
>>I went for a flat array back then.
>
>Maybe we improve it in XE. I am looking for the shortest path to get 
>this merged without any functional issues.
>
>>
>>>
>>>>+
>>>> static u64 get_rc6(struct intel_gt *gt)
>>>> {
>>>>	struct drm_i915_private *i915 = gt->i915;
>>>>+	const unsigned int gt_id = gt->info.id;
>>>>	struct i915_pmu *pmu = &i915->pmu;
>>>>	unsigned long flags;
>>>>	bool awake = false;
>>>>@@ -181,7 +234,7 @@ static u64 get_rc6(struct intel_gt *gt)
>>>>	spin_lock_irqsave(&pmu->lock, flags);
>>>>
>>>>	if (awake) {
>>>>-		pmu->sample[__I915_SAMPLE_RC6].cur = val;
>>>>+		store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
>>>>	} else {
>>>>		/*
>>>>		 * We think we are runtime suspended.
>>>>@@ -190,14 +243,14 @@ static u64 get_rc6(struct intel_gt *gt)
>>>>		 * on top of the last known real value, as the approximated RC6
>>>>		 * counter value.
>>>>		 */
>>>>-		val = ktime_since_raw(pmu->sleep_last);
>>>>-		val += pmu->sample[__I915_SAMPLE_RC6].cur;
>>>>+		val = ktime_since_raw(pmu->sleep_last[gt_id]);
>>>>+		val += read_sample(pmu, gt_id, __I915_SAMPLE_RC6);
>>>>	}
>>>>
>>>>-	if (val < pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur)
>>>>-		val = pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur;
>>>>+	if (val < read_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED))
>>>>+		val = read_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED);
>>>>	else
>>>>-		pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur = val;
>>>>+		store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED, val);
>>>>
>>>>	spin_unlock_irqrestore(&pmu->lock, flags);
>>>>
>>>>@@ -207,13 +260,20 @@ static u64 get_rc6(struct intel_gt *gt)
>>>> static void init_rc6(struct i915_pmu *pmu)
>>>> {
>>>>	struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>>>>-	intel_wakeref_t wakeref;
>>>>+	struct intel_gt *gt;
>>>>+	unsigned int i;
>>>>+
>>>>+	for_each_gt(gt, i915, i) {
>>>>+		intel_wakeref_t wakeref;
>>>>
>>>>-	with_intel_runtime_pm(to_gt(i915)->uncore->rpm, wakeref) {
>>>>-		pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(to_gt(i915));
>>>>-		pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur =
>>>>-					pmu->sample[__I915_SAMPLE_RC6].cur;
>>>>-		pmu->sleep_last = ktime_get_raw();
>>>>+		with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
>>>>+			u64 val = __get_rc6(gt);
>>>>+
>>>>+			store_sample(pmu, i, __I915_SAMPLE_RC6, val);
>>>>+			store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
>>>>+				     val);
>>>>+			pmu->sleep_last[i] = ktime_get_raw();
>>>>+		}
>>>>	}
>>>> }
>>>>
>>>>@@ -221,8 +281,8 @@ static void park_rc6(struct intel_gt *gt)
>>>> {
>>>>	struct i915_pmu *pmu = &gt->i915->pmu;
>>>>
>>>>-	pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(gt);
>>>>-	pmu->sleep_last = ktime_get_raw();
>>>>+	store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>>>+	pmu->sleep_last[gt->info.id] = ktime_get_raw();
>>>> }
>>>>
>>>> static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
>>>>@@ -362,34 +422,30 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
>>>>	}
>>>> }
>>>>
>>>>-static void
>>>>-add_sample_mult(struct i915_pmu_sample *sample, u32 val, u32 mul)
>>>>-{
>>>>-	sample->cur += mul_u32_u32(val, mul);
>>>>-}
>>>>-
>>>>-static bool frequency_sampling_enabled(struct i915_pmu *pmu)
>>>>+static bool
>>>>+frequency_sampling_enabled(struct i915_pmu *pmu, unsigned int gt)
>>>> {
>>>>	return pmu->enable &
>>>>-	       (config_mask(I915_PMU_ACTUAL_FREQUENCY) |
>>>>-		config_mask(I915_PMU_REQUESTED_FREQUENCY));
>>>>+	       (config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt)) |
>>>>+		config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt)));
>>>
>>>Here again:
>>>
>>>	u32 pmu->enable & u64 config_mask
>>>
>>>Probably ok?
>>>
>>>And also in i915_pmu_enable() we have:
>>>
>>>	pmu->enable |= BIT_ULL(bit);
>>>
>>>So change pmu->enable to u64?
>
>Right, changing to u64
>
>>>
>>>> }
>>>>
>>>> static void
>>>> frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>>> {
>>>>	struct drm_i915_private *i915 = gt->i915;
>>>>+	const unsigned int gt_id = gt->info.id;
>>>>	struct i915_pmu *pmu = &i915->pmu;
>>>>	struct intel_rps *rps = &gt->rps;
>>>>
>>>>-	if (!frequency_sampling_enabled(pmu))
>>>>+	if (!frequency_sampling_enabled(pmu, gt_id))
>>>>		return;
>>>
>>>Pre-existing issue, but why do we need this check? This is already checked
>>>in the two individual checks for actual and requested freq below:
>>>
>>>	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id)))
>>>
>>>	and
>>>
>>>	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id)))
>>>
>>>So can we delete frequency_sampling_enabled()? Or is it there to avoid the
>>>overhead of intel_gt_pm_get_if_awake() (which doesn't seem to be much)?
>>
>>I think it was to avoid even getting an already active pm ref if 
>>frequency events are not enabled. Timer could be running for 
>>instance if only engine wait/sema is enabled. So yeah, just a little 
>>bit cheaper than pm get + async put and avoid prolonging the delayed 
>>put for no reason. (As the timer races with regular GT pm activities 
>>(see mod_delayed_work in __intel_wakeref_put_last).)
>
>leaving as is.
>
>>
>>>
>>>>
>>>>	/* Report 0/0 (actual/requested) frequency while parked. */
>>>>	if (!intel_gt_pm_get_if_awake(gt))
>>>>		return;
>>>>
>>>>-	if (pmu->enable & config_mask(I915_PMU_ACTUAL_FREQUENCY)) {
>>>>+	if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
>>>>		u32 val;
>>>>
>>>>		/*
>>>>@@ -405,12 +461,12 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>>>		if (!val)
>>>>			val = intel_gpu_freq(rps, rps->cur_freq);
>>>>
>>>>-		add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
>>>>+		add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_ACT,
>>>>				val, period_ns / 1000);
>>>>	}
>>>>
>>>>-	if (pmu->enable & config_mask(I915_PMU_REQUESTED_FREQUENCY)) {
>>>>-		add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_REQ],
>>>>+	if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
>>>>+		add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
>>>>				intel_rps_get_requested_frequency(rps),
>>>>				period_ns / 1000);
>>>>	}
>>>>@@ -447,9 +503,7 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer)
>>>>			continue;
>>>>
>>>>		engines_sample(gt, period_ns);
>>>>-
>>>>-		if (i == 0) /* FIXME */
>>>>-			frequency_sample(gt, period_ns);
>>>>+		frequency_sample(gt, period_ns);
>>>>	}
>>>>
>>>>	hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
>>>>@@ -491,7 +545,12 @@ config_status(struct drm_i915_private *i915, u64 config)
>>>> {
>>>>	struct intel_gt *gt = to_gt(i915);
>>>>
>>>>-	switch (config) {
>>>>+	unsigned int gt_id = config_gt_id(config);
>>>>+
>>>>+	if (gt_id)
>>>>+		return -ENOENT;
>>>
>>>This is just wrong. It is fixed in the next patch:
>>>
>>>	if (gt_id > max_gt_id)
>>>		return -ENOENT;
>>>
>>>But probably should be fixed in this patch itself. Or dropped from this
>>>patch and let it come in in Patch 6, since it's confusing. Though it
>>>probably belongs in this patch.
>>
>>Hmm my thinking was probably to reject gt > 0 in this patch since 
>>only the last patch was supposed to be exposing the other tiles. 
>>Granted that is not entirely true since this patch already makes 
>>access to them available via i915_drm.h. Last patch only makes then 
>>discoverable via sysfs.
>>
>>In this case yes, I'd pull in "gt_id > max_gt_id" into this patch. 
>>And this hunk from the next patch too:
>>
>>	case I915_PMU_INTERRUPTS:
>>+		if (gt_id)
>>+			return -ENOENT;
>>
>
>pulling in the above snippets from patch 6 to patch 5
>
>>>
>>>>+
>>>>+	switch (config_counter(config)) {
>>>>	case I915_PMU_ACTUAL_FREQUENCY:
>>>>		if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
>>>>			/* Requires a mutex for sampling! */
>>>>@@ -599,22 +658,27 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
>>>>			val = engine->pmu.sample[sample].cur;
>>>>		}
>>>>	} else {
>>>>-		switch (event->attr.config) {
>>>>+		const unsigned int gt_id = config_gt_id(event->attr.config);
>>>>+		const u64 config = config_counter(event->attr.config);
>>>>+
>>>>+		switch (config) {
>>>>		case I915_PMU_ACTUAL_FREQUENCY:
>>>>			val =
>>>>-			   div_u64(pmu->sample[__I915_SAMPLE_FREQ_ACT].cur,
>>>>+			   div_u64(read_sample(pmu, gt_id,
>>>>+					       __I915_SAMPLE_FREQ_ACT),
>>>>				   USEC_PER_SEC /* to MHz */);
>>>>			break;
>>>>		case I915_PMU_REQUESTED_FREQUENCY:
>>>>			val =
>>>>-			   div_u64(pmu->sample[__I915_SAMPLE_FREQ_REQ].cur,
>>>>+			   div_u64(read_sample(pmu, gt_id,
>>>>+					       __I915_SAMPLE_FREQ_REQ),
>>>>				   USEC_PER_SEC /* to MHz */);
>>>>			break;
>>>>		case I915_PMU_INTERRUPTS:
>>>>			val = READ_ONCE(pmu->irq_count);
>>>>			break;
>>>>		case I915_PMU_RC6_RESIDENCY:
>>>>-			val = get_rc6(to_gt(i915));
>>>>+			val = get_rc6(i915->gt[gt_id]);
>>>>			break;
>>>>		case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
>>>>			val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
>>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
>>>>index 3a811266ac6a..d47846f21ddf 100644
>>>>--- a/drivers/gpu/drm/i915/i915_pmu.h
>>>>+++ b/drivers/gpu/drm/i915/i915_pmu.h
>>>>@@ -38,13 +38,16 @@ enum {
>>>>	__I915_NUM_PMU_SAMPLERS
>>>> };
>>>>
>>>>+#define I915_PMU_MAX_GTS (4)
>>>
>>>4 or (4)? :-)
>>
>>Bike shed was strong with you on the day of review I see. :)
>>
>>I would rather get rid of this define altogether if we could use the 
>>"normal" MAX_GT define. As I was saying earlier, I think this one 
>>was here just because header dependencies were too convulted back 
>>then. Maybe today things are better? Worth I try probably.
>
>dropping this and using I915_MAX_GTS

Okay, I see what you mentioned here about the header dependencies. It's 
still the same. i915_drv.h includes intel_engine.h which includes 
i915_pmu.h. I cannot use i915_drv.h to use I915_MAX_GTS, it's causing 
all sorts of compile errors (likely cyclic). I don't see a quick way to 
resolve that, so I am going to leave it as I915_PMU_MAX_GTS.

Thanks,
Umesh

>
>>
>>>
>>>>+
>>>> /*
>>>>  * How many different events we track in the global PMU mask.
>>>>  *
>>>>  * It is also used to know to needed number of event reference counters.
>>>>  */
>>>> #define I915_PMU_MASK_BITS \
>>>>-	(I915_ENGINE_SAMPLE_COUNT + __I915_PMU_TRACKED_EVENT_COUNT)
>>>>+	(I915_ENGINE_SAMPLE_COUNT + \
>>>>+	 I915_PMU_MAX_GTS * __I915_PMU_TRACKED_EVENT_COUNT)
>>>>
>>>> #define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1)
>>>>
>>>>@@ -124,11 +127,11 @@ struct i915_pmu {
>>>>	 * Only global counters are held here, while the per-engine ones are in
>>>>	 * struct intel_engine_cs.
>>>>	 */
>>>>-	struct i915_pmu_sample sample[__I915_NUM_PMU_SAMPLERS];
>>>>+	struct i915_pmu_sample sample[I915_PMU_MAX_GTS * __I915_NUM_PMU_SAMPLERS];
>>>>	/**
>>>>	 * @sleep_last: Last time GT parked for RC6 estimation.
>>>>	 */
>>>>-	ktime_t sleep_last;
>>>>+	ktime_t sleep_last[I915_PMU_MAX_GTS];
>>>>	/**
>>>>	 * @irq_count: Number of interrupts
>>>>	 *
>>>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>>index dba7c5a5b25e..d5ac1fdeb2b1 100644
>>>>--- a/include/uapi/drm/i915_drm.h
>>>>+++ b/include/uapi/drm/i915_drm.h
>>>>@@ -280,7 +280,16 @@ enum drm_i915_pmu_engine_sample {
>>>> #define I915_PMU_ENGINE_SEMA(class, instance) \
>>>>	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
>>>>
>>>>-#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
>>>>+/*
>>>>+ * Top 4 bits of every non-engine counter are GT id.
>>>>+ */
>>>>+#define __I915_PMU_GT_SHIFT (60)
>>>
>>>REG_GENMASK64 or GENMASK_ULL would be nicer but of course we can't put in
>>>the uapi header, so ok.
>>
>>Yep.
>
>leaving as is.
>
>>
>>>>+
>>>>+#define ___I915_PMU_OTHER(gt, x) \
>>>>+	(((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
>>>>+	((__u64)(gt) << __I915_PMU_GT_SHIFT))
>>>>+
>>>>+#define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
>>>>
>>>> #define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)
>>>> #define I915_PMU_REQUESTED_FREQUENCY	__I915_PMU_OTHER(1)
>>>>@@ -290,6 +299,12 @@ enum drm_i915_pmu_engine_sample {
>>>>
>>>> #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
>>>>
>>>>+#define __I915_PMU_ACTUAL_FREQUENCY(gt)		___I915_PMU_OTHER(gt, 0)
>>>>+#define __I915_PMU_REQUESTED_FREQUENCY(gt)	___I915_PMU_OTHER(gt, 1)
>>>>+#define __I915_PMU_INTERRUPTS(gt)		___I915_PMU_OTHER(gt, 2)
>>>>+#define __I915_PMU_RC6_RESIDENCY(gt)		___I915_PMU_OTHER(gt, 3)
>>>>+#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt)	___I915_PMU_OTHER(gt, 4)
>>>>+
>>>> /* Each region is a minimum of 16k, and there are at most 255 of them.
>>>>  */
>>>> #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use
>>>>--
>>>>2.36.1
>>>>
>>>
>>>Above comments are mostly nits so after addressing the above comments, this
>>>is:
>>>
>>>Reviewed-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
>>
>>Well you found some ugly bits (or I got confused, double check me 
>>please) so I'd say hold off with r-b just yet. Sadly it's on Umesh 
>>now to fix up my mess. :I
>
>I don't see anything wrong with the SECTION I marked above. As in, the 
>pmu_needs_timer and the sampling code for events that need to be 
>sampled. If you agree, I can spin the next revision.
>
>Thanks,
>Umesh
>
>>
>>Regards,
>>
>>Tvrtko