[Intel-gfx] [PATCH 5/6] drm/i915/pmu: Prepare for multi-tile non-engine counters
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Fri May 12 22:37:48 UTC 2023
On Fri, May 12, 2023 at 01:57:59PM -0700, Umesh Nerlige Ramappa wrote:
>On Fri, May 12, 2023 at 11:56:18AM +0100, Tvrtko Ursulin wrote:
>>
>>On 12/05/2023 02:08, Dixit, Ashutosh wrote:
>>>On Fri, 05 May 2023 17:58:15 -0700, Umesh Nerlige Ramappa wrote:
>>>>
>>>>From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>>>
>>>>Reserve some bits in the counter config namespace which will carry the
>>>>tile id and prepare the code to handle this.
>>>>
>>>>No per tile counters have been added yet.
>>>>
>>>>v2:
>>>>- Fix checkpatch issues
>>>>- Use 4 bits for gt id in non-engine counters. Drop FIXME.
>>>>- Set MAX GTs to 4. Drop FIXME.
>>>>
>>>>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>>>---
>>>> drivers/gpu/drm/i915/i915_pmu.c | 150 +++++++++++++++++++++++---------
>>>> drivers/gpu/drm/i915/i915_pmu.h | 9 +-
>>>> include/uapi/drm/i915_drm.h | 17 +++-
>>>> 3 files changed, 129 insertions(+), 47 deletions(-)
>>>>
>>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>>>>index 669a42e44082..12b2f3169abf 100644
>>>>--- a/drivers/gpu/drm/i915/i915_pmu.c
>>>>+++ b/drivers/gpu/drm/i915/i915_pmu.c
>>>>@@ -56,11 +56,21 @@ static bool is_engine_config(u64 config)
>>>> return config < __I915_PMU_OTHER(0);
>>>> }
>>>>
>>>>+static unsigned int config_gt_id(const u64 config)
>>>>+{
>>>>+ return config >> __I915_PMU_GT_SHIFT;
>>>>+}
>>>>+
>>>>+static u64 config_counter(const u64 config)
>>>>+{
>>>>+ return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
>>>
>>>ok, but another possibility:
>>>
>>> return config & ~REG_GENMASK64(63, __I915_PMU_GT_SHIFT);
>>
>>It's not a register so no. :) GENMASK_ULL maybe but meh.
>
>leaving as is.
>
>>
>>>>+}
>>>>+
>>>> static unsigned int other_bit(const u64 config)
>>>> {
>>>> unsigned int val;
>>>>
>>>>- switch (config) {
>>>>+ switch (config_counter(config)) {
>>>> case I915_PMU_ACTUAL_FREQUENCY:
>>>> val = __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
>>>> break;
>>>>@@ -78,15 +88,20 @@ static unsigned int other_bit(const u64 config)
>>>> return -1;
>>>> }
>>>>
>>>>- return I915_ENGINE_SAMPLE_COUNT + val;
>>>>+ return I915_ENGINE_SAMPLE_COUNT +
>>>>+ config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
>>>>+ val;
>>>> }
>>>>
>>>> static unsigned int config_bit(const u64 config)
>>>> {
>>>>- if (is_engine_config(config))
>>>>+ if (is_engine_config(config)) {
>>>>+ GEM_BUG_ON(config_gt_id(config));
>>>
>>>This GEM_BUG_ON is not needed since:
>>>
>>> static bool is_engine_config(u64 config)
>>> {
>>> return config < __I915_PMU_OTHER(0);
>>> }
>>
>>True!
>
>dropping BUG_ON
>
>>
>>>>+
>>>> return engine_config_sample(config);
>>>>- else
>>>>+ } else {
>>>> return other_bit(config);
>>>>+ }
>>>> }
>>>>
>>>> static u64 config_mask(u64 config)
>>>>@@ -104,6 +119,18 @@ static unsigned int event_bit(struct perf_event *event)
>>>> return config_bit(event->attr.config);
>>>> }
>>>>
>>>>+static u64 frequency_enabled_mask(void)
>>>>+{
>>>>+ unsigned int i;
>>>>+ u64 mask = 0;
>>>>+
>>>>+ for (i = 0; i < I915_PMU_MAX_GTS; i++)
>>>>+ mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
>>>>+ config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
>>>>+
>>>>+ return mask;
>>>>+}
>>>>+
>>>> static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
>>>> {
>>>> struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>>>>@@ -120,9 +147,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
>>>> * Mask out all the ones which do not need the timer, or in
>>>> * other words keep all the ones that could need the timer.
>>>> */
>>>>- enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
>>>>- config_mask(I915_PMU_REQUESTED_FREQUENCY) |
>>>>- ENGINE_SAMPLE_MASK;
>>>>+ enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
>>>
>>>u32 enable & u64 frequency_enabled_mask
>>>
>>>ugly but ok I guess? Or change enable to u64?
>
>making pmu->enable u64 as well as other places where it is assigned to
>local variables.
>
>>
>>Hmm.. yes very ugly. Could have been an accident which happened to
>>work because there is a single timer (not per tile).
>
>Happened to work because the frequency mask does not spill over to the
>upper 32 bits (even for multi tile).
>
>--------------------- START_SECTION ----------------
>>
>>Similar issue in frequency_sampling_enabled too. Gt_id argument to
>>it seems pointless.
>
>Not sure why it's pointless. We need the gt_id to determine the right
>mask for that specific gt. If it's not enabled, then we just return
>without pm_get and async put (like you mention later).
>
>And this piece of code is called within for_each_gt.
>
>>
>>So I now think whole frequency_enabled_mask() is just pointless and
>>should be removed. And then pmu_needs_time code can stay as is.
>>Possibly add a config_mask_32 helper which ensures no bits in upper
>>32 bits are returned.
>>
>>That is if we are happy for the frequency_sampling_enabled returning
>>true for all gts, regardless of which ones actually have frequency
>>sampling enabled.
>>
>>Or if we want to implement it as I probably have intended, we will
>>need to add some gt bits into pmu->enable. Maybe reserve top four
>>same as with config counters.
>
>Nope. What you have here works just fine. pmu->enable should not
>include any gt id info. gt_id[63:60] is only a concept for pmu config
>sent by user. config_mask and pmu->enable are i915 internal
>bookkeeping (bit masks) just to track what events need to be sampled.
>The 'other' bit masks are a function of gt_id because we use gt_id to
>calculate a contiguous numerical value for these 'other' events.
>That's all. Once the numerical value is calculated, there is no need
>for gt_id because config_mask is BIT_ULL(numerical_value). Since the
>numerical values never exceeded 31 (even for multi-gts), everything
>worked even with 32 bit pmu->enable.
>
>>
>>In this case the config_mask needs to be updated to translate not
>>just the config counter into the pmu tracked event bits, but config
>>counter gt id into the pmu->enabled gt id.
>>
>>Sounds easily doable on a first thought.
>
>------------------------ END_SECTION ----------------
>
>
>>>>
>>>> /*
>>>> * When the GPU is idle per-engine counters do not need to be
>>>>@@ -164,9 +189,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
>>>> return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
>>>> }
>>>>
>>>>+static unsigned int
>>>>+__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
>>>>+{
>>>>+ unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
>>>>+
>>>>+ GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
>>>
>>>Does this GEM_BUG_ON need to be split up as follows:
>>>
>>> GEM_BUG_ON(gt_id >= I915_PMU_MAX_GTS);
>>> GEM_BUG_ON(sample >= __I915_NUM_PMU_SAMPLERS);
>>>
>>>Since that is what we really mean here isn't it?
>>
>>ARRAY_SIZE check seems the safest option to me, given it is defined as:
>>
>>sample[I915_PMU_MAX_GTS * __I915_NUM_PMU_SAMPLERS];
>>
>>What problem do you see here?
>>
>>>>+
>>>>+ return idx;
>>>>+}
>>>>+
>>>>+static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
>>>>+{
>>>>+ return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
>>>>+}
>>>>+
>>>>+static void
>>>>+store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
>>>>+{
>>>>+ pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
>>>>+}
>>>>+
>>>>+static void
>>>>+add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, u32 mul)
>>>>+{
>>>>+ pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, mul);
>>>>+}
>>>
>>>Gripe: I think this code should have per event data structures which store
>>>all information about a particular event. Rather than storing it in these
>>>arrays common to all events (and in bit-fields common to all events) which
>>>results in the kind of dance we have to do here. Anyway too big a change to
>>>make now but something to consider if we ever do this for xe.
>>
>>Could do a two dimensional array like:
>>
>>sample[I915_PMU_MAX_GTS][__I915_NUM_PMU_SAMPLERS];
>>
>>Any better? Honestly I don't remember if there was a special reason
>>I went for a flat array back then.
>
>Maybe we improve it in XE. I am looking for the shortest path to get
>this merged without any functional issues.
>
>>
>>>
>>>>+
>>>> static u64 get_rc6(struct intel_gt *gt)
>>>> {
>>>> struct drm_i915_private *i915 = gt->i915;
>>>>+ const unsigned int gt_id = gt->info.id;
>>>> struct i915_pmu *pmu = &i915->pmu;
>>>> unsigned long flags;
>>>> bool awake = false;
>>>>@@ -181,7 +234,7 @@ static u64 get_rc6(struct intel_gt *gt)
>>>> spin_lock_irqsave(&pmu->lock, flags);
>>>>
>>>> if (awake) {
>>>>- pmu->sample[__I915_SAMPLE_RC6].cur = val;
>>>>+ store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
>>>> } else {
>>>> /*
>>>> * We think we are runtime suspended.
>>>>@@ -190,14 +243,14 @@ static u64 get_rc6(struct intel_gt *gt)
>>>> * on top of the last known real value, as the approximated RC6
>>>> * counter value.
>>>> */
>>>>- val = ktime_since_raw(pmu->sleep_last);
>>>>- val += pmu->sample[__I915_SAMPLE_RC6].cur;
>>>>+ val = ktime_since_raw(pmu->sleep_last[gt_id]);
>>>>+ val += read_sample(pmu, gt_id, __I915_SAMPLE_RC6);
>>>> }
>>>>
>>>>- if (val < pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur)
>>>>- val = pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur;
>>>>+ if (val < read_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED))
>>>>+ val = read_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED);
>>>> else
>>>>- pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur = val;
>>>>+ store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED, val);
>>>>
>>>> spin_unlock_irqrestore(&pmu->lock, flags);
>>>>
>>>>@@ -207,13 +260,20 @@ static u64 get_rc6(struct intel_gt *gt)
>>>> static void init_rc6(struct i915_pmu *pmu)
>>>> {
>>>> struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>>>>- intel_wakeref_t wakeref;
>>>>+ struct intel_gt *gt;
>>>>+ unsigned int i;
>>>>+
>>>>+ for_each_gt(gt, i915, i) {
>>>>+ intel_wakeref_t wakeref;
>>>>
>>>>- with_intel_runtime_pm(to_gt(i915)->uncore->rpm, wakeref) {
>>>>- pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(to_gt(i915));
>>>>- pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur =
>>>>- pmu->sample[__I915_SAMPLE_RC6].cur;
>>>>- pmu->sleep_last = ktime_get_raw();
>>>>+ with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
>>>>+ u64 val = __get_rc6(gt);
>>>>+
>>>>+ store_sample(pmu, i, __I915_SAMPLE_RC6, val);
>>>>+ store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
>>>>+ val);
>>>>+ pmu->sleep_last[i] = ktime_get_raw();
>>>>+ }
>>>> }
>>>> }
>>>>
>>>>@@ -221,8 +281,8 @@ static void park_rc6(struct intel_gt *gt)
>>>> {
>>>> struct i915_pmu *pmu = >->i915->pmu;
>>>>
>>>>- pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(gt);
>>>>- pmu->sleep_last = ktime_get_raw();
>>>>+ store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>>>+ pmu->sleep_last[gt->info.id] = ktime_get_raw();
>>>> }
>>>>
>>>> static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
>>>>@@ -362,34 +422,30 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
>>>> }
>>>> }
>>>>
>>>>-static void
>>>>-add_sample_mult(struct i915_pmu_sample *sample, u32 val, u32 mul)
>>>>-{
>>>>- sample->cur += mul_u32_u32(val, mul);
>>>>-}
>>>>-
>>>>-static bool frequency_sampling_enabled(struct i915_pmu *pmu)
>>>>+static bool
>>>>+frequency_sampling_enabled(struct i915_pmu *pmu, unsigned int gt)
>>>> {
>>>> return pmu->enable &
>>>>- (config_mask(I915_PMU_ACTUAL_FREQUENCY) |
>>>>- config_mask(I915_PMU_REQUESTED_FREQUENCY));
>>>>+ (config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt)) |
>>>>+ config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt)));
>>>
>>>Here again:
>>>
>>> u32 pmu->enable & u64 config_mask
>>>
>>>Probably ok?
>>>
>>>And also in i915_pmu_enable() we have:
>>>
>>> pmu->enable |= BIT_ULL(bit);
>>>
>>>So change pmu->enable to u64?
>
>Right, changing to u64
>
>>>
>>>> }
>>>>
>>>> static void
>>>> frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>>> {
>>>> struct drm_i915_private *i915 = gt->i915;
>>>>+ const unsigned int gt_id = gt->info.id;
>>>> struct i915_pmu *pmu = &i915->pmu;
>>>> struct intel_rps *rps = >->rps;
>>>>
>>>>- if (!frequency_sampling_enabled(pmu))
>>>>+ if (!frequency_sampling_enabled(pmu, gt_id))
>>>> return;
>>>
>>>Pre-existing issue, but why do we need this check? This is already checked
>>>in the two individual checks for actual and requested freq below:
>>>
>>> if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id)))
>>>
>>> and
>>>
>>> if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id)))
>>>
>>>So can we delete frequency_sampling_enabled()? Or is it there to avoid the
>>>overhead of intel_gt_pm_get_if_awake() (which doesn't seem to be much)?
>>
>>I think it was to avoid even getting an already active pm ref if
>>frequency events are not enabled. Timer could be running for
>>instance if only engine wait/sema is enabled. So yeah, just a little
>>bit cheaper than pm get + async put and avoid prolonging the delayed
>>put for no reason. (As the timer races with regular GT pm activities
>>(see mod_delayed_work in __intel_wakeref_put_last).)
>
>leaving as is.
>
>>
>>>
>>>>
>>>> /* Report 0/0 (actual/requested) frequency while parked. */
>>>> if (!intel_gt_pm_get_if_awake(gt))
>>>> return;
>>>>
>>>>- if (pmu->enable & config_mask(I915_PMU_ACTUAL_FREQUENCY)) {
>>>>+ if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
>>>> u32 val;
>>>>
>>>> /*
>>>>@@ -405,12 +461,12 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>>> if (!val)
>>>> val = intel_gpu_freq(rps, rps->cur_freq);
>>>>
>>>>- add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
>>>>+ add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_ACT,
>>>> val, period_ns / 1000);
>>>> }
>>>>
>>>>- if (pmu->enable & config_mask(I915_PMU_REQUESTED_FREQUENCY)) {
>>>>- add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_REQ],
>>>>+ if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
>>>>+ add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
>>>> intel_rps_get_requested_frequency(rps),
>>>> period_ns / 1000);
>>>> }
>>>>@@ -447,9 +503,7 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer)
>>>> continue;
>>>>
>>>> engines_sample(gt, period_ns);
>>>>-
>>>>- if (i == 0) /* FIXME */
>>>>- frequency_sample(gt, period_ns);
>>>>+ frequency_sample(gt, period_ns);
>>>> }
>>>>
>>>> hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
>>>>@@ -491,7 +545,12 @@ config_status(struct drm_i915_private *i915, u64 config)
>>>> {
>>>> struct intel_gt *gt = to_gt(i915);
>>>>
>>>>- switch (config) {
>>>>+ unsigned int gt_id = config_gt_id(config);
>>>>+
>>>>+ if (gt_id)
>>>>+ return -ENOENT;
>>>
>>>This is just wrong. It is fixed in the next patch:
>>>
>>> if (gt_id > max_gt_id)
>>> return -ENOENT;
>>>
>>>But probably should be fixed in this patch itself. Or dropped from this
>>>patch and let it come in in Patch 6, since it's confusing. Though it
>>>probably belongs in this patch.
>>
>>Hmm my thinking was probably to reject gt > 0 in this patch since
>>only the last patch was supposed to be exposing the other tiles.
>>Granted that is not entirely true since this patch already makes
>>access to them available via i915_drm.h. Last patch only makes then
>>discoverable via sysfs.
>>
>>In this case yes, I'd pull in "gt_id > max_gt_id" into this patch.
>>And this hunk from the next patch too:
>>
>> case I915_PMU_INTERRUPTS:
>>+ if (gt_id)
>>+ return -ENOENT;
>>
>
>pulling in the above snippets from patch 6 to patch 5
>
>>>
>>>>+
>>>>+ switch (config_counter(config)) {
>>>> case I915_PMU_ACTUAL_FREQUENCY:
>>>> if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
>>>> /* Requires a mutex for sampling! */
>>>>@@ -599,22 +658,27 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
>>>> val = engine->pmu.sample[sample].cur;
>>>> }
>>>> } else {
>>>>- switch (event->attr.config) {
>>>>+ const unsigned int gt_id = config_gt_id(event->attr.config);
>>>>+ const u64 config = config_counter(event->attr.config);
>>>>+
>>>>+ switch (config) {
>>>> case I915_PMU_ACTUAL_FREQUENCY:
>>>> val =
>>>>- div_u64(pmu->sample[__I915_SAMPLE_FREQ_ACT].cur,
>>>>+ div_u64(read_sample(pmu, gt_id,
>>>>+ __I915_SAMPLE_FREQ_ACT),
>>>> USEC_PER_SEC /* to MHz */);
>>>> break;
>>>> case I915_PMU_REQUESTED_FREQUENCY:
>>>> val =
>>>>- div_u64(pmu->sample[__I915_SAMPLE_FREQ_REQ].cur,
>>>>+ div_u64(read_sample(pmu, gt_id,
>>>>+ __I915_SAMPLE_FREQ_REQ),
>>>> USEC_PER_SEC /* to MHz */);
>>>> break;
>>>> case I915_PMU_INTERRUPTS:
>>>> val = READ_ONCE(pmu->irq_count);
>>>> break;
>>>> case I915_PMU_RC6_RESIDENCY:
>>>>- val = get_rc6(to_gt(i915));
>>>>+ val = get_rc6(i915->gt[gt_id]);
>>>> break;
>>>> case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
>>>> val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
>>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
>>>>index 3a811266ac6a..d47846f21ddf 100644
>>>>--- a/drivers/gpu/drm/i915/i915_pmu.h
>>>>+++ b/drivers/gpu/drm/i915/i915_pmu.h
>>>>@@ -38,13 +38,16 @@ enum {
>>>> __I915_NUM_PMU_SAMPLERS
>>>> };
>>>>
>>>>+#define I915_PMU_MAX_GTS (4)
>>>
>>>4 or (4)? :-)
>>
>>Bike shed was strong with you on the day of review I see. :)
>>
>>I would rather get rid of this define altogether if we could use the
>>"normal" MAX_GT define. As I was saying earlier, I think this one
>>was here just because header dependencies were too convulted back
>>then. Maybe today things are better? Worth I try probably.
>
>dropping this and using I915_MAX_GTS
Okay, I see what you mentioned here about the header dependencies. It's
still the same. i915_drv.h includes intel_engine.h which includes
i915_pmu.h. I cannot use i915_drv.h to use I915_MAX_GTS, it's causing
all sorts of compile errors (likely cyclic). I don't see a quick way to
resolve that, so I am going to leave it as I915_PMU_MAX_GTS.
Thanks,
Umesh
>
>>
>>>
>>>>+
>>>> /*
>>>> * How many different events we track in the global PMU mask.
>>>> *
>>>> * It is also used to know to needed number of event reference counters.
>>>> */
>>>> #define I915_PMU_MASK_BITS \
>>>>- (I915_ENGINE_SAMPLE_COUNT + __I915_PMU_TRACKED_EVENT_COUNT)
>>>>+ (I915_ENGINE_SAMPLE_COUNT + \
>>>>+ I915_PMU_MAX_GTS * __I915_PMU_TRACKED_EVENT_COUNT)
>>>>
>>>> #define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1)
>>>>
>>>>@@ -124,11 +127,11 @@ struct i915_pmu {
>>>> * Only global counters are held here, while the per-engine ones are in
>>>> * struct intel_engine_cs.
>>>> */
>>>>- struct i915_pmu_sample sample[__I915_NUM_PMU_SAMPLERS];
>>>>+ struct i915_pmu_sample sample[I915_PMU_MAX_GTS * __I915_NUM_PMU_SAMPLERS];
>>>> /**
>>>> * @sleep_last: Last time GT parked for RC6 estimation.
>>>> */
>>>>- ktime_t sleep_last;
>>>>+ ktime_t sleep_last[I915_PMU_MAX_GTS];
>>>> /**
>>>> * @irq_count: Number of interrupts
>>>> *
>>>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>>index dba7c5a5b25e..d5ac1fdeb2b1 100644
>>>>--- a/include/uapi/drm/i915_drm.h
>>>>+++ b/include/uapi/drm/i915_drm.h
>>>>@@ -280,7 +280,16 @@ enum drm_i915_pmu_engine_sample {
>>>> #define I915_PMU_ENGINE_SEMA(class, instance) \
>>>> __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
>>>>
>>>>-#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
>>>>+/*
>>>>+ * Top 4 bits of every non-engine counter are GT id.
>>>>+ */
>>>>+#define __I915_PMU_GT_SHIFT (60)
>>>
>>>REG_GENMASK64 or GENMASK_ULL would be nicer but of course we can't put in
>>>the uapi header, so ok.
>>
>>Yep.
>
>leaving as is.
>
>>
>>>>+
>>>>+#define ___I915_PMU_OTHER(gt, x) \
>>>>+ (((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
>>>>+ ((__u64)(gt) << __I915_PMU_GT_SHIFT))
>>>>+
>>>>+#define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
>>>>
>>>> #define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0)
>>>> #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1)
>>>>@@ -290,6 +299,12 @@ enum drm_i915_pmu_engine_sample {
>>>>
>>>> #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
>>>>
>>>>+#define __I915_PMU_ACTUAL_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 0)
>>>>+#define __I915_PMU_REQUESTED_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 1)
>>>>+#define __I915_PMU_INTERRUPTS(gt) ___I915_PMU_OTHER(gt, 2)
>>>>+#define __I915_PMU_RC6_RESIDENCY(gt) ___I915_PMU_OTHER(gt, 3)
>>>>+#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt) ___I915_PMU_OTHER(gt, 4)
>>>>+
>>>> /* Each region is a minimum of 16k, and there are at most 255 of them.
>>>> */
>>>> #define I915_NR_TEX_REGIONS 255 /* table size 2k - maximum due to use
>>>>--
>>>>2.36.1
>>>>
>>>
>>>Above comments are mostly nits so after addressing the above comments, this
>>>is:
>>>
>>>Reviewed-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
>>
>>Well you found some ugly bits (or I got confused, double check me
>>please) so I'd say hold off with r-b just yet. Sadly it's on Umesh
>>now to fix up my mess. :I
>
>I don't see anything wrong with the SECTION I marked above. As in, the
>pmu_needs_timer and the sampling code for events that need to be
>sampled. If you agree, I can spin the next revision.
>
>Thanks,
>Umesh
>
>>
>>Regards,
>>
>>Tvrtko
More information about the Intel-gfx
mailing list