[Intel-gfx] [PATCH 5/6] drm/i915/pmu: Prepare for multi-tile non-engine counters
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Mon May 15 06:16:01 UTC 2023
On Fri, May 12, 2023 at 09:41:56PM -0700, Dixit, Ashutosh wrote:
>On Fri, 12 May 2023 18:55:44 -0700, Umesh Nerlige Ramappa wrote:
>>
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> Reserve some bits in the counter config namespace which will carry the
>> tile id and prepare the code to handle this.
>>
>> No per tile counters have been added yet.
>>
>> v2:
>> - Fix checkpatch issues
>> - Use 4 bits for gt id in non-engine counters. Drop FIXME.
>> - Set MAX GTs to 4. Drop FIXME.
>>
>> v3: (Ashutosh, Tvrtko)
>> - Drop BUG_ON that would never fire
>> - Make enable u64
>> - Pull in some code from next patch
>
>Just a reminder in case you want to do something like:
>
>#define I915_PMU_MAX_GTS I915_MAX_GT
>
>Or replace I915_PMU_MAX_GTS by I915_MAX_GT.
Hmmm, I thought I sent out a response separately for that in the
previous series, but I am not able to locate it, strange. Anyways, I did
try that and ran into issues that Tvrtko was mentioning w.r.t. header
dependencies. I think i915_drv.h includes intel_engine.h and that
includes i915_pmu.h. So including i915_drv.h in i915_pmu.h for the
definition of I915_MAX_GT is just wreaking havoc during compile.
Hence, gave up on that and using whatever existed before in Tvrtko's
patch.
Thanks,
Umesh
>
>But otherwise v3 LGTM:
>
>Reviewed-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> ---
>> drivers/gpu/drm/i915/i915_pmu.c | 148 +++++++++++++++++++++++---------
>> drivers/gpu/drm/i915/i915_pmu.h | 11 ++-
>> include/uapi/drm/i915_drm.h | 17 +++-
>> 3 files changed, 129 insertions(+), 47 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>> index 725b01b00775..b3dd9e51c5cc 100644
>> --- a/drivers/gpu/drm/i915/i915_pmu.c
>> +++ b/drivers/gpu/drm/i915/i915_pmu.c
>> @@ -56,11 +56,21 @@ static bool is_engine_config(u64 config)
>> return config < __I915_PMU_OTHER(0);
>> }
>>
>> +static unsigned int config_gt_id(const u64 config)
>> +{
>> + return config >> __I915_PMU_GT_SHIFT;
>> +}
>> +
>> +static u64 config_counter(const u64 config)
>> +{
>> + return config & ~(~0ULL << __I915_PMU_GT_SHIFT);
>> +}
>> +
>> static unsigned int other_bit(const u64 config)
>> {
>> unsigned int val;
>>
>> - switch (config) {
>> + switch (config_counter(config)) {
>> case I915_PMU_ACTUAL_FREQUENCY:
>> val = __I915_PMU_ACTUAL_FREQUENCY_ENABLED;
>> break;
>> @@ -78,7 +88,9 @@ static unsigned int other_bit(const u64 config)
>> return -1;
>> }
>>
>> - return I915_ENGINE_SAMPLE_COUNT + val;
>> + return I915_ENGINE_SAMPLE_COUNT +
>> + config_gt_id(config) * __I915_PMU_TRACKED_EVENT_COUNT +
>> + val;
>> }
>>
>> static unsigned int config_bit(const u64 config)
>> @@ -104,10 +116,22 @@ static unsigned int event_bit(struct perf_event *event)
>> return config_bit(event->attr.config);
>> }
>>
>> +static u64 frequency_enabled_mask(void)
>> +{
>> + unsigned int i;
>> + u64 mask = 0;
>> +
>> + for (i = 0; i < I915_PMU_MAX_GTS; i++)
>> + mask |= config_mask(__I915_PMU_ACTUAL_FREQUENCY(i)) |
>> + config_mask(__I915_PMU_REQUESTED_FREQUENCY(i));
>> +
>> + return mask;
>> +}
>> +
>> static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
>> {
>> struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>> - u32 enable;
>> + u64 enable;
>>
>> /*
>> * Only some counters need the sampling timer.
>> @@ -120,9 +144,7 @@ static bool pmu_needs_timer(struct i915_pmu *pmu, bool gpu_active)
>> * Mask out all the ones which do not need the timer, or in
>> * other words keep all the ones that could need the timer.
>> */
>> - enable &= config_mask(I915_PMU_ACTUAL_FREQUENCY) |
>> - config_mask(I915_PMU_REQUESTED_FREQUENCY) |
>> - ENGINE_SAMPLE_MASK;
>> + enable &= frequency_enabled_mask() | ENGINE_SAMPLE_MASK;
>>
>> /*
>> * When the GPU is idle per-engine counters do not need to be
>> @@ -164,9 +186,37 @@ static inline s64 ktime_since_raw(const ktime_t kt)
>> return ktime_to_ns(ktime_sub(ktime_get_raw(), kt));
>> }
>>
>> +static unsigned int
>> +__sample_idx(struct i915_pmu *pmu, unsigned int gt_id, int sample)
>> +{
>> + unsigned int idx = gt_id * __I915_NUM_PMU_SAMPLERS + sample;
>> +
>> + GEM_BUG_ON(idx >= ARRAY_SIZE(pmu->sample));
>> +
>> + return idx;
>> +}
>> +
>> +static u64 read_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample)
>> +{
>> + return pmu->sample[__sample_idx(pmu, gt_id, sample)].cur;
>> +}
>> +
>> +static void
>> +store_sample(struct i915_pmu *pmu, unsigned int gt_id, int sample, u64 val)
>> +{
>> + pmu->sample[__sample_idx(pmu, gt_id, sample)].cur = val;
>> +}
>> +
>> +static void
>> +add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val, u32 mul)
>> +{
>> + pmu->sample[__sample_idx(pmu, gt_id, sample)].cur += mul_u32_u32(val, mul);
>> +}
>> +
>> static u64 get_rc6(struct intel_gt *gt)
>> {
>> struct drm_i915_private *i915 = gt->i915;
>> + const unsigned int gt_id = gt->info.id;
>> struct i915_pmu *pmu = &i915->pmu;
>> unsigned long flags;
>> bool awake = false;
>> @@ -181,7 +231,7 @@ static u64 get_rc6(struct intel_gt *gt)
>> spin_lock_irqsave(&pmu->lock, flags);
>>
>> if (awake) {
>> - pmu->sample[__I915_SAMPLE_RC6].cur = val;
>> + store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
>> } else {
>> /*
>> * We think we are runtime suspended.
>> @@ -190,14 +240,14 @@ static u64 get_rc6(struct intel_gt *gt)
>> * on top of the last known real value, as the approximated RC6
>> * counter value.
>> */
>> - val = ktime_since_raw(pmu->sleep_last);
>> - val += pmu->sample[__I915_SAMPLE_RC6].cur;
>> + val = ktime_since_raw(pmu->sleep_last[gt_id]);
>> + val += read_sample(pmu, gt_id, __I915_SAMPLE_RC6);
>> }
>>
>> - if (val < pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur)
>> - val = pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur;
>> + if (val < read_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED))
>> + val = read_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED);
>> else
>> - pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur = val;
>> + store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED, val);
>>
>> spin_unlock_irqrestore(&pmu->lock, flags);
>>
>> @@ -207,13 +257,20 @@ static u64 get_rc6(struct intel_gt *gt)
>> static void init_rc6(struct i915_pmu *pmu)
>> {
>> struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
>> - intel_wakeref_t wakeref;
>> + struct intel_gt *gt;
>> + unsigned int i;
>> +
>> + for_each_gt(gt, i915, i) {
>> + intel_wakeref_t wakeref;
>> +
>> + with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
>> + u64 val = __get_rc6(gt);
>>
>> - with_intel_runtime_pm(to_gt(i915)->uncore->rpm, wakeref) {
>> - pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(to_gt(i915));
>> - pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur =
>> - pmu->sample[__I915_SAMPLE_RC6].cur;
>> - pmu->sleep_last = ktime_get_raw();
>> + store_sample(pmu, i, __I915_SAMPLE_RC6, val);
>> + store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
>> + val);
>> + pmu->sleep_last[i] = ktime_get_raw();
>> + }
>> }
>> }
>>
>> @@ -221,8 +278,8 @@ static void park_rc6(struct intel_gt *gt)
>> {
>> struct i915_pmu *pmu = >->i915->pmu;
>>
>> - pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(gt);
>> - pmu->sleep_last = ktime_get_raw();
>> + store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
>> + pmu->sleep_last[gt->info.id] = ktime_get_raw();
>> }
>>
>> static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
>> @@ -362,34 +419,30 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns)
>> }
>> }
>>
>> -static void
>> -add_sample_mult(struct i915_pmu_sample *sample, u32 val, u32 mul)
>> -{
>> - sample->cur += mul_u32_u32(val, mul);
>> -}
>> -
>> -static bool frequency_sampling_enabled(struct i915_pmu *pmu)
>> +static bool
>> +frequency_sampling_enabled(struct i915_pmu *pmu, unsigned int gt)
>> {
>> return pmu->enable &
>> - (config_mask(I915_PMU_ACTUAL_FREQUENCY) |
>> - config_mask(I915_PMU_REQUESTED_FREQUENCY));
>> + (config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt)) |
>> + config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt)));
>> }
>>
>> static void
>> frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>> {
>> struct drm_i915_private *i915 = gt->i915;
>> + const unsigned int gt_id = gt->info.id;
>> struct i915_pmu *pmu = &i915->pmu;
>> struct intel_rps *rps = >->rps;
>>
>> - if (!frequency_sampling_enabled(pmu))
>> + if (!frequency_sampling_enabled(pmu, gt_id))
>> return;
>>
>> /* Report 0/0 (actual/requested) frequency while parked. */
>> if (!intel_gt_pm_get_if_awake(gt))
>> return;
>>
>> - if (pmu->enable & config_mask(I915_PMU_ACTUAL_FREQUENCY)) {
>> + if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
>> u32 val;
>>
>> /*
>> @@ -405,12 +458,12 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>> if (!val)
>> val = intel_gpu_freq(rps, rps->cur_freq);
>>
>> - add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_ACT],
>> + add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_ACT,
>> val, period_ns / 1000);
>> }
>>
>> - if (pmu->enable & config_mask(I915_PMU_REQUESTED_FREQUENCY)) {
>> - add_sample_mult(&pmu->sample[__I915_SAMPLE_FREQ_REQ],
>> + if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
>> + add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
>> intel_rps_get_requested_frequency(rps),
>> period_ns / 1000);
>> }
>> @@ -444,9 +497,7 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer)
>>
>> for_each_gt(gt, i915, i) {
>> engines_sample(gt, period_ns);
>> -
>> - if (i == 0) /* FIXME */
>> - frequency_sample(gt, period_ns);
>> + frequency_sample(gt, period_ns);
>> }
>>
>> hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
>> @@ -488,7 +539,13 @@ config_status(struct drm_i915_private *i915, u64 config)
>> {
>> struct intel_gt *gt = to_gt(i915);
>>
>> - switch (config) {
>> + unsigned int gt_id = config_gt_id(config);
>> + unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 1 : 0;
>> +
>> + if (gt_id > max_gt_id)
>> + return -ENOENT;
>> +
>> + switch (config_counter(config)) {
>> case I915_PMU_ACTUAL_FREQUENCY:
>> if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
>> /* Requires a mutex for sampling! */
>> @@ -499,6 +556,8 @@ config_status(struct drm_i915_private *i915, u64 config)
>> return -ENODEV;
>> break;
>> case I915_PMU_INTERRUPTS:
>> + if (gt_id)
>> + return -ENOENT;
>> break;
>> case I915_PMU_RC6_RESIDENCY:
>> if (!gt->rc6.supported)
>> @@ -596,22 +655,27 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
>> val = engine->pmu.sample[sample].cur;
>> }
>> } else {
>> - switch (event->attr.config) {
>> + const unsigned int gt_id = config_gt_id(event->attr.config);
>> + const u64 config = config_counter(event->attr.config);
>> +
>> + switch (config) {
>> case I915_PMU_ACTUAL_FREQUENCY:
>> val =
>> - div_u64(pmu->sample[__I915_SAMPLE_FREQ_ACT].cur,
>> + div_u64(read_sample(pmu, gt_id,
>> + __I915_SAMPLE_FREQ_ACT),
>> USEC_PER_SEC /* to MHz */);
>> break;
>> case I915_PMU_REQUESTED_FREQUENCY:
>> val =
>> - div_u64(pmu->sample[__I915_SAMPLE_FREQ_REQ].cur,
>> + div_u64(read_sample(pmu, gt_id,
>> + __I915_SAMPLE_FREQ_REQ),
>> USEC_PER_SEC /* to MHz */);
>> break;
>> case I915_PMU_INTERRUPTS:
>> val = READ_ONCE(pmu->irq_count);
>> break;
>> case I915_PMU_RC6_RESIDENCY:
>> - val = get_rc6(to_gt(i915));
>> + val = get_rc6(i915->gt[gt_id]);
>> break;
>> case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
>> val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
>> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
>> index 3a811266ac6a..ea2d24ef5664 100644
>> --- a/drivers/gpu/drm/i915/i915_pmu.h
>> +++ b/drivers/gpu/drm/i915/i915_pmu.h
>> @@ -38,13 +38,16 @@ enum {
>> __I915_NUM_PMU_SAMPLERS
>> };
>>
>> +#define I915_PMU_MAX_GTS 4
>> +
>> /*
>> * How many different events we track in the global PMU mask.
>> *
>> * It is also used to know to needed number of event reference counters.
>> */
>> #define I915_PMU_MASK_BITS \
>> - (I915_ENGINE_SAMPLE_COUNT + __I915_PMU_TRACKED_EVENT_COUNT)
>> + (I915_ENGINE_SAMPLE_COUNT + \
>> + I915_PMU_MAX_GTS * __I915_PMU_TRACKED_EVENT_COUNT)
>>
>> #define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1)
>>
>> @@ -95,7 +98,7 @@ struct i915_pmu {
>> *
>> * Low bits are engine samplers and other events continue from there.
>> */
>> - u32 enable;
>> + u64 enable;
>>
>> /**
>> * @timer_last:
>> @@ -124,11 +127,11 @@ struct i915_pmu {
>> * Only global counters are held here, while the per-engine ones are in
>> * struct intel_engine_cs.
>> */
>> - struct i915_pmu_sample sample[__I915_NUM_PMU_SAMPLERS];
>> + struct i915_pmu_sample sample[I915_PMU_MAX_GTS * __I915_NUM_PMU_SAMPLERS];
>> /**
>> * @sleep_last: Last time GT parked for RC6 estimation.
>> */
>> - ktime_t sleep_last;
>> + ktime_t sleep_last[I915_PMU_MAX_GTS];
>> /**
>> * @irq_count: Number of interrupts
>> *
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index dba7c5a5b25e..d5ac1fdeb2b1 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -280,7 +280,16 @@ enum drm_i915_pmu_engine_sample {
>> #define I915_PMU_ENGINE_SEMA(class, instance) \
>> __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
>>
>> -#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
>> +/*
>> + * Top 4 bits of every non-engine counter are GT id.
>> + */
>> +#define __I915_PMU_GT_SHIFT (60)
>> +
>> +#define ___I915_PMU_OTHER(gt, x) \
>> + (((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
>> + ((__u64)(gt) << __I915_PMU_GT_SHIFT))
>> +
>> +#define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
>>
>> #define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0)
>> #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1)
>> @@ -290,6 +299,12 @@ enum drm_i915_pmu_engine_sample {
>>
>> #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
>>
>> +#define __I915_PMU_ACTUAL_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 0)
>> +#define __I915_PMU_REQUESTED_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 1)
>> +#define __I915_PMU_INTERRUPTS(gt) ___I915_PMU_OTHER(gt, 2)
>> +#define __I915_PMU_RC6_RESIDENCY(gt) ___I915_PMU_OTHER(gt, 3)
>> +#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt) ___I915_PMU_OTHER(gt, 4)
>> +
>> /* Each region is a minimum of 16k, and there are at most 255 of them.
>> */
>> #define I915_NR_TEX_REGIONS 255 /* table size 2k - maximum due to use
>> --
>> 2.36.1
>>
More information about the Intel-gfx
mailing list