[Mesa-dev] [PATCH 08/11] i965: perf: snapshot RPSTAT1 register
Lionel Landwerlin
lionel.g.landwerlin at intel.com
Tue Apr 3 13:59:28 UTC 2018
On 23/03/18 18:00, Kenneth Graunke wrote:
> On Thursday, March 8, 2018 7:42:53 AM PDT Lionel Landwerlin wrote:
>> This register contains the frequency of the GT, it's one of the value
>> GPA would like to have as part of their queries.
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>> ---
>> src/mesa/drivers/dri/i965/brw_defines.h | 10 +++++
>> src/mesa/drivers/dri/i965/brw_performance_query.c | 45 +++++++++++++++++++++++
>> src/mesa/drivers/dri/i965/brw_performance_query.h | 5 +++
>> 3 files changed, 60 insertions(+)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
>> index 8bf6f68b67c..ead44ebc5e8 100644
>> --- a/src/mesa/drivers/dri/i965/brw_defines.h
>> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
>> @@ -1656,6 +1656,16 @@ enum brw_pixel_shader_coverage_mask_mode {
>> #define CS_DEBUG_MODE2 0x20d8 /* Gen9+ */
>> # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
>>
>> +#define GEN6_RPSTAT1 0xA01C
>> +#define GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT 7
>> +#define GEN6_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(13, 7)
>> +#define GEN6_RPSTAT1_PREV_GT_FREQ_SHIFT 0
>> +#define GEN6_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(6, 0)
>> +#define GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT 23
>> +#define GEN9_RPSTAT1_CURR_GT_FREQ_MASK INTEL_MASK(31, 23)
>> +#define GEN9_RPSTAT1_PREV_GT_FREQ_SHIFT 0
>> +#define GEN9_RPSTAT1_PREV_GT_FREQ_MASK INTEL_MASK(8, 0)
>> +
> I can confirm that Haswell->Broadwell use 13:7 and 6:0, while
> Skylake and Cannonlake use 31:23 and 8:0. They apparently call this
> RPSTAT1 on Haswell and RP_STATUS0 on Gen8+.
>
> These are the wrong masks for Sandybridge, so I would not call them
> GEN6_*. The kernel has code for Sandybridge if we wanted to handle it,
> but it looks like we don't expose OA on Sandybridge anyway, so there's
> likely little point.
>
> Baytrail and Cherryview should both be excluded, as you have to read the
> current frequency from the PUnit. Broxton and all others should work.
Thanks, updating.
>
>> #define SLICE_COMMON_ECO_CHICKEN1 0x731c /* Gen9+ */
>> # define GLK_SCEC_BARRIER_MODE_GPGPU (0 << 7)
>> # define GLK_SCEC_BARRIER_MODE_3D_HULL (1 << 7)
>> diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
>> index 98666759d75..7d5b44cf61d 100644
>> --- a/src/mesa/drivers/dri/i965/brw_performance_query.c
>> +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
>> @@ -227,6 +227,8 @@ brw_perf_query(struct gl_perf_query_object *o)
>>
>> #define MI_RPC_BO_SIZE 4096
>> #define MI_RPC_BO_END_OFFSET_BYTES (MI_RPC_BO_SIZE / 2)
>> +#define MI_FREQ_START_OFFSET_BYTES (3072)
>> +#define MI_FREQ_END_OFFSET_BYTES (3076)
> Why these?
That's where I store the RPSTAT copy (before/after the workload).
>
>> /******************************************************************************/
>>
>> @@ -1150,6 +1152,9 @@ brw_begin_perf_query(struct gl_context *ctx,
>> /* Take a starting OA counter snapshot. */
>> brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
>> obj->oa.begin_report_id);
>> + brw_store_register_mem32(brw, obj->oa.bo, GEN6_RPSTAT1,
>> + MI_FREQ_START_OFFSET_BYTES);
>> +
>> ++brw->perfquery.n_active_oa_queries;
>>
>> /* No already-buffered samples can possibly be associated with this query
>> @@ -1233,6 +1238,8 @@ brw_end_perf_query(struct gl_context *ctx,
>> */
>> if (!obj->oa.results_accumulated) {
>> /* Take an ending OA counter snapshot. */
>> + brw_store_register_mem32(brw, obj->oa.bo, GEN6_RPSTAT1,
>> + MI_FREQ_END_OFFSET_BYTES);
>> brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
>> MI_RPC_BO_END_OFFSET_BYTES,
>> obj->oa.begin_report_id + 1);
>> @@ -1333,6 +1340,43 @@ brw_is_perf_query_ready(struct gl_context *ctx,
>> return false;
>> }
>>
>> +static void
>> +read_gt_frequency(struct brw_context *brw,
>> + struct brw_perf_query_object *obj)
>> +{
>> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
>> + uint32_t *start_reg = obj->oa.map + MI_FREQ_START_OFFSET_BYTES,
>> + *end_reg = obj->oa.map + MI_FREQ_END_OFFSET_BYTES;
>> +
>> + switch (devinfo->gen) {
>> + case 7:
>> + case 8:
>> + obj->oa.gt_frequency[0] =
>> + ((start_reg[0] & GEN6_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> + GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT) * 50ULL;
> You can just do:
>
> GET_FIELD(start_reg[0], GEN6_RPSTAT1_CURR_GT_FREQ)
>
> instead of shifting and masking.
>
> I think your conversions may be wrong. In particular, you don't handle
> Gen9LP and Gen9 differently, while in the kernel, GT_PM_INTERVAL_TO_US
> does:
>
> Gen9 LP: 0.833 -> usec
> Gen9+ non-LP: 1.33 -> usec
> other: 1.28 -> usec
>
> #define INTERVAL_1_28_TO_US(interval) (((interval) << 7) / 100)
> #define INTERVAL_1_33_TO_US(interval) (((interval) << 2) / 3)
> #define INTERVAL_0_833_TO_US(interval) (((interval) * 5) / 6)
> #define GT_PM_INTERVAL_TO_US(dev_priv, interval) (INTEL_GEN(dev_priv) >= 9 ? \
> (IS_GEN9_LP(dev_priv) ? \
> INTERVAL_0_833_TO_US(interval) : \
> INTERVAL_1_33_TO_US(interval)) : \
> INTERVAL_1_28_TO_US(interval))
>
> I could be mistaken, though.
Actually the kernel reads rpstat1 already and computes the frequency value.
I think the current code is equivalent to what the kernel does on big
cores & small cores >= gen9.
On cherryview/valleyview, we need to read another register to figure out
the multipliers...
So I'll just leave it out for those small cores gens for now.
>
>> + obj->oa.gt_frequency[1] =
>> + ((end_reg[0] & GEN6_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> + GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT) * 50ULL;
>> + break;
>> + case 9:
>> + case 10:
>> + case 11:
>> + obj->oa.gt_frequency[0] =
>> + ((start_reg[0] & GEN9_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> + GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT) * 100ULL / 6ULL;
>> + obj->oa.gt_frequency[1] =
>> + ((end_reg[0] & GEN9_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> + GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT) * 100ULL / 6ULL;
>> + break;
>> + default:
>> + unreachable("unexpected gen");
>> + }
>> +
>> + /* Put the numbers into Hz. */
>> + obj->oa.gt_frequency[0] *= 1000000ULL;
>> + obj->oa.gt_frequency[1] *= 1000000ULL;
>> +}
>> +
>> static int
>> get_oa_counter_data(struct brw_context *brw,
>> struct brw_perf_query_object *obj,
>> @@ -1344,6 +1388,7 @@ get_oa_counter_data(struct brw_context *brw,
>> int written = 0;
>>
>> if (!obj->oa.results_accumulated) {
>> + read_gt_frequency(brw, obj);
>> accumulate_oa_reports(brw, obj);
>> assert(obj->oa.results_accumulated);
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.h b/src/mesa/drivers/dri/i965/brw_performance_query.h
>> index f62786f7f1c..f8732738b4e 100644
>> --- a/src/mesa/drivers/dri/i965/brw_performance_query.h
>> +++ b/src/mesa/drivers/dri/i965/brw_performance_query.h
>> @@ -113,6 +113,11 @@ struct brw_perf_query_object
>> * Number of reports accumulated to produce the results.
>> */
>> uint32_t reports_accumulated;
>> +
>> + /**
>> + * Frequency of the GT at begin and end of the query.
>> + */
>> + uint64_t gt_frequency[2];
>> } oa;
>>
>> struct {
>>
More information about the mesa-dev
mailing list