[Mesa-dev] [PATCH 08/11] i965: perf: snapshot RPSTAT1 register

Lionel Landwerlin lionel.g.landwerlin at intel.com
Tue Apr 3 13:59:28 UTC 2018


On 23/03/18 18:00, Kenneth Graunke wrote:
> On Thursday, March 8, 2018 7:42:53 AM PDT Lionel Landwerlin wrote:
>> This register contains the frequency of the GT, it's one of the value
>> GPA would like to have as part of their queries.
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>> ---
>>   src/mesa/drivers/dri/i965/brw_defines.h           | 10 +++++
>>   src/mesa/drivers/dri/i965/brw_performance_query.c | 45 +++++++++++++++++++++++
>>   src/mesa/drivers/dri/i965/brw_performance_query.h |  5 +++
>>   3 files changed, 60 insertions(+)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
>> index 8bf6f68b67c..ead44ebc5e8 100644
>> --- a/src/mesa/drivers/dri/i965/brw_defines.h
>> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
>> @@ -1656,6 +1656,16 @@ enum brw_pixel_shader_coverage_mask_mode {
>>   #define CS_DEBUG_MODE2                     0x20d8 /* Gen9+ */
>>   # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
>>   
>> +#define GEN6_RPSTAT1                       0xA01C
>> +#define  GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT   7
>> +#define  GEN6_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(13, 7)
>> +#define  GEN6_RPSTAT1_PREV_GT_FREQ_SHIFT   0
>> +#define  GEN6_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(6, 0)
>> +#define  GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT   23
>> +#define  GEN9_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(31, 23)
>> +#define  GEN9_RPSTAT1_PREV_GT_FREQ_SHIFT   0
>> +#define  GEN9_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(8, 0)
>> +
> I can confirm that Haswell->Broadwell use 13:7 and 6:0, while
> Skylake and Cannonlake use 31:23 and 8:0.  They apparently call this
> RPSTAT1 on Haswell and RP_STATUS0 on Gen8+.
>
> These are the wrong masks for Sandybridge, so I would not call them
> GEN6_*.  The kernel has code for Sandybridge if we wanted to handle it,
> but it looks like we don't expose OA on Sandybridge anyway, so there's
> likely little point.
>
> Baytrail and Cherryview should both be excluded, as you have to read the
> current frequency from the PUnit.  Broxton and all others should work.

Thanks, updating.

>
>>   #define SLICE_COMMON_ECO_CHICKEN1          0x731c /* Gen9+ */
>>   # define GLK_SCEC_BARRIER_MODE_GPGPU       (0 << 7)
>>   # define GLK_SCEC_BARRIER_MODE_3D_HULL     (1 << 7)
>> diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
>> index 98666759d75..7d5b44cf61d 100644
>> --- a/src/mesa/drivers/dri/i965/brw_performance_query.c
>> +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
>> @@ -227,6 +227,8 @@ brw_perf_query(struct gl_perf_query_object *o)
>>   
>>   #define MI_RPC_BO_SIZE              4096
>>   #define MI_RPC_BO_END_OFFSET_BYTES  (MI_RPC_BO_SIZE / 2)
>> +#define MI_FREQ_START_OFFSET_BYTES  (3072)
>> +#define MI_FREQ_END_OFFSET_BYTES    (3076)
> Why these?

That's where I store the RPSTAT copy (before/after the workload).

>
>>   /******************************************************************************/
>>   
>> @@ -1150,6 +1152,9 @@ brw_begin_perf_query(struct gl_context *ctx,
>>         /* Take a starting OA counter snapshot. */
>>         brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
>>                                             obj->oa.begin_report_id);
>> +      brw_store_register_mem32(brw, obj->oa.bo, GEN6_RPSTAT1,
>> +                               MI_FREQ_START_OFFSET_BYTES);
>> +
>>         ++brw->perfquery.n_active_oa_queries;
>>   
>>         /* No already-buffered samples can possibly be associated with this query
>> @@ -1233,6 +1238,8 @@ brw_end_perf_query(struct gl_context *ctx,
>>          */
>>         if (!obj->oa.results_accumulated) {
>>            /* Take an ending OA counter snapshot. */
>> +         brw_store_register_mem32(brw, obj->oa.bo, GEN6_RPSTAT1,
>> +                                  MI_FREQ_END_OFFSET_BYTES);
>>            brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
>>                                                MI_RPC_BO_END_OFFSET_BYTES,
>>                                                obj->oa.begin_report_id + 1);
>> @@ -1333,6 +1340,43 @@ brw_is_perf_query_ready(struct gl_context *ctx,
>>      return false;
>>   }
>>   
>> +static void
>> +read_gt_frequency(struct brw_context *brw,
>> +                  struct brw_perf_query_object *obj)
>> +{
>> +   const struct gen_device_info *devinfo = &brw->screen->devinfo;
>> +   uint32_t *start_reg = obj->oa.map + MI_FREQ_START_OFFSET_BYTES,
>> +      *end_reg = obj->oa.map + MI_FREQ_END_OFFSET_BYTES;
>> +
>> +   switch (devinfo->gen) {
>> +   case 7:
>> +   case 8:
>> +      obj->oa.gt_frequency[0] =
>> +         ((start_reg[0] & GEN6_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> +          GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT) * 50ULL;
> You can just do:
>
>    GET_FIELD(start_reg[0], GEN6_RPSTAT1_CURR_GT_FREQ)
>
> instead of shifting and masking.
>
> I think your conversions may be wrong.  In particular, you don't handle
> Gen9LP and Gen9 differently, while in the kernel, GT_PM_INTERVAL_TO_US
> does:
>
>    Gen9 LP:      0.833 -> usec
>    Gen9+ non-LP: 1.33  -> usec
>    other:        1.28  -> usec
>
> #define INTERVAL_1_28_TO_US(interval)  (((interval) << 7) / 100)
> #define INTERVAL_1_33_TO_US(interval)  (((interval) << 2) / 3)
> #define INTERVAL_0_833_TO_US(interval) (((interval) * 5)  / 6)
> #define GT_PM_INTERVAL_TO_US(dev_priv, interval) (INTEL_GEN(dev_priv) >= 9 ? \
>                             (IS_GEN9_LP(dev_priv) ? \
>                             INTERVAL_0_833_TO_US(interval) : \
>                             INTERVAL_1_33_TO_US(interval)) : \
>                             INTERVAL_1_28_TO_US(interval))
>
> I could be mistaken, though.

Actually the kernel reads rpstat1 already and computes the frequency value.
I think the current code is equivalent to what the kernel does on big 
cores & small cores >= gen9.

On cherryview/valleyview, we need to read another register to figure out 
the multipliers...

So I'll just leave it out for those small cores gens for now.

>
>> +      obj->oa.gt_frequency[1] =
>> +         ((end_reg[0] & GEN6_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> +          GEN6_RPSTAT1_CURR_GT_FREQ_SHIFT) * 50ULL;
>> +      break;
>> +   case 9:
>> +   case 10:
>> +   case 11:
>> +      obj->oa.gt_frequency[0] =
>> +         ((start_reg[0] & GEN9_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> +          GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT) * 100ULL / 6ULL;
>> +      obj->oa.gt_frequency[1] =
>> +         ((end_reg[0] & GEN9_RPSTAT1_CURR_GT_FREQ_MASK) >>
>> +          GEN9_RPSTAT1_CURR_GT_FREQ_SHIFT) * 100ULL / 6ULL;
>> +      break;
>> +   default:
>> +      unreachable("unexpected gen");
>> +   }
>> +
>> +   /* Put the numbers into Hz. */
>> +   obj->oa.gt_frequency[0] *= 1000000ULL;
>> +   obj->oa.gt_frequency[1] *= 1000000ULL;
>> +}
>> +
>>   static int
>>   get_oa_counter_data(struct brw_context *brw,
>>                       struct brw_perf_query_object *obj,
>> @@ -1344,6 +1388,7 @@ get_oa_counter_data(struct brw_context *brw,
>>      int written = 0;
>>   
>>      if (!obj->oa.results_accumulated) {
>> +      read_gt_frequency(brw, obj);
>>         accumulate_oa_reports(brw, obj);
>>         assert(obj->oa.results_accumulated);
>>   
>> diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.h b/src/mesa/drivers/dri/i965/brw_performance_query.h
>> index f62786f7f1c..f8732738b4e 100644
>> --- a/src/mesa/drivers/dri/i965/brw_performance_query.h
>> +++ b/src/mesa/drivers/dri/i965/brw_performance_query.h
>> @@ -113,6 +113,11 @@ struct brw_perf_query_object
>>             * Number of reports accumulated to produce the results.
>>             */
>>            uint32_t reports_accumulated;
>> +
>> +         /**
>> +          * Frequency of the GT at begin and end of the query.
>> +          */
>> +         uint64_t gt_frequency[2];
>>         } oa;
>>   
>>         struct {
>>



More information about the mesa-dev mailing list