[Mesa-dev] [PATCH] i965: Allow a per gen timebase scale factor

Fri Jan 6 21:23:45 UTC 2017

On Fri, Jan 6, 2017 at 1:17 PM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> From: Robert Bragg <robert at sixbynine.org>
>
> v2: (Ken) Update timebase_scale for platforms past Skylake/Broxton too.
> ---
>  src/intel/common/gen_device_info.c        | 13 ++++++--
>  src/intel/common/gen_device_info.h        | 24 ++++++++++++++
>  src/mesa/drivers/dri/i965/brw_context.c   | 15 +++++++++
>  src/mesa/drivers/dri/i965/brw_context.h   |  3 ++
>  src/mesa/drivers/dri/i965/brw_queryobj.c  | 53 ++++++++++++++++++++++++++++---
>  src/mesa/drivers/dri/i965/gen6_queryobj.c | 28 +++++-----------
>  6 files changed, 109 insertions(+), 27 deletions(-)
>
> diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c
> index 9bf3cd5cc42..209b293e510 100644
> --- a/src/intel/common/gen_device_info.c
> +++ b/src/intel/common/gen_device_info.c
> @@ -36,6 +36,7 @@ static const struct gen_device_info gen_device_info_i965 = {
>     .urb = {
>        .size = 256,
>     },
> +   .timebase_scale = 80,
>  };
>
>  static const struct gen_device_info gen_device_info_g4x = {
> @@ -51,6 +52,7 @@ static const struct gen_device_info gen_device_info_g4x = {
>     .urb = {
>        .size = 384,
>     },
> +   .timebase_scale = 80,
>  };
>
>  static const struct gen_device_info gen_device_info_ilk = {
> @@ -65,6 +67,7 @@ static const struct gen_device_info gen_device_info_ilk = {
>     .urb = {
>        .size = 1024,
>     },
> +   .timebase_scale = 80,
>  };
>
>  static const struct gen_device_info gen_device_info_snb_gt1 = {
> @@ -89,6 +92,7 @@ static const struct gen_device_info gen_device_info_snb_gt1 = {
>           [MESA_SHADER_GEOMETRY] = 256,
>        },
>     },
> +   .timebase_scale = 80,
>  };
>
>  static const struct gen_device_info gen_device_info_snb_gt2 = {
> @@ -113,6 +117,7 @@ static const struct gen_device_info gen_device_info_snb_gt2 = {
>           [MESA_SHADER_GEOMETRY] = 256,
>        },
>     },
> +   .timebase_scale = 80,
>  };
>
>  #define GEN7_FEATURES                               \
> @@ -121,7 +126,8 @@ static const struct gen_device_info gen_device_info_snb_gt2 = {
>     .must_use_separate_stencil = true,               \
>     .has_llc = true,                                 \
>     .has_pln = true,                                 \
> -   .has_surface_tile_offset = true
> +   .has_surface_tile_offset = true,                 \
> +   .timebase_scale = 80

Trailing comma

>
>  static const struct gen_device_info gen_device_info_ivb_gt1 = {
>     GEN7_FEATURES, .is_ivybridge = true, .gt = 1,
> @@ -287,7 +293,8 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = {
>     .max_tcs_threads = 504,                          \
>     .max_tes_threads = 504,                          \
>     .max_gs_threads = 504,                           \
> -   .max_wm_threads = 384
> +   .max_wm_threads = 384,                           \
> +   .timebase_scale = 80

Trailing comma

>
>  static const struct gen_device_info gen_device_info_bdw_gt1 = {
>     GEN8_FEATURES, .gt = 1,
> @@ -385,6 +392,7 @@ static const struct gen_device_info gen_device_info_chv = {
>     .max_tcs_threads = 336,                          \
>     .max_tes_threads = 336,                          \
>     .max_cs_threads = 56,                            \
> +   .timebase_scale = 1000000000.0 / 12000000.0,     \
>     .urb = {                                         \
>        .size = 384,                                  \
>        .min_entries = {                              \
> @@ -410,6 +418,7 @@ static const struct gen_device_info gen_device_info_chv = {
>     .max_tes_threads = 112,                         \
>     .max_gs_threads = 112,                          \
>     .max_cs_threads = 6 * 6,                        \
> +   .timebase_scale = 1000000000.0 / 19200123.0,    \
>     .urb = {                                        \
>        .size = 192,                                 \
>        .min_entries = {                             \
> diff --git a/src/intel/common/gen_device_info.h b/src/intel/common/gen_device_info.h
> index f0e8750d0ea..80676d0e003 100644
> --- a/src/intel/common/gen_device_info.h
> +++ b/src/intel/common/gen_device_info.h
> @@ -147,6 +147,30 @@ struct gen_device_info
>         */
>        unsigned max_entries[4];
>     } urb;
> +
> +   /**
> +    * For the longest time the timestamp frequency for Gen's timestamp counter
> +    * could be assumed to be 12.5MHz, where the least significant bit neatly
> +    * corresponded to 80 nanoseconds.
> +    *
> +    * Since Gen9 the numbers aren't so round, with a a frequency of 12MHz for
> +    * SKL (or scale factor of 83.33333333) and a frequency of 19200123Hz for
> +    * BXT.
> +    *
> +    * For simplicty to fit with the current code scaling by a single constant
> +    * to map from raw timestamps to nanoseconds we now do the conversion in
> +    * floating point instead of integer arithmetic.
> +    *
> +    * In general it's probably worth noting that the documented constants we
> +    * have for the per-platform timestamp frequencies aren't perfect and
> +    * shouldn't be trusted for scaling and comparing timestamps with a large
> +    * delta.
> +    *
> +    * E.g. with crude testing on my system using the 'correct' scale factor I'm
> +    * seeing a drift of ~2 milliseconds per second.
> +    */
> +   double timebase_scale;
> +
>     /** @} */
>  };
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index f939463539c..b071d08e95d 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -524,6 +524,21 @@ brw_initialize_context_constants(struct brw_context *brw)
>     ctx->Const.MaxCombinedShaderOutputResources =
>        MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
>
> +   /* The timestamp register we can read for glGetTimestamp() is
> +    * sometimes only 32 bits, before scaling to nanoseconds (depending
> +    * on kernel).
> +    *
> +    * Once scaled to nanoseconds the timestamp would roll over at a
> +    * non-power-of-two, so an application couldn't use
> +    * GL_QUERY_COUNTER_BITS to handle rollover correctly.  Instead, we
> +    * report 36 bits and truncate at that (rolling over 5 times as
> +    * often as the HW counter), and when the 32-bit counter rolls
> +    * over, it happens to also be at a rollover in the reported value
> +    * from near (1<<36) to 0.
> +    *
> +    * The low 32 bits rolls over in ~343 seconds.  Our 36-bit result
> +    * rolls over every ~69 seconds.
> +    */
>     ctx->Const.QueryCounterBits.Timestamp = 36;
>
>     ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index ff3f861a147..b6ac1a2e67f 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1300,6 +1300,9 @@ void brw_emit_query_begin(struct brw_context *brw);
>  void brw_emit_query_end(struct brw_context *brw);
>  void brw_query_counter(struct gl_context *ctx, struct gl_query_object *q);
>  bool brw_is_query_pipelined(struct brw_query_object *query);
> +uint64_t brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp);
> +uint64_t brw_raw_timestamp_delta(struct brw_context *brw,
> +                                 uint64_t time0, uint64_t time1);
>
>  /** gen6_queryobj.c */
>  void gen6_init_queryobj_functions(struct dd_function_table *functions);
> diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
> index dda17de7157..ebd5776d0aa 100644
> --- a/src/mesa/drivers/dri/i965/brw_queryobj.c
> +++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
> @@ -42,6 +42,37 @@
>  #include "brw_state.h"
>  #include "intel_batchbuffer.h"
>
> +uint64_t
> +brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp)
> +{
> +   const struct gen_device_info *devinfo = &brw->screen->devinfo;
> +
> +   return (double)gpu_timestamp * devinfo->timebase_scale;
> +}
> +
> +/* As best we know currently, the Gen HW timestamps are 36bits across
> + * all platforms, which we need to account for when calculating a
> + * delta to measure elapsed time.
> + *
> + * The timestamps read via glGetTimestamp() / brw_get_timestamp() sometimes
> + * only have 32bits due to a kernel bug and so in that case we make sure to
> + * treat all raw timestamps as 32bits so they overflow consistently and remain
> + * comparable.
> + */
> +uint64_t
> +brw_raw_timestamp_delta(struct brw_context *brw, uint64_t time0, uint64_t time1)
> +{
> +   if (brw->screen->hw_has_timestamp == 2) {
> +      /* Kernel clips timestamps to 32bits in this case */
> +      return (uint32_t)time1 - (uint32_t)time0;
> +   } else {
> +      if (time0 > time1)
> +         return (1ULL << 36) + time1 - time0;
> +      else
> +         return time1 - time0;

Braces in nested CF.

> +   }
> +}
> +
>  /**
>   * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
>   */
> @@ -117,12 +148,18 @@ brw_queryobj_get_results(struct gl_context *ctx,
>        /* The query BO contains the starting and ending timestamps.
>         * Subtract the two and convert to nanoseconds.
>         */
> -      query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
> +      query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]);
> +      query->Base.Result = brw_timebase_scale(brw, query->Base.Result);
>        break;
>
>     case GL_TIMESTAMP:
>        /* The query BO contains a single timestamp value in results[0]. */
> -      query->Base.Result = 1000 * (results[0] >> 32);
> +      query->Base.Result = brw_timebase_scale(brw, results[0]);
> +
> +      /* Ensure the scaled timestamp overflows according to
> +       * GL_QUERY_COUNTER_BITS
> +       */
> +      query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
>        break;
>
>     case GL_SAMPLES_PASSED_ARB:
> @@ -508,9 +545,15 @@ brw_get_timestamp(struct gl_context *ctx)
>        break;
>     }
>
> -   /* See logic in brw_queryobj_get_results() */
> -   result *= 80;
> -   result &= (1ull << 36) - 1;
> +   /* Scale to nanosecond units */
> +   result = brw_timebase_scale(brw, result);
> +
> +   /* Ensure the scaled timestamp overflows according to
> +    * GL_QUERY_COUNTER_BITS.  Technically this isn't required if
> +    * querying GL_TIMESTAMP via glGetInteger but it seems best to keep
> +    * QueryObject and GetInteger timestamps consistent.
> +    */
> +   result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
>     return result;
>  }
>
> diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> index bbd3c44fb0f..3f6edf1da1a 100644
> --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> @@ -171,30 +171,18 @@ gen6_queryobj_get_results(struct gl_context *ctx,
>        /* The query BO contains the starting and ending timestamps.
>         * Subtract the two and convert to nanoseconds.
>         */
> -      query->Base.Result += 80 * (results[1] - results[0]);
> +      query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]);
> +      query->Base.Result = brw_timebase_scale(brw, query->Base.Result);
>        break;
>
>     case GL_TIMESTAMP:
> -      /* Our timer is a clock that increments every 80ns (regardless of
> -       * other clock scaling in the system).  The timestamp register we can
> -       * read for glGetTimestamp() masks out the top 32 bits, so we do that
> -       * here too to let the two counters be compared against each other.
> -       *
> -       * If we just multiplied that 32 bits of data by 80, it would roll
> -       * over at a non-power-of-two, so an application couldn't use
> -       * GL_QUERY_COUNTER_BITS to handle rollover correctly.  Instead, we
> -       * report 36 bits and truncate at that (rolling over 5 times as often
> -       * as the HW counter), and when the 32-bit counter rolls over, it
> -       * happens to also be at a rollover in the reported value from near
> -       * (1<<36) to 0.
> -       *
> -       * The low 32 bits rolls over in ~343 seconds.  Our 36-bit result
> -       * rolls over every ~69 seconds.
> -       *
> -       * The query BO contains a single timestamp value in results[0].
> +      /* The query BO contains a single timestamp value in results[0]. */
> +      query->Base.Result = brw_timebase_scale(brw, results[0]);
> +
> +      /* Ensure the scaled timestamp overflows according to
> +       * GL_QUERY_COUNTER_BITS
>         */
> -      query->Base.Result = 80 * (results[0] & 0xffffffff);
> -      query->Base.Result &= (1ull << 36) - 1;
> +      query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
>        break;
>
>     case GL_SAMPLES_PASSED_ARB:
> --
> 2.11.0

Looks good to me.

Reviewed-by: Matt Turner <mattst88 at gmail.com>