[Mesa-dev] [PATCH] i965: Allow a per gen timebase scale factor
Robert Bragg
robert at sixbynine.org
Fri Oct 28 00:44:34 UTC 2016
typo fixed (thanks)
--- >8 ---
Prior to Skylake the Gen HW timestamps were driven by a 12.5MHz clock
with the convenient property of being able to scale by an integer (80)
to nanosecond units.
For Skylake the frequency is 12MHz or a scale factor of 83.333333
This updates gen_device_info to track a floating point timebase_scale
factor and makes corresponding _queryobj.c changes to no longer assume a
scale factor of 80 works across all gens.
Although the gen6_ code could have been been left alone, the changes
keep the code more comparable, and it now shares a few utility functions
for scaling raw timestamps and calculating deltas. The utility for
calculating deltas takes into account 32 or 36bit overflow depending on
the current kernel version.
Note: this leaves the timestamp handling of ARB_query_buffer_object
untouched, which continues to use an incorrect scale of 80 on Skylake
for now. This is more awkward to solve since the scaling is currently
done using a very limited uint64 ALU available to the command parser
that doesn't support multiply or divide where it's already taking a
large number of instructions just to effectively multiple by 80.
This fixes piglit arb_timer_query-timestamp-get on Skylake
Signed-off-by: Robert Bragg <robert at sixbynine.org>
---
src/intel/common/gen_device_info.c | 21 +++++++++---
src/intel/common/gen_device_info.h | 24 ++++++++++++++
src/mesa/drivers/dri/i965/brw_context.c | 15 +++++++++
src/mesa/drivers/dri/i965/brw_context.h | 3 ++
src/mesa/drivers/dri/i965/brw_queryobj.c | 53 ++++++++++++++++++++++++++++---
src/mesa/drivers/dri/i965/gen6_queryobj.c | 28 +++++-----------
6 files changed, 115 insertions(+), 29 deletions(-)
diff --git a/src/intel/common/gen_device_info.c b/src/intel/common/gen_device_info.c
index 30df0b2..20594b0 100644
--- a/src/intel/common/gen_device_info.c
+++ b/src/intel/common/gen_device_info.c
@@ -35,6 +35,7 @@ static const struct gen_device_info gen_device_info_i965 = {
.urb = {
.size = 256,
},
+ .timebase_scale = 80,
};
static const struct gen_device_info gen_device_info_g4x = {
@@ -50,6 +51,7 @@ static const struct gen_device_info gen_device_info_g4x = {
.urb = {
.size = 384,
},
+ .timebase_scale = 80,
};
static const struct gen_device_info gen_device_info_ilk = {
@@ -64,6 +66,7 @@ static const struct gen_device_info gen_device_info_ilk = {
.urb = {
.size = 1024,
},
+ .timebase_scale = 80,
};
static const struct gen_device_info gen_device_info_snb_gt1 = {
@@ -84,6 +87,7 @@ static const struct gen_device_info gen_device_info_snb_gt1 = {
.max_vs_entries = 256,
.max_gs_entries = 256,
},
+ .timebase_scale = 80,
};
static const struct gen_device_info gen_device_info_snb_gt2 = {
@@ -104,6 +108,7 @@ static const struct gen_device_info gen_device_info_snb_gt2 = {
.max_vs_entries = 256,
.max_gs_entries = 256,
},
+ .timebase_scale = 80,
};
#define GEN7_FEATURES \
@@ -112,7 +117,8 @@ static const struct gen_device_info gen_device_info_snb_gt2 = {
.must_use_separate_stencil = true, \
.has_llc = true, \
.has_pln = true, \
- .has_surface_tile_offset = true
+ .has_surface_tile_offset = true, \
+ .timebase_scale = 80
static const struct gen_device_info gen_device_info_ivb_gt1 = {
GEN7_FEATURES, .is_ivybridge = true, .gt = 1,
@@ -254,7 +260,8 @@ static const struct gen_device_info gen_device_info_hsw_gt3 = {
.max_tcs_threads = 504, \
.max_tes_threads = 504, \
.max_gs_threads = 504, \
- .max_wm_threads = 384
+ .max_wm_threads = 384, \
+ .timebase_scale = 80
static const struct gen_device_info gen_device_info_bdw_gt1 = {
GEN8_FEATURES, .gt = 1,
@@ -351,16 +358,19 @@ static const struct gen_device_info gen_device_info_skl_gt1 = {
GEN9_FEATURES, .gt = 1,
.num_slices = 1,
.urb.size = 192,
+ .timebase_scale = 1000000000.0 / 12000000.0,
};
static const struct gen_device_info gen_device_info_skl_gt2 = {
GEN9_FEATURES, .gt = 2,
.num_slices = 1,
+ .timebase_scale = 1000000000.0 / 12000000.0,
};
static const struct gen_device_info gen_device_info_skl_gt3 = {
GEN9_FEATURES, .gt = 3,
.num_slices = 2,
+ .timebase_scale = 1000000000.0 / 12000000.0,
};
static const struct gen_device_info gen_device_info_skl_gt4 = {
@@ -375,6 +385,7 @@ static const struct gen_device_info gen_device_info_skl_gt4 = {
* only 1008KB of this will be used."
*/
.urb.size = 1008 / 3,
+ .timebase_scale = 1000000000.0 / 12000000.0,
};
static const struct gen_device_info gen_device_info_bxt = {
@@ -397,7 +408,8 @@ static const struct gen_device_info gen_device_info_bxt = {
.max_tcs_entries = 256,
.max_tes_entries = 416,
.max_gs_entries = 256,
- }
+ },
+ .timebase_scale = 1000000000.0 / 19200123.0,
};
static const struct gen_device_info gen_device_info_bxt_2x6 = {
@@ -420,7 +432,8 @@ static const struct gen_device_info gen_device_info_bxt_2x6 = {
.max_tcs_entries = 128,
.max_tes_entries = 208,
.max_gs_entries = 128,
- }
+ },
+ .timebase_scale = 1000000000.0 / 19200123.0,
};
/*
* Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
diff --git a/src/intel/common/gen_device_info.h b/src/intel/common/gen_device_info.h
index 10324e6..d17931e 100644
--- a/src/intel/common/gen_device_info.h
+++ b/src/intel/common/gen_device_info.h
@@ -142,6 +142,30 @@ struct gen_device_info
unsigned max_tes_entries;
unsigned max_gs_entries;
} urb;
+
+ /**
+ * For the longest time the timestamp frequency for Gen's timestamp counter
+ * could be assumed to be 12.5MHz, where the least significant bit neatly
+ * corresponded to 80 nanoseconds.
+ *
+ * Since Gen9 the numbers aren't so round, with a a frequency of 12MHz for
+ * SKL (or scale factor of 83.33333333) and a frequency of 19200123Hz for
+ * BXT.
+ *
+ * For simplicty to fit with the current code scaling by a single constant
+ * to map from raw timestamps to nanoseconds we now do the conversion in
+ * floating point instead of integer arithmetic.
+ *
+ * In general it's probably worth noting that the documented constants we
+ * have for the per-platform timestamp frequencies aren't perfect and
+ * shouldn't be trusted for scaling and comparing timestamps with a large
+ * delta.
+ *
+ * E.g. with crude testing on my system using the 'correct' scale factor I'm
+ * seeing a drift of ~2 milliseconds per second.
+ */
+ double timebase_scale;
+
/** @} */
};
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index af8ed2c..2f71f52 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -519,6 +519,21 @@ brw_initialize_context_constants(struct brw_context *brw)
ctx->Const.MaxCombinedShaderOutputResources =
MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
+ /* The timestamp register we can read for glGetTimestamp() is
+ * sometimes only 32 bits, before scaling to nanoseconds (depending
+ * on kernel).
+ *
+ * Once scaled to nanoseconds the timestamp would roll over at a
+ * non-power-of-two, so an application couldn't use
+ * GL_QUERY_COUNTER_BITS to handle rollover correctly. Instead, we
+ * report 36 bits and truncate at that (rolling over 5 times as
+ * often as the HW counter), and when the 32-bit counter rolls
+ * over, it happens to also be at a rollover in the reported value
+ * from near (1<<36) to 0.
+ *
+ * The low 32 bits rolls over in ~343 seconds. Our 36-bit result
+ * rolls over every ~69 seconds.
+ */
ctx->Const.QueryCounterBits.Timestamp = 36;
ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 5c64c2f..f2dcff5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1349,6 +1349,9 @@ void brw_emit_query_begin(struct brw_context *brw);
void brw_emit_query_end(struct brw_context *brw);
void brw_query_counter(struct gl_context *ctx, struct gl_query_object *q);
bool brw_is_query_pipelined(struct brw_query_object *query);
+uint64_t brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp);
+uint64_t brw_raw_timestamp_delta(struct brw_context *brw,
+ uint64_t time0, uint64_t time1);
/** gen6_queryobj.c */
void gen6_init_queryobj_functions(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index dda17de..ebd5776 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -42,6 +42,37 @@
#include "brw_state.h"
#include "intel_batchbuffer.h"
+uint64_t
+brw_timebase_scale(struct brw_context *brw, uint64_t gpu_timestamp)
+{
+ const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+ return (double)gpu_timestamp * devinfo->timebase_scale;
+}
+
+/* As best we know currently, the Gen HW timestamps are 36bits across
+ * all platforms, which we need to account for when calculating a
+ * delta to measure elapsed time.
+ *
+ * The timestamps read via glGetTimestamp() / brw_get_timestamp() sometimes
+ * only have 32bits due to a kernel bug and so in that case we make sure to
+ * treat all raw timestamps as 32bits so they overflow consistently and remain
+ * comparable.
+ */
+uint64_t
+brw_raw_timestamp_delta(struct brw_context *brw, uint64_t time0, uint64_t time1)
+{
+ if (brw->screen->hw_has_timestamp == 2) {
+ /* Kernel clips timestamps to 32bits in this case */
+ return (uint32_t)time1 - (uint32_t)time0;
+ } else {
+ if (time0 > time1)
+ return (1ULL << 36) + time1 - time0;
+ else
+ return time1 - time0;
+ }
+}
+
/**
* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
*/
@@ -117,12 +148,18 @@ brw_queryobj_get_results(struct gl_context *ctx,
/* The query BO contains the starting and ending timestamps.
* Subtract the two and convert to nanoseconds.
*/
- query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
+ query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]);
+ query->Base.Result = brw_timebase_scale(brw, query->Base.Result);
break;
case GL_TIMESTAMP:
/* The query BO contains a single timestamp value in results[0]. */
- query->Base.Result = 1000 * (results[0] >> 32);
+ query->Base.Result = brw_timebase_scale(brw, results[0]);
+
+ /* Ensure the scaled timestamp overflows according to
+ * GL_QUERY_COUNTER_BITS
+ */
+ query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
break;
case GL_SAMPLES_PASSED_ARB:
@@ -508,9 +545,15 @@ brw_get_timestamp(struct gl_context *ctx)
break;
}
- /* See logic in brw_queryobj_get_results() */
- result *= 80;
- result &= (1ull << 36) - 1;
+ /* Scale to nanosecond units */
+ result = brw_timebase_scale(brw, result);
+
+ /* Ensure the scaled timestamp overflows according to
+ * GL_QUERY_COUNTER_BITS. Technically this isn't required if
+ * querying GL_TIMESTAMP via glGetInteger but it seems best to keep
+ * QueryObject and GetInteger timestamps consistent.
+ */
+ result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
return result;
}
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index bbd3c44..3f6edf1 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -171,30 +171,18 @@ gen6_queryobj_get_results(struct gl_context *ctx,
/* The query BO contains the starting and ending timestamps.
* Subtract the two and convert to nanoseconds.
*/
- query->Base.Result += 80 * (results[1] - results[0]);
+ query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]);
+ query->Base.Result = brw_timebase_scale(brw, query->Base.Result);
break;
case GL_TIMESTAMP:
- /* Our timer is a clock that increments every 80ns (regardless of
- * other clock scaling in the system). The timestamp register we can
- * read for glGetTimestamp() masks out the top 32 bits, so we do that
- * here too to let the two counters be compared against each other.
- *
- * If we just multiplied that 32 bits of data by 80, it would roll
- * over at a non-power-of-two, so an application couldn't use
- * GL_QUERY_COUNTER_BITS to handle rollover correctly. Instead, we
- * report 36 bits and truncate at that (rolling over 5 times as often
- * as the HW counter), and when the 32-bit counter rolls over, it
- * happens to also be at a rollover in the reported value from near
- * (1<<36) to 0.
- *
- * The low 32 bits rolls over in ~343 seconds. Our 36-bit result
- * rolls over every ~69 seconds.
- *
- * The query BO contains a single timestamp value in results[0].
+ /* The query BO contains a single timestamp value in results[0]. */
+ query->Base.Result = brw_timebase_scale(brw, results[0]);
+
+ /* Ensure the scaled timestamp overflows according to
+ * GL_QUERY_COUNTER_BITS
*/
- query->Base.Result = 80 * (results[0] & 0xffffffff);
- query->Base.Result &= (1ull << 36) - 1;
+ query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1;
break;
case GL_SAMPLES_PASSED_ARB:
--
2.10.1
More information about the mesa-dev
mailing list