Mesa (main): intel/ds: drop timestamp correlation code
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Mon Nov 22 12:17:08 UTC 2021
Module: Mesa
Branch: main
Commit: 4ef6698a265fd900da80d80fd42044e642d1b641
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=4ef6698a265fd900da80d80fd42044e642d1b641
Author: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Date: Mon Nov 8 00:09:08 2021 +0200
intel/ds: drop timestamp correlation code
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Reviewed-by: Antonio Caggiano <antonio.caggiano at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13831>
---
src/intel/ds/intel_pps_driver.cc | 202 ++++++++++++++-------------------------
src/intel/ds/intel_pps_driver.h | 45 ++++-----
2 files changed, 88 insertions(+), 159 deletions(-)
diff --git a/src/intel/ds/intel_pps_driver.cc b/src/intel/ds/intel_pps_driver.cc
index 6966db8fdc1..031097e22df 100644
--- a/src/intel/ds/intel_pps_driver.cc
+++ b/src/intel/ds/intel_pps_driver.cc
@@ -37,6 +37,45 @@ uint64_t IntelDriver::get_min_sampling_period_ns()
return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
}
+uint64_t scale_gpu_timestamp(uint64_t ts, uint64_t timestamp_frequency)
+{
+ // Try to avoid going over the 64bits when doing the scaling
+ uint64_t lower_ts = ts >> 6;
+ uint64_t scaled_ts = lower_ts * 1000000000ull / timestamp_frequency;
+ scaled_ts <<= 6;
+ scaled_ts += (ts & 0x3f) * 1000000000ull / timestamp_frequency;
+ return scaled_ts;
+}
+
+uint64_t read_gpu_timestamp(int drm_fd)
+{
+ drm_i915_reg_read reg_read = {};
+ const uint64_t render_ring_timestamp = 0x2358;
+ reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
+
+ if (perf_ioctl(drm_fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) {
+ PPS_LOG_ERROR("Unable to read GPU clock");
+ return 0;
+ }
+
+ return reg_read.val;
+
+}
+
+IntelDriver::IntelDriver()
+{
+ /* Note: clock_id's below 128 are reserved.. for custom clock sources,
+ * using the hash of a namespaced string is the recommended approach.
+ * See: https://perfetto.dev/docs/concepts/clock-sync
+ */
+ this->clock_id =
+ _mesa_hash_string("org.freedesktop.mesa.intel") | 0x80000000;
+}
+
+IntelDriver::~IntelDriver()
+{
+}
+
void IntelDriver::enable_counter(uint32_t counter_id)
{
auto &counter = counters[counter_id];
@@ -75,71 +114,6 @@ void IntelDriver::enable_all_counters()
}
}
-static uint64_t timespec_diff(timespec *begin, timespec *end)
-{
- return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
-}
-
-/// @brief This function tries to correlate CPU time with GPU time
-std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
-{
- TimestampCorrelation corr = {};
-
- clock_t correlation_clock_id = CLOCK_BOOTTIME;
-
- drm_i915_reg_read reg_read = {};
- const uint64_t render_ring_timestamp = 0x2358;
- reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
-
- constexpr size_t attempt_count = 3;
- struct {
- timespec cpu_ts_begin;
- timespec cpu_ts_end;
- uint64_t gpu_ts;
- } attempts[attempt_count] = {};
-
- uint32_t best = 0;
-
- // Gather 3 correlations
- for (uint32_t i = 0; i < attempt_count; i++) {
- clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
- if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, ®_read) < 0) {
- return std::nullopt;
- }
- clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
-
- attempts[i].gpu_ts = reg_read.val;
- }
-
- // Now select the best
- for (uint32_t i = 1; i < attempt_count; i++) {
- if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
- timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
- best = i;
- }
- }
-
- corr.cpu_timestamp =
- (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
- timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
- corr.gpu_timestamp = attempts[best].gpu_ts;
-
- return corr;
-}
-
-void IntelDriver::get_new_correlation()
-{
- // Rotate left correlations by one position so to make space at the end
- std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
-
- // Then we overwrite the last correlation with a new one
- if (auto corr = query_correlation_timestamps()) {
- correlations.back() = *corr;
- } else {
- PPS_LOG_FATAL("Failed to get correlation timestamps");
- }
-}
-
bool IntelDriver::init_perfcnt()
{
assert(!perf && "Intel perf should not be initialized at this point");
@@ -201,41 +175,11 @@ void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
{
this->sampling_period_ns = sampling_period_ns;
- // Fill correlations with an initial one
- if (auto corr = query_correlation_timestamps()) {
- correlations.fill(*corr);
- } else {
- PPS_LOG_FATAL("Failed to get correlation timestamps");
- }
-
if (!perf->open(sampling_period_ns)) {
PPS_LOG_FATAL("Failed to open intel perf");
}
}
-/// @brief Transforms the GPU timestop into a CPU timestamp equivalent
-uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
-{
- auto &corr_a = correlations[0];
- auto &corr_b = correlations[correlations.size() - 1];
-
- // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
- uint64_t mask = 0xffffffff;
- uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
- uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
-
- // Make sure it is within the interval [a,b)
- assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
- assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
-
- uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
- // Factor to convert gpu time to cpu time
- double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
- double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
- uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
- return corr_a.cpu_timestamp + cpu_delta;
-}
-
void IntelDriver::disable_perfcnt()
{
perf = nullptr;
@@ -244,12 +188,6 @@ void IntelDriver::disable_perfcnt()
enabled_counters.clear();
}
-struct Report {
- uint32_t version;
- uint32_t timestamp;
- uint32_t id;
-};
-
/// @brief Some perf record durations can be really short
/// @return True if the duration is at least close to the sampling period
static bool close_enough(uint64_t duration, uint64_t sampling_period)
@@ -265,12 +203,12 @@ std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_
records.reserve(128);
PerfRecord record;
- record.reserve(512);
+ record.data.reserve(512);
const uint8_t *iter = data.data();
const uint8_t *end = iter + byte_count;
- uint64_t prev_cpu_timestamp = last_cpu_timestamp;
+ uint64_t prev_gpu_timestamp = last_gpu_timestamp;
while (iter < end) {
// Iterate a record at a time
@@ -278,18 +216,32 @@ std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_
if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
// Report is next to the header
- auto report = reinterpret_cast<const Report *>(header + 1);
- auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
- auto duration = cpu_timestamp - prev_cpu_timestamp;
+ const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
+ uint64_t gpu_timestamp_ldw =
+ intel_perf_report_timestamp(&perf->query.value(), report);
+
+ /* Our HW only provides us with the lower 32 bits of the 36bits
+ * timestamp counter value. If we haven't captured the top bits yet,
+ * do it now. If we see a roll over the lower 32bits capture it
+ * again.
+ */
+ if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw + gpu_timestamp_ldw) < last_gpu_timestamp)
+ gpu_timestamp_udw = read_gpu_timestamp(drm_device.fd) & 0xffffffff00000000;
+
+ uint64_t gpu_timestamp = gpu_timestamp_udw + gpu_timestamp_ldw;
+
+ auto duration = scale_gpu_timestamp(gpu_timestamp - prev_gpu_timestamp,
+ perf->devinfo.timestamp_frequency);
// Skip perf-records that are too short by checking
// the distance between last report and this one
if (close_enough(duration, sampling_period_ns)) {
- prev_cpu_timestamp = cpu_timestamp;
+ prev_gpu_timestamp = gpu_timestamp;
// Add the new record to the list
- record.resize(header->size); // Possibly 264?
- memcpy(record.data(), iter, header->size);
+ record.timestamp = gpu_timestamp;
+ record.data.resize(header->size); // Possibly 264?
+ memcpy(record.data.data(), iter, header->size);
records.emplace_back(record);
}
}
@@ -329,8 +281,6 @@ bool IntelDriver::dump_perfcnt()
read_data_from_metric_set();
- get_new_correlation();
-
auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
if (new_records.empty()) {
PPS_LOG("No new records");
@@ -353,7 +303,7 @@ bool IntelDriver::dump_perfcnt()
return true;
}
-uint32_t IntelDriver::gpu_next()
+uint64_t IntelDriver::gpu_next()
{
if (records.size() < 2) {
// Not enough records to accumulate
@@ -361,8 +311,8 @@ uint32_t IntelDriver::gpu_next()
}
// Get first and second
- auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
- auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
+ auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
+ auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
intel_perf_query_result_accumulate_fields(&result,
&perf->query.value(),
@@ -372,42 +322,30 @@ uint32_t IntelDriver::gpu_next()
false /* no_oa_accumulate */);
// Get last timestamp
- auto report_b = reinterpret_cast<const Report *>(record_b + 1);
- auto gpu_timestamp = report_b->timestamp;
+ auto gpu_timestamp = records[1].timestamp;
// Consume first record
records.erase(std::begin(records), std::begin(records) + 1);
- return gpu_timestamp;
-}
-
-uint64_t IntelDriver::cpu_next()
-{
- if (auto gpu_timestamp = gpu_next()) {
- auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
-
- last_cpu_timestamp = cpu_timestamp;
- return cpu_timestamp;
- }
-
- return 0;
+ return scale_gpu_timestamp(gpu_timestamp, perf->devinfo.timestamp_frequency);
}
uint64_t IntelDriver::next()
{
// Reset accumulation
intel_perf_query_result_clear(&result);
- return cpu_next();
+ return gpu_next();
}
uint32_t IntelDriver::gpu_clock_id() const
{
- return perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
+ return this->clock_id;
}
uint64_t IntelDriver::gpu_timestamp() const
{
- return perfetto::base::GetBootTimeNs().count();
+ return scale_gpu_timestamp(read_gpu_timestamp(drm_device.fd),
+ perf->devinfo.timestamp_frequency);
}
} // namespace pps
diff --git a/src/intel/ds/intel_pps_driver.h b/src/intel/ds/intel_pps_driver.h
index cac029a973e..4f0754c732a 100644
--- a/src/intel/ds/intel_pps_driver.h
+++ b/src/intel/ds/intel_pps_driver.h
@@ -13,17 +13,15 @@
namespace pps
{
-/// Timestamp correlation between CPU/GPU.
-struct TimestampCorrelation {
- /// In CLOCK_MONOTONIC
- uint64_t cpu_timestamp;
-
- /// Engine timestamp associated with the OA unit
- uint64_t gpu_timestamp;
-};
/// @brief Variable length sequence of bytes generated by Intel Obstervation Architecture (OA)
-using PerfRecord = std::vector<uint8_t>;
+struct PerfRecord {
+ /// Timestamp in the GPU clock domain
+ uint64_t timestamp;
+
+ /// drm_i915_perf_record_header + report data
+ std::vector<uint8_t> data;
+};
/// @brief PPS Driver implementation for Intel graphics devices.
/// When sampling it may collect multiple perf-records at once. Each perf-record holds multiple
@@ -34,14 +32,8 @@ using PerfRecord = std::vector<uint8_t>;
class IntelDriver : public Driver
{
public:
- std::optional<TimestampCorrelation> query_correlation_timestamps() const;
- void get_new_correlation();
-
- /// @brief OA reports only have the lower 32 bits of the timestamp
- /// register, while correlation data has the whole 36 bits.
- /// @param gpu_ts a 32 bit OA report GPU timestamp
- /// @return The CPU timestamp relative to the argument
- uint64_t correlate_gpu_timestamp(uint32_t gpu_ts);
+ IntelDriver();
+ ~IntelDriver();
uint64_t get_min_sampling_period_ns() override;
bool init_perfcnt() override;
@@ -57,12 +49,7 @@ class IntelDriver : public Driver
private:
/// @brief Requests the next perf sample
/// @return The sample GPU timestamp
- uint32_t gpu_next();
-
- /// @brief Requests the next perf sample accumulating those which
- /// which duration is shorter than the requested sampling period
- /// @return The sample CPU timestamp
- uint64_t cpu_next();
+ uint64_t gpu_next();
/// @param data Buffer of bytes to parse
/// @param byte_count Number of bytes to parse
@@ -75,11 +62,12 @@ class IntelDriver : public Driver
/// Sampling period in nanoseconds requested by the datasource
uint64_t sampling_period_ns = 0;
- /// Keep track of the timestamp of the last sample generated
- uint64_t last_cpu_timestamp = 0;
+ /// Last upper 32bits of the GPU timestamp in the parsed reports
+ uint64_t gpu_timestamp_udw = 0;
- /// This is used to correlate CPU and GPU timestamps
- std::array<TimestampCorrelation, 64> correlations;
+ /// Keep track of the timestamp of the last sample generated (upper & lower
+ /// 32bits)
+ uint64_t last_gpu_timestamp = 0;
/// Data buffer used to store data read from the metric set
std::vector<uint8_t> metric_buffer = std::vector<uint8_t>(1024, 0);
@@ -94,6 +82,9 @@ class IntelDriver : public Driver
// Accumulations are stored here
struct intel_perf_query_result result = {};
+
+ // Gpu clock ID used to correlate GPU/CPU timestamps
+ uint32_t clock_id = 0;
};
} // namespace pps
More information about the mesa-commit
mailing list