Mesa (main): intel/ds: drop timestamp correlation code

Mon Nov 22 12:17:08 UTC 2021

Module: Mesa
Branch: main
Commit: 4ef6698a265fd900da80d80fd42044e642d1b641
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=4ef6698a265fd900da80d80fd42044e642d1b641

Author: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Date:   Mon Nov  8 00:09:08 2021 +0200

intel/ds: drop timestamp correlation code

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Reviewed-by: Antonio Caggiano <antonio.caggiano at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13831>

---

 src/intel/ds/intel_pps_driver.cc | 202 ++++++++++++++-------------------------
 src/intel/ds/intel_pps_driver.h  |  45 ++++-----
 2 files changed, 88 insertions(+), 159 deletions(-)

diff --git a/src/intel/ds/intel_pps_driver.cc b/src/intel/ds/intel_pps_driver.cc
index 6966db8fdc1..031097e22df 100644
--- a/src/intel/ds/intel_pps_driver.cc
+++ b/src/intel/ds/intel_pps_driver.cc
@@ -37,6 +37,45 @@ uint64_t IntelDriver::get_min_sampling_period_ns()
    return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
 }
 
+uint64_t scale_gpu_timestamp(uint64_t ts, uint64_t timestamp_frequency)
+{
+   // Try to avoid going over the 64bits when doing the scaling
+   uint64_t lower_ts = ts >> 6;
+   uint64_t scaled_ts = lower_ts * 1000000000ull / timestamp_frequency;
+   scaled_ts <<= 6;
+   scaled_ts += (ts & 0x3f) * 1000000000ull / timestamp_frequency;
+   return scaled_ts;
+}
+
+uint64_t read_gpu_timestamp(int drm_fd)
+{
+   drm_i915_reg_read reg_read = {};
+   const uint64_t render_ring_timestamp = 0x2358;
+   reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
+
+   if (perf_ioctl(drm_fd, DRM_IOCTL_I915_REG_READ, &reg_read) < 0) {
+      PPS_LOG_ERROR("Unable to read GPU clock");
+      return 0;
+   }
+
+   return reg_read.val;
+
+}
+
+IntelDriver::IntelDriver()
+{
+   /* Note: clock_id's below 128 are reserved.. for custom clock sources,
+    * using the hash of a namespaced string is the recommended approach.
+    * See: https://perfetto.dev/docs/concepts/clock-sync
+    */
+   this->clock_id =
+      _mesa_hash_string("org.freedesktop.mesa.intel") | 0x80000000;
+}
+
+IntelDriver::~IntelDriver()
+{
+}
+
 void IntelDriver::enable_counter(uint32_t counter_id)
 {
    auto &counter = counters[counter_id];
@@ -75,71 +114,6 @@ void IntelDriver::enable_all_counters()
    }
 }
 
-static uint64_t timespec_diff(timespec *begin, timespec *end)
-{
-   return 1000000000ull * (end->tv_sec - begin->tv_sec) + end->tv_nsec - begin->tv_nsec;
-}
-
-/// @brief This function tries to correlate CPU time with GPU time
-std::optional<TimestampCorrelation> IntelDriver::query_correlation_timestamps() const
-{
-   TimestampCorrelation corr = {};
-
-   clock_t correlation_clock_id = CLOCK_BOOTTIME;
-
-   drm_i915_reg_read reg_read = {};
-   const uint64_t render_ring_timestamp = 0x2358;
-   reg_read.offset = render_ring_timestamp | I915_REG_READ_8B_WA;
-
-   constexpr size_t attempt_count = 3;
-   struct {
-      timespec cpu_ts_begin;
-      timespec cpu_ts_end;
-      uint64_t gpu_ts;
-   } attempts[attempt_count] = {};
-
-   uint32_t best = 0;
-
-   // Gather 3 correlations
-   for (uint32_t i = 0; i < attempt_count; i++) {
-      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_begin);
-      if (perf_ioctl(drm_device.fd, DRM_IOCTL_I915_REG_READ, &reg_read) < 0) {
-         return std::nullopt;
-      }
-      clock_gettime(correlation_clock_id, &attempts[i].cpu_ts_end);
-
-      attempts[i].gpu_ts = reg_read.val;
-   }
-
-   // Now select the best
-   for (uint32_t i = 1; i < attempt_count; i++) {
-      if (timespec_diff(&attempts[i].cpu_ts_begin, &attempts[i].cpu_ts_end) <
-         timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end)) {
-         best = i;
-      }
-   }
-
-   corr.cpu_timestamp =
-      (attempts[best].cpu_ts_begin.tv_sec * 1000000000ull + attempts[best].cpu_ts_begin.tv_nsec) +
-      timespec_diff(&attempts[best].cpu_ts_begin, &attempts[best].cpu_ts_end) / 2;
-   corr.gpu_timestamp = attempts[best].gpu_ts;
-
-   return corr;
-}
-
-void IntelDriver::get_new_correlation()
-{
-   // Rotate left correlations by one position so to make space at the end
-   std::rotate(correlations.begin(), correlations.begin() + 1, correlations.end());
-
-   // Then we overwrite the last correlation with a new one
-   if (auto corr = query_correlation_timestamps()) {
-      correlations.back() = *corr;
-   } else {
-      PPS_LOG_FATAL("Failed to get correlation timestamps");
-   }
-}
-
 bool IntelDriver::init_perfcnt()
 {
    assert(!perf && "Intel perf should not be initialized at this point");
@@ -201,41 +175,11 @@ void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
 {
    this->sampling_period_ns = sampling_period_ns;
 
-   // Fill correlations with an initial one
-   if (auto corr = query_correlation_timestamps()) {
-      correlations.fill(*corr);
-   } else {
-      PPS_LOG_FATAL("Failed to get correlation timestamps");
-   }
-
    if (!perf->open(sampling_period_ns)) {
       PPS_LOG_FATAL("Failed to open intel perf");
    }
 }
 
-/// @brief Transforms the GPU timestop into a CPU timestamp equivalent
-uint64_t IntelDriver::correlate_gpu_timestamp(const uint32_t gpu_ts)
-{
-   auto &corr_a = correlations[0];
-   auto &corr_b = correlations[correlations.size() - 1];
-
-   // A correlation timestamp has 36 bits, so get the first 32 to make it work with gpu_ts
-   uint64_t mask = 0xffffffff;
-   uint32_t corr_a_gpu_ts = corr_a.gpu_timestamp & mask;
-   uint32_t corr_b_gpu_ts = corr_b.gpu_timestamp & mask;
-
-   // Make sure it is within the interval [a,b)
-   assert(gpu_ts >= corr_a_gpu_ts && "GPU TS < Corr a");
-   assert(gpu_ts < corr_b_gpu_ts && "GPU TS >= Corr b");
-
-   uint32_t gpu_delta = gpu_ts - corr_a_gpu_ts;
-   // Factor to convert gpu time to cpu time
-   double gpu_to_cpu = (corr_b.cpu_timestamp - corr_a.cpu_timestamp) /
-      double(corr_b.gpu_timestamp - corr_a.gpu_timestamp);
-   uint64_t cpu_delta = gpu_delta * gpu_to_cpu;
-   return corr_a.cpu_timestamp + cpu_delta;
-}
-
 void IntelDriver::disable_perfcnt()
 {
    perf = nullptr;
@@ -244,12 +188,6 @@ void IntelDriver::disable_perfcnt()
    enabled_counters.clear();
 }
 
-struct Report {
-   uint32_t version;
-   uint32_t timestamp;
-   uint32_t id;
-};
-
 /// @brief Some perf record durations can be really short
 /// @return True if the duration is at least close to the sampling period
 static bool close_enough(uint64_t duration, uint64_t sampling_period)
@@ -265,12 +203,12 @@ std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_
    records.reserve(128);
 
    PerfRecord record;
-   record.reserve(512);
+   record.data.reserve(512);
 
    const uint8_t *iter = data.data();
    const uint8_t *end = iter + byte_count;
 
-   uint64_t prev_cpu_timestamp = last_cpu_timestamp;
+   uint64_t prev_gpu_timestamp = last_gpu_timestamp;
 
    while (iter < end) {
       // Iterate a record at a time
@@ -278,18 +216,32 @@ std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_
 
       if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
          // Report is next to the header
-         auto report = reinterpret_cast<const Report *>(header + 1);
-         auto cpu_timestamp = correlate_gpu_timestamp(report->timestamp);
-         auto duration = cpu_timestamp - prev_cpu_timestamp;
+         const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
+         uint64_t gpu_timestamp_ldw =
+            intel_perf_report_timestamp(&perf->query.value(), report);
+
+         /* Our HW only provides us with the lower 32 bits of the 36bits
+          * timestamp counter value. If we haven't captured the top bits yet,
+          * do it now. If we see a roll over the lower 32bits capture it
+          * again.
+          */
+         if (gpu_timestamp_udw == 0 || (gpu_timestamp_udw + gpu_timestamp_ldw) < last_gpu_timestamp)
+            gpu_timestamp_udw = read_gpu_timestamp(drm_device.fd) & 0xffffffff00000000;
+
+         uint64_t gpu_timestamp = gpu_timestamp_udw + gpu_timestamp_ldw;
+
+         auto duration = scale_gpu_timestamp(gpu_timestamp - prev_gpu_timestamp,
+                                             perf->devinfo.timestamp_frequency);
 
          // Skip perf-records that are too short by checking
          // the distance between last report and this one
          if (close_enough(duration, sampling_period_ns)) {
-            prev_cpu_timestamp = cpu_timestamp;
+            prev_gpu_timestamp = gpu_timestamp;
 
             // Add the new record to the list
-            record.resize(header->size); // Possibly 264?
-            memcpy(record.data(), iter, header->size);
+            record.timestamp = gpu_timestamp;
+            record.data.resize(header->size); // Possibly 264?
+            memcpy(record.data.data(), iter, header->size);
             records.emplace_back(record);
          }
       }
@@ -329,8 +281,6 @@ bool IntelDriver::dump_perfcnt()
 
    read_data_from_metric_set();
 
-   get_new_correlation();
-
    auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
    if (new_records.empty()) {
       PPS_LOG("No new records");
@@ -353,7 +303,7 @@ bool IntelDriver::dump_perfcnt()
    return true;
 }
 
-uint32_t IntelDriver::gpu_next()
+uint64_t IntelDriver::gpu_next()
 {
    if (records.size() < 2) {
       // Not enough records to accumulate
@@ -361,8 +311,8 @@ uint32_t IntelDriver::gpu_next()
    }
 
    // Get first and second
-   auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data());
-   auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data());
+   auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
+   auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
 
    intel_perf_query_result_accumulate_fields(&result,
       &perf->query.value(),
@@ -372,42 +322,30 @@ uint32_t IntelDriver::gpu_next()
       false /* no_oa_accumulate */);
 
    // Get last timestamp
-   auto report_b = reinterpret_cast<const Report *>(record_b + 1);
-   auto gpu_timestamp = report_b->timestamp;
+   auto gpu_timestamp = records[1].timestamp;
 
    // Consume first record
    records.erase(std::begin(records), std::begin(records) + 1);
 
-   return gpu_timestamp;
-}
-
-uint64_t IntelDriver::cpu_next()
-{
-   if (auto gpu_timestamp = gpu_next()) {
-      auto cpu_timestamp = correlate_gpu_timestamp(gpu_timestamp);
-
-      last_cpu_timestamp = cpu_timestamp;
-      return cpu_timestamp;
-   }
-
-   return 0;
+   return scale_gpu_timestamp(gpu_timestamp, perf->devinfo.timestamp_frequency);
 }
 
 uint64_t IntelDriver::next()
 {
    // Reset accumulation
    intel_perf_query_result_clear(&result);
-   return cpu_next();
+   return gpu_next();
 }
 
 uint32_t IntelDriver::gpu_clock_id() const
 {
-   return perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME;
+   return this->clock_id;
 }
 
 uint64_t IntelDriver::gpu_timestamp() const
 {
-   return perfetto::base::GetBootTimeNs().count();
+   return scale_gpu_timestamp(read_gpu_timestamp(drm_device.fd),
+                              perf->devinfo.timestamp_frequency);
 }
 
 } // namespace pps
diff --git a/src/intel/ds/intel_pps_driver.h b/src/intel/ds/intel_pps_driver.h
index cac029a973e..4f0754c732a 100644
--- a/src/intel/ds/intel_pps_driver.h
+++ b/src/intel/ds/intel_pps_driver.h
@@ -13,17 +13,15 @@
 
 namespace pps
 {
-/// Timestamp correlation between CPU/GPU.
-struct TimestampCorrelation {
-   /// In CLOCK_MONOTONIC
-   uint64_t cpu_timestamp;
-
-   /// Engine timestamp associated with the OA unit
-   uint64_t gpu_timestamp;
-};
 
 /// @brief Variable length sequence of bytes generated by Intel Obstervation Architecture (OA)
-using PerfRecord = std::vector<uint8_t>;
+struct PerfRecord {
+   /// Timestamp in the GPU clock domain
+   uint64_t timestamp;
+
+   /// drm_i915_perf_record_header + report data
+   std::vector<uint8_t> data;
+};
 
 /// @brief PPS Driver implementation for Intel graphics devices.
 /// When sampling it may collect multiple perf-records at once. Each perf-record holds multiple
@@ -34,14 +32,8 @@ using PerfRecord = std::vector<uint8_t>;
 class IntelDriver : public Driver
 {
    public:
-   std::optional<TimestampCorrelation> query_correlation_timestamps() const;
-   void get_new_correlation();
-
-   /// @brief OA reports only have the lower 32 bits of the timestamp
-   /// register, while correlation data has the whole 36 bits.
-   /// @param gpu_ts a 32 bit OA report GPU timestamp
-   /// @return The CPU timestamp relative to the argument
-   uint64_t correlate_gpu_timestamp(uint32_t gpu_ts);
+   IntelDriver();
+   ~IntelDriver();
 
    uint64_t get_min_sampling_period_ns() override;
    bool init_perfcnt() override;
@@ -57,12 +49,7 @@ class IntelDriver : public Driver
    private:
    /// @brief Requests the next perf sample
    /// @return The sample GPU timestamp
-   uint32_t gpu_next();
-
-   /// @brief Requests the next perf sample accumulating those which
-   /// which duration is shorter than the requested sampling period
-   /// @return The sample CPU timestamp
-   uint64_t cpu_next();
+   uint64_t gpu_next();
 
    /// @param data Buffer of bytes to parse
    /// @param byte_count Number of bytes to parse
@@ -75,11 +62,12 @@ class IntelDriver : public Driver
    /// Sampling period in nanoseconds requested by the datasource
    uint64_t sampling_period_ns = 0;
 
-   /// Keep track of the timestamp of the last sample generated
-   uint64_t last_cpu_timestamp = 0;
+   /// Last upper 32bits of the GPU timestamp in the parsed reports
+   uint64_t gpu_timestamp_udw = 0;
 
-   /// This is used to correlate CPU and GPU timestamps
-   std::array<TimestampCorrelation, 64> correlations;
+   /// Keep track of the timestamp of the last sample generated (upper & lower
+   /// 32bits)
+   uint64_t last_gpu_timestamp = 0;
 
    /// Data buffer used to store data read from the metric set
    std::vector<uint8_t> metric_buffer = std::vector<uint8_t>(1024, 0);
@@ -94,6 +82,9 @@ class IntelDriver : public Driver
 
    // Accumulations are stored here
    struct intel_perf_query_result result = {};
+
+   // Gpu clock ID used to correlate GPU/CPU timestamps
+   uint32_t clock_id = 0;
 };
 
 } // namespace pps