[Mesa-dev] [PATCH 2/4] i965: perf: query topology

Thu Feb 22 17:24:38 UTC 2018

With the introduction of asymmetric slices in CNL, we cannot rely on
the previous SUBSLICE_MASK getparam to tell userspace what subslices
are available.

We introduce a new uAPI in the kernel driver to report exactly what
part of the GPU are fused and require this to be available on Gen10+.

Prior generations can continue to rely on GETPARAM on older kernels.

This patch is quite a lot of code because we have to support lots of
different kernel versions, ranging from not providing any information
(for Haswell on 4.13 through 4.17), to being able to query through
GETPARAM (for gen8/9 on 4.13 through 4.17), to finally requiring 4.17
for Gen10+.

This change stores topology information in a unified way on
brw_context.topology from the various kernel APIs. And then generates
the appropriate values for the equations from that unified topology.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
---
 src/mesa/drivers/dri/i965/brw_context.h           |  14 ++
 src/mesa/drivers/dri/i965/brw_performance_query.c | 267 ++++++++++++++++------
 2 files changed, 208 insertions(+), 73 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 050b656e3da..69bf7530fbc 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1160,6 +1160,20 @@ struct brw_context
       bool supported;
    } predicate;
 
+   struct {
+      uint8_t slice_mask[4];
+      uint8_t subslice_mask[100];
+      uint8_t eu_mask[100];
+
+      uint16_t max_slices;
+      uint16_t max_subslices;
+      uint16_t max_eus_per_subslice;
+
+      uint16_t subslice_slice_stride;
+      uint16_t eu_slice_stride;
+      uint16_t eu_subslice_stride;
+   } topology;
+
    struct {
       /* Variables referenced in the XML meta data for OA performance
        * counters, e.g in the normalization equations.
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index c0bb4442bec..10f519a757f 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -1888,6 +1888,192 @@ init_oa_configs(struct brw_context *brw, const char *sysfs_dev_dir)
    }
 }
 
+static bool
+query_topology(struct brw_context *brw)
+{
+   __DRIscreen *screen = brw->screen->driScrnPriv;
+   struct drm_i915_query_item item = {
+      .query_id = DRM_I915_QUERY_TOPOLOGY_INFO,
+   };
+   struct drm_i915_query query = {
+      .num_items = 1,
+      .items_ptr = (uintptr_t) &item,
+   };
+
+   return false;
+
+   if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
+      return false;
+
+   struct drm_i915_query_topology_info *topo_info =
+      (struct drm_i915_query_topology_info *) calloc(1, item.length);
+   item.data_ptr = (uintptr_t) topo_info;
+
+   if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
+      return false;
+
+   brw->topology.max_slices = topo_info->max_slices;
+   brw->topology.max_subslices = topo_info->max_subslices;
+   brw->topology.max_eus_per_subslice = topo_info->max_eus_per_subslice;
+
+   brw->topology.subslice_slice_stride =
+      DIV_ROUND_UP(brw->topology.max_subslices, 8);
+   brw->topology.eu_subslice_stride =
+      DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
+   brw->topology.eu_slice_stride = brw->topology.max_subslices *
+      brw->topology.eu_subslice_stride;
+
+   assert(DIV_ROUND_UP(topo_info->max_slices, 8) <=
+          sizeof(brw->topology.slice_mask));
+   memcpy(brw->topology.slice_mask, topo_info->data,
+          DIV_ROUND_UP(topo_info->max_slices, 8));
+
+   assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices, 8) <=
+          sizeof(brw->topology.subslice_mask));
+   memcpy(brw->topology.subslice_mask,
+          &topo_info->data[topo_info->subslice_offset],
+          topo_info->max_slices * topo_info->subslice_stride);
+
+   assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices *
+                       topo_info->max_eus_per_subslice, 8) <=
+          sizeof(brw->topology.eu_mask));
+   memcpy(brw->topology.eu_mask,
+          &topo_info->data[topo_info->eu_offset],
+          topo_info->max_slices * topo_info->max_subslices * topo_info->eu_stride);
+
+   free(topo_info);
+
+   return true;
+}
+
+static bool
+getparam_topology(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   __DRIscreen *screen = brw->screen->driScrnPriv;
+   drm_i915_getparam_t gp;
+   int ret;
+
+   /* On CNL+ we need to use the query ioctl(). */
+   assert(devinfo->gen < 10);
+
+   int slice_mask = 0;
+   gp.param = I915_PARAM_SLICE_MASK;
+   gp.value = &slice_mask;
+   ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+   if (ret)
+      return false;
+
+   int subslice_mask = 0;
+   gp.param = I915_PARAM_SUBSLICE_MASK;
+   gp.value = &subslice_mask;
+   ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
+   if (ret)
+      return false;
+
+   brw->topology.max_slices = util_last_bit(slice_mask);
+   brw->topology.max_subslices = util_last_bit(subslice_mask);
+   brw->topology.max_eus_per_subslice = devinfo->is_haswell ? 10 : 8;
+
+   brw->topology.subslice_slice_stride =
+      DIV_ROUND_UP(brw->topology.max_subslices, 8);
+   brw->topology.eu_subslice_stride =
+      DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
+   brw->topology.eu_slice_stride = brw->topology.max_subslices *
+      brw->topology.eu_subslice_stride;
+
+   int n_subslices = __builtin_popcount(slice_mask) * __builtin_popcount(subslice_mask);
+   int eus_per_subslice = brw->screen->eu_total / n_subslices;
+
+   for (int s = 0; s < brw->topology.max_slices; s++) {
+      brw->topology.slice_mask[s / 8] |= (1UL << (s % 8)) & slice_mask;
+
+      for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
+         brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride +
+                                     ss / 8] |=
+            (1UL << (ss % 8)) & subslice_mask;
+
+         for (int eug = 0; eug < brw->topology.eu_slice_stride; eug++) {
+            brw->topology.eu_mask[s * brw->topology.eu_slice_stride +
+                                  ss * brw->topology.eu_subslice_stride +
+                                  eug] =
+               (((1UL << eus_per_subslice) - 1) >> (eug * 8)) & 0xff;
+         }
+      }
+   }
+
+   return true;
+}
+
+static void
+devinfo_topology(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   assert(devinfo->is_haswell);
+
+   brw->topology.max_slices = devinfo->num_slices;
+   brw->topology.max_subslices = devinfo->num_subslices[0];
+   brw->topology.max_eus_per_subslice = 10;
+
+   int subslice_stride = DIV_ROUND_UP(brw->topology.max_subslices, 8);
+   int eu_subslice_stride = DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
+   int eu_slice_stride = brw->topology.max_subslices * eu_subslice_stride;
+
+   for (int s = 0; s < brw->topology.max_slices; s++) {
+      brw->topology.slice_mask[s / 8] |= 1UL << (s % 8);
+
+      for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
+         brw->topology.subslice_mask[(s * subslice_stride + ss) / 8] |=
+            1UL << (ss % 8);
+
+         for (int eug = 0; eug < eu_subslice_stride; eug++) {
+            brw->topology.eu_mask[s * eu_slice_stride + ss * eu_subslice_stride + eug] =
+               (((1UL << brw->topology.max_eus_per_subslice) - 1) >> (eug * 8)) & 0xff;
+         }
+      }
+   }
+}
+
+static void
+compute_topology_builtins(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   assert(brw->topology.max_slices <= 8);
+   brw->perfquery.sys_vars.slice_mask = brw->topology.slice_mask[0];
+   brw->perfquery.sys_vars.n_eu_slices =
+      __builtin_popcount(brw->perfquery.sys_vars.slice_mask);
+
+   for (int i = 0; i < sizeof(brw->topology.subslice_mask); i++) {
+      brw->perfquery.sys_vars.n_eu_sub_slices +=
+         __builtin_popcount(brw->topology.subslice_mask[i]);
+   }
+
+   for (int i = 0; i < sizeof(brw->topology.eu_mask); i++) {
+      brw->perfquery.sys_vars.n_eus +=
+         __builtin_popcount(brw->topology.eu_mask[i]);
+   }
+
+   brw->perfquery.sys_vars.eu_threads_count =
+      brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
+
+   /* At the moment the subslice mask builtin has groups of 3bits for each
+    * slice.
+    *
+    * Ideally equations would be updated to have a slice/subslice query
+    * function/operator.
+    */
+   brw->perfquery.sys_vars.subslice_mask = 0;
+   for (int s = 0; s < brw->topology.max_slices; s++) {
+      for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
+         if (brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride + ss / 8] &
+             (1UL << (ss % 8)))
+            brw->perfquery.sys_vars.subslice_mask |= 1UL << (s * 3 + ss);
+      }
+   }
+}
+
 static bool
 init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
 {
@@ -1905,83 +2091,18 @@ init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
                                           &max_freq_mhz))
       return false;
 
+   memset(&brw->topology, 0, sizeof(brw->topology));
+   if (!query_topology(brw)) {
+      if (!getparam_topology(brw))
+         devinfo_topology(brw);
+   }
+
+   memset(&brw->perfquery.sys_vars, 0, sizeof(brw->perfquery.sys_vars));
    brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
    brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
    brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
-
    brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
-   brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
-   /* Assuming uniform distribution of subslices per slices. */
-   brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
-
-   if (devinfo->is_haswell) {
-      brw->perfquery.sys_vars.slice_mask = 0;
-      brw->perfquery.sys_vars.subslice_mask = 0;
-
-      for (int s = 0; s < devinfo->num_slices; s++)
-         brw->perfquery.sys_vars.slice_mask |= 1U << s;
-      for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
-         brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
-
-      if (devinfo->gt == 1) {
-         brw->perfquery.sys_vars.n_eus = 10;
-      } else if (devinfo->gt == 2) {
-         brw->perfquery.sys_vars.n_eus = 20;
-      } else if (devinfo->gt == 3) {
-         brw->perfquery.sys_vars.n_eus = 40;
-      } else
-         unreachable("not reached");
-   } else {
-      drm_i915_getparam_t gp;
-      int ret;
-      int slice_mask = 0;
-      int ss_mask = 0;
-      /* maximum number of slices */
-      int s_max = devinfo->num_slices;
-      /* maximum number of subslices per slice (assuming uniform subslices per
-       * slices)
-       */
-      int ss_max = devinfo->num_subslices[0];
-      uint64_t subslice_mask = 0;
-      int s;
-
-      gp.param = I915_PARAM_SLICE_MASK;
-      gp.value = &slice_mask;
-      ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
-      if (ret)
-         return false;
-
-      gp.param = I915_PARAM_SUBSLICE_MASK;
-      gp.value = &ss_mask;
-      ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
-      if (ret)
-         return false;
-
-      brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
-      brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
-      brw->perfquery.sys_vars.slice_mask = slice_mask;
-
-      /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
-       * which applies to all slices.
-       *
-       * Note: some of the metrics we have (as described in XML) are
-       * conditional on a $SubsliceMask variable which is expected to also
-       * reflect the slice mask by packing together subslice masks for each
-       * slice in one value..
-       */
-      for (s = 0; s < s_max; s++) {
-         if (slice_mask & (1<<s)) {
-            subslice_mask |= ss_mask << (ss_max * s);
-         }
-      }
-
-      brw->perfquery.sys_vars.subslice_mask = subslice_mask;
-      brw->perfquery.sys_vars.n_eu_sub_slices =
-         __builtin_popcount(subslice_mask);
-   }
-
-   brw->perfquery.sys_vars.eu_threads_count =
-      brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
+   compute_topology_builtins(brw);
 
    return true;
 }
-- 
2.16.1