[Mesa-dev] [PATCH 2/4] i965: perf: query topology
Rafael Antognolli
rafael.antognolli at intel.com
Tue Mar 6 01:07:23 UTC 2018
On Thu, Feb 22, 2018 at 05:24:38PM +0000, Lionel Landwerlin wrote:
> With the introduction of asymmetric slices in CNL, we cannot rely on
> the previous SUBSLICE_MASK getparam to tell userspace what subslices
> are available.
>
> We introduce a new uAPI in the kernel driver to report exactly what
> part of the GPU are fused and require this to be available on Gen10+.
>
> Prior generations can continue to rely on GETPARAM on older kernels.
>
> This patch is quite a lot of code because we have to support lots of
> different kernel versions, ranging from not providing any information
> (for Haswell on 4.13 through 4.17), to being able to query through
> GETPARAM (for gen8/9 on 4.13 through 4.17), to finally requiring 4.17
> for Gen10+.
I don't think it's that much code. It's reasonable given how many
interfaces we have to query such data.
> This change stores topology information in a unified way on
> brw_context.topology from the various kernel APIs. And then generates
> the appropriate values for the equations from that unified topology.
>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> ---
> src/mesa/drivers/dri/i965/brw_context.h | 14 ++
> src/mesa/drivers/dri/i965/brw_performance_query.c | 267 ++++++++++++++++------
> 2 files changed, 208 insertions(+), 73 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 050b656e3da..69bf7530fbc 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1160,6 +1160,20 @@ struct brw_context
> bool supported;
> } predicate;
>
> + struct {
> + uint8_t slice_mask[4];
> + uint8_t subslice_mask[100];
> + uint8_t eu_mask[100];
> +
> + uint16_t max_slices;
> + uint16_t max_subslices;
> + uint16_t max_eus_per_subslice;
> +
> + uint16_t subslice_slice_stride;
> + uint16_t eu_slice_stride;
> + uint16_t eu_subslice_stride;
> + } topology;
> +
I wonder if such information shouldn't be stored in gen_device_info. But
it seems the rest of the OA code seems to be tied to i965 anyways, so I
guess this should be fine.
In any case, series is:
Acked-by: Rafael Antognolli <rafael.antognolli at intel.com>
> struct {
> /* Variables referenced in the XML meta data for OA performance
> * counters, e.g in the normalization equations.
> diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
> index c0bb4442bec..10f519a757f 100644
> --- a/src/mesa/drivers/dri/i965/brw_performance_query.c
> +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
> @@ -1888,6 +1888,192 @@ init_oa_configs(struct brw_context *brw, const char *sysfs_dev_dir)
> }
> }
>
> +static bool
> +query_topology(struct brw_context *brw)
> +{
> + __DRIscreen *screen = brw->screen->driScrnPriv;
> + struct drm_i915_query_item item = {
> + .query_id = DRM_I915_QUERY_TOPOLOGY_INFO,
> + };
> + struct drm_i915_query query = {
> + .num_items = 1,
> + .items_ptr = (uintptr_t) &item,
> + };
> +
> + return false;
> +
> + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
> + return false;
> +
> + struct drm_i915_query_topology_info *topo_info =
> + (struct drm_i915_query_topology_info *) calloc(1, item.length);
> + item.data_ptr = (uintptr_t) topo_info;
> +
> + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
> + return false;
> +
> + brw->topology.max_slices = topo_info->max_slices;
> + brw->topology.max_subslices = topo_info->max_subslices;
> + brw->topology.max_eus_per_subslice = topo_info->max_eus_per_subslice;
> +
> + brw->topology.subslice_slice_stride =
> + DIV_ROUND_UP(brw->topology.max_subslices, 8);
> + brw->topology.eu_subslice_stride =
> + DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
> + brw->topology.eu_slice_stride = brw->topology.max_subslices *
> + brw->topology.eu_subslice_stride;
> +
> + assert(DIV_ROUND_UP(topo_info->max_slices, 8) <=
> + sizeof(brw->topology.slice_mask));
> + memcpy(brw->topology.slice_mask, topo_info->data,
> + DIV_ROUND_UP(topo_info->max_slices, 8));
> +
> + assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices, 8) <=
> + sizeof(brw->topology.subslice_mask));
> + memcpy(brw->topology.subslice_mask,
> + &topo_info->data[topo_info->subslice_offset],
> + topo_info->max_slices * topo_info->subslice_stride);
> +
> + assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices *
> + topo_info->max_eus_per_subslice, 8) <=
> + sizeof(brw->topology.eu_mask));
> + memcpy(brw->topology.eu_mask,
> + &topo_info->data[topo_info->eu_offset],
> + topo_info->max_slices * topo_info->max_subslices * topo_info->eu_stride);
> +
> + free(topo_info);
> +
> + return true;
> +}
> +
> +static bool
> +getparam_topology(struct brw_context *brw)
> +{
> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
> + __DRIscreen *screen = brw->screen->driScrnPriv;
> + drm_i915_getparam_t gp;
> + int ret;
> +
> + /* On CNL+ we need to use the query ioctl(). */
> + assert(devinfo->gen < 10);
> +
> + int slice_mask = 0;
> + gp.param = I915_PARAM_SLICE_MASK;
> + gp.value = &slice_mask;
> + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
> + if (ret)
> + return false;
> +
> + int subslice_mask = 0;
> + gp.param = I915_PARAM_SUBSLICE_MASK;
> + gp.value = &subslice_mask;
> + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
> + if (ret)
> + return false;
> +
> + brw->topology.max_slices = util_last_bit(slice_mask);
> + brw->topology.max_subslices = util_last_bit(subslice_mask);
> + brw->topology.max_eus_per_subslice = devinfo->is_haswell ? 10 : 8;
> +
> + brw->topology.subslice_slice_stride =
> + DIV_ROUND_UP(brw->topology.max_subslices, 8);
> + brw->topology.eu_subslice_stride =
> + DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
> + brw->topology.eu_slice_stride = brw->topology.max_subslices *
> + brw->topology.eu_subslice_stride;
> +
> + int n_subslices = __builtin_popcount(slice_mask) * __builtin_popcount(subslice_mask);
> + int eus_per_subslice = brw->screen->eu_total / n_subslices;
> +
> + for (int s = 0; s < brw->topology.max_slices; s++) {
> + brw->topology.slice_mask[s / 8] |= (1UL << (s % 8)) & slice_mask;
> +
> + for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
> + brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride +
> + ss / 8] |=
> + (1UL << (ss % 8)) & subslice_mask;
> +
> + for (int eug = 0; eug < brw->topology.eu_slice_stride; eug++) {
> + brw->topology.eu_mask[s * brw->topology.eu_slice_stride +
> + ss * brw->topology.eu_subslice_stride +
> + eug] =
> + (((1UL << eus_per_subslice) - 1) >> (eug * 8)) & 0xff;
> + }
> + }
> + }
> +
> + return true;
> +}
> +
> +static void
> +devinfo_topology(struct brw_context *brw)
> +{
> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
> +
> + assert(devinfo->is_haswell);
> +
> + brw->topology.max_slices = devinfo->num_slices;
> + brw->topology.max_subslices = devinfo->num_subslices[0];
> + brw->topology.max_eus_per_subslice = 10;
> +
> + int subslice_stride = DIV_ROUND_UP(brw->topology.max_subslices, 8);
> + int eu_subslice_stride = DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
> + int eu_slice_stride = brw->topology.max_subslices * eu_subslice_stride;
> +
> + for (int s = 0; s < brw->topology.max_slices; s++) {
> + brw->topology.slice_mask[s / 8] |= 1UL << (s % 8);
> +
> + for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
> + brw->topology.subslice_mask[(s * subslice_stride + ss) / 8] |=
> + 1UL << (ss % 8);
> +
> + for (int eug = 0; eug < eu_subslice_stride; eug++) {
> + brw->topology.eu_mask[s * eu_slice_stride + ss * eu_subslice_stride + eug] =
> + (((1UL << brw->topology.max_eus_per_subslice) - 1) >> (eug * 8)) & 0xff;
> + }
> + }
> + }
> +}
> +
> +static void
> +compute_topology_builtins(struct brw_context *brw)
> +{
> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
> +
> + assert(brw->topology.max_slices <= 8);
> + brw->perfquery.sys_vars.slice_mask = brw->topology.slice_mask[0];
> + brw->perfquery.sys_vars.n_eu_slices =
> + __builtin_popcount(brw->perfquery.sys_vars.slice_mask);
> +
> + for (int i = 0; i < sizeof(brw->topology.subslice_mask); i++) {
> + brw->perfquery.sys_vars.n_eu_sub_slices +=
> + __builtin_popcount(brw->topology.subslice_mask[i]);
> + }
> +
> + for (int i = 0; i < sizeof(brw->topology.eu_mask); i++) {
> + brw->perfquery.sys_vars.n_eus +=
> + __builtin_popcount(brw->topology.eu_mask[i]);
> + }
> +
> + brw->perfquery.sys_vars.eu_threads_count =
> + brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
> +
> + /* At the moment the subslice mask builtin has groups of 3bits for each
> + * slice.
> + *
> + * Ideally equations would be updated to have a slice/subslice query
> + * function/operator.
> + */
> + brw->perfquery.sys_vars.subslice_mask = 0;
> + for (int s = 0; s < brw->topology.max_slices; s++) {
> + for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
> + if (brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride + ss / 8] &
> + (1UL << (ss % 8)))
> + brw->perfquery.sys_vars.subslice_mask |= 1UL << (s * 3 + ss);
> + }
> + }
> +}
> +
> static bool
> init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
> {
> @@ -1905,83 +2091,18 @@ init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
> &max_freq_mhz))
> return false;
>
> + memset(&brw->topology, 0, sizeof(brw->topology));
> + if (!query_topology(brw)) {
> + if (!getparam_topology(brw))
> + devinfo_topology(brw);
> + }
> +
> + memset(&brw->perfquery.sys_vars, 0, sizeof(brw->perfquery.sys_vars));
> brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
> brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
> brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
> -
> brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
> - brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
> - /* Assuming uniform distribution of subslices per slices. */
> - brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
> -
> - if (devinfo->is_haswell) {
> - brw->perfquery.sys_vars.slice_mask = 0;
> - brw->perfquery.sys_vars.subslice_mask = 0;
> -
> - for (int s = 0; s < devinfo->num_slices; s++)
> - brw->perfquery.sys_vars.slice_mask |= 1U << s;
> - for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
> - brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
> -
> - if (devinfo->gt == 1) {
> - brw->perfquery.sys_vars.n_eus = 10;
> - } else if (devinfo->gt == 2) {
> - brw->perfquery.sys_vars.n_eus = 20;
> - } else if (devinfo->gt == 3) {
> - brw->perfquery.sys_vars.n_eus = 40;
> - } else
> - unreachable("not reached");
> - } else {
> - drm_i915_getparam_t gp;
> - int ret;
> - int slice_mask = 0;
> - int ss_mask = 0;
> - /* maximum number of slices */
> - int s_max = devinfo->num_slices;
> - /* maximum number of subslices per slice (assuming uniform subslices per
> - * slices)
> - */
> - int ss_max = devinfo->num_subslices[0];
> - uint64_t subslice_mask = 0;
> - int s;
> -
> - gp.param = I915_PARAM_SLICE_MASK;
> - gp.value = &slice_mask;
> - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
> - if (ret)
> - return false;
> -
> - gp.param = I915_PARAM_SUBSLICE_MASK;
> - gp.value = &ss_mask;
> - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
> - if (ret)
> - return false;
> -
> - brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
> - brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
> - brw->perfquery.sys_vars.slice_mask = slice_mask;
> -
> - /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
> - * which applies to all slices.
> - *
> - * Note: some of the metrics we have (as described in XML) are
> - * conditional on a $SubsliceMask variable which is expected to also
> - * reflect the slice mask by packing together subslice masks for each
> - * slice in one value..
> - */
> - for (s = 0; s < s_max; s++) {
> - if (slice_mask & (1<<s)) {
> - subslice_mask |= ss_mask << (ss_max * s);
> - }
> - }
> -
> - brw->perfquery.sys_vars.subslice_mask = subslice_mask;
> - brw->perfquery.sys_vars.n_eu_sub_slices =
> - __builtin_popcount(subslice_mask);
> - }
> -
> - brw->perfquery.sys_vars.eu_threads_count =
> - brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
> + compute_topology_builtins(brw);
>
> return true;
> }
> --
> 2.16.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list