[Mesa-dev] [PATCH 2/4] i965: perf: query topology
Lionel Landwerlin
lionel.g.landwerlin at intel.com
Tue Mar 6 10:07:37 UTC 2018
On 06/03/18 01:07, Rafael Antognolli wrote:
> On Thu, Feb 22, 2018 at 05:24:38PM +0000, Lionel Landwerlin wrote:
>> With the introduction of asymmetric slices in CNL, we cannot rely on
>> the previous SUBSLICE_MASK getparam to tell userspace what subslices
>> are available.
>>
>> We introduce a new uAPI in the kernel driver to report exactly what
>> part of the GPU are fused and require this to be available on Gen10+.
>>
>> Prior generations can continue to rely on GETPARAM on older kernels.
>>
>> This patch is quite a lot of code because we have to support lots of
>> different kernel versions, ranging from not providing any information
>> (for Haswell on 4.13 through 4.17), to being able to query through
>> GETPARAM (for gen8/9 on 4.13 through 4.17), to finally requiring 4.17
>> for Gen10+.
> I don't think it's that much code. It's reasonable given how many
> interfaces we have to query such data.
>
>> This change stores topology information in a unified way on
>> brw_context.topology from the various kernel APIs. And then generates
>> the appropriate values for the equations from that unified topology.
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>> ---
>> src/mesa/drivers/dri/i965/brw_context.h | 14 ++
>> src/mesa/drivers/dri/i965/brw_performance_query.c | 267 ++++++++++++++++------
>> 2 files changed, 208 insertions(+), 73 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
>> index 050b656e3da..69bf7530fbc 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>> @@ -1160,6 +1160,20 @@ struct brw_context
>> bool supported;
>> } predicate;
>>
>> + struct {
>> + uint8_t slice_mask[4];
>> + uint8_t subslice_mask[100];
>> + uint8_t eu_mask[100];
>> +
>> + uint16_t max_slices;
>> + uint16_t max_subslices;
>> + uint16_t max_eus_per_subslice;
>> +
>> + uint16_t subslice_slice_stride;
>> + uint16_t eu_slice_stride;
>> + uint16_t eu_subslice_stride;
>> + } topology;
>> +
> I wonder if such information shouldn't be stored in gen_device_info. But
> it seems the rest of the OA code seems to be tied to i965 anyways, so I
> guess this should be fine.
Actually putting that into gen_device_info makes a lot of sense.
Thanks for the suggestion, I'll update the series.
>
> In any case, series is:
>
> Acked-by: Rafael Antognolli <rafael.antognolli at intel.com>
>
>> struct {
>> /* Variables referenced in the XML meta data for OA performance
>> * counters, e.g in the normalization equations.
>> diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
>> index c0bb4442bec..10f519a757f 100644
>> --- a/src/mesa/drivers/dri/i965/brw_performance_query.c
>> +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
>> @@ -1888,6 +1888,192 @@ init_oa_configs(struct brw_context *brw, const char *sysfs_dev_dir)
>> }
>> }
>>
>> +static bool
>> +query_topology(struct brw_context *brw)
>> +{
>> + __DRIscreen *screen = brw->screen->driScrnPriv;
>> + struct drm_i915_query_item item = {
>> + .query_id = DRM_I915_QUERY_TOPOLOGY_INFO,
>> + };
>> + struct drm_i915_query query = {
>> + .num_items = 1,
>> + .items_ptr = (uintptr_t) &item,
>> + };
>> +
>> + return false;
>> +
>> + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
>> + return false;
>> +
>> + struct drm_i915_query_topology_info *topo_info =
>> + (struct drm_i915_query_topology_info *) calloc(1, item.length);
>> + item.data_ptr = (uintptr_t) topo_info;
>> +
>> + if (drmIoctl(screen->fd, DRM_IOCTL_I915_QUERY, &query))
>> + return false;
>> +
>> + brw->topology.max_slices = topo_info->max_slices;
>> + brw->topology.max_subslices = topo_info->max_subslices;
>> + brw->topology.max_eus_per_subslice = topo_info->max_eus_per_subslice;
>> +
>> + brw->topology.subslice_slice_stride =
>> + DIV_ROUND_UP(brw->topology.max_subslices, 8);
>> + brw->topology.eu_subslice_stride =
>> + DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
>> + brw->topology.eu_slice_stride = brw->topology.max_subslices *
>> + brw->topology.eu_subslice_stride;
>> +
>> + assert(DIV_ROUND_UP(topo_info->max_slices, 8) <=
>> + sizeof(brw->topology.slice_mask));
>> + memcpy(brw->topology.slice_mask, topo_info->data,
>> + DIV_ROUND_UP(topo_info->max_slices, 8));
>> +
>> + assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices, 8) <=
>> + sizeof(brw->topology.subslice_mask));
>> + memcpy(brw->topology.subslice_mask,
>> + &topo_info->data[topo_info->subslice_offset],
>> + topo_info->max_slices * topo_info->subslice_stride);
>> +
>> + assert(DIV_ROUND_UP(topo_info->max_slices * topo_info->max_subslices *
>> + topo_info->max_eus_per_subslice, 8) <=
>> + sizeof(brw->topology.eu_mask));
>> + memcpy(brw->topology.eu_mask,
>> + &topo_info->data[topo_info->eu_offset],
>> + topo_info->max_slices * topo_info->max_subslices * topo_info->eu_stride);
>> +
>> + free(topo_info);
>> +
>> + return true;
>> +}
>> +
>> +static bool
>> +getparam_topology(struct brw_context *brw)
>> +{
>> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
>> + __DRIscreen *screen = brw->screen->driScrnPriv;
>> + drm_i915_getparam_t gp;
>> + int ret;
>> +
>> + /* On CNL+ we need to use the query ioctl(). */
>> + assert(devinfo->gen < 10);
>> +
>> + int slice_mask = 0;
>> + gp.param = I915_PARAM_SLICE_MASK;
>> + gp.value = &slice_mask;
>> + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>> + if (ret)
>> + return false;
>> +
>> + int subslice_mask = 0;
>> + gp.param = I915_PARAM_SUBSLICE_MASK;
>> + gp.value = &subslice_mask;
>> + ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>> + if (ret)
>> + return false;
>> +
>> + brw->topology.max_slices = util_last_bit(slice_mask);
>> + brw->topology.max_subslices = util_last_bit(subslice_mask);
>> + brw->topology.max_eus_per_subslice = devinfo->is_haswell ? 10 : 8;
>> +
>> + brw->topology.subslice_slice_stride =
>> + DIV_ROUND_UP(brw->topology.max_subslices, 8);
>> + brw->topology.eu_subslice_stride =
>> + DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
>> + brw->topology.eu_slice_stride = brw->topology.max_subslices *
>> + brw->topology.eu_subslice_stride;
>> +
>> + int n_subslices = __builtin_popcount(slice_mask) * __builtin_popcount(subslice_mask);
>> + int eus_per_subslice = brw->screen->eu_total / n_subslices;
>> +
>> + for (int s = 0; s < brw->topology.max_slices; s++) {
>> + brw->topology.slice_mask[s / 8] |= (1UL << (s % 8)) & slice_mask;
>> +
>> + for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
>> + brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride +
>> + ss / 8] |=
>> + (1UL << (ss % 8)) & subslice_mask;
>> +
>> + for (int eug = 0; eug < brw->topology.eu_slice_stride; eug++) {
>> + brw->topology.eu_mask[s * brw->topology.eu_slice_stride +
>> + ss * brw->topology.eu_subslice_stride +
>> + eug] =
>> + (((1UL << eus_per_subslice) - 1) >> (eug * 8)) & 0xff;
>> + }
>> + }
>> + }
>> +
>> + return true;
>> +}
>> +
>> +static void
>> +devinfo_topology(struct brw_context *brw)
>> +{
>> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
>> +
>> + assert(devinfo->is_haswell);
>> +
>> + brw->topology.max_slices = devinfo->num_slices;
>> + brw->topology.max_subslices = devinfo->num_subslices[0];
>> + brw->topology.max_eus_per_subslice = 10;
>> +
>> + int subslice_stride = DIV_ROUND_UP(brw->topology.max_subslices, 8);
>> + int eu_subslice_stride = DIV_ROUND_UP(brw->topology.max_eus_per_subslice, 8);
>> + int eu_slice_stride = brw->topology.max_subslices * eu_subslice_stride;
>> +
>> + for (int s = 0; s < brw->topology.max_slices; s++) {
>> + brw->topology.slice_mask[s / 8] |= 1UL << (s % 8);
>> +
>> + for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
>> + brw->topology.subslice_mask[(s * subslice_stride + ss) / 8] |=
>> + 1UL << (ss % 8);
>> +
>> + for (int eug = 0; eug < eu_subslice_stride; eug++) {
>> + brw->topology.eu_mask[s * eu_slice_stride + ss * eu_subslice_stride + eug] =
>> + (((1UL << brw->topology.max_eus_per_subslice) - 1) >> (eug * 8)) & 0xff;
>> + }
>> + }
>> + }
>> +}
>> +
>> +static void
>> +compute_topology_builtins(struct brw_context *brw)
>> +{
>> + const struct gen_device_info *devinfo = &brw->screen->devinfo;
>> +
>> + assert(brw->topology.max_slices <= 8);
>> + brw->perfquery.sys_vars.slice_mask = brw->topology.slice_mask[0];
>> + brw->perfquery.sys_vars.n_eu_slices =
>> + __builtin_popcount(brw->perfquery.sys_vars.slice_mask);
>> +
>> + for (int i = 0; i < sizeof(brw->topology.subslice_mask); i++) {
>> + brw->perfquery.sys_vars.n_eu_sub_slices +=
>> + __builtin_popcount(brw->topology.subslice_mask[i]);
>> + }
>> +
>> + for (int i = 0; i < sizeof(brw->topology.eu_mask); i++) {
>> + brw->perfquery.sys_vars.n_eus +=
>> + __builtin_popcount(brw->topology.eu_mask[i]);
>> + }
>> +
>> + brw->perfquery.sys_vars.eu_threads_count =
>> + brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
>> +
>> + /* At the moment the subslice mask builtin has groups of 3bits for each
>> + * slice.
>> + *
>> + * Ideally equations would be updated to have a slice/subslice query
>> + * function/operator.
>> + */
>> + brw->perfquery.sys_vars.subslice_mask = 0;
>> + for (int s = 0; s < brw->topology.max_slices; s++) {
>> + for (int ss = 0; ss < brw->topology.max_subslices; ss++) {
>> + if (brw->topology.subslice_mask[s * brw->topology.subslice_slice_stride + ss / 8] &
>> + (1UL << (ss % 8)))
>> + brw->perfquery.sys_vars.subslice_mask |= 1UL << (s * 3 + ss);
>> + }
>> + }
>> +}
>> +
>> static bool
>> init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
>> {
>> @@ -1905,83 +2091,18 @@ init_oa_sys_vars(struct brw_context *brw, const char *sysfs_dev_dir)
>> &max_freq_mhz))
>> return false;
>>
>> + memset(&brw->topology, 0, sizeof(brw->topology));
>> + if (!query_topology(brw)) {
>> + if (!getparam_topology(brw))
>> + devinfo_topology(brw);
>> + }
>> +
>> + memset(&brw->perfquery.sys_vars, 0, sizeof(brw->perfquery.sys_vars));
>> brw->perfquery.sys_vars.gt_min_freq = min_freq_mhz * 1000000;
>> brw->perfquery.sys_vars.gt_max_freq = max_freq_mhz * 1000000;
>> brw->perfquery.sys_vars.timestamp_frequency = devinfo->timestamp_frequency;
>> -
>> brw->perfquery.sys_vars.revision = intel_device_get_revision(screen->fd);
>> - brw->perfquery.sys_vars.n_eu_slices = devinfo->num_slices;
>> - /* Assuming uniform distribution of subslices per slices. */
>> - brw->perfquery.sys_vars.n_eu_sub_slices = devinfo->num_subslices[0];
>> -
>> - if (devinfo->is_haswell) {
>> - brw->perfquery.sys_vars.slice_mask = 0;
>> - brw->perfquery.sys_vars.subslice_mask = 0;
>> -
>> - for (int s = 0; s < devinfo->num_slices; s++)
>> - brw->perfquery.sys_vars.slice_mask |= 1U << s;
>> - for (int ss = 0; ss < devinfo->num_subslices[0]; ss++)
>> - brw->perfquery.sys_vars.subslice_mask |= 1U << ss;
>> -
>> - if (devinfo->gt == 1) {
>> - brw->perfquery.sys_vars.n_eus = 10;
>> - } else if (devinfo->gt == 2) {
>> - brw->perfquery.sys_vars.n_eus = 20;
>> - } else if (devinfo->gt == 3) {
>> - brw->perfquery.sys_vars.n_eus = 40;
>> - } else
>> - unreachable("not reached");
>> - } else {
>> - drm_i915_getparam_t gp;
>> - int ret;
>> - int slice_mask = 0;
>> - int ss_mask = 0;
>> - /* maximum number of slices */
>> - int s_max = devinfo->num_slices;
>> - /* maximum number of subslices per slice (assuming uniform subslices per
>> - * slices)
>> - */
>> - int ss_max = devinfo->num_subslices[0];
>> - uint64_t subslice_mask = 0;
>> - int s;
>> -
>> - gp.param = I915_PARAM_SLICE_MASK;
>> - gp.value = &slice_mask;
>> - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>> - if (ret)
>> - return false;
>> -
>> - gp.param = I915_PARAM_SUBSLICE_MASK;
>> - gp.value = &ss_mask;
>> - ret = drmIoctl(screen->fd, DRM_IOCTL_I915_GETPARAM, &gp);
>> - if (ret)
>> - return false;
>> -
>> - brw->perfquery.sys_vars.n_eus = brw->screen->eu_total;
>> - brw->perfquery.sys_vars.n_eu_slices = __builtin_popcount(slice_mask);
>> - brw->perfquery.sys_vars.slice_mask = slice_mask;
>> -
>> - /* Note: the _SUBSLICE_MASK param only reports a global subslice mask
>> - * which applies to all slices.
>> - *
>> - * Note: some of the metrics we have (as described in XML) are
>> - * conditional on a $SubsliceMask variable which is expected to also
>> - * reflect the slice mask by packing together subslice masks for each
>> - * slice in one value..
>> - */
>> - for (s = 0; s < s_max; s++) {
>> - if (slice_mask & (1<<s)) {
>> - subslice_mask |= ss_mask << (ss_max * s);
>> - }
>> - }
>> -
>> - brw->perfquery.sys_vars.subslice_mask = subslice_mask;
>> - brw->perfquery.sys_vars.n_eu_sub_slices =
>> - __builtin_popcount(subslice_mask);
>> - }
>> -
>> - brw->perfquery.sys_vars.eu_threads_count =
>> - brw->perfquery.sys_vars.n_eus * devinfo->num_thread_per_eu;
>> + compute_topology_builtins(brw);
>>
>> return true;
>> }
>> --
>> 2.16.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list