[RESEND PATCH v2 2/2] drm/panfrost: Expose perf counters through debugfs

Thu May 16 16:21:39 UTC 2019

On 14/05/2019 14:31, Rob Herring wrote:
> On Tue, May 14, 2019 at 5:48 AM Boris Brezillon
> <boris.brezillon at collabora.com> wrote:
>>
>> Add a way to dump perf counters through debugfs. The implementation is
>> kept simple and has a number of limitations:
>>
>> * it's not designed for multi-user usage as the counter values are
>>   reset after each dump and there's no per-user context
>> * only accessible to root users
>> * no counters naming/position abstraction. Things are dumped in a raw
>>   format that has to be parsed by the user who has to know where the
>>   relevant values are and what they mean
>>
>> This implementation is intended to be used by mesa developers to help
>> debug perf-related issues while we work on a more generic approach that
>> would allow all GPU drivers to expose their counters in a consistent
>> way. As a result, this debugfs interface is considered unstable and
>> might be deprecated in the future.
>>
>> Signed-off-by: Boris Brezillon <boris.brezillon at collabora.com>
>> ---
>> Changes in v2:
>> * Expose counters through debugfs and keep things simple for now (we'll
>>   work on a generic solution in parallel)
>> ---
>>  drivers/gpu/drm/panfrost/Makefile           |   3 +-
>>  drivers/gpu/drm/panfrost/panfrost_device.c  |   9 +
>>  drivers/gpu/drm/panfrost/panfrost_device.h  |   3 +
>>  drivers/gpu/drm/panfrost/panfrost_drv.c     |   7 +
>>  drivers/gpu/drm/panfrost/panfrost_gpu.c     |   7 +
>>  drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 339 ++++++++++++++++++++
>>  drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  15 +
>>  drivers/gpu/drm/panfrost/panfrost_regs.h    |  19 ++
>>  8 files changed, 401 insertions(+), 1 deletion(-)
>>  create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
>>  create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h
>>
[...]
>> new file mode 100644
>> index 000000000000..8093783a5a26
>> --- /dev/null
>> +++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
>> @@ -0,0 +1,339 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright 2019 Collabora Ltd */
>> +
>> +#include <drm/drm_file.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> +#include <drm/panfrost_drm.h>
>> +#include <linux/completion.h>
>> +#include <linux/debugfs.h>
>> +#include <linux/iopoll.h>
>> +#include <linux/pm_runtime.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +
>> +#include "panfrost_device.h"
>> +#include "panfrost_features.h"
>> +#include "panfrost_gem.h"
>> +#include "panfrost_issues.h"
>> +#include "panfrost_job.h"
>> +#include "panfrost_mmu.h"
>> +#include "panfrost_regs.h"
>> +
>> +#define COUNTERS_PER_BLOCK             64
>> +#define BYTES_PER_COUNTER              4
>> +#define BLOCKS_PER_COREGROUP           8
>> +#define V4_SHADERS_PER_COREGROUP       4
>> +
>> +struct panfrost_perfcnt {
>> +       struct panfrost_gem_object *bo;
>> +       size_t bosize;
>> +       void *buf;
>> +       bool enabled;
>> +       struct mutex lock;
>> +       struct completion dump_comp;
>> +};
>> +
>> +void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
>> +{
>> +       complete(&pfdev->perfcnt->dump_comp);
>> +}
>> +
>> +void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
>> +{
>> +       gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
>> +}
>> +
>> +static void panfrost_perfcnt_setup(struct panfrost_device *pfdev)
>> +{
>> +       u32 cfg;
>> +
>> +       /*
>> +        * Always use address space 0 for now.
>> +        * FIXME: this needs to be updated when we start using different
>> +        * address space.
>> +        */
>> +       cfg = GPU_PERFCNT_CFG_AS(0);
>> +       if (panfrost_model_cmp(pfdev, 0x1000) >= 0)
> 
> You've got a couple of these. Perhaps we should add either a
> model_is_bifrost() helper or an is_bifrost variable to use.
> 
>> +               cfg |= GPU_PERFCNT_CFG_SETSEL(1);

mali_kbase only sets this if CONFIG_MALI_PRFCNT_SET_SECONDARY is defined
- i.e. this selects a different selection of counters. At at the very
least this should be an optional flag that can be set, but I suspect the
primary set are the ones you are interested in.

>> +
>> +       gpu_write(pfdev, GPU_PERFCNT_CFG,
>> +                 cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
>> +
>> +       if (!pfdev->perfcnt->enabled)
>> +               return;
>> +
>> +       gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0xffffffff);
>> +       gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0xffffffff);
>> +       gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0xffffffff);
>> +
>> +       /*
>> +        * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
>> +        * counters.
> 
> Seems like you could just always apply the work-around? It doesn't
> look like it would be much different.

Technically the workaround causes the driver to perform the operation in
the wrong order below (write TILER_EN when the mode is MANUAL) - this is
a workaround for the hardware issue (and therefore works on that
hardware). But I wouldn't like to say it works on other hardware which
doesn't have the issue.

>> +        */
>> +       if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
>> +               gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
>> +       else
>> +               gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
>> +
>> +       gpu_write(pfdev, GPU_PERFCNT_CFG,
>> +                 cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
>> +
>> +       if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
>> +               gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
>> +}
>> +
>> +static int panfrost_perfcnt_dump(struct panfrost_device *pfdev)
>> +{
>> +       u64 gpuva;
>> +       int ret;
>> +
>> +       reinit_completion(&pfdev->perfcnt->dump_comp);
>> +       gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
>> +       gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
>> +       gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
>> +       gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
>> +       ret = wait_for_completion_interruptible_timeout(&pfdev->perfcnt->dump_comp,
>> +                                                       msecs_to_jiffies(1000));
>> +       if (!ret)
>> +               ret = -ETIMEDOUT;
>> +       else if (ret > 0)
>> +               ret = 0;
>> +
>> +       return ret;
>> +}
>> +
>> +void panfrost_perfcnt_resume(struct panfrost_device *pfdev)
> 
> No suspend handling needed?
> 
>> +{
>> +       if (pfdev->perfcnt)
>> +               panfrost_perfcnt_setup(pfdev);
>> +}
>> +
>> +static ssize_t panfrost_perfcnt_enable_read(struct file *file,
>> +                                           char __user *user_buf,
>> +                                           size_t count, loff_t *ppos)
>> +{
>> +       struct panfrost_device *pfdev = file->private_data;
>> +       ssize_t ret;
>> +
>> +       mutex_lock(&pfdev->perfcnt->lock);
>> +       ret = simple_read_from_buffer(user_buf, count, ppos,
>> +                                     pfdev->perfcnt->enabled ? "Y\n" : "N\n",
>> +                                     2);
>> +       mutex_unlock(&pfdev->perfcnt->lock);
>> +
>> +       return ret;
>> +}
>> +
>> +static ssize_t panfrost_perfcnt_enable_write(struct file *file,
>> +                                            const char __user *user_buf,
>> +                                            size_t count, loff_t *ppos)
>> +{
>> +       struct panfrost_device *pfdev = file->private_data;
>> +       bool enable;
>> +       int ret;
>> +
>> +       ret = kstrtobool_from_user(user_buf, count, &enable);
>> +       if (ret)
>> +               return ret;
>> +
>> +       ret = pm_runtime_get_sync(pfdev->dev);
>> +       if (ret < 0)
>> +               return ret;
>> +
>> +       mutex_lock(&pfdev->perfcnt->lock);
>> +       if (enable != pfdev->perfcnt->enabled) {
>> +               pfdev->perfcnt->enabled = enable;
>> +               panfrost_perfcnt_setup(pfdev);
>> +       }
>> +       mutex_unlock(&pfdev->perfcnt->lock);
>> +
>> +       pm_runtime_mark_last_busy(pfdev->dev);
>> +       pm_runtime_put_autosuspend(pfdev->dev);
>> +
>> +       return count;
>> +}
>> +
>> +static const struct file_operations panfrost_perfcnt_enable_fops = {
>> +       .read = panfrost_perfcnt_enable_read,
>> +       .write = panfrost_perfcnt_enable_write,
>> +       .open = simple_open,
>> +       .llseek = default_llseek,
>> +};
>> +
>> +static ssize_t panfrost_perfcnt_dump_read(struct file *file,
>> +                                         char __user *user_buf,
>> +                                         size_t count, loff_t *ppos)
>> +{
>> +       struct panfrost_device *pfdev = file->private_data;
>> +       ssize_t ret;
>> +
>> +       ret = pm_runtime_get_sync(pfdev->dev);
>> +       if (ret < 0)
>> +               return ret;
>> +
>> +       mutex_lock(&pfdev->perfcnt->lock);
>> +       if (!pfdev->perfcnt->enabled) {
>> +               ret = -EINVAL;
>> +               goto out;
>> +       }
>> +
>> +       ret = panfrost_perfcnt_dump(pfdev);
>> +       if (ret)
>> +               goto out;
>> +
>> +       ret = simple_read_from_buffer(user_buf, count, ppos,
>> +                                     pfdev->perfcnt->buf,
>> +                                     pfdev->perfcnt->bosize);
>> +
>> +out:
>> +       mutex_unlock(&pfdev->perfcnt->lock);
>> +       pm_runtime_mark_last_busy(pfdev->dev);
>> +       pm_runtime_put_autosuspend(pfdev->dev);
>> +
>> +       return ret;
>> +}
>> +
>> +static const struct file_operations panfrost_perfcnt_dump_fops = {
>> +       .read = panfrost_perfcnt_dump_read,
>> +       .open = simple_open,
>> +       .llseek = default_llseek,
>> +};
>> +
>> +int panfrost_perfcnt_debugfs_init(struct drm_minor *minor)
>> +{
>> +       struct panfrost_device *pfdev = to_panfrost_device(minor->dev);
>> +       struct dentry *file, *dir;
>> +
>> +       dir = debugfs_create_dir("perfcnt", minor->debugfs_root);
>> +       if (IS_ERR(dir))
>> +               return PTR_ERR(dir);
>> +
>> +       file = debugfs_create_file("dump", 0400, dir, pfdev,
>> +                                  &panfrost_perfcnt_dump_fops);
>> +       if (IS_ERR(file))
>> +               return PTR_ERR(file);
>> +
>> +       file = debugfs_create_file("enable", 0600, dir, pfdev,
>> +                                  &panfrost_perfcnt_enable_fops);
>> +       if (IS_ERR(file))
>> +               return PTR_ERR(file);
>> +
>> +       return 0;
>> +}
>> +
>> +int panfrost_perfcnt_init(struct panfrost_device *pfdev)
>> +{
>> +       struct panfrost_perfcnt *perfcnt;
>> +       struct drm_gem_shmem_object *bo;
>> +       size_t size;
>> +       u32 status;
>> +       int ret;
>> +
>> +       if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
>> +               unsigned int ncoregroups;
>> +
>> +               ncoregroups = hweight64(pfdev->features.l2_present);
>> +               size = ncoregroups * BLOCKS_PER_COREGROUP *
>> +                      COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
>> +       } else {
>> +               unsigned int nl2c, ncores;
>> +
>> +               /*
>> +                * TODO: define a macro to extract the number of l2 caches from
>> +                * mem_features.
>> +                */
>> +               nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
>> +
>> +               /*
>> +                * The ARM driver is grouping cores per core group and then
>> +                * only using the number of cores in group 0 to calculate the
>> +                * size. Not sure why this is done like that, but I guess
>> +                * shader_present will only show cores in the first group
>> +                * anyway.
>> +                */

I'm not sure I understand this comment. I'm not sure which version of
the driver you are looking at, but shader_present should contain all the
cores (in every core group).

The kbase driver (in panfrost's git tree[1]), contains:

	base_gpu_props *props = &kbdev->gpu_props.props;
	u32 nr_l2 = props->l2_props.num_l2_slices;
	u64 core_mask = props->coherency_info.group[0].core_mask;

	u32 nr_blocks = fls64(core_mask);

	/* JM and tiler counter blocks are always present */
	dump_size = (2 + nr_l2 + nr_blocks) *
			NR_CNT_PER_BLOCK *
			NR_BYTES_PER_CNT;

[1]
https://gitlab.freedesktop.org/panfrost/mali_kbase/blob/master/driver/product/kernel/drivers/gpu/arm/midgard/mali_kbase_vinstr.c

i.e. the number of cores + number of L2 caches (plus 2 for JM/tiler)
multiplied by the block size.

Also another difference with the below is that fls64 and hweight64 don't
return the same numbers. If the core mask is non-contiguous this will
return different values.

>> +               ncores = hweight64(pfdev->features.shader_present);
>> +
>> +               /*
>> +                * There's always one JM and one Tiler block, hence the '+ 2'
>> +                * here.
>> +                */
>> +               size = (nl2c + ncores + 2) *
>> +                      COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
>> +       }
>> +
>> +       perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
>> +       if (!perfcnt)
>> +               return -ENOMEM;
[...]

Steve