[RESEND PATCH v2 2/2] drm/panfrost: Expose perf counters through debugfs
Steven Price
steven.price at arm.com
Thu May 16 16:21:39 UTC 2019
On 14/05/2019 14:31, Rob Herring wrote:
> On Tue, May 14, 2019 at 5:48 AM Boris Brezillon
> <boris.brezillon at collabora.com> wrote:
>>
>> Add a way to dump perf counters through debugfs. The implementation is
>> kept simple and has a number of limitations:
>>
>> * it's not designed for multi-user usage as the counter values are
>> reset after each dump and there's no per-user context
>> * only accessible to root users
>> * no counters naming/position abstraction. Things are dumped in a raw
>> format that has to be parsed by the user who has to know where the
>> relevant values are and what they mean
>>
>> This implementation is intended to be used by mesa developers to help
>> debug perf-related issues while we work on a more generic approach that
>> would allow all GPU drivers to expose their counters in a consistent
>> way. As a result, this debugfs interface is considered unstable and
>> might be deprecated in the future.
>>
>> Signed-off-by: Boris Brezillon <boris.brezillon at collabora.com>
>> ---
>> Changes in v2:
>> * Expose counters through debugfs and keep things simple for now (we'll
>> work on a generic solution in parallel)
>> ---
>> drivers/gpu/drm/panfrost/Makefile | 3 +-
>> drivers/gpu/drm/panfrost/panfrost_device.c | 9 +
>> drivers/gpu/drm/panfrost/panfrost_device.h | 3 +
>> drivers/gpu/drm/panfrost/panfrost_drv.c | 7 +
>> drivers/gpu/drm/panfrost/panfrost_gpu.c | 7 +
>> drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 339 ++++++++++++++++++++
>> drivers/gpu/drm/panfrost/panfrost_perfcnt.h | 15 +
>> drivers/gpu/drm/panfrost/panfrost_regs.h | 19 ++
>> 8 files changed, 401 insertions(+), 1 deletion(-)
>> create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
>> create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h
>>
[...]
>> new file mode 100644
>> index 000000000000..8093783a5a26
>> --- /dev/null
>> +++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
>> @@ -0,0 +1,339 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright 2019 Collabora Ltd */
>> +
>> +#include <drm/drm_file.h>
>> +#include <drm/drm_gem_shmem_helper.h>
>> +#include <drm/panfrost_drm.h>
>> +#include <linux/completion.h>
>> +#include <linux/debugfs.h>
>> +#include <linux/iopoll.h>
>> +#include <linux/pm_runtime.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +
>> +#include "panfrost_device.h"
>> +#include "panfrost_features.h"
>> +#include "panfrost_gem.h"
>> +#include "panfrost_issues.h"
>> +#include "panfrost_job.h"
>> +#include "panfrost_mmu.h"
>> +#include "panfrost_regs.h"
>> +
>> +#define COUNTERS_PER_BLOCK 64
>> +#define BYTES_PER_COUNTER 4
>> +#define BLOCKS_PER_COREGROUP 8
>> +#define V4_SHADERS_PER_COREGROUP 4
>> +
>> +struct panfrost_perfcnt {
>> + struct panfrost_gem_object *bo;
>> + size_t bosize;
>> + void *buf;
>> + bool enabled;
>> + struct mutex lock;
>> + struct completion dump_comp;
>> +};
>> +
>> +void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
>> +{
>> + complete(&pfdev->perfcnt->dump_comp);
>> +}
>> +
>> +void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
>> +{
>> + gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
>> +}
>> +
>> +static void panfrost_perfcnt_setup(struct panfrost_device *pfdev)
>> +{
>> + u32 cfg;
>> +
>> + /*
>> + * Always use address space 0 for now.
>> + * FIXME: this needs to be updated when we start using different
>> + * address space.
>> + */
>> + cfg = GPU_PERFCNT_CFG_AS(0);
>> + if (panfrost_model_cmp(pfdev, 0x1000) >= 0)
>
> You've got a couple of these. Perhaps we should add either a
> model_is_bifrost() helper or an is_bifrost variable to use.
>
>> + cfg |= GPU_PERFCNT_CFG_SETSEL(1);
mali_kbase only sets this if CONFIG_MALI_PRFCNT_SET_SECONDARY is defined
- i.e. this selects a different selection of counters. At at the very
least this should be an optional flag that can be set, but I suspect the
primary set are the ones you are interested in.
>> +
>> + gpu_write(pfdev, GPU_PERFCNT_CFG,
>> + cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
>> +
>> + if (!pfdev->perfcnt->enabled)
>> + return;
>> +
>> + gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0xffffffff);
>> + gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0xffffffff);
>> + gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0xffffffff);
>> +
>> + /*
>> + * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
>> + * counters.
>
> Seems like you could just always apply the work-around? It doesn't
> look like it would be much different.
Technically the workaround causes the driver to perform the operation in
the wrong order below (write TILER_EN when the mode is MANUAL) - this is
a workaround for the hardware issue (and therefore works on that
hardware). But I wouldn't like to say it works on other hardware which
doesn't have the issue.
>> + */
>> + if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
>> + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
>> + else
>> + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
>> +
>> + gpu_write(pfdev, GPU_PERFCNT_CFG,
>> + cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
>> +
>> + if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
>> + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
>> +}
>> +
>> +static int panfrost_perfcnt_dump(struct panfrost_device *pfdev)
>> +{
>> + u64 gpuva;
>> + int ret;
>> +
>> + reinit_completion(&pfdev->perfcnt->dump_comp);
>> + gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
>> + gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
>> + gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
>> + gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
>> + ret = wait_for_completion_interruptible_timeout(&pfdev->perfcnt->dump_comp,
>> + msecs_to_jiffies(1000));
>> + if (!ret)
>> + ret = -ETIMEDOUT;
>> + else if (ret > 0)
>> + ret = 0;
>> +
>> + return ret;
>> +}
>> +
>> +void panfrost_perfcnt_resume(struct panfrost_device *pfdev)
>
> No suspend handling needed?
>
>> +{
>> + if (pfdev->perfcnt)
>> + panfrost_perfcnt_setup(pfdev);
>> +}
>> +
>> +static ssize_t panfrost_perfcnt_enable_read(struct file *file,
>> + char __user *user_buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + struct panfrost_device *pfdev = file->private_data;
>> + ssize_t ret;
>> +
>> + mutex_lock(&pfdev->perfcnt->lock);
>> + ret = simple_read_from_buffer(user_buf, count, ppos,
>> + pfdev->perfcnt->enabled ? "Y\n" : "N\n",
>> + 2);
>> + mutex_unlock(&pfdev->perfcnt->lock);
>> +
>> + return ret;
>> +}
>> +
>> +static ssize_t panfrost_perfcnt_enable_write(struct file *file,
>> + const char __user *user_buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + struct panfrost_device *pfdev = file->private_data;
>> + bool enable;
>> + int ret;
>> +
>> + ret = kstrtobool_from_user(user_buf, count, &enable);
>> + if (ret)
>> + return ret;
>> +
>> + ret = pm_runtime_get_sync(pfdev->dev);
>> + if (ret < 0)
>> + return ret;
>> +
>> + mutex_lock(&pfdev->perfcnt->lock);
>> + if (enable != pfdev->perfcnt->enabled) {
>> + pfdev->perfcnt->enabled = enable;
>> + panfrost_perfcnt_setup(pfdev);
>> + }
>> + mutex_unlock(&pfdev->perfcnt->lock);
>> +
>> + pm_runtime_mark_last_busy(pfdev->dev);
>> + pm_runtime_put_autosuspend(pfdev->dev);
>> +
>> + return count;
>> +}
>> +
>> +static const struct file_operations panfrost_perfcnt_enable_fops = {
>> + .read = panfrost_perfcnt_enable_read,
>> + .write = panfrost_perfcnt_enable_write,
>> + .open = simple_open,
>> + .llseek = default_llseek,
>> +};
>> +
>> +static ssize_t panfrost_perfcnt_dump_read(struct file *file,
>> + char __user *user_buf,
>> + size_t count, loff_t *ppos)
>> +{
>> + struct panfrost_device *pfdev = file->private_data;
>> + ssize_t ret;
>> +
>> + ret = pm_runtime_get_sync(pfdev->dev);
>> + if (ret < 0)
>> + return ret;
>> +
>> + mutex_lock(&pfdev->perfcnt->lock);
>> + if (!pfdev->perfcnt->enabled) {
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + ret = panfrost_perfcnt_dump(pfdev);
>> + if (ret)
>> + goto out;
>> +
>> + ret = simple_read_from_buffer(user_buf, count, ppos,
>> + pfdev->perfcnt->buf,
>> + pfdev->perfcnt->bosize);
>> +
>> +out:
>> + mutex_unlock(&pfdev->perfcnt->lock);
>> + pm_runtime_mark_last_busy(pfdev->dev);
>> + pm_runtime_put_autosuspend(pfdev->dev);
>> +
>> + return ret;
>> +}
>> +
>> +static const struct file_operations panfrost_perfcnt_dump_fops = {
>> + .read = panfrost_perfcnt_dump_read,
>> + .open = simple_open,
>> + .llseek = default_llseek,
>> +};
>> +
>> +int panfrost_perfcnt_debugfs_init(struct drm_minor *minor)
>> +{
>> + struct panfrost_device *pfdev = to_panfrost_device(minor->dev);
>> + struct dentry *file, *dir;
>> +
>> + dir = debugfs_create_dir("perfcnt", minor->debugfs_root);
>> + if (IS_ERR(dir))
>> + return PTR_ERR(dir);
>> +
>> + file = debugfs_create_file("dump", 0400, dir, pfdev,
>> + &panfrost_perfcnt_dump_fops);
>> + if (IS_ERR(file))
>> + return PTR_ERR(file);
>> +
>> + file = debugfs_create_file("enable", 0600, dir, pfdev,
>> + &panfrost_perfcnt_enable_fops);
>> + if (IS_ERR(file))
>> + return PTR_ERR(file);
>> +
>> + return 0;
>> +}
>> +
>> +int panfrost_perfcnt_init(struct panfrost_device *pfdev)
>> +{
>> + struct panfrost_perfcnt *perfcnt;
>> + struct drm_gem_shmem_object *bo;
>> + size_t size;
>> + u32 status;
>> + int ret;
>> +
>> + if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
>> + unsigned int ncoregroups;
>> +
>> + ncoregroups = hweight64(pfdev->features.l2_present);
>> + size = ncoregroups * BLOCKS_PER_COREGROUP *
>> + COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
>> + } else {
>> + unsigned int nl2c, ncores;
>> +
>> + /*
>> + * TODO: define a macro to extract the number of l2 caches from
>> + * mem_features.
>> + */
>> + nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
>> +
>> + /*
>> + * The ARM driver is grouping cores per core group and then
>> + * only using the number of cores in group 0 to calculate the
>> + * size. Not sure why this is done like that, but I guess
>> + * shader_present will only show cores in the first group
>> + * anyway.
>> + */
I'm not sure I understand this comment. I'm not sure which version of
the driver you are looking at, but shader_present should contain all the
cores (in every core group).
The kbase driver (in panfrost's git tree[1]), contains:
base_gpu_props *props = &kbdev->gpu_props.props;
u32 nr_l2 = props->l2_props.num_l2_slices;
u64 core_mask = props->coherency_info.group[0].core_mask;
u32 nr_blocks = fls64(core_mask);
/* JM and tiler counter blocks are always present */
dump_size = (2 + nr_l2 + nr_blocks) *
NR_CNT_PER_BLOCK *
NR_BYTES_PER_CNT;
[1]
https://gitlab.freedesktop.org/panfrost/mali_kbase/blob/master/driver/product/kernel/drivers/gpu/arm/midgard/mali_kbase_vinstr.c
i.e. the number of cores + number of L2 caches (plus 2 for JM/tiler)
multiplied by the block size.
Also another difference with the below is that fls64 and hweight64 don't
return the same numbers. If the core mask is non-contiguous this will
return different values.
>> + ncores = hweight64(pfdev->features.shader_present);
>> +
>> + /*
>> + * There's always one JM and one Tiler block, hence the '+ 2'
>> + * here.
>> + */
>> + size = (nl2c + ncores + 2) *
>> + COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
>> + }
>> +
>> + perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
>> + if (!perfcnt)
>> + return -ENOMEM;
[...]
Steve
More information about the dri-devel
mailing list