[PATCH 2/3] drm/panfrost: Expose HW counters to userspace
Boris Brezillon
boris.brezillon at collabora.com
Thu Apr 4 15:20:50 UTC 2019
Add the necessary infrastructure to expose GPU counters to userspace.
This takes the form of 4 new ioctls to:
- query the available counters
- create/destroy a performance monitor
- retrieve its values
The drm_panfrost_submit struct is extended to pass a list of perfmons
to attach to a job, which means perfmons will only track changes caused
by the jobs they are attached too.
Signed-off-by: Boris Brezillon <boris.brezillon at collabora.com>
---
drivers/gpu/drm/panfrost/Makefile | 3 +-
drivers/gpu/drm/panfrost/panfrost_device.c | 8 +
drivers/gpu/drm/panfrost/panfrost_device.h | 11 +
drivers/gpu/drm/panfrost/panfrost_drv.c | 22 +-
drivers/gpu/drm/panfrost/panfrost_gpu.c | 43 +-
drivers/gpu/drm/panfrost/panfrost_job.c | 24 +
drivers/gpu/drm/panfrost/panfrost_job.h | 4 +
drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 954 ++++++++++++++++++++
drivers/gpu/drm/panfrost/panfrost_perfcnt.h | 54 ++
drivers/gpu/drm/panfrost/panfrost_regs.h | 19 +
include/uapi/drm/panfrost_drm.h | 122 +++
11 files changed, 1260 insertions(+), 4 deletions(-)
create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h
diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile
index d07e0971b687..31cfb9d25682 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -6,6 +6,7 @@ panfrost-y := \
panfrost_gem.o \
panfrost_gpu.o \
panfrost_job.o \
- panfrost_mmu.o
+ panfrost_mmu.o \
+ panfrost_perfcnt.o
obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index 148b5caa2322..f6a87bfa486b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -13,6 +13,7 @@
#include "panfrost_gpu.h"
#include "panfrost_job.h"
#include "panfrost_mmu.h"
+#include "panfrost_perfcnt.h"
static int panfrost_reset_init(struct panfrost_device *pfdev)
{
@@ -147,7 +148,13 @@ int panfrost_device_init(struct panfrost_device *pfdev)
pm_runtime_mark_last_busy(pfdev->dev);
pm_runtime_put_autosuspend(pfdev->dev);
+ err = panfrost_perfcnt_init(pfdev);
+ if (err)
+ goto err_out5;
+
return 0;
+err_out5:
+ panfrost_job_fini(pfdev);
err_out4:
panfrost_mmu_fini(pfdev);
err_out3:
@@ -163,6 +170,7 @@ int panfrost_device_init(struct panfrost_device *pfdev)
void panfrost_device_fini(struct panfrost_device *pfdev)
{
+ panfrost_perfcnt_fini(pfdev);
panfrost_job_fini(pfdev);
panfrost_mmu_fini(pfdev);
panfrost_gpu_fini(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index a821b50a14c3..f7c4e9e55f1b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -9,11 +9,13 @@
#include <drm/drm_device.h>
#include <drm/drm_mm.h>
#include <drm/gpu_scheduler.h>
+#include <drm/panfrost_drm.h>
struct panfrost_device;
struct panfrost_mmu;
struct panfrost_job_slot;
struct panfrost_job;
+struct panfrost_perfcnt;
#define NUM_JOB_SLOTS 3
@@ -45,6 +47,8 @@ struct panfrost_features {
unsigned long hw_features[64 / BITS_PER_LONG];
unsigned long hw_issues[64 / BITS_PER_LONG];
+
+ struct drm_panfrost_block_perfcounters perfcnt_layout[PANFROST_NUM_BLOCKS];
};
struct panfrost_device {
@@ -70,6 +74,8 @@ struct panfrost_device {
struct panfrost_job *jobs[NUM_JOB_SLOTS];
struct list_head scheduled_jobs;
+ struct panfrost_perfcnt *perfcnt;
+
struct mutex sched_lock;
};
@@ -77,6 +83,11 @@ struct panfrost_file_priv {
struct panfrost_device *pfdev;
struct drm_sched_entity sched_entity[NUM_JOB_SLOTS];
+
+ struct {
+ struct idr idr;
+ struct mutex lock;
+ } perfmon;
};
static inline struct panfrost_device *to_panfrost_device(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 8cffb70a3548..e5375b31627f 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -19,6 +19,7 @@
#include "panfrost_mmu.h"
#include "panfrost_job.h"
#include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
static int panfrost_ioctl_get_param(struct drm_device *ddev, void *data, struct drm_file *file)
{
@@ -219,6 +220,10 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
if (ret)
goto fail;
+ ret = panfrost_perfcnt_create_job_ctx(job, file, args);
+ if (ret)
+ goto fail;
+
ret = panfrost_job_push(job);
if (ret)
goto fail;
@@ -313,6 +318,7 @@ panfrost_open(struct drm_device *dev, struct drm_file *file)
{
struct panfrost_device *pfdev = dev->dev_private;
struct panfrost_file_priv *panfrost_priv;
+ int ret;
panfrost_priv = kzalloc(sizeof(*panfrost_priv), GFP_KERNEL);
if (!panfrost_priv)
@@ -321,7 +327,16 @@ panfrost_open(struct drm_device *dev, struct drm_file *file)
panfrost_priv->pfdev = pfdev;
file->driver_priv = panfrost_priv;
- return panfrost_job_open(panfrost_priv);
+ ret = panfrost_job_open(panfrost_priv);
+ if (ret)
+ goto err_free_priv;
+
+ panfrost_perfcnt_open(panfrost_priv);
+ return 0;
+
+err_free_priv:
+ kfree(panfrost_priv);
+ return ret;
}
static void
@@ -329,6 +344,7 @@ panfrost_postclose(struct drm_device *dev, struct drm_file *file)
{
struct panfrost_file_priv *panfrost_priv = file->driver_priv;
+ panfrost_perfcnt_close(panfrost_priv);
panfrost_job_close(panfrost_priv);
kfree(panfrost_priv);
@@ -348,6 +364,10 @@ static const struct drm_ioctl_desc panfrost_drm_driver_ioctls[] = {
PANFROST_IOCTL(MMAP_BO, mmap_bo, DRM_RENDER_ALLOW),
PANFROST_IOCTL(GET_PARAM, get_param, DRM_RENDER_ALLOW),
PANFROST_IOCTL(GET_BO_OFFSET, get_bo_offset, DRM_RENDER_ALLOW),
+ PANFROST_IOCTL(GET_PERFCNT_LAYOUT, get_perfcnt_layout, DRM_RENDER_ALLOW),
+ PANFROST_IOCTL(CREATE_PERFMON, create_perfmon, DRM_RENDER_ALLOW),
+ PANFROST_IOCTL(DESTROY_PERFMON, destroy_perfmon, DRM_RENDER_ALLOW),
+ PANFROST_IOCTL(GET_PERFMON_VALUES, get_perfmon_values, DRM_RENDER_ALLOW),
};
DEFINE_DRM_GEM_SHMEM_FOPS(panfrost_drm_driver_fops);
diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c
index d46d36170e18..c28a31c547cc 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
@@ -13,6 +13,7 @@
#include "panfrost_features.h"
#include "panfrost_issues.h"
#include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
#include "panfrost_regs.h"
static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
@@ -42,6 +43,12 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
done = true;
}
+ if (state & GPU_IRQ_PERFCNT_SAMPLE_COMPLETED)
+ panfrost_perfcnt_sample_done(pfdev);
+
+ if (state & GPU_IRQ_CLEAN_CACHES_COMPLETED)
+ panfrost_perfcnt_clean_cache_done(pfdev);
+
gpu_write(pfdev, GPU_INT_CLEAR, state);
return IRQ_HANDLED;
@@ -152,14 +159,16 @@ struct panfrost_model {
u32 revision;
u64 issues;
} revs[MAX_HW_REVS];
+ u64 perfcnt[PANFROST_NUM_BLOCKS];
};
#define GPU_MODEL(_name, _id, ...) \
-{\
+{ \
.name = __stringify(_name), \
.id = _id, \
.features = hw_features_##_name, \
.issues = hw_issues_##_name, \
+ .perfcnt = hw_perfcnt_##_name, \
.revs = { __VA_ARGS__ }, \
}
@@ -198,13 +207,17 @@ static const struct panfrost_model gpu_models[] = {
static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
{
+ struct drm_panfrost_block_perfcounters *perfcnt_layout;
u32 gpu_id, num_js, major, minor, status, rev;
const char *name = "unknown";
u64 hw_feat = 0;
- u64 hw_issues = hw_issues_all;
+ u64 hw_issues = hw_issues_all, mask;
const struct panfrost_model *model;
+ unsigned int num;
int i;
+ perfcnt_layout = pfdev->features.perfcnt_layout;
+
pfdev->features.l2_features = gpu_read(pfdev, GPU_L2_FEATURES);
pfdev->features.core_features = gpu_read(pfdev, GPU_CORE_FEATURES);
pfdev->features.tiler_features = gpu_read(pfdev, GPU_TILER_FEATURES);
@@ -272,9 +285,35 @@ static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
if (best >= 0)
hw_issues |= model->revs[best].issues;
+ for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+ perfcnt_layout[i].counters = model->perfcnt[i];
+
break;
}
+ /* Only one Job Manager. */
+ perfcnt_layout[PANFROST_JM_BLOCK].instances = BIT(0);
+ perfcnt_layout[PANFROST_SHADER_BLOCK].instances =
+ pfdev->features.shader_present;
+
+ /*
+ * In v4 HW we have one tiler per core group, with the number
+ * of core groups being equal to the number of L2 caches. Other
+ * HW versions just have one tiler and the number of L2 caches
+ * can be extracted from the mem_features field.
+ */
+ if (hw_feat & HW_FEATURE_V4) {
+ num = hweight64(pfdev->features.l2_present);
+ mask = GENMASK(num - 1, 0);
+ perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+ perfcnt_layout[PANFROST_TILER_BLOCK].instances = mask;
+ } else {
+ perfcnt_layout[PANFROST_TILER_BLOCK].instances = BIT(0);
+ num = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+ mask = GENMASK(num - 1, 0);
+ perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+ }
+
bitmap_from_u64(pfdev->features.hw_features, hw_feat);
bitmap_from_u64(pfdev->features.hw_issues, hw_issues);
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 8d570c3f15d0..c2be61a9ebff 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -15,6 +15,7 @@
#include "panfrost_features.h"
#include "panfrost_issues.h"
#include "panfrost_gem.h"
+#include "panfrost_perfcnt.h"
#include "panfrost_regs.h"
#include "panfrost_gpu.h"
#include "panfrost_mmu.h"
@@ -153,6 +154,7 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
goto end;
spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
+ panfrost_perfcnt_run_job(job);
job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF);
job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32);
@@ -233,6 +235,12 @@ int panfrost_job_push(struct panfrost_job *job)
goto unlock;
}
+ ret = panfrost_perfcnt_push_job(job);
+ if (ret) {
+ mutex_unlock(&pfdev->sched_lock);
+ goto unlock;
+ }
+
job->render_done_fence = dma_fence_get(&job->base.s_fence->finished);
kref_get(&job->refcount); /* put by scheduler job completion */
@@ -272,6 +280,9 @@ static void panfrost_job_cleanup(struct kref *ref)
for (i = 0; i < job->bo_count; i++)
drm_gem_object_put_unlocked(job->bos[i]);
+
+ panfrost_perfcnt_clean_job_ctx(job);
+
kvfree(job->bos);
kfree(job);
@@ -316,6 +327,13 @@ static struct dma_fence *panfrost_job_dependency(struct drm_sched_job *sched_job
}
}
+ /* Return the perfmon wait fence if any. */
+ if (job->perfcnt_fence) {
+ fence = job->perfcnt_fence;
+ job->perfcnt_fence = NULL;
+ return fence;
+ }
+
return NULL;
}
@@ -399,6 +417,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
/* restart scheduler after GPU is usable again */
for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_start(&pfdev->js->queue[i].sched, true);
+
+ /* For now, just say we're done. No reset and retry. */
+// job_write(pfdev, JS_COMMAND(js), JS_COMMAND_HARD_STOP);
+ dma_fence_signal(job->done_fence);
+ panfrost_perfcnt_finish_job(job, true);
}
static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -442,6 +465,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
if (status & JOB_INT_MASK_DONE(j)) {
dma_fence_signal(pfdev->jobs[j]->done_fence);
+ panfrost_perfcnt_finish_job(pfdev->jobs[j], false);
}
status &= ~mask;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h b/drivers/gpu/drm/panfrost/panfrost_job.h
index 62454128a792..18646cc5eebb 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.h
+++ b/drivers/gpu/drm/panfrost/panfrost_job.h
@@ -37,6 +37,10 @@ struct panfrost_job {
/* Fence to be signaled by drm-sched once its done with the job */
struct dma_fence *render_done_fence;
+
+ /* Perfcnt context */
+ struct panfrost_perfcnt_job_ctx *perfcnt_ctx;
+ struct dma_fence *perfcnt_fence;
};
int panfrost_job_init(struct panfrost_device *pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
new file mode 100644
index 000000000000..4491f153ad48
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -0,0 +1,954 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 Collabora Ltd */
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/panfrost_drm.h>
+#include <linux/iopoll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "panfrost_device.h"
+#include "panfrost_features.h"
+#include "panfrost_gem.h"
+#include "panfrost_issues.h"
+#include "panfrost_job.h"
+#include "panfrost_mmu.h"
+#include "panfrost_regs.h"
+
+#define COUNTERS_PER_BLOCK 64
+#define BYTES_PER_COUNTER 4
+#define BLOCKS_PER_COREGROUP 8
+#define V4_SHADERS_PER_COREGROUP 4
+
+struct panfrost_perfcnt_job_ctx {
+ refcount_t refcount;
+ struct panfrost_device *pfdev;
+ struct dma_fence *wait_fence;
+ struct dma_fence *done_fence;
+ struct panfrost_perfmon **perfmons;
+ u32 perfmon_count;
+};
+
+struct panfrost_perfcnt {
+ struct work_struct dumpwork;
+ u64 fence_context;
+ u64 emit_seqno;
+ spinlock_t fence_lock;
+ struct mutex cfg_lock;
+ u32 cur_cfg[PANFROST_NUM_BLOCKS];
+ struct panfrost_gem_object *bo;
+ void *buf;
+ spinlock_t ctx_lock;
+ struct panfrost_perfcnt_job_ctx *last_ctx;
+ struct panfrost_perfcnt_job_ctx *dump_ctx;
+};
+
+struct panfrost_perfcnt_fence {
+ struct dma_fence base;
+ struct drm_device *dev;
+ u64 seqno;
+};
+
+struct panfrost_perfmon {
+ refcount_t refcnt;
+ atomic_t busycnt;
+ struct wait_queue_head wq;
+ struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+ u32 *values[PANFROST_NUM_BLOCKS];
+};
+
+static inline struct panfrost_perfcnt_fence *
+to_panfrost_perfcnt_fence(struct dma_fence *fence)
+{
+ return container_of(fence, struct panfrost_perfcnt_fence, base);
+}
+
+static const char *
+panfrost_perfcnt_fence_get_driver_name(struct dma_fence *fence)
+{
+ return "panfrost";
+}
+
+static const char *
+panfrost_perfcnt_fence_get_timeline_name(struct dma_fence *fence)
+{
+ return "panfrost-perfcnt";
+}
+
+static const struct dma_fence_ops panfrost_perfcnt_fence_ops = {
+ .get_driver_name = panfrost_perfcnt_fence_get_driver_name,
+ .get_timeline_name = panfrost_perfcnt_fence_get_timeline_name,
+};
+
+static struct dma_fence *
+panfrost_perfcnt_fence_create(struct panfrost_device *pfdev)
+{
+ struct panfrost_perfcnt_fence *fence;
+
+ fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+ if (!fence)
+ return ERR_PTR(-ENOMEM);
+
+ fence->dev = pfdev->ddev;
+ fence->seqno = ++pfdev->perfcnt->emit_seqno;
+ dma_fence_init(&fence->base, &panfrost_perfcnt_fence_ops,
+ &pfdev->perfcnt->fence_lock,
+ pfdev->perfcnt->fence_context, fence->seqno);
+
+ return &fence->base;
+}
+
+static void panfrost_perfmon_get(struct panfrost_perfmon *perfmon)
+{
+ if (perfmon)
+ refcount_inc(&perfmon->refcnt);
+}
+
+static void panfrost_perfmon_put(struct panfrost_perfmon *perfmon)
+{
+ if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
+ unsigned int i;
+
+ for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+ kfree(perfmon->values[i]);
+
+ kfree(perfmon);
+ }
+}
+
+static struct panfrost_perfmon *
+panfrost_perfcnt_find_perfmon(struct panfrost_file_priv *pfile, int id)
+{
+ struct panfrost_perfmon *perfmon;
+
+ mutex_lock(&pfile->perfmon.lock);
+ perfmon = idr_find(&pfile->perfmon.idr, id);
+ panfrost_perfmon_get(perfmon);
+ mutex_unlock(&pfile->perfmon.lock);
+
+ return perfmon;
+}
+
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile)
+{
+ mutex_init(&pfile->perfmon.lock);
+ idr_init(&pfile->perfmon.idr);
+}
+
+static int panfrost_perfcnt_idr_del(int id, void *elem, void *data)
+{
+ struct panfrost_perfmon *perfmon = elem;
+
+ panfrost_perfmon_put(perfmon);
+
+ return 0;
+}
+
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile)
+{
+ mutex_lock(&pfile->perfmon.lock);
+ idr_for_each(&pfile->perfmon.idr, panfrost_perfcnt_idr_del, NULL);
+ idr_destroy(&pfile->perfmon.idr);
+ mutex_unlock(&pfile->perfmon.lock);
+}
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct panfrost_file_priv *pfile = file_priv->driver_priv;
+ struct panfrost_device *pfdev = pfile->pfdev;
+ struct drm_panfrost_get_perfcnt_layout *layout = data;
+
+ memcpy(layout->counters, pfdev->features.perfcnt_layout,
+ sizeof(layout->counters));
+
+ return 0;
+}
+
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct panfrost_file_priv *pfile = file_priv->driver_priv;
+ struct panfrost_device *pfdev = pfile->pfdev;
+ struct drm_panfrost_create_perfmon *req = data;
+ struct drm_panfrost_block_perfcounters *layout;
+ struct panfrost_perfmon *perfmon;
+ unsigned int i;
+ int ret;
+
+ if (req->padding)
+ return -EINVAL;
+
+ perfmon = kzalloc(sizeof(*perfmon), GFP_KERNEL);
+ if (!perfmon)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ layout = pfdev->features.perfcnt_layout;
+ for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+ unsigned int ncounters;
+
+ /* Make sure the request matches the available counters. */
+ if (~layout[i].instances & req->counters[i].instances ||
+ ~layout[i].counters & req->counters[i].counters)
+ goto err_free_perfmon;
+
+ ncounters = hweight64(req->counters[i].instances) *
+ hweight64(req->counters[i].counters);
+ if (!ncounters)
+ continue;
+
+ perfmon->counters[i] = req->counters[i];
+ perfmon->values[i] = kcalloc(ncounters, sizeof(u32), GFP_KERNEL);
+ if (!perfmon->values)
+ goto err_free_perfmon;
+ }
+
+ refcount_set(&perfmon->refcnt, 1);
+ init_waitqueue_head(&perfmon->wq);
+
+ mutex_lock(&pfile->perfmon.lock);
+ ret = idr_alloc(&pfile->perfmon.idr, perfmon, 1, U32_MAX, GFP_KERNEL);
+ mutex_unlock(&pfile->perfmon.lock);
+
+ if (ret < 0)
+ goto err_free_perfmon;
+
+ req->id = ret;
+ return 0;
+
+err_free_perfmon:
+ for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+ kfree(perfmon->values[i]);
+
+ kfree(perfmon);
+ return ret;
+}
+
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct panfrost_file_priv *pfile = file_priv->driver_priv;
+ struct drm_panfrost_destroy_perfmon *req = data;
+ struct panfrost_perfmon *perfmon;
+
+ mutex_lock(&pfile->perfmon.lock);
+ perfmon = idr_remove(&pfile->perfmon.idr, req->id);
+ mutex_unlock(&pfile->perfmon.lock);
+
+ if (!perfmon)
+ return -EINVAL;
+
+ panfrost_perfmon_put(perfmon);
+ return 0;
+}
+
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct panfrost_file_priv *pfile = file_priv->driver_priv;
+ struct drm_panfrost_get_perfmon_values *req = data;
+ struct panfrost_perfmon *perfmon;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&pfile->perfmon.lock);
+ perfmon = idr_find(&pfile->perfmon.idr, req->id);
+ panfrost_perfmon_get(perfmon);
+ mutex_unlock(&pfile->perfmon.lock);
+
+ if (!perfmon)
+ return -EINVAL;
+
+ if (!(req->flags & DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT))
+ ret = wait_event_interruptible(perfmon->wq,
+ !atomic_read(&perfmon->busycnt));
+ else if (atomic_read(&perfmon->busycnt))
+ ret = -EBUSY;
+
+ if (ret)
+ goto out;
+
+ for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+ unsigned int ncounters;
+
+ ncounters = hweight64(perfmon->counters[i].instances) *
+ hweight64(perfmon->counters[i].counters);
+ if (!ncounters)
+ continue;
+
+ if (copy_to_user(u64_to_user_ptr(req->values_ptrs[i]),
+ perfmon->values[i],
+ ncounters * sizeof(u32))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (req->flags & DRM_PANFROST_GET_PERFMON_VALS_RESET)
+ memset(perfmon->values[i], 0, ncounters * sizeof(u32));
+ }
+
+out:
+ panfrost_perfmon_put(perfmon);
+ return ret;
+}
+
+/*
+ * Returns true if the 2 jobs have exactly the same perfcnt context, false
+ * otherwise.
+ */
+static bool panfrost_perfcnt_job_ctx_cmp(struct panfrost_perfcnt_job_ctx *a,
+ struct panfrost_perfcnt_job_ctx *b)
+{
+ unsigned int i, j;
+
+ if (a->perfmon_count != b->perfmon_count)
+ return false;
+
+ for (i = 0; i < a->perfmon_count; i++) {
+ for (j = 0; j < b->perfmon_count; j++) {
+ if (a->perfmons[i] == b->perfmons[j])
+ break;
+ }
+
+ if (j == b->perfmon_count)
+ return false;
+ }
+
+ return true;
+}
+
+static u32 counters_u64_to_u32(u64 in)
+{
+ unsigned int i;
+ u32 out = 0;
+
+ for (i = 0; i < 64; i += 4) {
+ if (GENMASK(i + 3, i) & in)
+ out |= BIT(i / 4);
+ }
+
+ return out;
+}
+
+void panfrost_perfcnt_run_job(struct panfrost_job *job)
+{
+ struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+ struct panfrost_device *pfdev = job->pfdev;
+ u32 perfcnt_en[PANFROST_NUM_BLOCKS] = { };
+ bool disable_perfcnt = true, config_changed = false;
+ unsigned int i, j;
+ u64 gpuva;
+ u32 cfg;
+
+ mutex_lock(&pfdev->perfcnt->cfg_lock);
+ for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+ for (j = 0; j < ctx->perfmon_count; j++) {
+ u64 counters = ctx->perfmons[j]->counters[i].counters;
+
+ perfcnt_en[i] |= counters_u64_to_u32(counters);
+ }
+
+ if (perfcnt_en[i])
+ disable_perfcnt = false;
+
+ if (perfcnt_en[i] != pfdev->perfcnt->cur_cfg[i]) {
+ pfdev->perfcnt->cur_cfg[i] = perfcnt_en[i];
+ config_changed = true;
+ }
+ }
+ mutex_unlock(&pfdev->perfcnt->cfg_lock);
+
+ if (!config_changed)
+ return;
+
+ /*
+ * Always use address space 0 for now.
+ * FIXME: this needs to be updated when we start using different
+ * address space.
+ */
+ cfg = GPU_PERFCNT_CFG_AS(0);
+ if (panfrost_model_cmp(pfdev, 0x1000) >= 0)
+ cfg |= GPU_PERFCNT_CFG_SETSEL(1);
+
+ gpu_write(pfdev, GPU_PERFCNT_CFG,
+ cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+
+ if (disable_perfcnt)
+ return;
+
+ gpu_write(pfdev, GPU_PRFCNT_JM_EN, perfcnt_en[PANFROST_JM_BLOCK]);
+ gpu_write(pfdev, GPU_PRFCNT_SHADER_EN,
+ perfcnt_en[PANFROST_SHADER_BLOCK]);
+ gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN,
+ perfcnt_en[PANFROST_MMU_L2_BLOCK]);
+ gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
+ gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
+ gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
+
+ /*
+ * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
+ * counters.
+ */
+ if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+ gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+ else
+ gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+ perfcnt_en[PANFROST_TILER_BLOCK]);
+
+ gpu_write(pfdev, GPU_PERFCNT_CFG,
+ cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
+
+ if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+ gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+ perfcnt_en[PANFROST_TILER_BLOCK]);
+}
+
+static void
+panfrost_perfcnt_release_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+ unsigned int i;
+
+ WARN_ON(refcount_read(&ctx->refcount));
+ for (i = 0; i < ctx->perfmon_count; i++) {
+ if (atomic_dec_and_test(&ctx->perfmons[i]->busycnt))
+ wake_up(&ctx->perfmons[i]->wq);
+ panfrost_perfmon_put(ctx->perfmons[i]);
+ }
+
+ dma_fence_put(ctx->wait_fence);
+ dma_fence_put(ctx->done_fence);
+ kfree(ctx->perfmons);
+ kfree(ctx);
+}
+
+static void panfrost_perfcnt_put_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+ if (!IS_ERR_OR_NULL(ctx) && refcount_dec_and_test(&ctx->refcount))
+ panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+struct panfrost_perfcnt_job_ctx *
+panfrost_perfcnt_get_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+ if (ctx)
+ refcount_inc(&ctx->refcount);
+
+ return ctx;
+}
+
+static void panfrost_perfcnt_dump_done(struct panfrost_perfcnt_job_ctx *ctx)
+{
+ struct panfrost_device *pfdev;
+ unsigned long flags;
+
+ pfdev = ctx->pfdev;
+ spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+ pfdev->perfcnt->dump_ctx = NULL;
+ if (pfdev->perfcnt->last_ctx == ctx)
+ pfdev->perfcnt->last_ctx = NULL;
+ spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+
+ dma_fence_signal(ctx->done_fence);
+ panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+static void
+panfrost_perfcnt_get_counter_vals(struct panfrost_device *pfdev,
+ enum drm_panfrost_block_id block,
+ unsigned int instance, u32 *vals)
+{
+ u64 shader_present = pfdev->features.shader_present;
+ unsigned int bufoffs, shaderid, shadernum;
+
+ if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+ unsigned int ncoregroups;
+
+ ncoregroups = hweight64(pfdev->features.l2_present);
+
+ switch (block) {
+ case PANFROST_SHADER_BLOCK:
+ for (shaderid = 0, shadernum = 0; shaderid < 64;
+ shaderid++) {
+ if (!(BIT_ULL(shaderid) & shader_present))
+ continue;
+
+ if (shadernum == instance)
+ break;
+
+ shadernum++;
+ }
+
+ if (WARN_ON(shaderid == 64))
+ return;
+
+ /* 4 shaders per core group. */
+ bufoffs = ((shaderid / V4_SHADERS_PER_COREGROUP) *
+ 2048) +
+ ((shaderid % V4_SHADERS_PER_COREGROUP) *
+ 256);
+ break;
+
+ case PANFROST_TILER_BLOCK:
+ if (WARN_ON(instance >= ncoregroups))
+ return;
+
+ bufoffs = (instance * 2048) + 1024;
+ break;
+ case PANFROST_MMU_L2_BLOCK:
+ if (WARN_ON(instance >= ncoregroups))
+ return;
+
+ bufoffs = (instance * 2048) + 1280;
+ break;
+ case PANFROST_JM_BLOCK:
+ if (WARN_ON(instance))
+ return;
+ bufoffs = 1792;
+ break;
+ default:
+ WARN_ON(1);
+ return;
+ }
+ } else {
+ unsigned int nl2c, ncores;
+
+ /*
+ * TODO: define a macro to extract the number of l2 caches from
+ * mem_features.
+ */
+ nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+ /*
+ * The ARM driver is grouping cores per core group and then
+ * only using the number of cores in group 0 to calculate the
+ * size. Not sure why this is done like that, but I guess
+ * shader_present will only show cores in the first group
+ * anyway.
+ */
+ ncores = hweight64(pfdev->features.shader_present);
+
+ switch (block) {
+ case PANFROST_SHADER_BLOCK:
+ for (shaderid = 0, shadernum = 0; shaderid < 64;
+ shaderid++) {
+ if (!(BIT_ULL(shaderid) & shader_present))
+ continue;
+
+ if (shadernum == instance)
+ break;
+
+ shadernum++;
+ }
+
+ if (WARN_ON(shaderid == 64))
+ return;
+
+ /* 4 shaders per core group. */
+ bufoffs = 512 + ((nl2c + shaderid) * 256);
+ break;
+
+ case PANFROST_TILER_BLOCK:
+ if (WARN_ON(instance))
+ return;
+
+ bufoffs = 256;
+ break;
+ case PANFROST_MMU_L2_BLOCK:
+ if (WARN_ON(instance >= nl2c))
+ return;
+
+ bufoffs = 512 + (instance * 256);
+ break;
+ case PANFROST_JM_BLOCK:
+ if (WARN_ON(instance))
+ return;
+ bufoffs = 0;
+ break;
+ default:
+ WARN_ON(1);
+ return;
+ }
+ }
+
+ memcpy(vals, pfdev->perfcnt->buf + bufoffs, 256);
+}
+
+static void
+panfrost_perfmon_upd_counter_vals(struct panfrost_perfmon *perfmon,
+ enum drm_panfrost_block_id block,
+ unsigned int instance, u32 *invals)
+{
+ u32 *outvals = perfmon->values[block];
+ unsigned int inidx, outidx;
+
+ if (WARN_ON(instance >= hweight64(perfmon->counters[block].instances)))
+ return;
+
+ if (!(perfmon->counters[block].instances & BIT_ULL(instance)))
+ return;
+
+ outvals += instance * hweight64(perfmon->counters[block].counters);
+ for (inidx = 0, outidx = 0; inidx < 64; inidx++) {
+ if (!(perfmon->counters[block].counters & BIT_ULL(inidx)))
+ continue;
+
+ if (U32_MAX - outvals[outidx] < invals[inidx])
+ outvals[outidx] = U32_MAX;
+ else
+ outvals[outidx] += invals[inidx];
+ outidx++;
+ }
+}
+
+static void panfrost_perfcnt_dump_work(struct work_struct *w)
+{
+ struct panfrost_perfcnt *perfcnt = container_of(w,
+ struct panfrost_perfcnt,
+ dumpwork);
+ struct panfrost_perfcnt_job_ctx *ctx = perfcnt->dump_ctx;
+ unsigned int block, instance, pmonidx, num;
+
+ if (!ctx)
+ return;
+
+ for (block = 0; block < PANFROST_NUM_BLOCKS; block++) {
+ struct panfrost_perfmon *perfmon;
+ u32 vals[COUNTERS_PER_BLOCK];
+ u64 instances = 0;
+
+ for (pmonidx = 0; pmonidx < ctx->perfmon_count; pmonidx++) {
+ perfmon = ctx->perfmons[pmonidx];
+ instances |= perfmon->counters[block].instances;
+ }
+
+ for (instance = 0, num = 0; instance < 64; instance++) {
+ if (!(instances & BIT_ULL(instance)))
+ continue;
+
+ panfrost_perfcnt_get_counter_vals(ctx->pfdev, block,
+ instance, vals);
+
+ for (pmonidx = 0; pmonidx < ctx->perfmon_count;
+ pmonidx++) {
+ perfmon = ctx->perfmons[pmonidx];
+ panfrost_perfmon_upd_counter_vals(perfmon,
+ block,
+ num,
+ vals);
+ }
+ num++;
+ }
+ }
+
+ panfrost_perfcnt_dump_done(ctx);
+}
+
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
+{
+ schedule_work(&pfdev->perfcnt->dumpwork);
+}
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
+{
+ gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
+}
+
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job)
+{
+ return panfrost_perfcnt_put_job_ctx(job->perfcnt_ctx);
+}
+
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+ struct drm_file *file_priv,
+ struct drm_panfrost_submit *args)
+{
+ struct panfrost_device *pfdev = job->pfdev;
+ struct panfrost_file_priv *pfile = file_priv->driver_priv;
+ struct panfrost_perfcnt_job_ctx *ctx;
+ unsigned int i, j;
+ u32 *handles;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->pfdev = pfdev;
+ refcount_set(&ctx->refcount, 1);
+
+ ctx->perfmon_count = args->perfmon_handle_count;
+ if (!ctx->perfmon_count) {
+ job->perfcnt_ctx = ctx;
+ return 0;
+ }
+
+ handles = kcalloc(ctx->perfmon_count, sizeof(u32), GFP_KERNEL);
+ if (!handles) {
+ ret = -ENOMEM;
+ goto err_put_ctx;
+ }
+
+ if (copy_from_user(handles,
+ u64_to_user_ptr(args->perfmon_handles),
+ ctx->perfmon_count * sizeof(u32))) {
+ ret = -EFAULT;
+ DRM_DEBUG("Failed to copy in perfmon handles\n");
+ goto err_free_handles;
+ }
+
+ /* Make sure each perfmon only appears once. */
+ for (i = 0; i < ctx->perfmon_count - 1; i++) {
+ for (j = i + 1; j < ctx->perfmon_count; j++) {
+ if (handles[i] == handles[j]) {
+ ret = -EINVAL;
+ goto err_free_handles;
+ }
+ }
+ }
+
+ ctx->perfmons = kcalloc(ctx->perfmon_count, sizeof(*ctx->perfmons),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!ctx->perfmons) {
+ ret = -ENOMEM;
+ goto err_free_handles;
+ }
+
+ for (i = 0; i < ctx->perfmon_count; i++) {
+ ctx->perfmons[i] = panfrost_perfcnt_find_perfmon(pfile,
+ handles[i]);
+ if (!ctx->perfmons[i]) {
+ ret = -EINVAL;
+ goto err_free_handles;
+ }
+ atomic_inc(&ctx->perfmons[i]->busycnt);
+ }
+
+ job->perfcnt_ctx = ctx;
+ kfree(handles);
+ return 0;
+
+err_free_handles:
+ kfree(handles);
+
+err_put_ctx:
+ panfrost_perfcnt_put_job_ctx(ctx);
+ return ret;
+}
+
+void panfrost_perfcnt_finish_job(struct panfrost_job *job, bool skip_dump)
+{
+ struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+
+ if (WARN_ON(!ctx))
+ return;
+
+ job->perfcnt_ctx = NULL;
+ if (!refcount_dec_and_test(&ctx->refcount))
+ return;
+
+ if (!ctx->perfmon_count || skip_dump) {
+ panfrost_perfcnt_dump_done(ctx);
+ return;
+ }
+
+ ctx->pfdev->perfcnt->dump_ctx = ctx;
+ gpu_write(ctx->pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
+}
+
+static bool panfrost_perfcnt_try_reuse_last_job_ctx(struct panfrost_job *job)
+{
+ struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+ struct panfrost_device *pfdev = job->pfdev;
+ unsigned int i;
+
+ new_ctx = job->perfcnt_ctx;
+ prev_ctx = pfdev->perfcnt->last_ctx;
+ if (!prev_ctx)
+ return false;
+
+ if (!refcount_inc_not_zero(&prev_ctx->refcount))
+ return false;
+
+ if (!panfrost_perfcnt_job_ctx_cmp(prev_ctx, new_ctx)) {
+ refcount_dec(&prev_ctx->refcount);
+ return false;
+ }
+
+ /*
+ * Make sure we increment busycnt, as panfrost_perfcnt_put_job_ctx()
+ * will decrement it.
+ */
+ for (i = 0; i < prev_ctx->perfmon_count; i++)
+ atomic_inc(&prev_ctx->perfmons[i]->busycnt);
+
+ panfrost_perfcnt_put_job_ctx(new_ctx);
+ job->perfcnt_ctx = prev_ctx;
+ job->perfcnt_fence = dma_fence_get(prev_ctx->wait_fence);
+ return true;
+}
+
+int panfrost_perfcnt_push_job(struct panfrost_job *job)
+{
+ struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+ struct panfrost_device *pfdev = job->pfdev;
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+ new_ctx = job->perfcnt_ctx;
+ prev_ctx = pfdev->perfcnt->last_ctx;
+ /*
+ * In order to keep things relatively fast even when HW counters are
+ * enabled we try to avoid having to dump perfcounters at the end of
+ * each job (which implies making other jobs wait for this dump to
+ * finish) when that's possible.
+ * This is only acceptable if all queued jobs share the same perfctx,
+ * that is, they have the same list of jobs attached to them. In this
+ * condition we are guaranteed that nothing will increment the counters
+ * behind our back.
+ */
+ if (panfrost_perfcnt_try_reuse_last_job_ctx(job))
+ goto out;
+
+ new_ctx->done_fence = panfrost_perfcnt_fence_create(pfdev);
+ if (IS_ERR(new_ctx->done_fence)) {
+ ret = PTR_ERR(new_ctx->done_fence);
+ goto out;
+ }
+
+ /*
+ * The previous job has a different perfmon ctx, so we must wait for it
+ * to be done dumping the counters before we can schedule this new job,
+ * otherwise we might corrupt the counter values.
+ */
+ if (prev_ctx)
+ new_ctx->wait_fence = dma_fence_get(prev_ctx->done_fence);
+
+ job->perfcnt_fence = dma_fence_get(new_ctx->wait_fence);
+ pfdev->perfcnt->last_ctx = new_ctx;
+
+out:
+ spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+ return ret;
+}
+
+int panfrost_perfcnt_init(struct panfrost_device *pfdev)
+{
+ struct panfrost_perfcnt *perfcnt;
+ struct drm_gem_shmem_object *bo;
+ size_t size;
+ u32 status;
+ int ret;
+
+ if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+ unsigned int ncoregroups;
+
+ ncoregroups = hweight64(pfdev->features.l2_present);
+ size = ncoregroups * BLOCKS_PER_COREGROUP *
+ COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+ } else {
+ unsigned int nl2c, ncores;
+
+ /*
+ * TODO: define a macro to extract the number of l2 caches from
+ * mem_features.
+ */
+ nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+ /*
+ * The ARM driver is grouping cores per core group and then
+ * only using the number of cores in group 0 to calculate the
+ * size. Not sure why this is done like that, but I guess
+ * shader_present will only show cores in the first group
+ * anyway.
+ */
+ ncores = hweight64(pfdev->features.shader_present);
+
+ /*
+ * There's always one JM and one Tiler block, hence the '+ 2'
+ * here.
+ */
+ size = (nl2c + ncores + 2) *
+ COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+ }
+
+ perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
+ if (!perfcnt)
+ return -ENOMEM;
+
+ bo = drm_gem_shmem_create(pfdev->ddev, size);
+ if (IS_ERR(bo))
+ return PTR_ERR(bo);
+
+ perfcnt->bo = to_panfrost_bo(&bo->base);
+
+ /*
+ * We always use the same buffer, so let's map it once and keep it
+ * mapped until the driver is unloaded. This might be a problem if
+ * we start using different AS and the perfcnt BO is not mapped at
+ * the same GPU virtual address.
+ */
+ ret = panfrost_mmu_map(perfcnt->bo);
+ if (ret)
+ goto err_put_bo;
+
+ /* Disable everything. */
+ gpu_write(pfdev, GPU_PERFCNT_CFG,
+ GPU_PERFCNT_CFG_AS(0) |
+ GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF) |
+ (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?
+ GPU_PERFCNT_CFG_SETSEL(1) : 0));
+ gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
+ gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
+ gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
+ gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+
+ perfcnt->buf = drm_gem_vmap(&bo->base);
+ if (IS_ERR(perfcnt->buf)) {
+ ret = PTR_ERR(perfcnt->buf);
+ goto err_put_bo;
+ }
+
+ INIT_WORK(&perfcnt->dumpwork, panfrost_perfcnt_dump_work);
+ mutex_init(&perfcnt->cfg_lock);
+ spin_lock_init(&perfcnt->fence_lock);
+ spin_lock_init(&perfcnt->ctx_lock);
+ perfcnt->fence_context = dma_fence_context_alloc(1);
+ pfdev->perfcnt = perfcnt;
+
+ /*
+ * Invalidate the cache and clear the counters to start from a fresh
+ * state.
+ */
+ gpu_write(pfdev, GPU_INT_MASK, 0);
+ gpu_write(pfdev, GPU_INT_CLEAR, GPU_IRQ_CLEAN_CACHES_COMPLETED);
+
+ gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR);
+ gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES);
+ ret = readl_relaxed_poll_timeout(pfdev->iomem + GPU_INT_RAWSTAT,
+ status,
+ status &
+ GPU_IRQ_CLEAN_CACHES_COMPLETED,
+ 100, 10000);
+ if (ret)
+ goto err_gem_vunmap;
+
+ gpu_write(pfdev, GPU_INT_MASK, GPU_IRQ_MASK_ALL);
+
+ return 0;
+
+err_gem_vunmap:
+ drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+
+err_put_bo:
+ drm_gem_object_put_unlocked(&bo->base);
+ return ret;
+}
+
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
+{
+ drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+ drm_gem_object_put_unlocked(&pfdev->perfcnt->bo->base.base);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
new file mode 100644
index 000000000000..7cbfeb072aa1
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2019 Collabora Ltd */
+#ifndef __PANFROST_PERFCNT_H__
+#define __PANFROST_PERFCNT_H__
+
+#include <linux/bitops.h>
+
+struct panfrost_perfcnt_job_ctx;
+
+#define PERFCNT(_shader, _tiler, _mmu_l2, _jm) \
+ { _shader, _tiler, _mmu_l2, _jm }
+#define NO_PERFCNT PERFCNT(0, 0, 0, 0)
+
+/* FIXME: Declare counters for all models */
+#define hw_perfcnt_t600 NO_PERFCNT
+#define hw_perfcnt_t620 NO_PERFCNT
+#define hw_perfcnt_t720 NO_PERFCNT
+#define hw_perfcnt_t760 NO_PERFCNT
+#define hw_perfcnt_t820 NO_PERFCNT
+#define hw_perfcnt_t830 NO_PERFCNT
+#define hw_perfcnt_t860 NO_PERFCNT
+#define hw_perfcnt_t880 NO_PERFCNT
+#define hw_perfcnt_g76 NO_PERFCNT
+#define hw_perfcnt_g71 NO_PERFCNT
+#define hw_perfcnt_g72 NO_PERFCNT
+#define hw_perfcnt_g51 NO_PERFCNT
+#define hw_perfcnt_g52 NO_PERFCNT
+#define hw_perfcnt_g31 NO_PERFCNT
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev);
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev);
+int panfrost_perfcnt_push_job(struct panfrost_job *job);
+void panfrost_perfcnt_run_job(struct panfrost_job *job);
+void panfrost_perfcnt_finish_job(struct panfrost_job *job,
+ bool skip_dump);
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job);
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+ struct drm_file *file_priv,
+ struct drm_panfrost_submit *args);
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile);
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile);
+int panfrost_perfcnt_init(struct panfrost_device *pfdev);
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev);
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+
+#endif
diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h
index 42d08860fd76..ea38ac60581c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_regs.h
+++ b/drivers/gpu/drm/panfrost/panfrost_regs.h
@@ -44,12 +44,31 @@
GPU_IRQ_MULTIPLE_FAULT)
#define GPU_CMD 0x30
#define GPU_CMD_SOFT_RESET 0x01
+#define GPU_CMD_PERFCNT_CLEAR 0x03
+#define GPU_CMD_PERFCNT_SAMPLE 0x04
+#define GPU_CMD_CLEAN_CACHES 0x07
+#define GPU_CMD_CLEAN_INV_CACHES 0x08
#define GPU_STATUS 0x34
+#define GPU_STATUS_PRFCNT_ACTIVE BIT(2)
#define GPU_LATEST_FLUSH_ID 0x38
#define GPU_FAULT_STATUS 0x3C
#define GPU_FAULT_ADDRESS_LO 0x40
#define GPU_FAULT_ADDRESS_HI 0x44
+#define GPU_PERFCNT_BASE_LO 0x60
+#define GPU_PERFCNT_BASE_HI 0x64
+#define GPU_PERFCNT_CFG 0x68
+#define GPU_PERFCNT_CFG_MODE(x) (x)
+#define GPU_PERFCNT_CFG_MODE_OFF 0
+#define GPU_PERFCNT_CFG_MODE_MANUAL 1
+#define GPU_PERFCNT_CFG_MODE_TILE 2
+#define GPU_PERFCNT_CFG_AS(x) ((x) << 4)
+#define GPU_PERFCNT_CFG_SETSEL(x) ((x) << 8)
+#define GPU_PRFCNT_JM_EN 0x6c
+#define GPU_PRFCNT_SHADER_EN 0x70
+#define GPU_PRFCNT_TILER_EN 0x74
+#define GPU_PRFCNT_MMU_L2_EN 0x7c
+
#define GPU_THREAD_MAX_THREADS 0x0A0 /* (RO) Maximum number of threads per core */
#define GPU_THREAD_MAX_WORKGROUP_SIZE 0x0A4 /* (RO) Maximum workgroup size */
#define GPU_THREAD_MAX_BARRIER_SIZE 0x0A8 /* (RO) Maximum threads waiting at a barrier */
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
index 508b9621d9db..e09b35bf6035 100644
--- a/include/uapi/drm/panfrost_drm.h
+++ b/include/uapi/drm/panfrost_drm.h
@@ -18,6 +18,10 @@ extern "C" {
#define DRM_PANFROST_MMAP_BO 0x03
#define DRM_PANFROST_GET_PARAM 0x04
#define DRM_PANFROST_GET_BO_OFFSET 0x05
+#define DRM_PANFROST_GET_PERFCNT_LAYOUT 0x06
+#define DRM_PANFROST_CREATE_PERFMON 0x07
+#define DRM_PANFROST_DESTROY_PERFMON 0x08
+#define DRM_PANFROST_GET_PERFMON_VALUES 0x09
#define DRM_IOCTL_PANFROST_SUBMIT DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_SUBMIT, struct drm_panfrost_submit)
#define DRM_IOCTL_PANFROST_WAIT_BO DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_WAIT_BO, struct drm_panfrost_wait_bo)
@@ -25,6 +29,10 @@ extern "C" {
#define DRM_IOCTL_PANFROST_MMAP_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_MMAP_BO, struct drm_panfrost_mmap_bo)
#define DRM_IOCTL_PANFROST_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
#define DRM_IOCTL_PANFROST_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
+#define DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFCNT_LAYOUT, struct drm_panfrost_get_perfcnt_layout)
+#define DRM_IOCTL_PANFROST_CREATE_PERFMON DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_CREATE_PERFMON, struct drm_panfrost_create_perfmon)
+#define DRM_IOCTL_PANFROST_DESTROY_PERFMON DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_DESTROY_PERFMON, struct drm_panfrost_destroy_perfmon)
+#define DRM_IOCTL_PANFROST_GET_PERFMON_VALUES DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFMON_VALUES, struct drm_panfrost_get_perfmon_values)
#define PANFROST_JD_REQ_FS (1 << 0)
/**
@@ -55,6 +63,15 @@ struct drm_panfrost_submit {
/** A combination of PANFROST_JD_REQ_* */
__u32 requirements;
+
+ /** Pointer to a u32 array of perfmons that should be attached to the job. */
+ __u64 perfmon_handles;
+
+ /** Number of perfmon handles passed in (size is that times 4). */
+ __u32 perfmon_handle_count;
+
+ /** Unused field, should be set to 0. */
+ __u32 padding;
};
/**
@@ -133,6 +150,111 @@ struct drm_panfrost_get_bo_offset {
__u64 offset;
};
+/**
+ * Panfrost HW block ids used to group HW counters. There might be several
+ * shader, tiler and MMU/L2 blocks in a given GPU. How many of them are
+ * available is exposed through the instances field of
+ * drm_panfrost_block_perfcounters.
+ */
+enum drm_panfrost_block_id {
+ PANFROST_SHADER_BLOCK,
+ PANFROST_TILER_BLOCK,
+ PANFROST_MMU_L2_BLOCK,
+ PANFROST_JM_BLOCK,
+ PANFROST_NUM_BLOCKS,
+};
+
+struct drm_panfrost_block_perfcounters {
+ /*
+ * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+ * instances for a specific given block type.
+ * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the instances the
+ * user wants to monitor.
+ * Note: the bitmap might be sparse.
+ */
+ __u64 instances;
+
+ /*
+ * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+ * counters attached to a specific block type.
+ * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the counters the user
+ * wants to monitor.
+ * Note: the bitmap might be sparse.
+ */
+ __u64 counters;
+};
+
+/**
+ * Used to retrieve available HW counters.
+ */
+struct drm_panfrost_get_perfcnt_layout {
+ struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+};
+
+/**
+ * Used to create a performance monitor. Each perfmonance monitor is assigned an
+ * ID that can later be passed when submitting a job to capture hardware counter
+ * values (and thus count things related to this specific job).
+ * Performance monitors are attached to the GPU file descriptor and IDs are
+ * unique within this context, not across all GPU users.
+ * This implies that
+ * - perfmons are automatically released when the FD is closed
+ * - perfmons can't be shared across GPU context
+ */
+struct drm_panfrost_create_perfmon {
+ /* Input Fields. */
+ /* List all HW counters this performance monitor should track. */
+ struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+
+ /* Output fields. */
+ /* ID of the newly created perfmon. */
+ __u32 id;
+
+ /* Padding: must be set to 0. */
+ __u32 padding;
+};
+
+/**
+ * Destroy an existing performance monitor.
+ */
+struct drm_panfrost_destroy_perfmon {
+ /*
+ * ID of the perfmon to destroy (the one returned by
+ * DRM_IOCTL_PANFROST_CREATE_PERFMON)
+ */
+ __u32 id;
+};
+
+/*
+ * Don't wait when trying to get perfmon values. If the perfmon is still active
+ * (still attached to a queued or running job), EBUSY is returned.
+ */
+#define DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT 0x1
+
+/* Reset all perfmon values to zero after reading them. */
+#define DRM_PANFROST_GET_PERFMON_VALS_RESET 0x2
+
+/**
+ * Used to query values collected by a performance monitor.
+ */
+struct drm_panfrost_get_perfmon_values {
+ /* ID of the perfmon to query value on. */
+ __u32 id;
+
+ /* See DRM_PANFROST_GET_PERFMON_VALS_XXX flags */
+ __u32 flags;
+
+ /*
+ * An array of u32 userspace pointers where counters values will be
+ * copied too.
+ * The array sizes depend on the counters/instances activated at
+ * perfmon creation time: hweight64(instances) * hweight64(counters).
+ * Note that some entries in values_ptrs[] might be NULL if no counters
+ * on a specific block were activated.
+ */
+ __u64 values_ptrs[PANFROST_NUM_BLOCKS];
+};
+
#if defined(__cplusplus)
}
#endif
--
2.20.1
More information about the dri-devel
mailing list