[PATCH 2/3] drm/panfrost: Expose HW counters to userspace

Thu Apr 4 15:20:50 UTC 2019

Add the necessary infrastructure to expose GPU counters to userspace.
This takes the form of 4 new ioctls to:

- query the available counters
- create/destroy a performance monitor
- retrieve its values

The drm_panfrost_submit struct is extended to pass a list of perfmons
to attach to a job, which means perfmons will only track changes caused
by the jobs they are attached too.

Signed-off-by: Boris Brezillon <boris.brezillon at collabora.com>
---
 drivers/gpu/drm/panfrost/Makefile           |   3 +-
 drivers/gpu/drm/panfrost/panfrost_device.c  |   8 +
 drivers/gpu/drm/panfrost/panfrost_device.h  |  11 +
 drivers/gpu/drm/panfrost/panfrost_drv.c     |  22 +-
 drivers/gpu/drm/panfrost/panfrost_gpu.c     |  43 +-
 drivers/gpu/drm/panfrost/panfrost_job.c     |  24 +
 drivers/gpu/drm/panfrost/panfrost_job.h     |   4 +
 drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 954 ++++++++++++++++++++
 drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  54 ++
 drivers/gpu/drm/panfrost/panfrost_regs.h    |  19 +
 include/uapi/drm/panfrost_drm.h             | 122 +++
 11 files changed, 1260 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h

diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile
index d07e0971b687..31cfb9d25682 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -6,6 +6,7 @@ panfrost-y := \
 	panfrost_gem.o \
 	panfrost_gpu.o \
 	panfrost_job.o \
-	panfrost_mmu.o
+	panfrost_mmu.o \
+	panfrost_perfcnt.o
 
 obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index 148b5caa2322..f6a87bfa486b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -13,6 +13,7 @@
 #include "panfrost_gpu.h"
 #include "panfrost_job.h"
 #include "panfrost_mmu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_reset_init(struct panfrost_device *pfdev)
 {
@@ -147,7 +148,13 @@ int panfrost_device_init(struct panfrost_device *pfdev)
 	pm_runtime_mark_last_busy(pfdev->dev);
 	pm_runtime_put_autosuspend(pfdev->dev);
 
+	err = panfrost_perfcnt_init(pfdev);
+	if (err)
+		goto err_out5;
+
 	return 0;
+err_out5:
+	panfrost_job_fini(pfdev);
 err_out4:
 	panfrost_mmu_fini(pfdev);
 err_out3:
@@ -163,6 +170,7 @@ int panfrost_device_init(struct panfrost_device *pfdev)
 
 void panfrost_device_fini(struct panfrost_device *pfdev)
 {
+	panfrost_perfcnt_fini(pfdev);
 	panfrost_job_fini(pfdev);
 	panfrost_mmu_fini(pfdev);
 	panfrost_gpu_fini(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index a821b50a14c3..f7c4e9e55f1b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -9,11 +9,13 @@
 #include <drm/drm_device.h>
 #include <drm/drm_mm.h>
 #include <drm/gpu_scheduler.h>
+#include <drm/panfrost_drm.h>
 
 struct panfrost_device;
 struct panfrost_mmu;
 struct panfrost_job_slot;
 struct panfrost_job;
+struct panfrost_perfcnt;
 
 #define NUM_JOB_SLOTS 3
 
@@ -45,6 +47,8 @@ struct panfrost_features {
 
 	unsigned long hw_features[64 / BITS_PER_LONG];
 	unsigned long hw_issues[64 / BITS_PER_LONG];
+
+	struct drm_panfrost_block_perfcounters perfcnt_layout[PANFROST_NUM_BLOCKS];
 };
 
 struct panfrost_device {
@@ -70,6 +74,8 @@ struct panfrost_device {
 	struct panfrost_job *jobs[NUM_JOB_SLOTS];
 	struct list_head scheduled_jobs;
 
+	struct panfrost_perfcnt *perfcnt;
+
 	struct mutex sched_lock;
 };
 
@@ -77,6 +83,11 @@ struct panfrost_file_priv {
 	struct panfrost_device *pfdev;
 
 	struct drm_sched_entity sched_entity[NUM_JOB_SLOTS];
+
+	struct {
+		struct idr idr;
+		struct mutex lock;
+	} perfmon;
 };
 
 static inline struct panfrost_device *to_panfrost_device(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 8cffb70a3548..e5375b31627f 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -19,6 +19,7 @@
 #include "panfrost_mmu.h"
 #include "panfrost_job.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_ioctl_get_param(struct drm_device *ddev, void *data, struct drm_file *file)
 {
@@ -219,6 +220,10 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	ret = panfrost_perfcnt_create_job_ctx(job, file, args);
+	if (ret)
+		goto fail;
+
 	ret = panfrost_job_push(job);
 	if (ret)
 		goto fail;
@@ -313,6 +318,7 @@ panfrost_open(struct drm_device *dev, struct drm_file *file)
 {
 	struct panfrost_device *pfdev = dev->dev_private;
 	struct panfrost_file_priv *panfrost_priv;
+	int ret;
 
 	panfrost_priv = kzalloc(sizeof(*panfrost_priv), GFP_KERNEL);
 	if (!panfrost_priv)
@@ -321,7 +327,16 @@ panfrost_open(struct drm_device *dev, struct drm_file *file)
 	panfrost_priv->pfdev = pfdev;
 	file->driver_priv = panfrost_priv;
 
-	return panfrost_job_open(panfrost_priv);
+	ret = panfrost_job_open(panfrost_priv);
+	if (ret)
+		goto err_free_priv;
+
+	panfrost_perfcnt_open(panfrost_priv);
+	return 0;
+
+err_free_priv:
+	kfree(panfrost_priv);
+	return ret;
 }
 
 static void
@@ -329,6 +344,7 @@ panfrost_postclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct panfrost_file_priv *panfrost_priv = file->driver_priv;
 
+	panfrost_perfcnt_close(panfrost_priv);
 	panfrost_job_close(panfrost_priv);
 
 	kfree(panfrost_priv);
@@ -348,6 +364,10 @@ static const struct drm_ioctl_desc panfrost_drm_driver_ioctls[] = {
 	PANFROST_IOCTL(MMAP_BO,		mmap_bo,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_PARAM,	get_param,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_BO_OFFSET,	get_bo_offset,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(GET_PERFCNT_LAYOUT, get_perfcnt_layout, DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(CREATE_PERFMON,	create_perfmon,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(DESTROY_PERFMON,	destroy_perfmon, DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(GET_PERFMON_VALUES, get_perfmon_values, DRM_RENDER_ALLOW),
 };
 
 DEFINE_DRM_GEM_SHMEM_FOPS(panfrost_drm_driver_fops);
diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c
index d46d36170e18..c28a31c547cc 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
@@ -13,6 +13,7 @@
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 
 static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
@@ -42,6 +43,12 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
 		done = true;
 	}
 
+	if (state & GPU_IRQ_PERFCNT_SAMPLE_COMPLETED)
+		panfrost_perfcnt_sample_done(pfdev);
+
+	if (state & GPU_IRQ_CLEAN_CACHES_COMPLETED)
+		panfrost_perfcnt_clean_cache_done(pfdev);
+
 	gpu_write(pfdev, GPU_INT_CLEAR, state);
 
 	return IRQ_HANDLED;
@@ -152,14 +159,16 @@ struct panfrost_model {
 		u32 revision;
 		u64 issues;
 	} revs[MAX_HW_REVS];
+	u64 perfcnt[PANFROST_NUM_BLOCKS];
 };
 
 #define GPU_MODEL(_name, _id, ...) \
-{\
+{								\
 	.name = __stringify(_name),				\
 	.id = _id,						\
 	.features = hw_features_##_name,			\
 	.issues = hw_issues_##_name,				\
+	.perfcnt = hw_perfcnt_##_name,				\
 	.revs = { __VA_ARGS__ },				\
 }
 
@@ -198,13 +207,17 @@ static const struct panfrost_model gpu_models[] = {
 
 static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
 {
+	struct drm_panfrost_block_perfcounters *perfcnt_layout;
 	u32 gpu_id, num_js, major, minor, status, rev;
 	const char *name = "unknown";
 	u64 hw_feat = 0;
-	u64 hw_issues = hw_issues_all;
+	u64 hw_issues = hw_issues_all, mask;
 	const struct panfrost_model *model;
+	unsigned int num;
 	int i;
 
+	perfcnt_layout = pfdev->features.perfcnt_layout;
+
 	pfdev->features.l2_features = gpu_read(pfdev, GPU_L2_FEATURES);
 	pfdev->features.core_features = gpu_read(pfdev, GPU_CORE_FEATURES);
 	pfdev->features.tiler_features = gpu_read(pfdev, GPU_TILER_FEATURES);
@@ -272,9 +285,35 @@ static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
 		if (best >= 0)
 			hw_issues |= model->revs[best].issues;
 
+		for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+			perfcnt_layout[i].counters = model->perfcnt[i];
+
 		break;
 	}
 
+	/* Only one Job Manager. */
+	perfcnt_layout[PANFROST_JM_BLOCK].instances = BIT(0);
+	perfcnt_layout[PANFROST_SHADER_BLOCK].instances =
+						pfdev->features.shader_present;
+
+	/*
+	 * In v4 HW we have one tiler per core group, with the number
+	 * of core groups being equal to the number of L2 caches. Other
+	 * HW versions just have one tiler and the number of L2 caches
+	 * can be extracted from the mem_features field.
+	 */
+	if (hw_feat & HW_FEATURE_V4) {
+		num = hweight64(pfdev->features.l2_present);
+		mask = GENMASK(num - 1, 0);
+		perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+		perfcnt_layout[PANFROST_TILER_BLOCK].instances = mask;
+	} else {
+		perfcnt_layout[PANFROST_TILER_BLOCK].instances = BIT(0);
+		num = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+		mask = GENMASK(num - 1, 0);
+		perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+	}
+
 	bitmap_from_u64(pfdev->features.hw_features, hw_feat);
 	bitmap_from_u64(pfdev->features.hw_issues, hw_issues);
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 8d570c3f15d0..c2be61a9ebff 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -15,6 +15,7 @@
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gem.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 #include "panfrost_gpu.h"
 #include "panfrost_mmu.h"
@@ -153,6 +154,7 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
 		goto end;
 
 	spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
+	panfrost_perfcnt_run_job(job);
 
 	job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF);
 	job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32);
@@ -233,6 +235,12 @@ int panfrost_job_push(struct panfrost_job *job)
 		goto unlock;
 	}
 
+	ret = panfrost_perfcnt_push_job(job);
+	if (ret) {
+		mutex_unlock(&pfdev->sched_lock);
+		goto unlock;
+	}
+
 	job->render_done_fence = dma_fence_get(&job->base.s_fence->finished);
 
 	kref_get(&job->refcount); /* put by scheduler job completion */
@@ -272,6 +280,9 @@ static void panfrost_job_cleanup(struct kref *ref)
 
 	for (i = 0; i < job->bo_count; i++)
 		drm_gem_object_put_unlocked(job->bos[i]);
+
+	panfrost_perfcnt_clean_job_ctx(job);
+
 	kvfree(job->bos);
 
 	kfree(job);
@@ -316,6 +327,13 @@ static struct dma_fence *panfrost_job_dependency(struct drm_sched_job *sched_job
 		}
 	}
 
+	/* Return the perfmon wait fence if any. */
+	if (job->perfcnt_fence) {
+		fence = job->perfcnt_fence;
+		job->perfcnt_fence = NULL;
+		return fence;
+	}
+
 	return NULL;
 }
 
@@ -399,6 +417,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 	/* restart scheduler after GPU is usable again */
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
 		drm_sched_start(&pfdev->js->queue[i].sched, true);
+
+	/* For now, just say we're done. No reset and retry. */
+//	job_write(pfdev, JS_COMMAND(js), JS_COMMAND_HARD_STOP);
+	dma_fence_signal(job->done_fence);
+	panfrost_perfcnt_finish_job(job, true);
 }
 
 static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -442,6 +465,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
 
 		if (status & JOB_INT_MASK_DONE(j)) {
 			dma_fence_signal(pfdev->jobs[j]->done_fence);
+			panfrost_perfcnt_finish_job(pfdev->jobs[j], false);
 		}
 
 		status &= ~mask;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h b/drivers/gpu/drm/panfrost/panfrost_job.h
index 62454128a792..18646cc5eebb 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.h
+++ b/drivers/gpu/drm/panfrost/panfrost_job.h
@@ -37,6 +37,10 @@ struct panfrost_job {
 
 	/* Fence to be signaled by drm-sched once its done with the job */
 	struct dma_fence *render_done_fence;
+
+	/* Perfcnt context */
+	struct panfrost_perfcnt_job_ctx *perfcnt_ctx;
+	struct dma_fence *perfcnt_fence;
 };
 
 int panfrost_job_init(struct panfrost_device *pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
new file mode 100644
index 000000000000..4491f153ad48
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -0,0 +1,954 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 Collabora Ltd */
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/panfrost_drm.h>
+#include <linux/iopoll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "panfrost_device.h"
+#include "panfrost_features.h"
+#include "panfrost_gem.h"
+#include "panfrost_issues.h"
+#include "panfrost_job.h"
+#include "panfrost_mmu.h"
+#include "panfrost_regs.h"
+
+#define COUNTERS_PER_BLOCK		64
+#define BYTES_PER_COUNTER		4
+#define BLOCKS_PER_COREGROUP		8
+#define V4_SHADERS_PER_COREGROUP	4
+
+struct panfrost_perfcnt_job_ctx {
+	refcount_t refcount;
+	struct panfrost_device *pfdev;
+	struct dma_fence *wait_fence;
+	struct dma_fence *done_fence;
+	struct panfrost_perfmon **perfmons;
+	u32 perfmon_count;
+};
+
+struct panfrost_perfcnt {
+	struct work_struct dumpwork;
+	u64 fence_context;
+	u64 emit_seqno;
+	spinlock_t fence_lock;
+	struct mutex cfg_lock;
+	u32 cur_cfg[PANFROST_NUM_BLOCKS];
+	struct panfrost_gem_object *bo;
+	void *buf;
+	spinlock_t ctx_lock;
+	struct panfrost_perfcnt_job_ctx *last_ctx;
+	struct panfrost_perfcnt_job_ctx *dump_ctx;
+};
+
+struct panfrost_perfcnt_fence {
+	struct dma_fence base;
+	struct drm_device *dev;
+	u64 seqno;
+};
+
+struct panfrost_perfmon {
+	refcount_t refcnt;
+	atomic_t busycnt;
+	struct wait_queue_head wq;
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+	u32 *values[PANFROST_NUM_BLOCKS];
+};
+
+static inline struct panfrost_perfcnt_fence *
+to_panfrost_perfcnt_fence(struct dma_fence *fence)
+{
+	return container_of(fence, struct panfrost_perfcnt_fence, base);
+}
+
+static const char *
+panfrost_perfcnt_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "panfrost";
+}
+
+static const char *
+panfrost_perfcnt_fence_get_timeline_name(struct dma_fence *fence)
+{
+	return "panfrost-perfcnt";
+}
+
+static const struct dma_fence_ops panfrost_perfcnt_fence_ops = {
+	.get_driver_name = panfrost_perfcnt_fence_get_driver_name,
+	.get_timeline_name = panfrost_perfcnt_fence_get_timeline_name,
+};
+
+static struct dma_fence *
+panfrost_perfcnt_fence_create(struct panfrost_device *pfdev)
+{
+	struct panfrost_perfcnt_fence *fence;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return ERR_PTR(-ENOMEM);
+
+	fence->dev = pfdev->ddev;
+	fence->seqno = ++pfdev->perfcnt->emit_seqno;
+	dma_fence_init(&fence->base, &panfrost_perfcnt_fence_ops,
+		       &pfdev->perfcnt->fence_lock,
+		       pfdev->perfcnt->fence_context, fence->seqno);
+
+	return &fence->base;
+}
+
+static void panfrost_perfmon_get(struct panfrost_perfmon *perfmon)
+{
+	if (perfmon)
+		refcount_inc(&perfmon->refcnt);
+}
+
+static void panfrost_perfmon_put(struct panfrost_perfmon *perfmon)
+{
+	if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
+		unsigned int i;
+
+		for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+			kfree(perfmon->values[i]);
+
+		kfree(perfmon);
+	}
+}
+
+static struct panfrost_perfmon *
+panfrost_perfcnt_find_perfmon(struct panfrost_file_priv *pfile, int id)
+{
+	struct panfrost_perfmon *perfmon;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_find(&pfile->perfmon.idr, id);
+	panfrost_perfmon_get(perfmon);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	return perfmon;
+}
+
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile)
+{
+	mutex_init(&pfile->perfmon.lock);
+	idr_init(&pfile->perfmon.idr);
+}
+
+static int panfrost_perfcnt_idr_del(int id, void *elem, void *data)
+{
+	struct panfrost_perfmon *perfmon = elem;
+
+	panfrost_perfmon_put(perfmon);
+
+	return 0;
+}
+
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile)
+{
+	mutex_lock(&pfile->perfmon.lock);
+	idr_for_each(&pfile->perfmon.idr, panfrost_perfcnt_idr_del, NULL);
+	idr_destroy(&pfile->perfmon.idr);
+	mutex_unlock(&pfile->perfmon.lock);
+}
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_device *pfdev = pfile->pfdev;
+	struct drm_panfrost_get_perfcnt_layout *layout = data;
+
+	memcpy(layout->counters, pfdev->features.perfcnt_layout,
+	       sizeof(layout->counters));
+
+	return 0;
+}
+
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_device *pfdev = pfile->pfdev;
+	struct drm_panfrost_create_perfmon *req = data;
+	struct drm_panfrost_block_perfcounters *layout;
+	struct panfrost_perfmon *perfmon;
+	unsigned int i;
+	int ret;
+
+	if (req->padding)
+		return -EINVAL;
+
+	perfmon = kzalloc(sizeof(*perfmon), GFP_KERNEL);
+	if (!perfmon)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	layout = pfdev->features.perfcnt_layout;
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		unsigned int ncounters;
+
+		/* Make sure the request matches the available counters. */
+		if (~layout[i].instances & req->counters[i].instances ||
+		    ~layout[i].counters & req->counters[i].counters)
+			goto err_free_perfmon;
+
+		ncounters = hweight64(req->counters[i].instances) *
+			    hweight64(req->counters[i].counters);
+		if (!ncounters)
+			continue;
+
+		perfmon->counters[i] = req->counters[i];
+		perfmon->values[i] = kcalloc(ncounters, sizeof(u32), GFP_KERNEL);
+		if (!perfmon->values)
+			goto err_free_perfmon;
+	}
+
+	refcount_set(&perfmon->refcnt, 1);
+	init_waitqueue_head(&perfmon->wq);
+
+	mutex_lock(&pfile->perfmon.lock);
+	ret = idr_alloc(&pfile->perfmon.idr, perfmon, 1, U32_MAX, GFP_KERNEL);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (ret < 0)
+		goto err_free_perfmon;
+
+	req->id = ret;
+	return 0;
+
+err_free_perfmon:
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+		kfree(perfmon->values[i]);
+
+	kfree(perfmon);
+	return ret;
+}
+
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+				   struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct drm_panfrost_destroy_perfmon *req = data;
+	struct panfrost_perfmon *perfmon;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_remove(&pfile->perfmon.idr, req->id);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	panfrost_perfmon_put(perfmon);
+	return 0;
+}
+
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct drm_panfrost_get_perfmon_values *req = data;
+	struct panfrost_perfmon *perfmon;
+	unsigned int i;
+	int ret = 0;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_find(&pfile->perfmon.idr, req->id);
+	panfrost_perfmon_get(perfmon);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	if (!(req->flags & DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT))
+		ret = wait_event_interruptible(perfmon->wq,
+					       !atomic_read(&perfmon->busycnt));
+	else if (atomic_read(&perfmon->busycnt))
+		ret = -EBUSY;
+
+	if (ret)
+		goto out;
+
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		unsigned int ncounters;
+
+		ncounters = hweight64(perfmon->counters[i].instances) *
+			    hweight64(perfmon->counters[i].counters);
+		if (!ncounters)
+			continue;
+
+		if (copy_to_user(u64_to_user_ptr(req->values_ptrs[i]),
+				 perfmon->values[i],
+				 ncounters * sizeof(u32))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (req->flags & DRM_PANFROST_GET_PERFMON_VALS_RESET)
+			memset(perfmon->values[i], 0, ncounters * sizeof(u32));
+	}
+
+out:
+	panfrost_perfmon_put(perfmon);
+	return ret;
+}
+
+/*
+ * Returns true if the 2 jobs have exactly the same perfcnt context, false
+ * otherwise.
+ */
+static bool panfrost_perfcnt_job_ctx_cmp(struct panfrost_perfcnt_job_ctx *a,
+					 struct panfrost_perfcnt_job_ctx *b)
+{
+	unsigned int i, j;
+
+	if (a->perfmon_count != b->perfmon_count)
+		return false;
+
+	for (i = 0; i < a->perfmon_count; i++) {
+		for (j = 0; j < b->perfmon_count; j++) {
+			if (a->perfmons[i] == b->perfmons[j])
+				break;
+		}
+
+		if (j == b->perfmon_count)
+			return false;
+	}
+
+	return true;
+}
+
+static u32 counters_u64_to_u32(u64 in)
+{
+	unsigned int i;
+	u32 out = 0;
+
+	for (i = 0; i < 64; i += 4) {
+		if (GENMASK(i + 3, i) & in)
+			out |= BIT(i / 4);
+	}
+
+	return out;
+}
+
+void panfrost_perfcnt_run_job(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	u32 perfcnt_en[PANFROST_NUM_BLOCKS] = { };
+	bool disable_perfcnt = true, config_changed = false;
+	unsigned int i, j;
+	u64 gpuva;
+	u32 cfg;
+
+	mutex_lock(&pfdev->perfcnt->cfg_lock);
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		for (j = 0; j < ctx->perfmon_count; j++) {
+			u64 counters = ctx->perfmons[j]->counters[i].counters;
+
+			perfcnt_en[i] |= counters_u64_to_u32(counters);
+		}
+
+		if (perfcnt_en[i])
+			disable_perfcnt = false;
+
+		if (perfcnt_en[i] != pfdev->perfcnt->cur_cfg[i]) {
+			pfdev->perfcnt->cur_cfg[i] = perfcnt_en[i];
+			config_changed = true;
+		}
+	}
+	mutex_unlock(&pfdev->perfcnt->cfg_lock);
+
+	if (!config_changed)
+		return;
+
+	/*
+	 * Always use address space 0 for now.
+	 * FIXME: this needs to be updated when we start using different
+	 * address space.
+	 */
+	cfg = GPU_PERFCNT_CFG_AS(0);
+	if (panfrost_model_cmp(pfdev, 0x1000) >= 0)
+		cfg |= GPU_PERFCNT_CFG_SETSEL(1);
+
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+
+	if (disable_perfcnt)
+		return;
+
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, perfcnt_en[PANFROST_JM_BLOCK]);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN,
+		  perfcnt_en[PANFROST_SHADER_BLOCK]);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN,
+		  perfcnt_en[PANFROST_MMU_L2_BLOCK]);
+	gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
+	gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
+	gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
+
+	/*
+	 * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
+	 * counters.
+	 */
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+	else
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+			  perfcnt_en[PANFROST_TILER_BLOCK]);
+
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
+
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+			  perfcnt_en[PANFROST_TILER_BLOCK]);
+}
+
+static void
+panfrost_perfcnt_release_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	unsigned int i;
+
+	WARN_ON(refcount_read(&ctx->refcount));
+	for (i = 0; i < ctx->perfmon_count; i++) {
+		if (atomic_dec_and_test(&ctx->perfmons[i]->busycnt))
+			wake_up(&ctx->perfmons[i]->wq);
+		panfrost_perfmon_put(ctx->perfmons[i]);
+	}
+
+	dma_fence_put(ctx->wait_fence);
+	dma_fence_put(ctx->done_fence);
+	kfree(ctx->perfmons);
+	kfree(ctx);
+}
+
+static void panfrost_perfcnt_put_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	if (!IS_ERR_OR_NULL(ctx) && refcount_dec_and_test(&ctx->refcount))
+		panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+struct panfrost_perfcnt_job_ctx *
+panfrost_perfcnt_get_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	if (ctx)
+		refcount_inc(&ctx->refcount);
+
+	return ctx;
+}
+
+static void panfrost_perfcnt_dump_done(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	struct panfrost_device *pfdev;
+	unsigned long flags;
+
+	pfdev = ctx->pfdev;
+	spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+	pfdev->perfcnt->dump_ctx = NULL;
+	if (pfdev->perfcnt->last_ctx == ctx)
+		pfdev->perfcnt->last_ctx = NULL;
+	spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+
+	dma_fence_signal(ctx->done_fence);
+	panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+static void
+panfrost_perfcnt_get_counter_vals(struct panfrost_device *pfdev,
+				  enum drm_panfrost_block_id block,
+				  unsigned int instance, u32 *vals)
+{
+	u64 shader_present = pfdev->features.shader_present;
+	unsigned int bufoffs, shaderid, shadernum;
+
+	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+		unsigned int ncoregroups;
+
+		ncoregroups = hweight64(pfdev->features.l2_present);
+
+		switch (block) {
+		case PANFROST_SHADER_BLOCK:
+			for (shaderid = 0, shadernum = 0; shaderid < 64;
+			     shaderid++) {
+				if (!(BIT_ULL(shaderid) & shader_present))
+					continue;
+
+				if (shadernum == instance)
+					break;
+
+				shadernum++;
+			}
+
+			if (WARN_ON(shaderid == 64))
+				return;
+
+			/* 4 shaders per core group. */
+			bufoffs = ((shaderid / V4_SHADERS_PER_COREGROUP) *
+				   2048) +
+				  ((shaderid % V4_SHADERS_PER_COREGROUP) *
+				   256);
+			break;
+
+		case PANFROST_TILER_BLOCK:
+			if (WARN_ON(instance >= ncoregroups))
+				return;
+
+			bufoffs = (instance * 2048) + 1024;
+			break;
+		case PANFROST_MMU_L2_BLOCK:
+			if (WARN_ON(instance >= ncoregroups))
+				return;
+
+			bufoffs = (instance * 2048) + 1280;
+			break;
+		case PANFROST_JM_BLOCK:
+			if (WARN_ON(instance))
+				return;
+			bufoffs = 1792;
+			break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+	} else {
+		unsigned int nl2c, ncores;
+
+		/*
+		 * TODO: define a macro to extract the number of l2 caches from
+		 * mem_features.
+		 */
+		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+		/*
+		 * The ARM driver is grouping cores per core group and then
+		 * only using the number of cores in group 0 to calculate the
+		 * size. Not sure why this is done like that, but I guess
+		 * shader_present will only show cores in the first group
+		 * anyway.
+		 */
+		ncores = hweight64(pfdev->features.shader_present);
+
+		switch (block) {
+		case PANFROST_SHADER_BLOCK:
+			for (shaderid = 0, shadernum = 0; shaderid < 64;
+			     shaderid++) {
+				if (!(BIT_ULL(shaderid) & shader_present))
+					continue;
+
+				if (shadernum == instance)
+					break;
+
+				shadernum++;
+			}
+
+			if (WARN_ON(shaderid == 64))
+				return;
+
+			/* 4 shaders per core group. */
+			bufoffs = 512 + ((nl2c + shaderid) * 256);
+			break;
+
+		case PANFROST_TILER_BLOCK:
+			if (WARN_ON(instance))
+				return;
+
+			bufoffs = 256;
+			break;
+		case PANFROST_MMU_L2_BLOCK:
+			if (WARN_ON(instance >= nl2c))
+				return;
+
+			bufoffs = 512 + (instance * 256);
+			break;
+		case PANFROST_JM_BLOCK:
+			if (WARN_ON(instance))
+				return;
+			bufoffs = 0;
+			break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+	}
+
+	memcpy(vals, pfdev->perfcnt->buf + bufoffs, 256);
+}
+
+static void
+panfrost_perfmon_upd_counter_vals(struct panfrost_perfmon *perfmon,
+				  enum drm_panfrost_block_id block,
+				  unsigned int instance, u32 *invals)
+{
+	u32 *outvals = perfmon->values[block];
+	unsigned int inidx, outidx;
+
+	if (WARN_ON(instance >= hweight64(perfmon->counters[block].instances)))
+		return;
+
+	if (!(perfmon->counters[block].instances & BIT_ULL(instance)))
+		return;
+
+	outvals += instance * hweight64(perfmon->counters[block].counters);
+	for (inidx = 0, outidx = 0; inidx < 64; inidx++) {
+		if (!(perfmon->counters[block].counters & BIT_ULL(inidx)))
+			continue;
+
+		if (U32_MAX - outvals[outidx] < invals[inidx])
+			outvals[outidx] = U32_MAX;
+		else
+			outvals[outidx] += invals[inidx];
+		outidx++;
+	}
+}
+
+static void panfrost_perfcnt_dump_work(struct work_struct *w)
+{
+	struct panfrost_perfcnt *perfcnt = container_of(w,
+						struct panfrost_perfcnt,
+						dumpwork);
+	struct panfrost_perfcnt_job_ctx *ctx = perfcnt->dump_ctx;
+	unsigned int block, instance, pmonidx, num;
+
+	if (!ctx)
+		return;
+
+	for (block = 0; block < PANFROST_NUM_BLOCKS; block++) {
+		struct panfrost_perfmon *perfmon;
+		u32 vals[COUNTERS_PER_BLOCK];
+		u64 instances = 0;
+
+		for (pmonidx = 0; pmonidx < ctx->perfmon_count; pmonidx++) {
+			perfmon = ctx->perfmons[pmonidx];
+			instances |= perfmon->counters[block].instances;
+		}
+
+		for (instance = 0, num = 0; instance < 64; instance++) {
+			if (!(instances & BIT_ULL(instance)))
+				continue;
+
+			panfrost_perfcnt_get_counter_vals(ctx->pfdev, block,
+							  instance, vals);
+
+			for (pmonidx = 0; pmonidx < ctx->perfmon_count;
+			     pmonidx++) {
+				perfmon = ctx->perfmons[pmonidx];
+				panfrost_perfmon_upd_counter_vals(perfmon,
+								  block,
+								  num,
+								  vals);
+			}
+			num++;
+		}
+	}
+
+	panfrost_perfcnt_dump_done(ctx);
+}
+
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
+{
+	schedule_work(&pfdev->perfcnt->dumpwork);
+}
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
+{
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
+}
+
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job)
+{
+	return panfrost_perfcnt_put_job_ctx(job->perfcnt_ctx);
+}
+
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+				    struct drm_file *file_priv,
+				    struct drm_panfrost_submit *args)
+{
+	struct panfrost_device *pfdev = job->pfdev;
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_perfcnt_job_ctx *ctx;
+	unsigned int i, j;
+	u32 *handles;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->pfdev = pfdev;
+	refcount_set(&ctx->refcount, 1);
+
+	ctx->perfmon_count = args->perfmon_handle_count;
+	if (!ctx->perfmon_count) {
+		job->perfcnt_ctx = ctx;
+		return 0;
+	}
+
+	handles = kcalloc(ctx->perfmon_count, sizeof(u32), GFP_KERNEL);
+	if (!handles) {
+		ret = -ENOMEM;
+		goto err_put_ctx;
+	}
+
+	if (copy_from_user(handles,
+			   u64_to_user_ptr(args->perfmon_handles),
+			   ctx->perfmon_count * sizeof(u32))) {
+		ret = -EFAULT;
+		DRM_DEBUG("Failed to copy in perfmon handles\n");
+		goto err_free_handles;
+	}
+
+	/* Make sure each perfmon only appears once. */
+	for (i = 0; i < ctx->perfmon_count - 1; i++) {
+		for (j = i + 1; j < ctx->perfmon_count; j++) {
+			if (handles[i] == handles[j]) {
+				ret = -EINVAL;
+				goto err_free_handles;
+			}
+		}
+	}
+
+	ctx->perfmons = kcalloc(ctx->perfmon_count, sizeof(*ctx->perfmons),
+				GFP_KERNEL | __GFP_ZERO);
+	if (!ctx->perfmons) {
+		ret = -ENOMEM;
+		goto err_free_handles;
+	}
+
+	for (i = 0; i < ctx->perfmon_count; i++) {
+		ctx->perfmons[i] = panfrost_perfcnt_find_perfmon(pfile,
+								 handles[i]);
+		if (!ctx->perfmons[i]) {
+			ret = -EINVAL;
+			goto err_free_handles;
+		}
+		atomic_inc(&ctx->perfmons[i]->busycnt);
+	}
+
+	job->perfcnt_ctx = ctx;
+	kfree(handles);
+	return 0;
+
+err_free_handles:
+	kfree(handles);
+
+err_put_ctx:
+	panfrost_perfcnt_put_job_ctx(ctx);
+	return ret;
+}
+
+void panfrost_perfcnt_finish_job(struct panfrost_job *job, bool skip_dump)
+{
+	struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+
+	if (WARN_ON(!ctx))
+		return;
+
+	job->perfcnt_ctx = NULL;
+	if (!refcount_dec_and_test(&ctx->refcount))
+		return;
+
+	if (!ctx->perfmon_count || skip_dump) {
+		panfrost_perfcnt_dump_done(ctx);
+		return;
+	}
+
+	ctx->pfdev->perfcnt->dump_ctx = ctx;
+	gpu_write(ctx->pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
+}
+
+static bool panfrost_perfcnt_try_reuse_last_job_ctx(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	unsigned int i;
+
+	new_ctx = job->perfcnt_ctx;
+	prev_ctx = pfdev->perfcnt->last_ctx;
+	if (!prev_ctx)
+		return false;
+
+	if (!refcount_inc_not_zero(&prev_ctx->refcount))
+		return false;
+
+	if (!panfrost_perfcnt_job_ctx_cmp(prev_ctx, new_ctx)) {
+		refcount_dec(&prev_ctx->refcount);
+		return false;
+	}
+
+	/*
+	 * Make sure we increment busycnt, as panfrost_perfcnt_put_job_ctx()
+	 * will decrement it.
+	 */
+	for (i = 0; i < prev_ctx->perfmon_count; i++)
+		atomic_inc(&prev_ctx->perfmons[i]->busycnt);
+
+	panfrost_perfcnt_put_job_ctx(new_ctx);
+	job->perfcnt_ctx = prev_ctx;
+	job->perfcnt_fence = dma_fence_get(prev_ctx->wait_fence);
+	return true;
+}
+
+int panfrost_perfcnt_push_job(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+	new_ctx = job->perfcnt_ctx;
+	prev_ctx = pfdev->perfcnt->last_ctx;
+	/*
+	 * In order to keep things relatively fast even when HW counters are
+	 * enabled we try to avoid having to dump perfcounters at the end of
+	 * each job (which implies making other jobs wait for this dump to
+	 * finish) when that's possible.
+	 * This is only acceptable if all queued jobs share the same perfctx,
+	 * that is, they have the same list of jobs attached to them. In this
+	 * condition we are guaranteed that nothing will increment the counters
+	 * behind our back.
+	 */
+	if (panfrost_perfcnt_try_reuse_last_job_ctx(job))
+		goto out;
+
+	new_ctx->done_fence = panfrost_perfcnt_fence_create(pfdev);
+	if (IS_ERR(new_ctx->done_fence)) {
+		ret = PTR_ERR(new_ctx->done_fence);
+		goto out;
+	}
+
+	/*
+	 * The previous job has a different perfmon ctx, so we must wait for it
+	 * to be done dumping the counters before we can schedule this new job,
+	 * otherwise we might corrupt the counter values.
+	 */
+	if (prev_ctx)
+		new_ctx->wait_fence = dma_fence_get(prev_ctx->done_fence);
+
+	job->perfcnt_fence = dma_fence_get(new_ctx->wait_fence);
+	pfdev->perfcnt->last_ctx = new_ctx;
+
+out:
+	spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+	return ret;
+}
+
+int panfrost_perfcnt_init(struct panfrost_device *pfdev)
+{
+	struct panfrost_perfcnt *perfcnt;
+	struct drm_gem_shmem_object *bo;
+	size_t size;
+	u32 status;
+	int ret;
+
+	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+		unsigned int ncoregroups;
+
+		ncoregroups = hweight64(pfdev->features.l2_present);
+		size = ncoregroups * BLOCKS_PER_COREGROUP *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	} else {
+		unsigned int nl2c, ncores;
+
+		/*
+		 * TODO: define a macro to extract the number of l2 caches from
+		 * mem_features.
+		 */
+		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+		/*
+		 * The ARM driver is grouping cores per core group and then
+		 * only using the number of cores in group 0 to calculate the
+		 * size. Not sure why this is done like that, but I guess
+		 * shader_present will only show cores in the first group
+		 * anyway.
+		 */
+		ncores = hweight64(pfdev->features.shader_present);
+
+		/*
+		 * There's always one JM and one Tiler block, hence the '+ 2'
+		 * here.
+		 */
+		size = (nl2c + ncores + 2) *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	}
+
+	perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
+	if (!perfcnt)
+		return -ENOMEM;
+
+	bo = drm_gem_shmem_create(pfdev->ddev, size);
+	if (IS_ERR(bo))
+		return PTR_ERR(bo);
+
+	perfcnt->bo = to_panfrost_bo(&bo->base);
+
+	/*
+	 * We always use the same buffer, so let's map it once and keep it
+	 * mapped until the driver is unloaded. This might be a problem if
+	 * we start using different AS and the perfcnt BO is not mapped at
+	 * the same GPU virtual address.
+	 */
+	ret = panfrost_mmu_map(perfcnt->bo);
+	if (ret)
+		goto err_put_bo;
+
+	/* Disable everything. */
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  GPU_PERFCNT_CFG_AS(0) |
+		  GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF) |
+		  (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?
+		   GPU_PERFCNT_CFG_SETSEL(1) : 0));
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+
+	perfcnt->buf = drm_gem_vmap(&bo->base);
+	if (IS_ERR(perfcnt->buf)) {
+		ret = PTR_ERR(perfcnt->buf);
+		goto err_put_bo;
+	}
+
+	INIT_WORK(&perfcnt->dumpwork, panfrost_perfcnt_dump_work);
+	mutex_init(&perfcnt->cfg_lock);
+	spin_lock_init(&perfcnt->fence_lock);
+	spin_lock_init(&perfcnt->ctx_lock);
+	perfcnt->fence_context = dma_fence_context_alloc(1);
+	pfdev->perfcnt = perfcnt;
+
+	/*
+	 * Invalidate the cache and clear the counters to start from a fresh
+	 * state.
+	 */
+	gpu_write(pfdev, GPU_INT_MASK, 0);
+	gpu_write(pfdev, GPU_INT_CLEAR, GPU_IRQ_CLEAN_CACHES_COMPLETED);
+
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR);
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES);
+	ret = readl_relaxed_poll_timeout(pfdev->iomem + GPU_INT_RAWSTAT,
+					 status,
+					 status &
+					 GPU_IRQ_CLEAN_CACHES_COMPLETED,
+					 100, 10000);
+	if (ret)
+		goto err_gem_vunmap;
+
+	gpu_write(pfdev, GPU_INT_MASK, GPU_IRQ_MASK_ALL);
+
+	return 0;
+
+err_gem_vunmap:
+	drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+
+err_put_bo:
+	drm_gem_object_put_unlocked(&bo->base);
+	return ret;
+}
+
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
+{
+	drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+	drm_gem_object_put_unlocked(&pfdev->perfcnt->bo->base.base);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
new file mode 100644
index 000000000000..7cbfeb072aa1
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2019 Collabora Ltd */
+#ifndef __PANFROST_PERFCNT_H__
+#define __PANFROST_PERFCNT_H__
+
+#include <linux/bitops.h>
+
+struct panfrost_perfcnt_job_ctx;
+
+#define PERFCNT(_shader, _tiler, _mmu_l2, _jm)		\
+	{ _shader, _tiler, _mmu_l2, _jm }
+#define NO_PERFCNT      PERFCNT(0, 0, 0, 0)
+
+/* FIXME: Declare counters for all models */
+#define hw_perfcnt_t600	NO_PERFCNT
+#define hw_perfcnt_t620	NO_PERFCNT
+#define hw_perfcnt_t720	NO_PERFCNT
+#define hw_perfcnt_t760	NO_PERFCNT
+#define hw_perfcnt_t820	NO_PERFCNT
+#define hw_perfcnt_t830	NO_PERFCNT
+#define hw_perfcnt_t860	NO_PERFCNT
+#define hw_perfcnt_t880	NO_PERFCNT
+#define hw_perfcnt_g76	NO_PERFCNT
+#define hw_perfcnt_g71	NO_PERFCNT
+#define hw_perfcnt_g72	NO_PERFCNT
+#define hw_perfcnt_g51	NO_PERFCNT
+#define hw_perfcnt_g52	NO_PERFCNT
+#define hw_perfcnt_g31	NO_PERFCNT
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev);
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev);
+int panfrost_perfcnt_push_job(struct panfrost_job *job);
+void panfrost_perfcnt_run_job(struct panfrost_job *job);
+void panfrost_perfcnt_finish_job(struct panfrost_job *job,
+				 bool skip_dump);
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job);
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+				    struct drm_file *file_priv,
+				    struct drm_panfrost_submit *args);
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile);
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile);
+int panfrost_perfcnt_init(struct panfrost_device *pfdev);
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev);
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv);
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv);
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+				   struct drm_file *file_priv);
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv);
+
+#endif
diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h
index 42d08860fd76..ea38ac60581c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_regs.h
+++ b/drivers/gpu/drm/panfrost/panfrost_regs.h
@@ -44,12 +44,31 @@
 	 GPU_IRQ_MULTIPLE_FAULT)
 #define GPU_CMD				0x30
 #define   GPU_CMD_SOFT_RESET		0x01
+#define   GPU_CMD_PERFCNT_CLEAR		0x03
+#define   GPU_CMD_PERFCNT_SAMPLE	0x04
+#define   GPU_CMD_CLEAN_CACHES		0x07
+#define   GPU_CMD_CLEAN_INV_CACHES	0x08
 #define GPU_STATUS			0x34
+#define   GPU_STATUS_PRFCNT_ACTIVE	BIT(2)
 #define GPU_LATEST_FLUSH_ID		0x38
 #define GPU_FAULT_STATUS		0x3C
 #define GPU_FAULT_ADDRESS_LO		0x40
 #define GPU_FAULT_ADDRESS_HI		0x44
 
+#define GPU_PERFCNT_BASE_LO		0x60
+#define GPU_PERFCNT_BASE_HI		0x64
+#define GPU_PERFCNT_CFG			0x68
+#define   GPU_PERFCNT_CFG_MODE(x)	(x)
+#define   GPU_PERFCNT_CFG_MODE_OFF	0
+#define   GPU_PERFCNT_CFG_MODE_MANUAL	1
+#define   GPU_PERFCNT_CFG_MODE_TILE	2
+#define   GPU_PERFCNT_CFG_AS(x)		((x) << 4)
+#define   GPU_PERFCNT_CFG_SETSEL(x)	((x) << 8)
+#define GPU_PRFCNT_JM_EN		0x6c
+#define GPU_PRFCNT_SHADER_EN		0x70
+#define GPU_PRFCNT_TILER_EN		0x74
+#define GPU_PRFCNT_MMU_L2_EN		0x7c
+
 #define GPU_THREAD_MAX_THREADS		0x0A0	/* (RO) Maximum number of threads per core */
 #define GPU_THREAD_MAX_WORKGROUP_SIZE	0x0A4	/* (RO) Maximum workgroup size */
 #define GPU_THREAD_MAX_BARRIER_SIZE	0x0A8	/* (RO) Maximum threads waiting at a barrier */
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
index 508b9621d9db..e09b35bf6035 100644
--- a/include/uapi/drm/panfrost_drm.h
+++ b/include/uapi/drm/panfrost_drm.h
@@ -18,6 +18,10 @@ extern "C" {
 #define DRM_PANFROST_MMAP_BO			0x03
 #define DRM_PANFROST_GET_PARAM			0x04
 #define DRM_PANFROST_GET_BO_OFFSET		0x05
+#define DRM_PANFROST_GET_PERFCNT_LAYOUT		0x06
+#define DRM_PANFROST_CREATE_PERFMON		0x07
+#define DRM_PANFROST_DESTROY_PERFMON		0x08
+#define DRM_PANFROST_GET_PERFMON_VALUES		0x09
 
 #define DRM_IOCTL_PANFROST_SUBMIT		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_SUBMIT, struct drm_panfrost_submit)
 #define DRM_IOCTL_PANFROST_WAIT_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_WAIT_BO, struct drm_panfrost_wait_bo)
@@ -25,6 +29,10 @@ extern "C" {
 #define DRM_IOCTL_PANFROST_MMAP_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_MMAP_BO, struct drm_panfrost_mmap_bo)
 #define DRM_IOCTL_PANFROST_GET_PARAM		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
 #define DRM_IOCTL_PANFROST_GET_BO_OFFSET	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
+#define DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFCNT_LAYOUT, struct drm_panfrost_get_perfcnt_layout)
+#define DRM_IOCTL_PANFROST_CREATE_PERFMON	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_CREATE_PERFMON, struct drm_panfrost_create_perfmon)
+#define DRM_IOCTL_PANFROST_DESTROY_PERFMON	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_DESTROY_PERFMON, struct drm_panfrost_destroy_perfmon)
+#define DRM_IOCTL_PANFROST_GET_PERFMON_VALUES	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFMON_VALUES, struct drm_panfrost_get_perfmon_values)
 
 #define PANFROST_JD_REQ_FS (1 << 0)
 /**
@@ -55,6 +63,15 @@ struct drm_panfrost_submit {
 
 	/** A combination of PANFROST_JD_REQ_* */
 	__u32 requirements;
+
+	/** Pointer to a u32 array of perfmons that should be attached to the job. */
+	__u64 perfmon_handles;
+
+	/** Number of perfmon handles passed in (size is that times 4). */
+	__u32 perfmon_handle_count;
+
+	/** Unused field, should be set to 0. */
+	__u32 padding;
 };
 
 /**
@@ -133,6 +150,111 @@ struct drm_panfrost_get_bo_offset {
 	__u64 offset;
 };
 
+/**
+ * Panfrost HW block ids used to group HW counters. There might be several
+ * shader, tiler and MMU/L2 blocks in a given GPU. How many of them are
+ * available is exposed through the instances field of
+ * drm_panfrost_block_perfcounters.
+ */
+enum drm_panfrost_block_id {
+	PANFROST_SHADER_BLOCK,
+	PANFROST_TILER_BLOCK,
+	PANFROST_MMU_L2_BLOCK,
+	PANFROST_JM_BLOCK,
+	PANFROST_NUM_BLOCKS,
+};
+
+struct drm_panfrost_block_perfcounters {
+	/*
+	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+	 * instances for a specific given block type.
+	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the instances the
+	 * user wants to monitor.
+	 * Note: the bitmap might be sparse.
+	 */
+	__u64 instances;
+
+	/*
+	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+	 * counters attached to a specific block type.
+	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the counters the user
+	 * wants to monitor.
+	 * Note: the bitmap might be sparse.
+	 */
+	__u64 counters;
+};
+
+/**
+ * Used to retrieve available HW counters.
+ */
+struct drm_panfrost_get_perfcnt_layout {
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+};
+
+/**
+ * Used to create a performance monitor. Each perfmonance monitor is assigned an
+ * ID that can later be passed when submitting a job to capture hardware counter
+ * values (and thus count things related to this specific job).
+ * Performance monitors are attached to the GPU file descriptor and IDs are
+ * unique within this context, not across all GPU users.
+ * This implies that
+ * - perfmons are automatically released when the FD is closed
+ * - perfmons can't be shared across GPU context
+ */
+struct drm_panfrost_create_perfmon {
+	/* Input Fields. */
+	/* List all HW counters this performance monitor should track. */
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+
+	/* Output fields. */
+	/* ID of the newly created perfmon. */
+	__u32 id;
+
+	/* Padding: must be set to 0. */
+	__u32 padding;
+};
+
+/**
+ * Destroy an existing performance monitor.
+ */
+struct drm_panfrost_destroy_perfmon {
+	/*
+	 * ID of the perfmon to destroy (the one returned by
+	 * DRM_IOCTL_PANFROST_CREATE_PERFMON)
+	 */
+	__u32 id;
+};
+
+/*
+ * Don't wait when trying to get perfmon values. If the perfmon is still active
+ * (still attached to a queued or running job), EBUSY is returned.
+ */
+#define DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT		0x1
+
+/* Reset all perfmon values to zero after reading them. */
+#define DRM_PANFROST_GET_PERFMON_VALS_RESET		0x2
+
+/**
+ * Used to query values collected by a performance monitor.
+ */
+struct drm_panfrost_get_perfmon_values {
+	/* ID of the perfmon to query value on. */
+	__u32 id;
+
+	/* See DRM_PANFROST_GET_PERFMON_VALS_XXX flags */
+	__u32 flags;
+
+	/*
+	 * An array of u32 userspace pointers where counters values will be
+	 * copied too.
+	 * The array sizes depend on the counters/instances activated at
+	 * perfmon creation time: hweight64(instances) * hweight64(counters).
+	 * Note that some entries in values_ptrs[] might be NULL if no counters
+	 * on a specific block were activated.
+	 */
+	__u64 values_ptrs[PANFROST_NUM_BLOCKS];
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
2.20.1