[PATCH] drm/panthor: Add support for performance counters

Adrián Larumbe adrian.larumbe at collabora.com
Tue Mar 5 16:58:16 UTC 2024


This brings in support for Panthor's HW performance counters and querying
them from UM through a specific ioctl(). The code is inspired by existing
functionality for the Panfrost driver, with some noteworthy differences:

 - Sample size is now reported by the firmware rather than having to reckon
 it by hand
 - Counter samples are chained in a ring buffer that can be accessed
 concurrently, but only from threads within a single context (this is
 because of a HW limitation).
 - List of enabled counters must be explicitly told from UM
 - Rather than allocating the BO that will contain the perfcounter values
 in the render context's address space, the samples ring buffer is mapped
 onto the MCU's VM.
 - If more than one thread within the same context tries to dump a sample,
 then the kernel will copy the same frame to every single thread that was
 able to join the dump queue right before the FW finished processing the
 sample request.
 - UM must provide a BO handle for retrieval of perfcnt values rather
 than passing a user virtual address.

The reason multicontext access to the driver's perfcnt ioctl interface
isn't tolerated is because toggling a different set of counters than the
current one implies a counter reset, which also messes up with the ring
buffer's extraction and insertion pointers. This is an unfortunate
hardware limitation.

Signed-off-by: Adrián Larumbe <adrian.larumbe at collabora.com>
---
 drivers/gpu/drm/panthor/Makefile          |   3 +-
 drivers/gpu/drm/panthor/panthor_device.c  |   6 +
 drivers/gpu/drm/panthor/panthor_device.h  |   6 +
 drivers/gpu/drm/panthor/panthor_drv.c     |  61 +++
 drivers/gpu/drm/panthor/panthor_fw.c      |  27 ++
 drivers/gpu/drm/panthor/panthor_fw.h      |  12 +
 drivers/gpu/drm/panthor/panthor_perfcnt.c | 551 ++++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_perfcnt.h |  31 ++
 drivers/gpu/drm/panthor/panthor_sched.c   |   1 +
 include/uapi/drm/panthor_drm.h            |  72 +++
 10 files changed, 769 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/panthor/panthor_perfcnt.c
 create mode 100644 drivers/gpu/drm/panthor/panthor_perfcnt.h

diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
index 15294719b09c..7f841fd053d4 100644
--- a/drivers/gpu/drm/panthor/Makefile
+++ b/drivers/gpu/drm/panthor/Makefile
@@ -9,6 +9,7 @@ panthor-y := \
 	panthor_gpu.o \
 	panthor_heap.o \
 	panthor_mmu.o \
-	panthor_sched.o
+	panthor_sched.o \
+	panthor_perfcnt.o
 
 obj-$(CONFIG_DRM_PANTHOR) += panthor.o
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
index bfe8da4a6e4c..5dfd82891063 100644
--- a/drivers/gpu/drm/panthor/panthor_device.c
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -20,6 +20,7 @@
 #include "panthor_mmu.h"
 #include "panthor_regs.h"
 #include "panthor_sched.h"
+#include "panthor_perfcnt.h"
 
 static int panthor_clk_init(struct panthor_device *ptdev)
 {
@@ -78,6 +79,7 @@ void panthor_device_unplug(struct panthor_device *ptdev)
 	/* Now, try to cleanly shutdown the GPU before the device resources
 	 * get reclaimed.
 	 */
+	panthor_perfcnt_unplug(ptdev);
 	panthor_sched_unplug(ptdev);
 	panthor_fw_unplug(ptdev);
 	panthor_mmu_unplug(ptdev);
@@ -233,6 +235,10 @@ int panthor_device_init(struct panthor_device *ptdev)
 	if (ret)
 		goto err_unplug_fw;
 
+	ret = panthor_perfcnt_init(ptdev);
+	if (ret)
+		goto err_rpm_put;
+
 	/* ~3 frames */
 	pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50);
 	pm_runtime_use_autosuspend(ptdev->base.dev);
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 51c9d61b6796..adf0bd29deb0 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -100,6 +100,9 @@ struct panthor_device {
 	/** @csif_info: Command stream interface information. */
 	struct drm_panthor_csif_info csif_info;
 
+	/** @perfcnt_info: Performance counters interface information. */
+	struct drm_panthor_perfcnt_info perfcnt_info;
+
 	/** @gpu: GPU management data. */
 	struct panthor_gpu *gpu;
 
@@ -127,6 +130,9 @@ struct panthor_device {
 		struct completion done;
 	} unplug;
 
+	/** @perfcnt: Device performance counters data. */
+	struct panthor_perfcnt *perfcnt;
+
 	/** @reset: Reset related fields. */
 	struct {
 		/** @wq: Ordered worqueud used to schedule reset operations. */
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index ff484506229f..6cb9ea0aa553 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -27,6 +27,7 @@
 #include "panthor_mmu.h"
 #include "panthor_regs.h"
 #include "panthor_sched.h"
+#include "panthor_perfcnt.h"
 
 /**
  * DOC: user <-> kernel object copy helpers.
@@ -164,6 +165,7 @@ panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride,
 	_Generic(_obj_name, \
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_gpu_info, tiler_present), \
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_csif_info, pad), \
+		 PANTHOR_UOBJ_DECL(struct drm_panthor_perfcnt_info, fw_size),	\
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_sync_op, timeline_value), \
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_submit, syncs), \
 		 PANTHOR_UOBJ_DECL(struct drm_panthor_queue_create, ringbuf_size), \
@@ -765,6 +767,10 @@ static int panthor_ioctl_dev_query(struct drm_device *ddev, void *data, struct d
 			args->size = sizeof(ptdev->csif_info);
 			return 0;
 
+		case DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO:
+			args->size = sizeof(ptdev->perfcnt_info);
+			return 0;
+
 		default:
 			return -EINVAL;
 		}
@@ -777,6 +783,9 @@ static int panthor_ioctl_dev_query(struct drm_device *ddev, void *data, struct d
 	case DRM_PANTHOR_DEV_QUERY_CSIF_INFO:
 		return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->csif_info);
 
+	case DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO:
+		return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->perfcnt_info);
+
 	default:
 		return -EINVAL;
 	}
@@ -1245,6 +1254,55 @@ static int panthor_ioctl_vm_get_state(struct drm_device *ddev, void *data,
 	return 0;
 }
 
+static bool perf_masks_zero(struct drm_panthor_perfcnt_config *req)
+{
+
+	u32 counters_mask = req->csg_select | req->fw_enable |
+		req->csg_enable | req->csf_enable | req->shader_enable |
+		req->tiler_enable | req->mmu_l2_enable;
+
+	return (!counters_mask) ? true : false;
+}
+
+static int panthor_ioctl_perfcnt_config(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv)
+{
+	struct panthor_file *pfile = file_priv->driver_priv;
+	struct panthor_device *ptdev = pfile->ptdev;
+	struct drm_panthor_perfcnt_config *req = data;
+
+	/*
+	 * GLB_PRFCNT_CONFIG.SET_SELECT: This flag allows selection of different
+	 * sets of counter events. For those counter blocks that support it, this
+	 * effectively selects between up to four sets of the event count inputs to
+	 * the same counter block. All counter blocks support counter set 0.
+	 */
+	if (req->counterset > 3)
+		return -EINVAL;
+
+	return panthor_perfcnt_config(ptdev, req, pfile, perf_masks_zero(req));
+}
+
+static int panthor_ioctl_perfcnt_dump(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
+{
+	struct panthor_file *pfile = file_priv->driver_priv;
+	struct panthor_device *ptdev = pfile->ptdev;
+	struct drm_panthor_perfcnt_dump *req = data;
+	struct drm_gem_object *obj;
+	int ret;
+
+	obj = drm_gem_object_lookup(file_priv, req->handle);
+	if (!obj)
+		return -ENOENT;
+
+	ret = panthor_perfcnt_dump(ptdev, obj, file_priv->driver_priv);
+
+	drm_gem_object_put(obj);
+
+	return ret;
+}
+
 static int
 panthor_open(struct drm_device *ddev, struct drm_file *file)
 {
@@ -1290,6 +1348,7 @@ panthor_postclose(struct drm_device *ddev, struct drm_file *file)
 {
 	struct panthor_file *pfile = file->driver_priv;
 
+	panthor_perfcnt_close(file);
 	panthor_group_pool_destroy(pfile);
 	panthor_vm_pool_destroy(pfile);
 
@@ -1314,6 +1373,8 @@ static const struct drm_ioctl_desc panthor_drm_driver_ioctls[] = {
 	PANTHOR_IOCTL(TILER_HEAP_CREATE, tiler_heap_create, DRM_RENDER_ALLOW),
 	PANTHOR_IOCTL(TILER_HEAP_DESTROY, tiler_heap_destroy, DRM_RENDER_ALLOW),
 	PANTHOR_IOCTL(GROUP_SUBMIT, group_submit, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(PERFCNT_CONFIG, perfcnt_config, DRM_RENDER_ALLOW),
+	PANTHOR_IOCTL(PERFCNT_DUMP, perfcnt_dump, DRM_RENDER_ALLOW),
 };
 
 static int panthor_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
index 33c87a59834e..7b31bb6c21b9 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.c
+++ b/drivers/gpu/drm/panthor/panthor_fw.c
@@ -23,6 +23,7 @@
 #include "panthor_mmu.h"
 #include "panthor_regs.h"
 #include "panthor_sched.h"
+#include "panthor_perfcnt.h"
 
 #define CSF_FW_NAME "mali_csffw.bin"
 
@@ -947,6 +948,7 @@ static void panthor_fw_init_global_iface(struct panthor_device *ptdev)
 					 GLB_PING |
 					 GLB_CFG_PROGRESS_TIMER |
 					 GLB_CFG_POWEROFF_TIMER |
+					 GLB_PERFCNT_SAMPLE |
 					 GLB_IDLE_EN |
 					 GLB_IDLE;
 
@@ -975,6 +977,10 @@ static void panthor_job_irq_handler(struct panthor_device *ptdev, u32 status)
 		return;
 
 	panthor_sched_report_fw_events(ptdev, status);
+
+	/* Let the perfcnt layer figure out if there are PERFCNT events to process. */
+	if (status & JOB_INT_GLOBAL_IF)
+		panthor_perfcnt_report_fw_events(ptdev, status);
 }
 PANTHOR_IRQ_HANDLER(job, JOB, panthor_job_irq_handler);
 
@@ -1213,6 +1219,26 @@ int panthor_fw_glb_wait_acks(struct panthor_device *ptdev,
 				    req_mask, acked, timeout_ms);
 }
 
+/**
+ * panthor_fw_glb_state_change() - Notify change of state in a global request register flags
+ * @ptdev: Device.
+ * @req_mask: Mask of requests to check change of state for.
+ * @flipped: Pointer to field that's updated with the flipped requests.
+ * If the function returns false, *flipped == 0.
+ *
+ * Return: true on change, false otherwise.
+ */
+bool panthor_fw_glb_state_change(struct panthor_device *ptdev,
+				 u32 req_mask, u32 *flipped)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	u32 req = READ_ONCE(glb_iface->input->req) & req_mask;
+	u32 ack = READ_ONCE(glb_iface->output->ack) & req_mask;
+
+	*flipped = (req ^ ack);
+	return (*flipped != 0);
+}
+
 /**
  * panthor_fw_csg_wait_acks() - Wait for command stream group requests to be acknowledged.
  * @ptdev: Device.
@@ -1352,6 +1378,7 @@ int panthor_fw_init(struct panthor_device *ptdev)
 		goto err_unplug_fw;
 
 	panthor_fw_init_global_iface(ptdev);
+
 	return 0;
 
 err_unplug_fw:
diff --git a/drivers/gpu/drm/panthor/panthor_fw.h b/drivers/gpu/drm/panthor/panthor_fw.h
index 22448abde992..682a02118077 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.h
+++ b/drivers/gpu/drm/panthor/panthor_fw.h
@@ -11,6 +11,7 @@ struct panthor_kernel_bo;
 
 #define MAX_CSGS				31
 #define MAX_CS_PER_CSG                          32
+#define MAX_PERFCNT_BUF_SLOTS                   128
 
 struct panthor_fw_ringbuf_input_iface {
 	u64 insert;
@@ -197,6 +198,8 @@ struct panthor_fw_global_control_iface {
 	u32 output_va;
 	u32 group_num;
 	u32 group_stride;
+#define GLB_PERFCNT_FW_SIZE(x)                 ((((x) >> 16) << 8))
+#define GLB_PERFCNT_HW_SIZE(x)                 (((x) & GENMASK(15, 0)) << 8)
 	u32 perfcnt_size;
 	u32 instr_features;
 };
@@ -240,6 +243,8 @@ struct panthor_fw_global_input_iface {
 	u64 perfcnt_base;
 	u32 perfcnt_extract;
 	u32 reserved3[3];
+#define GLB_PERFCNT_CFG_SIZE(x)			((x) & GENMASK(7, 0))
+#define GLB_PERFCNT_CFG_SET(x)			((GENMASK(1, 0) & (x)) << 8)
 	u32 perfcnt_config;
 	u32 perfcnt_csg_select;
 	u32 perfcnt_fw_enable;
@@ -264,6 +269,11 @@ struct panthor_fw_global_output_iface {
 	u32 doorbell_ack;
 	u32 reserved2;
 	u32 halt_status;
+
+#define GLB_PERFCNT_STATUS_FAILED            BIT(0)
+#define GLB_PERFCNT_STATUS_POWERON           BIT(1)
+#define GLB_PERFCNT_STATUS_POWEROFF          BIT(2)
+#define GLB_PERFCNT_STATUS_PROTSESSION       BIT(3)
 	u32 perfcnt_status;
 	u32 perfcnt_insert;
 };
@@ -472,6 +482,8 @@ int panthor_fw_csg_wait_acks(struct panthor_device *ptdev, u32 csg_id, u32 req_m
 int panthor_fw_glb_wait_acks(struct panthor_device *ptdev, u32 req_mask, u32 *acked,
 			     u32 timeout_ms);
 
+bool panthor_fw_glb_state_change(struct panthor_device *ptdev, u32 req_mask, u32 *flipped);
+
 void panthor_fw_ring_csg_doorbells(struct panthor_device *ptdev, u32 csg_slot);
 
 struct panthor_kernel_bo *
diff --git a/drivers/gpu/drm/panthor/panthor_perfcnt.c b/drivers/gpu/drm/panthor/panthor_perfcnt.c
new file mode 100644
index 000000000000..e223e44e3f35
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_perfcnt.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2023 Collabora Ltd */
+
+#include "linux/mutex.h"
+#include <linux/completion.h>
+#include <linux/iopoll.h>
+#include <linux/iosys-map.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/spinlock.h>
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_managed.h>
+#include <drm/panthor_drm.h>
+
+#include "panthor_device.h"
+#include "panthor_gem.h"
+#include "panthor_mmu.h"
+#include "panthor_perfcnt.h"
+#include "panthor_regs.h"
+#include "panthor_gpu.h"
+#include "panthor_fw.h"
+
+#define SAMPLE_TIMEOUT_MS		1000
+#define SAMPLE_HDR_SIZE                 12
+#define SAMPLE_BLOCK_SIZE		0x100
+
+#define PERFCNT_OP_AFFECTED \
+	(GLB_PERFCNT_STATUS_POWEROFF | \
+	 GLB_PERFCNT_STATUS_POWERON | \
+	 GLB_PERFCNT_STATUS_PROTSESSION)
+
+enum perfcnt_status {
+	PERFCNT_STATUS_STARTED,
+	PERFCNT_STATUS_SUCCEEDED,
+	PERFCNT_STATUS_FAILED,
+	PERFCNT_STATUS_OVERFLOW,
+};
+
+struct panthor_perfcnt {
+	struct panthor_device *ptdev;
+	struct panthor_file *user;
+	struct mutex lock;
+
+	struct panthor_kernel_bo *bo;
+	size_t sample_size;
+	u32 ringslots;
+
+	struct workqueue_struct *dumper_wkq;
+	struct work_struct work;
+	atomic_t dump_requested;
+
+	struct list_head dumper_list;
+	wait_queue_head_t wq;
+};
+
+struct panthor_perfcnt_dumper {
+	struct list_head list;
+	struct completion comp;
+	void *user_bo;
+	int last_status;
+};
+
+struct perfcnt_counters {
+	u32 counterset;
+	u32 csg_select;
+	u32 fw_enable;
+	u32 csg_enable;
+	u32 csf_enable;
+	u32 shader_enable;
+	u32 tiler_enable;
+	u32 mmu_l2_enable;
+};
+
+static int panthor_perfcnt_enable_counters(struct panthor_device *ptdev,
+					   struct perfcnt_counters *counters)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	u32 acked;
+
+	if (glb_iface->input->req & GLB_PERFCNT_EN) {
+		drm_info(&ptdev->base, "Performance counters aren't disabled!\n");
+		return -EBUSY;
+	}
+
+	glb_iface->input->perfcnt_config |= GLB_PERFCNT_CFG_SET(counters->counterset);
+	glb_iface->input->perfcnt_csg_select = counters->csg_select;
+	glb_iface->input->perfcnt_mmu_l2_enable = counters->mmu_l2_enable;
+	glb_iface->input->perfcnt_tiler_enable = counters->tiler_enable;
+	glb_iface->input->perfcnt_shader_enable = counters->shader_enable;
+	glb_iface->input->perfcnt_csf_enable = counters->csf_enable;
+	glb_iface->input->perfcnt_csg_enable = counters->csg_enable;
+	glb_iface->input->perfcnt_fw_enable = counters->fw_enable;
+
+	/* Enable/Disabled status is value-based, rather than change-of-value */
+	panthor_fw_update_reqs(glb_iface, req, GLB_PERFCNT_EN, GLB_PERFCNT_EN);
+	gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+	return panthor_fw_glb_wait_acks(ptdev, GLB_PERFCNT_EN, &acked, 100);
+}
+
+static int
+panthor_perfcnt_disable_counters(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	u32 acked;
+	int ret;
+
+	if (!(glb_iface->input->req & GLB_PERFCNT_EN)) {
+		drm_info(&ptdev->base, "Performance counters were already disabled\n");
+		return 0;
+	}
+
+	/* Enable/Disabled status is value-based, rather than change-of-value */
+	panthor_fw_update_reqs(glb_iface, req, 0, GLB_PERFCNT_EN);
+	gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+	ret = panthor_fw_glb_wait_acks(ptdev, GLB_PERFCNT_EN, &acked, 100);
+	if (ret) {
+		drm_err(&ptdev->base, "Could not disable performance counters\n");
+		return ret;
+	}
+
+	glb_iface->input->perfcnt_csg_select = 0;
+	glb_iface->input->perfcnt_mmu_l2_enable = 0;
+	glb_iface->input->perfcnt_tiler_enable = 0;
+	glb_iface->input->perfcnt_shader_enable = 0;
+	glb_iface->input->perfcnt_csf_enable = 0;
+	glb_iface->input->perfcnt_csg_enable = 0;
+	glb_iface->input->perfcnt_fw_enable = 0;
+
+	return 0;
+}
+
+static void perfcnt_copy_sample(struct panthor_device *ptdev,
+				struct panthor_perfcnt *perfcnt,
+				void *bo_va, unsigned int idx)
+{
+	/*
+	 * Ring buffer index calculation can be done in this way because it
+	 * is always guaranteed to be a power of 2
+	 */
+	memcpy(bo_va, perfcnt->bo->kmap +
+	       ((idx & (perfcnt->ringslots - 1)) * perfcnt->sample_size),
+	       perfcnt->sample_size);
+}
+
+static void clear_slot_headers(struct panthor_device *ptdev, u32 ext_idx, u32 ins_idx)
+{
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	unsigned int offset;
+	unsigned int i;
+
+	if (WARN_ON(ext_idx >= ins_idx))  {
+		drm_warn(&ptdev->base, "Extraction index is greater or equal than insertion index %u-%u\n",
+			 ext_idx, ins_idx);
+		return;
+	}
+
+	drm_dbg(&ptdev->base, "Cleaning perfcnt ring buffer slots %u-%u\n", ext_idx, ins_idx);
+
+	for (i = ext_idx; i < ins_idx; i++) {
+		void *slot = perfcnt->bo->kmap +
+			((i & (ptdev->perfcnt->ringslots - 1)) * perfcnt->sample_size);
+
+		for (offset = 0; offset < perfcnt->sample_size; offset += SAMPLE_BLOCK_SIZE)
+			memset(slot + offset, 0, SAMPLE_HDR_SIZE);
+	}
+}
+
+static void clean_dumper_list(struct panthor_device *ptdev, unsigned int status)
+{
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_perfcnt_dumper *dumper, *dumper_tmp;
+
+	mutex_lock(&perfcnt->lock);
+	list_for_each_entry_safe(dumper, dumper_tmp, &perfcnt->dumper_list, list) {
+		if (status == PERFCNT_STATUS_SUCCEEDED)
+			perfcnt_copy_sample(ptdev, perfcnt, dumper->user_bo,
+					    glb_iface->output->perfcnt_insert - 1);
+		list_del(&dumper->list);
+		INIT_LIST_HEAD(&dumper->list);
+		dumper->last_status = status;
+		complete(&dumper->comp);
+	}
+	mutex_unlock(&perfcnt->lock);
+}
+
+static void perfcnt_process_sample(struct work_struct *work)
+{
+	struct panthor_perfcnt *perfcnt =
+		container_of(work, struct panthor_perfcnt, work);
+	struct panthor_device *ptdev = perfcnt->ptdev;
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	u32 acked, flipped;
+	int ret;
+
+	if (panthor_fw_glb_state_change(ptdev, GLB_PERFCNT_THRESHOLD, &flipped)) {
+		drm_dbg(&ptdev->base, "Performance counter buffer has reached 50%% capacity\n");
+		panthor_fw_toggle_reqs(glb_iface, req, ack, flipped);
+		gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+		ret = panthor_fw_glb_wait_acks(ptdev, flipped, &acked, 100);
+		if (ret)
+			drm_warn(&ptdev->base, "Resetting Threshold flags failed\n");
+	}
+
+	if (glb_iface->output->perfcnt_status & GLB_PERFCNT_STATUS_FAILED) {
+		drm_err(&ptdev->base, "Perfcounter sampling failed\n");
+		clean_dumper_list(ptdev, PERFCNT_STATUS_FAILED);
+		goto worker_exit;
+	}
+
+	if (panthor_fw_glb_state_change(ptdev, GLB_PERFCNT_OVERFLOW, &flipped)) {
+		drm_info(&ptdev->base, "The performance counter buffer has overflowed. Some samples may have been lost\n");
+		panthor_fw_toggle_reqs(glb_iface, req, ack, flipped);
+		gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+		ret = panthor_fw_glb_wait_acks(ptdev, flipped, &acked, 100);
+		if (ret)
+			drm_err(&ptdev->base, "Resetting Overflow flags failed\n");
+		clean_dumper_list(ptdev, PERFCNT_STATUS_OVERFLOW);
+		goto clear_inc_idx;
+	}
+
+	if (glb_iface->output->perfcnt_status & PERFCNT_OP_AFFECTED)
+		drm_warn(&ptdev->base, "Perfcnt sample operation might have been impacted by a power transition or protected session exec\n");
+
+	clean_dumper_list(ptdev, PERFCNT_STATUS_SUCCEEDED);
+
+clear_inc_idx:
+	clear_slot_headers(ptdev, glb_iface->input->perfcnt_extract,
+			   glb_iface->output->perfcnt_insert);
+	/*
+	 * TRM recommends increasing the extract pointer by one after every sample
+	 * operation, but because sample requests are processed sequentially and we
+	 * discard samples triggered by the HW automatically, it's best if we simply
+	 * set it to the next insert slot index.
+	 */
+	WRITE_ONCE(glb_iface->input->perfcnt_extract,
+		   READ_ONCE(glb_iface->output->perfcnt_insert));
+worker_exit:
+	wake_up_all(&perfcnt->wq);
+}
+
+int panthor_perfcnt_dump(struct panthor_device *ptdev,
+			 struct drm_gem_object *obj,
+			 struct panthor_file *pfile)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	struct panthor_perfcnt_dumper dumper;
+	struct iosys_map map;
+
+	int ret;
+
+	mutex_lock(&perfcnt->lock);
+	if (perfcnt->user != pfile) {
+		ret = -EINVAL;
+		goto err_dump;
+	}
+
+	ret = drm_gem_vmap_unlocked(obj, &map);
+	if (ret) {
+		drm_err(&ptdev->base, "Could not map the target BO\n");
+		goto err_dump;
+	}
+
+	dumper.user_bo = map.vaddr;
+	dumper.last_status = PERFCNT_STATUS_STARTED;
+	init_completion(&dumper.comp);
+	list_add_tail(&dumper.list, &perfcnt->dumper_list);
+
+	/* Start the sampling if list were empty */
+	if (list_is_first(&dumper.list, &perfcnt->dumper_list)) {
+		panthor_fw_toggle_reqs(glb_iface, req, ack, GLB_PERFCNT_SAMPLE);
+		atomic_set(&ptdev->perfcnt->dump_requested, 1);
+		gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
+	}
+	mutex_unlock(&perfcnt->lock);
+
+	ret = wait_for_completion_interruptible_timeout(&dumper.comp,
+							msecs_to_jiffies(SAMPLE_TIMEOUT_MS));
+	if (!ret)
+		/* Let's give the worker thread a chance to finish */
+		ret = flush_work(&perfcnt->work);
+
+	if (!ret && !try_wait_for_completion(&dumper.comp)) {
+		mutex_lock(&perfcnt->lock);
+		if (!list_empty(&dumper.list)) {
+			list_del(&dumper.list);
+			if (list_empty(&perfcnt->dumper_list)) {
+				atomic_set(&ptdev->perfcnt->dump_requested, 0);
+				wake_up_all(&perfcnt->wq);
+			}
+		}
+		mutex_unlock(&perfcnt->lock);
+
+		ret = -ETIMEDOUT;
+	} else {
+		WARN_ON(dumper.last_status == PERFCNT_STATUS_STARTED);
+		ret = (dumper.last_status >= PERFCNT_STATUS_FAILED) ? -EIO : 0;
+	}
+
+	drm_gem_vunmap_unlocked(obj, &map);
+
+	return ret;
+
+err_dump:
+	mutex_unlock(&perfcnt->lock);
+	return ret;
+}
+
+
+static int panthor_perfcnt_enable_locked(struct panthor_device *ptdev,
+				  struct panthor_file *pfile,
+				  struct drm_panthor_perfcnt_config *req)
+
+{
+	unsigned int perfcnt_ringbuf_slots = req->ringslots;
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	struct panthor_kernel_bo *bo;
+	int ret;
+
+	if (pfile == perfcnt->user)
+		return 0;
+	else if (perfcnt->user)
+		return -EBUSY;
+
+	if (perfcnt_ringbuf_slots != perfcnt->ringslots) {
+		struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+
+		if (perfcnt->bo) {
+			panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), perfcnt->bo);
+			perfcnt->bo = NULL;
+		}
+
+		if (perfcnt_ringbuf_slots > MAX_PERFCNT_BUF_SLOTS)
+			perfcnt_ringbuf_slots = MAX_PERFCNT_BUF_SLOTS;
+		if (!is_power_of_2(perfcnt_ringbuf_slots))
+			perfcnt_ringbuf_slots = rounddown_pow_of_two(perfcnt_ringbuf_slots);
+
+		/*
+		 * Create the perfcnt dump BO. We need to use the FW's VM because GLB_PRFCNT_JASID's
+		 * maximum implementation defined value is 7. The way AS are assigned to a VM
+		 * in panthor_vm_active means we cannot guarantee an AS between 1 and 7 would be
+		 * available. An alternative would be implementing some sort of AS eviction
+		 * mechanism, or perhaps setting one AS bit aside for perfcnt. However, given that
+		 * the counters are global, it's simpler to bind the perfcount ringbuf to the FW AS.
+		 */
+		bo = panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev),
+					      perfcnt->sample_size * perfcnt_ringbuf_slots,
+					      DRM_PANTHOR_BO_NO_MMAP,
+					      DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+					      DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+					      PANTHOR_VM_KERNEL_AUTO_VA);
+		if (IS_ERR(bo))
+			return PTR_ERR(bo);
+
+		ret = panthor_kernel_bo_vmap(bo);
+		if (ret)
+			goto err_put_bo;
+
+		perfcnt->bo = bo;
+		perfcnt->ringslots = perfcnt_ringbuf_slots;
+		glb_iface->input->perfcnt_base = perfcnt->bo->va_node.start;
+		glb_iface->input->perfcnt_config |= GLB_PERFCNT_CFG_SIZE(perfcnt->ringslots);
+	}
+
+	ret = pm_runtime_get_sync(ptdev->base.dev);
+	if (ret < 0)
+		goto enable_err;
+
+	ret = panthor_perfcnt_disable_counters(ptdev);
+	if (ret)
+		goto enable_err;
+
+	ret = panthor_perfcnt_enable_counters(ptdev,
+					      (struct perfcnt_counters *) &req->counterset);
+	if (ret)
+		goto enable_err;
+
+	perfcnt->user =	pfile;
+
+	return 0;
+
+enable_err:
+	pm_runtime_put(ptdev->base.dev);
+	panthor_kernel_bo_vunmap(bo);
+err_put_bo:
+	panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), bo);
+	perfcnt->bo = NULL;
+	return ret;
+}
+
+static int panthor_perfcnt_disable_locked(struct panthor_device *ptdev,
+					  struct panthor_file *pfile)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	int ret;
+
+	if (perfcnt->user != pfile)
+		return -EINVAL;
+
+	if (!list_empty(&perfcnt->dumper_list)) {
+		drm_warn(&ptdev->base, "A perfcnt dump is still running, let it finnish\n");
+		mutex_unlock(&perfcnt->lock);
+		ret  = wait_event_timeout(perfcnt->wq,
+				   list_empty(&perfcnt->dumper_list),
+				   msecs_to_jiffies(SAMPLE_TIMEOUT_MS));
+		mutex_lock(&perfcnt->lock);
+		if (!ret)
+			drm_warn(&ptdev->base, "Dump didn't finish, results will be undefined\n");
+	}
+
+	panthor_perfcnt_disable_counters(ptdev);
+	glb_iface->input->perfcnt_extract = 0;
+	perfcnt->user = NULL;
+
+	pm_runtime_mark_last_busy(ptdev->base.dev);
+	pm_runtime_put_autosuspend(ptdev->base.dev);
+
+	return 0;
+}
+
+int panthor_perfcnt_config(struct panthor_device *ptdev,
+			   struct drm_panthor_perfcnt_config *req,
+			   struct panthor_file *pfile,
+			   bool disable)
+{
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	int ret;
+
+	mutex_lock(&perfcnt->lock);
+	if (disable)
+		ret = panthor_perfcnt_disable_locked(ptdev, pfile);
+	else
+		ret = panthor_perfcnt_enable_locked(ptdev, pfile, req);
+	mutex_unlock(&perfcnt->lock);
+
+	return ret;
+}
+
+void panthor_perfcnt_close(struct drm_file *file_priv)
+{
+	struct panthor_file *pfile = file_priv->driver_priv;
+	struct panthor_device *ptdev = pfile->ptdev;
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+
+	pm_runtime_get_sync(ptdev->base.dev);
+
+	mutex_lock(&perfcnt->lock);
+	if (perfcnt->user == pfile)
+		panthor_perfcnt_disable_locked(ptdev, file_priv->driver_priv);
+	mutex_unlock(&perfcnt->lock);
+
+	pm_runtime_mark_last_busy(ptdev->base.dev);
+	pm_runtime_put_autosuspend(ptdev->base.dev);
+}
+
+void panthor_perfcnt_report_fw_events(struct panthor_device *ptdev, u32 status)
+{
+
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+	u32 req, ack;
+
+	if (!perfcnt)
+		return;
+
+	req = READ_ONCE(glb_iface->input->req);
+	ack = READ_ONCE(glb_iface->output->ack);
+
+	if ((~(req ^ ack) & GLB_PERFCNT_SAMPLE) &&
+	    !panthor_device_reset_is_pending(ptdev)) {
+		if (atomic_cmpxchg(&ptdev->perfcnt->dump_requested, 1, 0))
+			queue_work(perfcnt->dumper_wkq, &perfcnt->work);
+	}
+}
+
+int panthor_perfcnt_init(struct panthor_device *ptdev)
+{
+	struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev);
+	struct panthor_perfcnt *perfcnt;
+	int ret;
+
+	perfcnt = devm_kzalloc(ptdev->base.dev, sizeof(*perfcnt), GFP_KERNEL);
+	if (!perfcnt)
+		return -ENOMEM;
+
+	ptdev->perfcnt_info.fw_size = GLB_PERFCNT_FW_SIZE(glb_iface->control->perfcnt_size);
+	ptdev->perfcnt_info.hw_size = GLB_PERFCNT_HW_SIZE(glb_iface->control->perfcnt_size);
+
+	perfcnt->sample_size = ptdev->perfcnt_info.fw_size + ptdev->perfcnt_info.hw_size;
+	perfcnt->ringslots = 0;
+	perfcnt->bo = NULL;
+
+	perfcnt->dumper_wkq = alloc_workqueue("perfcnt-dumper", WQ_UNBOUND, 0);
+	if (!perfcnt->dumper_wkq) {
+		drm_err(&ptdev->base, "Failed to allocate perfcnt workqueue");
+		return -ENOMEM;
+	}
+	INIT_WORK(&perfcnt->work, perfcnt_process_sample);
+
+	/* Perfcnt configuration */
+	glb_iface->input->perfcnt_config |= GLB_PERFCNT_CFG_SIZE(perfcnt->ringslots);
+	glb_iface->input->perfcnt_as = panthor_vm_as(panthor_fw_vm(ptdev));
+	glb_iface->input->perfcnt_extract = 0;
+
+	/* Start with everything disabled. */
+	ret = panthor_perfcnt_disable_counters(ptdev);
+	if (ret)
+		goto err_dealloc_workqueue;
+
+	INIT_LIST_HEAD(&perfcnt->dumper_list);
+	init_waitqueue_head(&perfcnt->wq);
+	mutex_init(&perfcnt->lock);
+
+	perfcnt->ptdev = ptdev;
+	ptdev->perfcnt = perfcnt;
+
+	drm_info(&ptdev->base,
+		 "Perfcnt params: Sample size: %#zx Slots: %u\n",
+		 perfcnt->sample_size, perfcnt->ringslots);
+
+	return 0;
+
+err_dealloc_workqueue:
+	destroy_workqueue(perfcnt->dumper_wkq);
+
+	return ret;
+}
+
+void panthor_perfcnt_unplug(struct panthor_device *ptdev)
+{
+	struct panthor_perfcnt *perfcnt = ptdev->perfcnt;
+
+	WARN_ON(perfcnt->user);
+
+	panthor_perfcnt_disable_counters(ptdev);
+
+	cancel_work_sync(&perfcnt->work);
+	destroy_workqueue(perfcnt->dumper_wkq);
+
+	mutex_destroy(&perfcnt->lock);
+
+	if (perfcnt->bo) {
+		panthor_kernel_bo_vunmap(perfcnt->bo);
+		panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), perfcnt->bo);
+	}
+}
diff --git a/drivers/gpu/drm/panthor/panthor_perfcnt.h b/drivers/gpu/drm/panthor/panthor_perfcnt.h
new file mode 100644
index 000000000000..6edcbe256f4a
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_perfcnt.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2023 Collabora Ltd */
+#ifndef __PANTHOR_PERFCNT_H__
+#define __PANTHOR_PERFCNT_H__
+
+#include <linux/types.h>
+
+struct panthor_device;
+struct panthor_file;
+struct drm_device;
+struct drm_file;
+struct drm_gem_object;
+struct drm_panthor_perfcnt_config;
+
+int panthor_perfcnt_init(struct panthor_device *ptdev);
+void panthor_perfcnt_unplug(struct panthor_device *ptdev);
+void panthor_perfcnt_close(struct drm_file *file_priv);
+
+int panthor_perfcnt_config(struct panthor_device *ptdev,
+			   struct drm_panthor_perfcnt_config *req,
+			   struct panthor_file *pfile,
+			   bool disable);
+int panthor_perfcnt_dump(struct panthor_device *ptdev,
+			 struct drm_gem_object *obj,
+			 struct panthor_file *pfile);
+
+void panthor_perfcnt_report_fw_events(struct panthor_device *ptdev,
+				      u32 status);
+
+
+#endif
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 5f7803b6fc48..cbd0ab77a3cd 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -31,6 +31,7 @@
 #include "panthor_mmu.h"
 #include "panthor_regs.h"
 #include "panthor_sched.h"
+#include "panthor_perfcnt.h"
 
 /**
  * DOC: Scheduler
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 373df80f41ed..0ca940529be4 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -127,6 +127,12 @@ enum drm_panthor_ioctl_id {
 
 	/** @DRM_PANTHOR_TILER_HEAP_DESTROY: Destroy a tiler heap. */
 	DRM_PANTHOR_TILER_HEAP_DESTROY,
+
+	/** @DRM_PANTHOR_PERFCNT_CONFIG: Enable or disable performance counters. */
+	DRM_PANTHOR_PERFCNT_CONFIG,
+
+	/** @DRM_PANTHOR_PERFCNT_DUMP: Sample and retrieve performance counters. */
+	DRM_PANTHOR_PERFCNT_DUMP,
 };
 
 /**
@@ -170,6 +176,10 @@ enum drm_panthor_ioctl_id {
 	DRM_IOCTL_PANTHOR(WR, TILER_HEAP_CREATE, tiler_heap_create)
 #define DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY \
 	DRM_IOCTL_PANTHOR(WR, TILER_HEAP_DESTROY, tiler_heap_destroy)
+#define DRM_IOCTL_PANTHOR_PERFCNT_CONFIG \
+	DRM_IOCTL_PANTHOR(WR, PERFCNT_CONFIG, perfcnt_config)
+#define DRM_IOCTL_PANTHOR_PERFCNT_DUMP \
+	DRM_IOCTL_PANTHOR(WR, PERFCNT_DUMP, perfcnt_dump)
 
 /**
  * DOC: IOCTL arguments
@@ -260,6 +270,9 @@ enum drm_panthor_dev_query_type {
 
 	/** @DRM_PANTHOR_DEV_QUERY_CSIF_INFO: Query command-stream interface information. */
 	DRM_PANTHOR_DEV_QUERY_CSIF_INFO,
+
+	/** @DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO: Query perf counters interface information. */
+	DRM_PANTHOR_DEV_QUERY_PERFCNT_INFO,
 };
 
 /**
@@ -377,6 +390,19 @@ struct drm_panthor_csif_info {
 	__u32 pad;
 };
 
+/**
+ * struct drm_panthor_perfcnt_info - Performance counters interface information
+ *
+ * Structure grouping all queryable information relating to the perfcnt interface.
+ */
+struct drm_panthor_perfcnt_info {
+	/** @hw_size: Size of HW related performance counters. */
+	__u32 hw_size;
+
+	/** @fw_size: Size of FW related performance counters. */
+	__u32 fw_size;
+};
+
 /**
  * struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY
  */
@@ -938,6 +964,52 @@ struct drm_panthor_tiler_heap_destroy {
 	__u32 pad;
 };
 
+/**
+ * struct drm_panthor_perfcnt_config - Arguments passed to DRM_IOCTL_PANTHOR_PERFCNT_CONFIG
+ */
+struct drm_panthor_perfcnt_config {
+	/** @ringslots: Size of the perfcnt ring buffer in slot count. */
+	__u32 ringslots;
+
+	/** @counterset: Counter set to enable in Panthor. */
+	__u32 counterset;
+
+	/** @csg_enable: List of CSG intances enabled for perf counting */
+	__u32 csg_select;
+
+	/** @fw_enable  FW counters to be enabled */
+	__u32 fw_enable;
+
+	/** @csg_enable  CSG counters to be enabled */
+	__u32 csg_enable;
+
+	/** @csf_enable  CSF counters to be enabled */
+	__u32 csf_enable;
+
+	/** @shader_enable  Shader unit counters to be enabled */
+	__u32 shader_enable;
+
+	/** @tiler_enable  Tiler unit counters to be enabled */
+	__u32 tiler_enable;
+
+	/** @mmu_l2_enable  L2 cache MMU counters to be enabled */
+	__u32 mmu_l2_enable;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_perfcnt_dump - Arguments passed to DRM_IOCTL_PANTHOR_PERFCNT_DUMP
+ */
+struct drm_panthor_perfcnt_dump {
+	/** @handle: Handle of the BO to write perfcnt dump into */
+	__u32 handle;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+};
+
 #if defined(__cplusplus)
 }
 #endif

base-commit: e635b7eb7062b464bbd9795308b1a80eac0b01f5
-- 
2.43.0



More information about the dri-devel mailing list