[PATCH 08/17] drm/xe/oa: OA stream initialization (OAG)

Umesh Nerlige Ramappa umesh.nerlige.ramappa at intel.com
Wed Dec 20 02:31:45 UTC 2023


On Thu, Dec 07, 2023 at 10:43:20PM -0800, Ashutosh Dixit wrote:
>Implement majority of OA stream initialization (as part of OA stream open)
>ioctl). OAG buffer is allocated for receiving perf counter samples from
>HW. OAG unit is initialized and the selected OA metric configuration is
>programmed into OAG unit HW using a command/batch buffer.
>
>Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
>---
> drivers/gpu/drm/xe/regs/xe_gt_regs.h |   3 +
> drivers/gpu/drm/xe/xe_oa.c           | 397 +++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_oa_types.h     |  82 ++++++
> 3 files changed, 482 insertions(+)
>
>diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>index d318ec0efd7db..1b98b609f7fda 100644
>--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
>@@ -156,6 +156,8 @@
>
> #define SQCNT1					XE_REG_MCR(0x8718)
> #define XELPMP_SQCNT1				XE_REG(0x8718)
>+#define   SQCNT1_PMON_ENABLE			REG_BIT(30)
>+#define   SQCNT1_OABPC				REG_BIT(29)
> #define   ENFORCE_RAR				REG_BIT(23)

REG_BIT(29) indentation seems to be off

>
> #define XEHP_SQCM				XE_REG_MCR(0x8724)
>@@ -365,6 +367,7 @@
> #define ROW_CHICKEN				XE_REG_MCR(0xe4f0, XE_REG_OPTION_MASKED)
> #define   UGM_BACKUP_MODE			REG_BIT(13)
> #define   MDQ_ARBITRATION_MODE			REG_BIT(12)
>+#define   STALL_DOP_GATING_DISABLE		REG_BIT(5)
> #define   EARLY_EOT_DIS				REG_BIT(1)
>
> #define ROW_CHICKEN2				XE_REG_MCR(0xe4f4, XE_REG_OPTION_MASKED)
>diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
>index 9b0bd58fcbc06..d898610322d50 100644
>--- a/drivers/gpu/drm/xe/xe_oa.c
>+++ b/drivers/gpu/drm/xe/xe_oa.c
>@@ -6,15 +6,26 @@
> #include <linux/nospec.h>
> #include <linux/sysctl.h>
>
>+#include <drm/drm_drv.h>
>+#include <drm/xe_drm.h>
>+
>+#include "instructions/xe_mi_commands.h"
> #include "regs/xe_gt_regs.h"
> #include "regs/xe_oa_regs.h"
> #include "xe_device.h"
> #include "xe_exec_queue.h"
>+#include "xe_bb.h"
>+#include "xe_bo.h"
> #include "xe_gt.h"
>+#include "xe_gt_mcr.h"
> #include "xe_mmio.h"
> #include "xe_oa.h"
>+#include "xe_sched_job.h"
> #include "xe_perf.h"
>
>+#define DEFAULT_POLL_FREQUENCY_HZ 200
>+#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ)
>+
> static int xe_oa_sample_rate_hard_limit;
> static u32 xe_oa_max_sample_rate = 100000;
>
>@@ -63,6 +74,13 @@ struct xe_oa_open_param {
> 	struct xe_hw_engine *hwe;
> };
>
>+struct xe_oa_config_bo {
>+	struct llist_node node;
>+
>+	struct xe_oa_config *oa_config;
>+	struct xe_bb *bb;
>+};
>+
> #define DRM_FMT(x) DRM_XE_OA_FMT_TYPE_##x
>
> static const struct xe_oa_format oa_formats[] = {
>@@ -105,6 +123,381 @@ static void xe_oa_config_put(struct xe_oa_config *oa_config)
> 	kref_put(&oa_config->ref, xe_oa_config_release);
> }
>
>+static struct xe_oa_config *xe_oa_config_get(struct xe_oa_config *oa_config)
>+{
>+	return kref_get_unless_zero(&oa_config->ref) ? oa_config : NULL;
>+}
>+
>+static struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_set)
>+{
>+	struct xe_oa_config *oa_config;
>+
>+	rcu_read_lock();
>+	oa_config = idr_find(&oa->metrics_idr, metrics_set);
>+	if (oa_config)
>+		oa_config = xe_oa_config_get(oa_config);
>+	rcu_read_unlock();
>+
>+	return oa_config;
>+}
>+
>+static void free_oa_config_bo(struct xe_oa_config_bo *oa_bo)
>+{
>+	xe_oa_config_put(oa_bo->oa_config);
>+	xe_bb_free(oa_bo->bb, NULL);
>+	kfree(oa_bo);
>+}
>+
>+static const struct xe_oa_regs *__oa_regs(struct xe_oa_stream *stream)
>+{
>+	return &stream->hwe->oa_unit->regs;
>+}
>+
>+static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb)
>+{
>+	struct xe_sched_job *job;
>+	struct dma_fence *fence;
>+	long timeout;
>+	int err = 0;
>+
>+	/* Kernel configuration is issued on stream->k_exec_q, not stream->exec_q */
>+	job = xe_bb_create_job(stream->k_exec_q, bb);
>+	if (IS_ERR(job)) {
>+		err = PTR_ERR(job);
>+		goto exit;
>+	}
>+
>+	xe_sched_job_arm(job);
>+	fence = dma_fence_get(&job->drm.s_fence->finished);
>+	xe_sched_job_push(job);
>+
>+	timeout = dma_fence_wait_timeout(fence, false, HZ);
>+	dma_fence_put(fence);
>+	if (timeout < 0)
>+		err = timeout;
>+	else if (!timeout)
>+		err = -ETIME;
>+exit:
>+	return err;
>+}
>+
>+static void xe_oa_free_oa_buffer(struct xe_oa_stream *stream)
>+{
>+	xe_bo_unpin_map_no_vm(stream->oa_buffer.bo);
>+}
>+
>+static void xe_oa_free_configs(struct xe_oa_stream *stream)
>+{
>+	struct xe_oa_config_bo *oa_bo, *tmp;
>+
>+	xe_oa_config_put(stream->oa_config);
>+	llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node)
>+		free_oa_config_bo(oa_bo);
>+}
>+
>+#define HAS_OA_BPC_REPORTING(xe) (GRAPHICS_VERx100(xe) >= 1255)
>+
>+static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
>+{
>+	u32 sqcnt1;
>+
>+	/*
>+	 * Wa_1508761755:xehpsdv, dg2
>+	 * Enable thread stall DOP gating and EU DOP gating.
>+	 */
>+	if (stream->oa->xe->info.platform == XE_DG2) {
>+		xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN,
>+					  _MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE));
>+		xe_gt_mcr_multicast_write(stream->gt, ROW_CHICKEN2,
>+					  _MASKED_BIT_DISABLE(DISABLE_DOP_GATING));
>+	}
>+
>+	/* Make sure we disable noa to save power. */
>+	xe_mmio_rmw32(stream->gt, RPM_CONFIG1, GT_NOA_ENABLE, 0);
>+
>+	sqcnt1 = SQCNT1_PMON_ENABLE |
>+		 (HAS_OA_BPC_REPORTING(stream->oa->xe) ? SQCNT1_OABPC : 0);
>+
>+	/* Reset PMON Enable to save power. */
>+	xe_mmio_rmw32(stream->gt, XELPMP_SQCNT1, sqcnt1, 0);
>+}
>+
>+static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream)
>+{
>+	struct xe_bo *bo;
>+
>+	BUILD_BUG_ON_NOT_POWER_OF_2(XE_OA_BUFFER_SIZE);
>+	BUILD_BUG_ON(XE_OA_BUFFER_SIZE < SZ_128K || XE_OA_BUFFER_SIZE > SZ_16M);
>+
>+	bo = xe_bo_create_pin_map(stream->oa->xe, stream->gt->tile, NULL,
>+				  XE_OA_BUFFER_SIZE, ttm_bo_type_kernel,
>+				  XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_GGTT_BIT);
>+	if (IS_ERR(bo))
>+		return PTR_ERR(bo);
>+
>+	stream->oa_buffer.bo = bo;
>+	stream->oa_buffer.vaddr = bo->vmap.vaddr;
>+	return 0;
>+}
>+
>+static void write_cs_mi_lri(struct xe_bb *bb, const struct xe_oa_reg *reg_data, u32 n_regs)
>+{
>+	u32 i;
>+
>+#define MI_LOAD_REGISTER_IMM_MAX_REGS (126)
>+
>+	for (i = 0; i < n_regs; i++) {
>+		if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
>+			u32 n_lri = min_t(u32, n_regs - i,
>+					  MI_LOAD_REGISTER_IMM_MAX_REGS);
>+
>+			bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(n_lri);
>+		}
>+		bb->cs[bb->len++] = reg_data[i].addr.addr;
>+		bb->cs[bb->len++] = reg_data[i].value;
>+	}
>+}
>+
>+static int num_lri_dwords(int num_regs)
>+{
>+	int count = 0;
>+
>+	if (num_regs > 0) {
>+		count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
>+		count += num_regs * 2;
>+	}
>+
>+	return count;
>+}
>+
>+static struct xe_oa_config_bo *
>+__xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa_config)
>+{
>+	struct xe_oa_config_bo *oa_bo;
>+	size_t config_length;
>+	struct xe_bb *bb;
>+
>+	oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
>+	if (!oa_bo)
>+		return ERR_PTR(-ENOMEM);
>+
>+	config_length = num_lri_dwords(oa_config->regs_len);
>+	config_length = ALIGN(sizeof(u32) * config_length, XE_PAGE_SIZE) / sizeof(u32);
>+
>+	bb = xe_bb_new(stream->gt, config_length, false);
>+	if (IS_ERR(bb))
>+		goto err_free;
>+
>+	write_cs_mi_lri(bb, oa_config->regs, oa_config->regs_len);
>+
>+	oa_bo->bb = bb;
>+	oa_bo->oa_config = xe_oa_config_get(oa_config);
>+	llist_add(&oa_bo->node, &stream->oa_config_bos);
>+
>+	return oa_bo;
>+err_free:
>+	kfree(oa_bo);
>+	return ERR_CAST(bb);
>+}
>+
>+static struct xe_oa_config_bo *xe_oa_alloc_config_buffer(struct xe_oa_stream *stream)
>+{
>+	struct xe_oa_config *oa_config = stream->oa_config;
>+	struct xe_oa_config_bo *oa_bo;
>+
>+	/* Look for the buffer in the already allocated BOs attached to the stream */
>+	llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) {
>+		if (oa_bo->oa_config == oa_config &&
>+		    memcmp(oa_bo->oa_config->uuid, oa_config->uuid,
>+			   sizeof(oa_config->uuid)) == 0)
>+			goto out;
>+	}
>+
>+	oa_bo = __xe_oa_alloc_config_buffer(stream, oa_config);
>+out:
>+	return oa_bo;
>+}
>+
>+static int xe_oa_emit_oa_config(struct xe_oa_stream *stream)
>+{
>+#define NOA_PROGRAM_ADDITIONAL_DELAY_US 500
>+	struct xe_oa_config_bo *oa_bo;
>+	int err, us = NOA_PROGRAM_ADDITIONAL_DELAY_US;
>+
>+	oa_bo = xe_oa_alloc_config_buffer(stream);
>+	if (IS_ERR(oa_bo)) {
>+		err = PTR_ERR(oa_bo);
>+		goto exit;
>+	}
>+
>+	err = xe_oa_submit_bb(stream, oa_bo->bb);
>+
>+	/* Additional empirical delay needed for NOA programming after registers are written */
>+	usleep_range(us, 2 * us);

Are we planning to signal user fence or something to indicate 
completion? I haven't tracked that aspect much.

The reset is familiar and lgtm,

Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>

Umesh


More information about the Intel-xe mailing list