[PATCH 11/11] HAX: drm/xe/oa: Incomplete features and FIXME's

Ashutosh Dixit ashutosh.dixit at intel.com
Tue Aug 8 01:21:55 UTC 2023


This last HAX patch is added as a help to reviewing the Xe OA patchset.

The following i915 features have not been included in the Xe OA patchset:
* Inline batch submission on stream exec_queue/hw_engine
* NOA wait
* GuC ctx id (guc_sw_ctx_id)
* CTX_R_PWR_CLK_STATE/GEN8_R_PWR_CLK_STATE
* hold_preemption (DRM_XE_OA_PROP_HOLD_PREEMPTION)
* sseu_config (DRM_XE_OA_PROP_GLOBAL_SSEU)
* Override gucrc (override_gucrc_mode)
* MTL bios_c6_setup
* ratelimits
* compat ioctl

Theis HAX patch contains:
a. Incomplete ports of the features listed above which are not included in
   the Xe OA patchset
b. FIXME highlighting significant changes between i915 and xe, and
c. FIXME containing author comments about implementation caveats

Therefore FIXME's (and associated comments) in this HAX patch should guide
in reviewing the Xe OA patchset.

Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
---
 drivers/gpu/drm/xe/xe_oa.c       | 783 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_oa.h       |   7 +
 drivers/gpu/drm/xe/xe_oa_types.h |  65 +++
 include/uapi/drm/xe_drm.h        |   2 +
 4 files changed, 828 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
index f320e5cd76a13..3b43b91a79a65 100644
--- a/drivers/gpu/drm/xe/xe_oa.c
+++ b/drivers/gpu/drm/xe/xe_oa.c
@@ -3,6 +3,12 @@
  * Copyright © 2023 Intel Corporation
  */
 
+/*
+ * Current list of features missing in xe kmd:
+ * - get_default_sseu_config
+ * - xe_engine_set_nopreempt
+ */
+
 #include <linux/anon_inodes.h>
 #include <linux/nospec.h>
 #include <linux/sizes.h>
@@ -31,6 +37,8 @@
 #include "xe_sched_job.h"
 #include "xe_vm.h"
 
+#define __UNUSED__ __attribute__((unused))
+
 #define OA_BUFFER_SIZE		SZ_16M
 #define OA_TAKEN(tail, head)	(((tail) - (head)) & (OA_BUFFER_SIZE - 1))
 #define DEFAULT_POLL_FREQUENCY_HZ 200
@@ -59,15 +67,18 @@ static const struct xe_oa_format oa_formats[] = {
 };
 
 struct xe_oa_open_properties {
-	bool sample;
-	bool single_exec_q;
-	u64 exec_q_id;
+	bool sample; // FIXME: previously sample_flags, changed to bool
+	bool single_exec_q; // FIXME: single_context
+	u64 exec_q_id; // FIXME: ctx_handle
+	bool hold_preemption;
 
 	int metrics_set;
 	int oa_format;
 	bool oa_periodic;
 	int oa_period_exponent;
 
+	// struct intel_sseu sseu; // FIXME: support in xe kmd?
+
 	struct xe_hw_engine *hwe;
 
 	u64 poll_oa_period;
@@ -77,7 +88,7 @@ struct xe_oa_config_bo {
 	struct llist_node node;
 
 	struct xe_oa_config *oa_config;
-	struct xe_bb *bb;
+	struct xe_bb *bb; // FIXME: check
 };
 
 static struct ctl_table_header *sysctl_header;
@@ -250,7 +261,8 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream)
 		tail = OA_TAKEN(tail, report_size);
 	}
 
-	if (OA_TAKEN(hw_tail, tail) > report_size)
+	if (OA_TAKEN(hw_tail, tail) > report_size &&
+	    __ratelimit(&stream->oa->tail_pointer_race))
 		drm_dbg(&stream->oa->xe->drm,
 			"unlanded report(s) head=0x%x tail=0x%x hw_tail=0x%x\n",
 			stream->oa_buffer.head, tail, hw_tail);
@@ -419,14 +431,15 @@ static int xe_oa_append_reports(struct xe_oa_stream *stream, char __user *buf,
 		 * since it's not-uncommon for periodic samples to identify a switch
 		 * before any 'context switch' report.
 		 */
-		if (!stream->exec_q || stream->specific_ctx_id == ctx_id ||
+		if (!stream->exec_q || // FIXME: check
+		    stream->specific_ctx_id == ctx_id ||
 		    stream->oa_buffer.last_ctx_id == stream->specific_ctx_id ||
 		    reason & OAREPORT_REASON_CTX_SWITCH) {
 			/*
 			 * While filtering for a single context we avoid
 			 * leaking the IDs of other contexts.
 			 */
-			if (stream->exec_q && stream->specific_ctx_id != ctx_id)
+			if (stream->exec_q && stream->specific_ctx_id != ctx_id) // FIXME: check
 				oa_context_id_squash(stream, report32);
 
 			ret = xe_oa_append_sample(stream, buf, count, offset, report);
@@ -694,6 +707,94 @@ static __poll_t xe_oa_poll(struct file *file, poll_table *wait)
 	return ret;
 }
 
+#if 0
+// If this is needed need to look into further
+static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb)
+{
+	struct xe_hw_engine *hwe = stream->hwe;
+	struct xe_engine *e;
+	struct xe_sched_job *job;
+	struct dma_fence *fence;
+	struct xe_vm *vm;
+	u64 batch_ofs;
+	long timeout;
+	int err = 0;
+
+	if (stream->engine) {
+		/*
+		 * FIXME: can we send kernel bb in e->vm context? Seems to be
+		 * causing big problems (cat err) which need to be investigated
+		*/
+		e = stream->engine;
+		XE_BUG_ON(!e->vm);
+		err = dma_resv_lock_interruptible(&e->vm->resv, NULL);
+		if (err)
+			goto exit;
+		down_write(&e->vm->lock);
+		job = xe_bb_create_job(e, bb);
+		if (IS_ERR(job)) {
+			err = PTR_ERR(job);
+			goto vm_unlock;
+		}
+	} else {
+		vm = xe_migrate_get_vm(stream->gt->tile->migrate);
+		e = xe_engine_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1,
+				     hwe, ENGINE_FLAG_WA);
+		if (IS_ERR(e)) {
+			err = PTR_ERR(e);
+			drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_engine_create,e failed=%d",
+				stream->gt->info.id, hwe->name, err);
+			goto put_vm;
+		}
+
+		batch_ofs = xe_bo_ggtt_addr(stream->gt->tile->mem.kernel_bb_pool->bo);
+		/* Will add MI_BATCH_BUFFER_END */
+		job = xe_bb_create_wa_job(e, bb, batch_ofs);
+		if (IS_ERR(job)) {
+			err = PTR_ERR(job);
+			goto put_engine;
+		}
+	}
+
+	xe_sched_job_arm(job);
+	fence = dma_fence_get(&job->drm.s_fence->finished);
+	xe_sched_job_push(job);
+
+	timeout = dma_fence_wait_timeout(fence, false, HZ);
+	dma_fence_put(fence);
+	if (timeout < 0)
+		err = timeout;
+	else if (!timeout)
+		err = -ETIME;
+put_engine:
+	if (!stream->engine)
+		xe_engine_put(e);
+put_vm:
+	if (!stream->engine)
+		xe_vm_put(vm);
+vm_unlock:
+	if (stream->engine) {
+		dma_resv_unlock(&e->vm->resv);
+		up_write(&e->vm->lock);
+	}
+exit:
+	return err;
+}
+#endif
+
+/*
+  FIXME: Currently submits only to stream->engine or new engine for
+  stream->hwe. If needed, add 'struct xe_engine *' argument
+
+  For now unconditionally create engine otherwise we hit BUG_ON in
+  xe_bb_create_wa_job. If jobs need to be sent to the same engine for
+  serialization may need to replace xe_bb_create_wa_job with a similar
+  function.
+
+  Also the code is wrong for xe_oa_guc_sw_ctx_id because there we need to
+  submit against the real engine/context rather than the new engine created
+  below.
+*/
 static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb)
 {
 	struct xe_hw_engine *hwe = stream->hwe;
@@ -706,14 +807,16 @@ static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb)
 	int err = 0;
 
 	vm = xe_migrate_get_vm(stream->gt->tile->migrate);
-	q = xe_exec_queue_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1,
-				 hwe, EXEC_QUEUE_FLAG_WA);
-	if (IS_ERR(q)) {
-		err = PTR_ERR(q);
-		drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_exec_queue_create failed=%d",
-			stream->gt->info.id, hwe->name, err);
-		goto put_vm;
-	}
+	// if (!stream->exec_q) {
+		q = xe_exec_queue_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1,
+					 hwe, EXEC_QUEUE_FLAG_WA);
+		if (IS_ERR(q)) {
+			err = PTR_ERR(q);
+			drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_exec_queue_create failed=%d",
+				stream->gt->info.id, hwe->name, err);
+			goto put_vm;
+		}
+	// }
 
 	batch_ofs = xe_bo_ggtt_addr(stream->gt->tile->mem.kernel_bb_pool->bo);
 	/* Will add MI_BATCH_BUFFER_END */
@@ -734,7 +837,8 @@ static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb)
 	else if (!timeout)
 		err = -ETIME;
 put_exec_q:
-	xe_exec_queue_put(q);
+	// if (!stream->exec_q)
+		xe_exec_queue_put(q);
 put_vm:
 	xe_vm_put(vm);
 
@@ -750,11 +854,17 @@ static void xe_oa_free_configs(struct xe_oa_stream *stream)
 {
 	struct xe_oa_config_bo *oa_bo, *tmp;
 
+	// FIXME: check functions below
 	xe_oa_config_put(stream->oa_config);
 	llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node)
 		free_oa_config_bo(oa_bo);
 }
 
+static void xe_oa_free_noa_wait(struct xe_oa_stream *stream)
+{
+	xe_bo_unpin_map_no_vm(stream->noa_wait);
+}
+
 static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc,
 			     struct xe_bb *bb, const struct flex *flex, u32 count)
 {
@@ -825,6 +935,90 @@ static int xe_oa_modify_self(struct xe_oa_stream *stream,
 	return err;
 }
 
+static int xe_oa_configure_context(struct xe_oa_stream *stream,
+				   struct xe_exec_queue *q,
+				   struct flex *flex, u32 count)
+{
+	int i, err = 0;
+
+	for (i = 0; i < q->width; i++) {
+		// flex->value = intel_sseu_make_rpcs(ce->engine->gt, &ce->sseu); // FIXME
+		err = xe_oa_modify_context(stream, &q->lrc[i], flex, count);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int __xe_oa_configure_all_contexts(struct xe_oa_stream *stream,
+					  struct flex *regs,
+					  size_t num_regs, bool enable)
+{
+	struct xe_file *xef = stream->xef;
+	struct xe_hw_engine *hwe;
+	enum xe_hw_engine_id id;
+	struct xe_exec_queue *q;
+	unsigned long idx;
+	int err;
+
+	// FIXME: below crashes during close, need to check xef mutex
+	return 0;
+
+	// FIXME: we can't use xef to find all engines since there may be multiple such files
+	mutex_lock(&xef->exec_queue.lock);
+	xa_for_each(&xef->exec_queue.xa, idx, q) {
+		xe_exec_queue_get(q);
+		err = xe_oa_configure_context(stream, q, regs, num_regs);
+		xe_exec_queue_put(q);
+		if (err)
+			return err;
+	}
+	mutex_unlock(&xef->exec_queue.lock);
+
+	/*
+	 * After updating all other contexts, we need to modify ourselves.  If
+	 * we don't modify the kernel_context, we do not get events while idle.
+	 */
+	for_each_hw_engine(hwe, stream->gt, id) {
+		/*
+		 * FIXME: at present there is no way to create an engine using
+		 * hwe->kernel_lrc. Also in xe we don't use kernel_lrc when idle,
+		 * though we would need a 'context' restored to get events when idle
+		 * to make sure registers are programmed correctly.
+		 */
+	}
+
+	return 0;
+}
+
+static __UNUSED__ int
+lrc_configure_all_contexts(struct xe_oa_stream *stream,
+			   const struct xe_oa_config *oa_config)
+{
+	return 0; // FIXME: not used for gen12+
+}
+
+static int xe_oa_configure_all_contexts(struct xe_oa_stream *stream, bool enable)
+{
+#define GEN8_R_PWR_CLK_STATE(base)	XE_REG((base) + 0xc8)
+#define CTX_R_PWR_CLK_STATE		(0x42 + 1)
+
+	struct flex regs[] = {
+		{
+			GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE),
+			CTX_R_PWR_CLK_STATE,
+		},
+	};
+
+	if (stream->hwe->class != XE_ENGINE_CLASS_RENDER)
+		return 0;
+
+	// FIXME: what should this do when enable == false?
+
+	return __xe_oa_configure_all_contexts(stream, regs, ARRAY_SIZE(regs), enable);
+}
+
 static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable)
 {
 	int err;
@@ -857,7 +1051,10 @@ static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable)
 		},
 	};
 
-	/* Modify stream hwe context image with regs_context */
+	/* Modify stream hwe context image with regs_context
+	 * FIXME: for now only modifying engine->lrc[0], but maybe this should
+	 * be changed to modify all lrc's underlying the engine?
+	 */
 	err = xe_oa_modify_context(stream, &stream->exec_q->lrc[0],
 				   regs_context, ARRAY_SIZE(regs_context));
 	if (err)
@@ -884,6 +1081,9 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
 				_MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
 	}
 
+	/* Reset all contexts' slices/subslices configurations. */
+	xe_oa_configure_all_contexts(stream, false);
+
 	/* disable the context save/restore or OAR counters */
 	if (stream->exec_q)
 		xe_oa_configure_oar_context(stream, false);
@@ -898,10 +1098,21 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
 	xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, sqcnt1, 0);
 }
 
+static int intel_guc_slpc_override_gucrc_mode(struct xe_gt *gt, u32 mode)
+{
+	return 0; // FIXME
+}
+
+static int intel_guc_slpc_unset_gucrc_mode(struct xe_gt *gt)
+{
+	return 0; // FIXME
+}
+
 static void xe_oa_stream_destroy(struct xe_oa_stream *stream)
 {
 	struct xe_oa_group *g = stream->hwe->oa_group;
 	struct xe_gt *gt = stream->hwe->gt;
+	struct xe_oa *oa = stream->oa;
 
 	if (WARN_ON(stream != g->exclusive_stream))
 		return;
@@ -912,10 +1123,21 @@ static void xe_oa_stream_destroy(struct xe_oa_stream *stream)
 
 	xe_oa_free_oa_buffer(stream);
 
+	/* Wa_16011777198:dg2: Unset the override of GUCRC mode to enable rc6 */
+	if (stream->override_gucrc)
+		drm_WARN_ON(&stream->oa->xe->drm, intel_guc_slpc_unset_gucrc_mode(gt));
+
 	XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL));
 	xe_device_mem_access_put(stream->oa->xe);
 
 	xe_oa_free_configs(stream);
+	xe_oa_free_noa_wait(stream);
+
+	if (oa->spurious_report_rs.missed) {
+		drm_notice(&stream->oa->xe->drm,
+			   "%d spurious OA report notices suppressed due to ratelimiting\n",
+			   oa->spurious_report_rs.missed);
+	}
 }
 
 static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream)
@@ -937,6 +1159,197 @@ static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream)
 	return 0;
 }
 
+static u32 *save_restore_register(struct xe_oa_stream *stream, u32 *cs,
+				  bool save, struct xe_reg reg, u32 offset,
+				  u32 dword_count)
+{
+	u32 cmd;
+	u32 d;
+
+	cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM;
+	cmd |= MI_SRM_LRM_GLOBAL_GTT;
+	cmd++;
+
+	for (d = 0; d < dword_count; d++) {
+		*cs++ = cmd;
+		*cs++ = reg.addr + 4 * d;
+		*cs++ = xe_bo_ggtt_addr(stream->noa_wait) + offset + 4 * d;
+		*cs++ = 0;
+	}
+
+	return cs;
+}
+
+static u64 xe_oa_ns_to_clock_interval(const struct xe_gt *gt, u64 ns)
+{
+	return DIV64_U64_ROUND_UP(gt->info.clock_freq * ns, NSEC_PER_SEC);
+}
+
+static int xe_oa_alloc_noa_wait(struct xe_oa_stream *stream)
+{
+	struct xe_bo *bo;
+	const u64 delay_ticks = 0xffffffffffffffff -
+		xe_oa_ns_to_clock_interval(stream->gt,
+			      atomic64_read(&stream->oa->noa_programming_delay));
+	const u32 base = stream->hwe->mmio_base;
+#define HAS_MI_SET_PREDICATE(xe) (GRAPHICS_VERx100(xe) >= 1270)
+#define CS_GPR(x) GEN8_RING_CS_GPR(base, x)
+	u32 *batch, *ts0, *cs, *jump;
+	int ret, i;
+	enum {
+		START_TS,
+		NOW_TS,
+		DELTA_TS,
+		JUMP_PREDICATE,
+		DELTA_TARGET,
+		N_CS_GPR
+	};
+	struct xe_reg mi_predicate_result = HAS_MI_SET_PREDICATE(stream->gt->tile->xe) ?
+					MI_PREDICATE_RESULT_2(base) :
+					MI_PREDICATE_RESULT_1(RENDER_RING_BASE);
+
+	bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL,
+				  8192, ttm_bo_type_kernel,
+				  // XE_BO_CREATE_VRAM_IF_DGFX(gt) |
+				  XE_BO_CREATE_SYSTEM_BIT | // FIXME: check
+				  XE_BO_CREATE_GGTT_BIT);
+	if (IS_ERR(bo))
+		return PTR_ERR(bo);
+
+	batch = cs = bo->vmap.is_iomem ? bo->vmap.vaddr_iomem : bo->vmap.vaddr;
+	stream->noa_wait = bo;
+
+#define GPR_SAVE_OFFSET 4096
+#define PREDICATE_SAVE_OFFSET 4160
+
+	/* Save registers. */
+	for (i = 0; i < N_CS_GPR; i++)
+		cs = save_restore_register(
+			stream, cs, true /* save */, CS_GPR(i),
+			GPR_SAVE_OFFSET + 8 * i, 2);
+	cs = save_restore_register(
+		stream, cs, true /* save */, mi_predicate_result,
+		PREDICATE_SAVE_OFFSET, 1);
+
+	/* First timestamp snapshot location. */
+	ts0 = cs;
+
+	/*
+	 * Initial snapshot of the timestamp register to implement the wait.
+	 * We work with 32b values, so clear out the top 32b bits of the
+	 * register because the ALU works 64bits.
+	 */
+	*cs++ = MI_LOAD_REGISTER_IMM(1);
+	*cs++ = CS_GPR(START_TS).addr + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = RING_TIMESTAMP(base).addr;
+	*cs++ = CS_GPR(START_TS).addr;
+
+	/*
+	 * This is the location we're going to jump back into until the
+	 * required amount of time has passed.
+	 */
+	jump = cs;
+
+	/*
+	 * Take another snapshot of the timestamp register. Take care to clear
+	 * up the top 32bits of CS_GPR(1) as we're using it for other
+	 * operations below.
+	 */
+	*cs++ = MI_LOAD_REGISTER_IMM(1);
+	*cs++ = CS_GPR(NOW_TS).addr + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = RING_TIMESTAMP(base).addr;
+	*cs++ = CS_GPR(NOW_TS).addr;
+
+	/*
+	 * Do a diff between the 2 timestamps and store the result back into
+	 * CS_GPR(1).
+	 */
+	*cs++ = MI_MATH(5);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU);
+	*cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+	/*
+	 * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
+	 * timestamp have rolled over the 32bits) into the predicate register
+	 * to be used for the predicated jump.
+	 */
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = CS_GPR(JUMP_PREDICATE).addr;
+	*cs++ = mi_predicate_result.addr;
+
+	if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe))
+		*cs++ = MI_SET_PREDICATE | 1;
+
+	/* Restart from the beginning if we had timestamps roll over. */
+	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_PREDICATE;
+	// *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; // FIXME
+	*cs++ = 0;
+
+	if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe))
+		*cs++ = MI_SET_PREDICATE;
+
+	/*
+	 * Now add the diff between to previous timestamps and add it to :
+	 *      (((1 * << 64) - 1) - delay_ns)
+	 *
+	 * When the Carry Flag contains 1 this means the elapsed time is
+	 * longer than the expected delay, and we can exit the wait loop.
+	 */
+	*cs++ = MI_LOAD_REGISTER_IMM(2);
+	*cs++ = CS_GPR(DELTA_TARGET).addr;
+	*cs++ = lower_32_bits(delay_ticks);
+	*cs++ = CS_GPR(DELTA_TARGET).addr + 4;
+	*cs++ = upper_32_bits(delay_ticks);
+
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET));
+	*cs++ = MI_MATH_ADD;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
+
+	*cs++ = MI_ARB_CHECK;
+
+	/*
+	 * Transfer the result into the predicate register to be used for the
+	 * predicated jump.
+	 */
+	*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
+	*cs++ = CS_GPR(JUMP_PREDICATE).addr;
+	*cs++ = mi_predicate_result.addr;
+
+	if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe))
+		*cs++ = MI_SET_PREDICATE | 1;
+
+	/* Predicate the jump.  */
+	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_PREDICATE;
+	// *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; // FIXME
+	*cs++ = 0;
+
+	if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe))
+		*cs++ = MI_SET_PREDICATE;
+
+	/* Restore registers. */
+	for (i = 0; i < N_CS_GPR; i++)
+		cs = save_restore_register(
+			stream, cs, false /* restore */, CS_GPR(i),
+			GPR_SAVE_OFFSET + 8 * i, 2);
+	cs = save_restore_register(
+		stream, cs, false /* restore */, mi_predicate_result,
+		PREDICATE_SAVE_OFFSET, 1);
+
+	/* And return to the ring. */
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	return ret;
+}
+
 static void write_cs_mi_lri(struct xe_bb *bb, const struct xe_oa_reg *reg_data, u32 n_regs)
 {
 	u32 i;
@@ -981,7 +1394,11 @@ __xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa
 	config_length += num_lri_dwords(oa_config->mux_regs_len);
 	config_length += num_lri_dwords(oa_config->b_counter_regs_len);
 	config_length += num_lri_dwords(oa_config->flex_regs_len);
+#if 1 // FIXME: noa_wait (see 93937659dc64)
 	config_length++; /* MI_BATCH_BUFFER_END */
+#else
+	config_length += 4; /* MI_BATCH_BUFFER_START */
+#endif
 	config_length = ALIGN(sizeof(u32) * config_length, XE_PAGE_SIZE) / sizeof(u32);
 
 	bb = xe_bb_new(stream->gt, config_length, false);
@@ -992,6 +1409,15 @@ __xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa
 	write_cs_mi_lri(bb, oa_config->b_counter_regs, oa_config->b_counter_regs_len);
 	write_cs_mi_lri(bb, oa_config->flex_regs, oa_config->flex_regs_len);
 
+#if 0 // FIXME: noa_wait (see 93937659dc64)
+	// xe_bb_create_job adds MI_BATCH_BUFFER_END
+	// TBD: how to handle noa_wait in xe_bb_create_job
+
+	/* Jump into the active wait. */
+	bb->cs[bb->len++] = MI_BATCH_BUFFER_START;
+	bb->cs[bb->len++] = xe_bo_ggtt_addr(stream->noa_wait);
+	bb->cs[bb->len++] = 0;
+#endif
 	oa_bo->bb = bb;
 	oa_bo->oa_config = xe_oa_config_get(oa_config);
 	llist_add(&oa_bo->node, &stream->oa_config_bos);
@@ -1020,6 +1446,7 @@ static struct xe_oa_config_bo *xe_oa_alloc_config_buffer(struct xe_oa_stream *st
 	return oa_bo;
 }
 
+// FIXME: check entire function and called functions
 static int xe_oa_emit_oa_config(struct xe_oa_stream *stream)
 {
 	struct xe_oa_config_bo *oa_bo;
@@ -1036,6 +1463,32 @@ static int xe_oa_emit_oa_config(struct xe_oa_stream *stream)
 	return err;
 }
 
+static __UNUSED__ void oa_context(struct xe_oa_stream *stream) {}
+
+static __UNUSED__ u32 oa_config_flex_reg(const struct xe_oa_config *oa_config,
+					 struct xe_reg reg)
+{
+	u32 mmio = reg.addr;
+	int i;
+
+	/*
+	 * This arbitrary default will select the 'EU FPU0 Pipeline
+	 * Active' event. In the future it's anticipated that there
+	 * will be an explicit 'No Event' we can select, but not yet...
+	 */
+	if (!oa_config)
+		return 0;
+
+	for (i = 0; i < oa_config->flex_regs_len; i++) {
+		if (oa_config->flex_regs[i].addr.addr == mmio)
+			return oa_config->flex_regs[i].value;
+	}
+
+	return 0;
+}
+
+static __UNUSED__ void gen8_update_reg_state_unlocked(const struct xe_oa_stream *stream) {}
+
 static u32 oag_report_ctx_switches(const struct xe_oa_stream *stream)
 {
 	return _MASKED_FIELD(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS,
@@ -1054,7 +1507,7 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream)
 	 * Disable thread stall DOP gating and EU DOP gating.
 	 */
 	if (stream->gt->tile->xe->info.platform == XE_DG2) {
-		xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN,
+		xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN,  // FIXME: check
 					  _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
 		xe_mmio_write32(stream->gt, GEN7_ROW_CHICKEN2,
 				_MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
@@ -1086,6 +1539,15 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream)
 
 	xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, 0, sqcnt1);
 
+	/* FIXME: do this later if needed
+	 *
+	 * Update all contexts prior writing the mux configurations as we need to make
+	 * sure all slices/subslices are ON before writing to NOA registers.
+	 */
+	ret = xe_oa_configure_all_contexts(stream, true);
+	if (ret)
+		return ret;
+
 	/*
 	 * For Gen12, performance counters are context saved/restored. Only enable it
 	 * for the context that requested this.
@@ -1099,6 +1561,19 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream)
 	return xe_oa_emit_oa_config(stream);
 }
 
+static __UNUSED__ void get_default_sseu_config(void) {}
+static __UNUSED__ void get_sseu_config(void) {}
+
+static void xe_engine_set_nopreempt(struct xe_exec_queue *q)
+{
+	// FIXME
+}
+
+static void xe_engine_clear_nopreempt(struct xe_exec_queue *q)
+{
+	// FIXME
+}
+
 static void xe_oa_stream_enable(struct xe_oa_stream *stream)
 {
 	stream->pollin = false;
@@ -1127,6 +1602,9 @@ static void xe_oa_enable_locked(struct xe_oa_stream *stream)
 	stream->enabled = true;
 
 	xe_oa_stream_enable(stream);
+
+	if (stream->hold_preemption)
+		xe_engine_set_nopreempt(stream->exec_q);
 }
 
 static void xe_oa_disable_locked(struct xe_oa_stream *stream)
@@ -1136,6 +1614,9 @@ static void xe_oa_disable_locked(struct xe_oa_stream *stream)
 
 	stream->enabled = false;
 
+	if (stream->hold_preemption)
+		xe_engine_clear_nopreempt(stream->exec_q);
+
 	xe_oa_stream_disable(stream);
 }
 
@@ -1152,6 +1633,7 @@ static long xe_oa_config_locked(struct xe_oa_stream *stream,
 	if (config != stream->oa_config) {
 		int err;
 
+		// FIXME: check: does the config have to be emitted on the stream engine?
 		/*
 		 * If OA is bound to a specific engine, emit the reconfiguration
 		 * inline from that engine. The update will then be ordered with
@@ -1209,7 +1691,7 @@ static void xe_oa_destroy_locked(struct xe_oa_stream *stream)
 	xe_oa_stream_destroy(stream);
 
 	if (stream->exec_q)
-		xe_exec_queue_put(stream->exec_q);
+		xe_exec_queue_put(stream->exec_q); // FIXME: check
 
 	kfree(stream);
 }
@@ -1240,6 +1722,8 @@ static const struct file_operations xe_oa_fops = {
 	.poll		= xe_oa_poll,
 	.read		= xe_oa_read,
 	.unlocked_ioctl	= xe_oa_ioctl,
+	// FIXME: check .compat_ioctl later, maybe skip for now
+	.compat_ioctl   = xe_oa_ioctl,
 };
 
 static bool engine_supports_mi_query(struct xe_hw_engine *hwe)
@@ -1247,6 +1731,11 @@ static bool engine_supports_mi_query(struct xe_hw_engine *hwe)
 	return hwe->class == XE_ENGINE_CLASS_RENDER;
 }
 
+static void xe_oa_pin_context(struct xe_oa_stream *stream)
+{
+	// contexts are already pinned for now, there's no unpin
+}
+
 #define MI_LRI_LEN(x) (((x) & 0xff) + 1)
 
 static bool xe_oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
@@ -1269,10 +1758,17 @@ static bool xe_oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end)
 
 static u32 xe_oa_context_image_offset(struct xe_oa_stream *stream, u32 reg)
 {
+	// FIXME: check len and state assignments below
 	u32 len = (xe_lrc_size(stream->gt->tile->xe, stream->hwe->class) - PAGE_SIZE) / 4;
 	u32 *state = stream->gt->default_lrc[stream->hwe->class];
 	u32 offset;
 
+	/*
+	 * FIXME: maybe ok but really __xe_lrc_regs_offset should be added to
+	 * state. The same offset should be used in xe_oa_configure_oar_context
+	 * where ctx_oactxctrl_offset is consumed. Also instead of default_lrc
+	 * we could use stream->engine->lrc or stream->hwe->kernel_lrc
+	 */
 	if (drm_WARN_ON(&stream->oa->xe->drm, !state))
 		return U32_MAX;
 
@@ -1313,6 +1809,126 @@ static int xe_oa_set_ctx_ctrl_offset(struct xe_oa_stream *stream)
 	return offset && offset != U32_MAX ? 0 : -ENODEV;
 }
 
+static void __store_reg_to_mem(struct xe_bb *bb, struct xe_reg reg, u32 ggtt_offset)
+{
+	u32 cmd;
+
+	cmd = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
+	cmd++;
+
+	bb->cs[bb->len++] = cmd;
+	bb->cs[bb->len++] = reg.addr;
+	bb->cs[bb->len++] = ggtt_offset;
+	bb->cs[bb->len++] = 0;
+}
+
+static int __read_reg(struct xe_oa_stream *stream, struct xe_reg reg, u32 ggtt_offset)
+{
+	struct xe_bb *bb;
+	int err = 0;
+
+	bb = xe_bb_new(stream->gt, 4 + 1, false);
+	if (IS_ERR(bb)) {
+		err = PTR_ERR(bb);
+		goto exit;
+	}
+
+	__store_reg_to_mem(bb, reg, ggtt_offset);
+
+	err = xe_oa_submit_bb(stream, bb);
+	xe_bb_free(bb, NULL);
+exit:
+	return err;
+}
+
+static int xe_oa_guc_sw_ctx_id(struct xe_oa_stream *stream, u32 *ctx_id)
+{
+	struct xe_bo *bo;
+	u32 *ptr;
+	int err = 0;
+
+	bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL,
+				  4096, ttm_bo_type_kernel,
+				  // XE_BO_CREATE_VRAM_IF_DGFX(gt) |
+				  XE_BO_CREATE_SYSTEM_BIT | // FIXME: check
+				  XE_BO_CREATE_GGTT_BIT);
+	if (IS_ERR(bo)) {
+		err = PTR_ERR(bo);
+		goto exit;
+	}
+
+	err = __read_reg(stream, RING_EXECLIST_STATUS_HI(stream->hwe->mmio_base),
+			 xe_bo_ggtt_addr(bo));
+	if (err)
+		goto unpin;
+
+	ptr = bo->vmap.is_iomem ? bo->vmap.vaddr_iomem : bo->vmap.vaddr;
+
+	*ctx_id = *ptr;
+unpin:
+	xe_bo_unpin_map_no_vm(bo);
+exit:
+	return err;
+}
+
+static int __xe_oa_get_render_context_id(struct xe_oa_stream *stream)
+{
+	u32 ctx_id, mask;
+	int ret;
+
+// FIXME: only retain the GuC case here, we only support GuC
+
+#define GEN12_GUC_SW_CTX_ID_MASK		GENMASK(22, 7)
+
+#define GEN11_SW_CTX_ID_SHIFT			37
+#define GEN11_SW_CTX_ID_WIDTH			11
+#define XEHP_SW_CTX_ID_SHIFT			39
+#define XEHP_SW_CTX_ID_WIDTH			16
+#define XEHP_SW_COUNTER_SHIFT			58
+#define XEHP_SW_COUNTER_WIDTH			6
+#define MAX_CONTEXT_HW_ID	(1 << 21) /* exclusive */
+#define GEN11_MAX_CONTEXT_HW_ID	(1 << 11) /* exclusive */
+/* in Gen12 ID 0x7FF is reserved to indicate idle */
+#define GEN12_MAX_CONTEXT_HW_ID	(GEN11_MAX_CONTEXT_HW_ID - 1)
+/* in Xe_HP ID 0xFFFF is reserved to indicate "invalid context" */
+#define XEHP_MAX_CONTEXT_HW_ID	0xFFFF
+
+	if (xe_device_guc_submission_enabled(stream->gt->tile->xe)) {
+		ret = xe_oa_guc_sw_ctx_id(stream, &ctx_id);
+		if (ret)
+			return ret;
+
+		mask = GEN12_GUC_SW_CTX_ID_MASK;
+	} else if (GRAPHICS_VERx100(stream->gt->tile->xe) >= 1250) {
+		ctx_id = (XEHP_MAX_CONTEXT_HW_ID - 1) <<
+			(XEHP_SW_CTX_ID_SHIFT - 32);
+
+		mask = ((1U << XEHP_SW_CTX_ID_WIDTH) - 1) <<
+			(XEHP_SW_CTX_ID_SHIFT - 32);
+	} else {
+		ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) <<
+			 (GEN11_SW_CTX_ID_SHIFT - 32);
+
+		mask = ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) <<
+			(GEN11_SW_CTX_ID_SHIFT - 32);
+	}
+	stream->specific_ctx_id = ctx_id & mask;
+	stream->specific_ctx_id_mask = mask;
+
+	return 0;
+}
+
+static int xe_oa_get_render_ctx_id(struct xe_oa_stream *stream)
+{
+	int ret = __xe_oa_get_render_context_id(stream);
+
+	drm_dbg(&stream->oa->xe->drm,
+		"filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
+		stream->specific_ctx_id, stream->specific_ctx_id_mask);
+
+	return ret;
+}
+
 static int xe_oa_stream_init(struct xe_oa_stream *stream,
 			     struct xe_oa_open_properties *props)
 {
@@ -1329,25 +1945,43 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
 
 	stream->sample = props->sample;
 	stream->sample_size += stream->oa_buffer.format->size;
+	stream->hold_preemption = props->hold_preemption;
 	stream->periodic = props->oa_periodic;
 	stream->period_exponent = props->oa_period_exponent;
 
-	if (stream->exec_q && engine_supports_mi_query(stream->hwe)) {
-		/* If we don't find the context offset, just return error */
-		ret = xe_oa_set_ctx_ctrl_offset(stream);
+	if (stream->exec_q) {
+		xe_oa_pin_context(stream); // FIXME: empty function, no unpin
+
+		if (engine_supports_mi_query(stream->hwe)) {
+			/* If we don't find the context offset, just return error */
+			ret = xe_oa_set_ctx_ctrl_offset(stream);
+			if (ret) {
+				drm_err(&stream->gt->tile->xe->drm,
+					"xe_oa_set_ctx_ctrl_offset failed for %s\n",
+					stream->hwe->name);
+				goto exit;
+			}
+		}
+
+		 // FIXME: do later, also put_render_ctx_id not needed, deleted, check
+		ret = xe_oa_get_render_ctx_id(stream);
 		if (ret) {
-			drm_err(&stream->gt->tile->xe->drm,
-				"xe_oa_set_ctx_ctrl_offset failed for %s\n",
-				stream->hwe->name);
+			drm_dbg(&oa->xe->drm, "Invalid context id to filter with\n");
 			goto exit;
 		}
 	}
 
+	ret = xe_oa_alloc_noa_wait(stream); // FIXME: do later
+	if (ret) {
+		drm_dbg(&oa->xe->drm, "Unable to allocate NOA wait batch buffer\n");
+		goto exit;
+	}
+
 	stream->oa_config = xe_oa_get_oa_config(oa, props->metrics_set);
 	if (!stream->oa_config) {
 		drm_dbg(&oa->xe->drm, "Invalid OA config id=%i\n", props->metrics_set);
 		ret = -EINVAL;
-		goto exit;
+		goto err_free_noa_wait;
 	}
 
 	ret = xe_oa_alloc_oa_buffer(stream);
@@ -1358,10 +1992,30 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
 	xe_device_mem_access_get(stream->oa->xe);
 	XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL));
 
+	/* FIXME: Do later if needed (DG2 not POR for xe)
+	 *
+	 * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes
+	 * OA to lose the configuration state. Prevent this by overriding GUCRC
+	 * mode.
+	 */
+	if (xe_device_guc_submission_enabled(oa->xe) &&
+	    (IS_SUBPLATFORM_STEP(oa->xe, XE_DG2, XE_SUBPLATFORM_DG2_G10, STEP_A0, STEP_C0) ||
+	     IS_SUBPLATFORM_STEP(oa->xe, XE_DG2, XE_SUBPLATFORM_DG2_G11, STEP_A0, STEP_B0))) {
+		ret = intel_guc_slpc_override_gucrc_mode(gt, 0); // FIXME
+		if (ret) {
+			drm_dbg(&oa->xe->drm, "Unable to override gucrc mode\n");
+			goto err_fw_put;
+		}
+
+		stream->override_gucrc = true;
+	}
+
+	// stream->engine->gt->perf.sseu = props->sseu; // FIXME
+
 	ret = xe_oa_enable_metric_set(stream);
 	if (ret) {
 		drm_dbg(&oa->xe->drm, "Unable to enable metric set\n");
-		goto err_fw_put;
+		goto err_unset_gucrc;
 	}
 
 	drm_dbg(&oa->xe->drm, "opening stream oa config uuid=%s\n",
@@ -1378,17 +2032,24 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
 
 	return 0;
 
-err_fw_put:
+err_unset_gucrc:
 	xe_oa_disable_metric_set(stream);
+	if (stream->override_gucrc)
+		intel_guc_slpc_unset_gucrc_mode(gt);
+err_fw_put:
 	XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL));
 	xe_device_mem_access_put(stream->oa->xe);
 	xe_oa_free_oa_buffer(stream);
 err_free_configs:
 	xe_oa_free_configs(stream);
+err_free_noa_wait:
+	xe_oa_free_noa_wait(stream);
 exit:
 	return ret;
 }
 
+__UNUSED__ void xe_oa_init_reg_state(void) {}
+
 static int
 xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
 			       struct drm_xe_oa_open_param *param,
@@ -1403,6 +2064,9 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
 	int stream_fd;
 	int ret;
 
+	// FIXME: check: some checks and initialization has been moved
+	// between stream_open_ioctl_locked, xe_oa_stream_init and read_properties
+
 	if (props->single_exec_q) {
 		q = xe_exec_queue_lookup(xef, props->exec_q_id);
 		if (XE_IOCTL_DBG(oa->xe, !q)) {
@@ -1419,6 +2083,17 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
 	if (q && !props->sample)
 		privileged_op = false;
 
+	if (props->hold_preemption) {
+		if (!q) {
+			drm_dbg(&oa->xe->drm, "preemption disable with no engine\n");
+			ret = -EINVAL;
+			goto err_exec_q;
+		}
+		privileged_op = true;
+	}
+
+	// get_default_sseu_config(&props->sseu, props->engine); // FIXME
+
 	if (privileged_op && xe_oa_stream_paranoid && !perfmon_capable()) {
 		drm_dbg(&oa->xe->drm, "Insufficient privileges to open xe perf stream\n");
 		ret = -EACCES;
@@ -1444,6 +2119,7 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa,
 		goto err_exec_q;
 	}
 
+	stream->xef = xef;
 	stream->oa = oa;
 	stream->exec_q = q;
 
@@ -1496,7 +2172,7 @@ u32 xe_oa_timestamp_frequency(struct xe_device *xe)
 	switch (xe->info.platform) {
 	case XE_DG2:
 	case XE_METEORLAKE:
-		xe_device_mem_access_get(xe);
+		xe_device_mem_access_get(xe); // FIXME: check
 		XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL));
 		reg = xe_mmio_read32(xe_root_mmio_gt(xe), RPM_CONFIG0);
 		XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL));
@@ -1508,6 +2184,9 @@ u32 xe_oa_timestamp_frequency(struct xe_device *xe)
 	default:
 		return xe_root_mmio_gt(xe)->info.clock_freq;
 	}
+
+	// FIXME: should this be per gt, even in i915?
+
 }
 
 static u64 oa_exponent_to_ns(struct xe_oa *oa, int exponent)
@@ -1617,6 +2296,17 @@ static int xe_oa_read_properties_unlocked(struct xe_oa *oa, u64 __user *uprops,
 			props->oa_periodic = true;
 			props->oa_period_exponent = value;
 			break;
+		case DRM_XE_OA_PROP_HOLD_PREEMPTION:
+			// FIXME: do this later
+			props->hold_preemption = value;
+			break;
+		case DRM_XE_OA_PROP_GLOBAL_SSEU:
+			/*
+			 * FIXME: Confirm this, on i915 supportd only for < 12.5
+			 * xe_oa_open_properties.has_sseu is removed (always false)
+			 */
+			drm_dbg(&oa->xe->drm, "SSEU config not supported\n");
+			return -ENODEV;
 		case DRM_XE_OA_PROP_POLL_OA_PERIOD:
 			if (value < 100000 /* 100us */) {
 				drm_dbg(&oa->xe->drm, "OA timer too small (%lluns < 100us)\n",
@@ -1664,6 +2354,19 @@ static int xe_oa_read_properties_unlocked(struct xe_oa *oa, u64 __user *uprops,
 		return -EINVAL;
 	}
 
+#if 0 // FIXME: Do this later
+	/*
+	 * Wa_14017512683: mtl[a0..c0): Use of OAM must be preceded with Media
+	 * C6 disable in BIOS. Fail if Media C6 is enabled on steppings where OAM
+	 * does not work as expected.
+	 */
+	if (IS_MTL_MEDIA_STEP(props->engine->xe, STEP_A0, STEP_C0) &&
+	    props->engine->oa_group->type == TYPE_OAM &&
+	    intel_check_bios_c6_setup(&props->engine->gt->rc6)) {
+		drm_dbg(&oa->xe->drm, "OAM requires media C6 to be disabled in BIOS\n");
+		return -EINVAL;
+	}
+#endif
 	f = &oa->oa_formats[props->oa_format];
 	if (!props->oa_format || !f->size ||
 	    !engine_supports_oa_format(props->hwe, f->type)) {
@@ -1768,6 +2471,7 @@ static const struct xe_mmio_range mtl_oam_b_counters[] = {
 	{}
 };
 
+/* FIXME: Checks below have been simplified/loosened for now compared with i915 */
 static bool xe_oa_is_valid_b_counter_addr(struct xe_oa *oa, u32 addr)
 {
 	return xe_oa_reg_in_range_table(addr, xehp_oa_b_counters) ||
@@ -2281,6 +2985,27 @@ int xe_oa_init(struct xe_device *xe)
 	mutex_init(&oa->metrics_lock);
 	idr_init_base(&oa->metrics_idr, 1);
 
+	/* We set up some ratelimit state to potentially throttle any
+	 * _NOTES about spurious, invalid OA reports which we don't
+	 * forward to userspace.
+	 *
+	 * We print a _NOTE about any throttling when closing the
+	 * stream instead of waiting until driver _fini which no one
+	 * would ever see.
+	 *
+	 * Using the same limiting factors as printk_ratelimit()
+	 */
+	ratelimit_state_init(&oa->spurious_report_rs, 5 * HZ, 10);
+	/* Since we use a DRM_NOTE for spurious reports it would be
+	 * inconsistent to let __ratelimit() automatically print a
+	 * warning for throttling.
+	 */
+	ratelimit_set_flags(&oa->spurious_report_rs, RATELIMIT_MSG_ON_RELEASE);
+	ratelimit_state_init(&oa->tail_pointer_race, 5 * HZ, 10);
+	ratelimit_set_flags(&oa->tail_pointer_race,RATELIMIT_MSG_ON_RELEASE);
+
+	atomic64_set(&oa->noa_programming_delay, 500 * 1000 /* 500us */);
+
 	ret = xe_oa_init_engine_groups(oa);
 	if (ret) {
 		drm_err(&xe->drm, "OA initialization failed %d\n", ret);
diff --git a/drivers/gpu/drm/xe/xe_oa.h b/drivers/gpu/drm/xe/xe_oa.h
index 41a7d8b0f10e1..cb716ffc4f7c9 100644
--- a/drivers/gpu/drm/xe/xe_oa.h
+++ b/drivers/gpu/drm/xe/xe_oa.h
@@ -8,6 +8,7 @@
 
 #include "xe_oa_types.h"
 
+/* Below __UNUSED__ refers to exported oa functions not called from other parts of xe */
 int xe_oa_init(struct xe_device *xe);
 void xe_oa_fini(struct xe_device *xe);
 void xe_oa_register(struct xe_device *xe);
@@ -22,6 +23,12 @@ int xe_oa_add_config_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file);
 int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data,
 			      struct drm_file *file);
+void xe_oa_init_reg_state(void); // __UNUSED__
+
+// struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_set);  // __UNUSED__
+// struct xe_oa_config *xe_oa_config_get(struct xe_oa_config *oa_config); // __UNUSED__
+// void xe_oa_config_put(struct xe_oa_config *oa_config); // __UNUSED__
+
 u32 xe_oa_timestamp_frequency(struct xe_device *xe);
 u32 xe_oa_unit_id(struct xe_hw_engine *hwe);
 
diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h
index 58164ff0b6a48..382566c85f75e 100644
--- a/drivers/gpu/drm/xe/xe_oa_types.h
+++ b/drivers/gpu/drm/xe/xe_oa_types.h
@@ -110,6 +110,12 @@ struct xe_oa_gt {
 	/** @lock: lock associated with anything below within this structure */
 	struct mutex lock;
 
+	/** FIXME
+	 * @sseu: sseu configuration selected to run while perf is active,
+	 * applies to all contexts.
+	 */
+	// struct intel_sseu sseu;
+
 	/** @num_oa_groups: number of oa groups per gt */
 	u32 num_oa_groups;
 
@@ -138,9 +144,23 @@ struct xe_oa {
 	 */
 	struct idr metrics_idr;
 
+	/**
+	 * @spurious_report_rs: For rate limiting notifications of spurious
+	 * invalid OA reports
+	 */
+	struct ratelimit_state spurious_report_rs;
+
+	/** @tail_pointer_race: For rate limiting notifications of tail pointer race */
+	struct ratelimit_state tail_pointer_race;
+
 	/** @ctx_oactxctrl_offset: offset of OACTXCONTROL register in context image */
 	u32 ctx_oactxctrl_offset;
 
+	// u32 gen7_latched_oastatus1; // FIXME
+	// u32 ctx_flexeu0_offset;
+	// u32 gen8_valid_ctx_bit; // FIXME: deleted
+	// struct i915_oa_ops ops; // FIXME: these are deleted
+
 	/** @oa_formats: tracks all OA formats across platforms */
 	const struct xe_oa_format *oa_formats;
 
@@ -149,6 +169,9 @@ struct xe_oa {
 	/** @format_mask: tracks valid OA formats for a platform */
 	unsigned long format_mask[FORMAT_MASK_SIZE];
 
+	/** @noa_programming_delay: NOA wait programmed delay value */
+	atomic64_t noa_programming_delay;
+
 	/** @oa_unit_ids: tracks oa unit ids assigned across gt's */
 	u32 oa_unit_ids;
 };
@@ -157,12 +180,17 @@ struct xe_oa {
  * struct xe_oa_stream - state for a single open stream FD
  */
 struct xe_oa_stream {
+	// FIXME: xef doesn't work, see __xe_oa_configure_all_contexts
+	/** @xef: xe_file associated with oa stream */
+	struct xe_file *xef;
+
 	/** @oa: xe_oa backpointer */
 	struct xe_oa *oa;
 
 	/** @gt: gt associated with the oa stream */
 	struct xe_gt *gt;
 
+	// FIXME: struct xe_hw_engine instead of intel_engine_cs
 	/**
 	 * @hwe: hardware engine associated with this performance stream.
 	 */
@@ -185,6 +213,7 @@ struct xe_oa_stream {
 	 */
 	int sample_size;
 
+	/* FIXME: struct xe_engine instead of i915_gem_context */
 	/**
 	 * @exec_q: %NULL if measuring system-wide across all exec_q's or a
 	 * specific exec_q that is being monitored.
@@ -198,6 +227,21 @@ struct xe_oa_stream {
 	 */
 	bool enabled;
 
+	/**
+	 * @hold_preemption: Whether preemption is put on hold for command
+	 * submissions done on the @engine. This is useful for some drivers that
+	 * cannot easily post process the OA buffer context to subtract delta
+	 * of performance counters not associated with @engine.
+	 */
+	bool hold_preemption;
+
+	// FIXME: these are deleted
+	/**
+	 * @ops: The callbacks providing the implementation of this specific
+	 * type of configured stream.
+	 */
+	// const struct xe_perf_stream_ops *ops;
+
 	/** @oa_config: The OA configuration used by the stream */
 	struct xe_oa_config *oa_config;
 
@@ -207,6 +251,12 @@ struct xe_oa_stream {
 	 */
 	struct llist_head oa_config_bos;
 
+	// FIXME: not needed for xe, should be 'struct xe_lrc *' if needed
+	/**
+	 * @pinned_ctx: The OA context specific information.
+	 */
+	// struct intel_context *pinned_ctx;
+
 	/** @specific_ctx_id: id of the context used for filtering reports */
 	u32 specific_ctx_id;
 
@@ -249,6 +299,9 @@ struct xe_oa_stream {
 		/** @last_ctx_id: last context id for OA data added */
 		u32 last_ctx_id;
 
+		// struct xe_vma *vma;
+		// int size_exponent;
+
 		/**
 		 * @ptr_lock: Locks reads and writes to all head/tail state
 		 *
@@ -286,10 +339,22 @@ struct xe_oa_stream {
 		u32 tail;
 	} oa_buffer;
 
+	/**
+	 * @noa_wait: A batch buffer doing a wait on the GPU for the NOA
+	 * logic to be reprogrammed.
+	 */
+	struct xe_bo *noa_wait;
+
 	/**
 	 * @poll_oa_period: The period in nanoseconds at which the OA
 	 * buffer should be checked for available data.
 	 */
 	u64 poll_oa_period;
+
+	/**
+	 * @override_gucrc: GuC RC has been overridden for the perf stream,
+	 * and we need to restore the default configuration on release.
+	 */
+	bool override_gucrc;
 };
 #endif
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index b4ab07c285245..ec2af7244e1de 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -1146,6 +1146,8 @@ enum drm_xe_oa_property_id {
 	 */
 	DRM_XE_OA_PROP_POLL_OA_PERIOD,
 
+	 /* FIXME: Should the OA unit be identified by OA unit id? In that case
+	    what happens when mulitple engines are connected to an OA unit? */
 	/**
 	 * Multiple engines may be mapped to the same OA unit. The OA unit is
 	 * identified by class:instance of any engine mapped to it.
-- 
2.41.0



More information about the Intel-gfx-trybot mailing list