[PATCH 17/17] drm/xe/oa: Enable Xe2+ overrun mode

Umesh Nerlige Ramappa umesh.nerlige.ramappa at intel.com
Tue Mar 12 20:17:23 UTC 2024


On Tue, Mar 12, 2024 at 01:14:14PM -0700, Umesh Nerlige Ramappa wrote:
>On Mon, Mar 11, 2024 at 08:40:03PM -0700, Ashutosh Dixit wrote:
>>Enable Xe2+ overrun mode. For Xe2+, when overrun mode is enabled, there are
>>no partial reports at the end of buffer, making the OA buffer effectively a
>>non-power-of-2 size circular buffer whose size, circ_size, is a multiple of
>>the report size.
>>
>>Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
>>---
>>drivers/gpu/drm/xe/xe_oa.c       | 36 +++++++++++++++++++++++++-------
>>drivers/gpu/drm/xe/xe_oa_types.h |  3 +++
>>2 files changed, 31 insertions(+), 8 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
>>index 6f5bbb0787d9..6a0d2e229254 100644
>>--- a/drivers/gpu/drm/xe/xe_oa.c
>>+++ b/drivers/gpu/drm/xe/xe_oa.c
>>@@ -106,7 +106,15 @@ static const struct xe_oa_format oa_formats[] = {
>>
>>static u32 xe_oa_circ_diff(struct xe_oa_stream *stream, u32 tail, u32 head)
>>{
>>-	return (tail - head) & (XE_OA_BUFFER_SIZE - 1);
>>+	if (stream->oa_buffer.circ_size == XE_OA_BUFFER_SIZE)
>>+		return (tail - head) & (XE_OA_BUFFER_SIZE - 1);
>>+	else
>>+		return (tail - head) % stream->oa_buffer.circ_size;
>>+}
>
>For ex: consider a 16 MB buffer with a report size of 384 bytes. At 
>the end of the buffer, you would have an empty space of 256 bytes (16 
>MB % 384)
>
>(For ref: 16 MB = 0x1000000, 384 = 0x180)
>In this case circ_size = 0xFFFF00
>
>Let's say your head is pointing to 0xFFFD80 and tail is pointing to 
>0x180 (essentially there is one unread report at the end of the buffer 
>and one unread report at the beginning of the buffer).
>
>In this case, (tail - head) % stream->oa_buffer.circ_size, is not 
>calculating the correct size. Should be 0x300, but I am not getting 
>that. Can you please check/verify?
>
>I am thinking we need something like this (roughly). We don't need the 
>mod operation.
>
>static u32 xe_oa_circ_diff(struct xe_oa_stream *stream, u32 tail, u32 > head)
>{
>	u32 available = (tail - head) & (XE_OA_BUFFER_SIZE - 1);
>
>	/* head is always aligned to report size, but tail may not be */
>	available = available % format_size;

should be

available -= available % format_size;

>
>	return tail >= head ? available : available - empty_space;
>}
>
>>+
>>+static u32 xe_oa_circ_incr(struct xe_oa_stream *stream, u32 ptr, u32 n)
>>+{
>>+	return xe_oa_circ_diff(stream, ptr, -n);
>>}
>>
>>static void xe_oa_config_release(struct kref *ref)
>>@@ -280,7 +288,7 @@ static int xe_oa_append_report(struct xe_oa_stream *stream, char __user *buf,
>>
>>	buf += *offset;
>>
>>-	oa_buf_end = stream->oa_buffer.vaddr + XE_OA_BUFFER_SIZE;
>>+	oa_buf_end = stream->oa_buffer.vaddr + stream->oa_buffer.circ_size;
>>	report_size_partial = oa_buf_end - report;
>>
>>	if (report_size_partial < report_size) {
>>@@ -306,7 +314,6 @@ static int xe_oa_append_reports(struct xe_oa_stream *stream, char __user *buf,
>>	int report_size = stream->oa_buffer.format->size;
>>	u8 *oa_buf_base = stream->oa_buffer.vaddr;
>>	u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo);
>>-	u32 mask = (XE_OA_BUFFER_SIZE - 1);
>>	size_t start_offset = *offset;
>>	unsigned long flags;
>>	u32 head, tail;
>>@@ -317,21 +324,23 @@ static int xe_oa_append_reports(struct xe_oa_stream *stream, char __user *buf,
>>	tail = stream->oa_buffer.tail;
>>	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
>>
>>-	xe_assert(stream->oa->xe, head < XE_OA_BUFFER_SIZE && tail < XE_OA_BUFFER_SIZE);
>>+	xe_assert(stream->oa->xe,
>>+		  head < stream->oa_buffer.circ_size && tail < stream->oa_buffer.circ_size);
>>
>>-	for (; xe_oa_circ_diff(stream, tail, head); head = (head + report_size) & mask) {
>>+	for (; xe_oa_circ_diff(stream, tail, head);
>>+	     head = xe_oa_circ_incr(stream, head, report_size)) {
>>		u8 *report = oa_buf_base + head;
>>
>>		ret = xe_oa_append_report(stream, buf, count, offset, report);
>>		if (ret)
>>			break;
>>
>>-		if (is_power_of_2(report_size)) {
>>+		if (!(stream->oa_buffer.circ_size % report_size)) {
>>			/* Clear out report id and timestamp to detect unlanded reports */
>>			oa_report_id_clear(stream, (void *)report);
>>			oa_timestamp_clear(stream, (void *)report);
>>		} else {
>>-			u8 *oa_buf_end = stream->oa_buffer.vaddr + XE_OA_BUFFER_SIZE;
>>+			u8 *oa_buf_end = stream->oa_buffer.vaddr + stream->oa_buffer.circ_size;
>>			u32 part = oa_buf_end - report;
>>
>>			/* Zero out the entire report */
>>@@ -369,7 +378,6 @@ static void xe_oa_init_oa_buffer(struct xe_oa_stream *stream)
>>	xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_head_ptr,
>>			gtt_offset & OAG_OAHEADPTR_MASK);
>>	stream->oa_buffer.head = 0;
>>-
>>	/*
>>	 * PRM says: "This MMIO must be set before the OATAILPTR register and after the
>>	 * OAHEADPTR register. This is to enable proper functionality of the overflow bit".
>>@@ -1256,6 +1264,18 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream,
>>	stream->periodic = param->period_exponent > 0;
>>	stream->period_exponent = param->period_exponent;
>>
>>+	/*
>>+	 * For Xe2+, when overrun mode is enabled, there are no partial reports at the end
>>+	 * of buffer, making the OA buffer effectively a non-power-of-2 size circular
>>+	 * buffer whose size, circ_size, is a multiple of the report size
>>+	 */
>>+	if (GRAPHICS_VER(stream->oa->xe) >= 20 &&
>>+	    stream->hwe->oa_unit->type == DRM_XE_OA_UNIT_TYPE_OAG && stream->sample)
>>+		stream->oa_buffer.circ_size =
>>+			XE_OA_BUFFER_SIZE - XE_OA_BUFFER_SIZE % stream->oa_buffer.format->size;
>>+	else
>>+		stream->oa_buffer.circ_size = XE_OA_BUFFER_SIZE;
>>+
>>	if (stream->exec_q && engine_supports_mi_query(stream->hwe)) {
>>		/* If we don't find the context offset, just return error */
>>		ret = xe_oa_set_ctx_ctrl_offset(stream);
>>diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h
>>index 6984e7d04be5..d8d5c9d8c22e 100644
>>--- a/drivers/gpu/drm/xe/xe_oa_types.h
>>+++ b/drivers/gpu/drm/xe/xe_oa_types.h
>>@@ -163,6 +163,9 @@ struct xe_oa_buffer {
>>
>>	/** @tail: The last verified cached tail where HW has completed writing */
>>	u32 tail;
>>+
>>+	/** @circ_size: The effective circular buffer size, for Xe2+ */
>>+	u32 circ_size;
>
>You could store the difference here instead.
>
>	/** @empty_space: empty space at tend of buffer */
>	u32 empty_space;
>
>Regards,
>Umesh
>
>>};
>>
>>/**
>>-- 
>>2.41.0
>>


More information about the Intel-xe mailing list