[PATCH v23 3/6] drm/xe/guc: Add capture size check in GuC log buffer

Wed Oct 2 23:44:35 UTC 2024

Capture-nodes generated by GuC are placed in the GuC capture ring
buffer which is a sub-region of the larger Guc-Log-buffer.
Add capture output size check before allocating the shared buffer.

Signed-off-by: Zhanjun Dong <zhanjun.dong at intel.com>
Reviewed-by: Alan Previn <alan.previn.teres.alexis at intel.com>
---
 drivers/gpu/drm/xe/abi/guc_log_abi.h | 20 +++++++
 drivers/gpu/drm/xe/xe_guc_capture.c  | 83 +++++++++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_guc_log.c      | 65 ++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_guc_log.h      |  7 ++-
 4 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/xe/abi/guc_log_abi.h

diff --git a/drivers/gpu/drm/xe/abi/guc_log_abi.h b/drivers/gpu/drm/xe/abi/guc_log_abi.h
new file mode 100644
index 000000000000..10db4ffaa17f
--- /dev/null
+++ b/drivers/gpu/drm/xe/abi/guc_log_abi.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2024 Intel Corporation
+ */
+
+#ifndef _ABI_GUC_LOG_ABI_H
+#define _ABI_GUC_LOG_ABI_H
+
+#include <linux/types.h>
+
+/* GuC logging buffer types */
+enum guc_log_buffer_type {
+	GUC_LOG_BUFFER_CRASH_DUMP,
+	GUC_LOG_BUFFER_DEBUG,
+	GUC_LOG_BUFFER_CAPTURE,
+};
+
+#define GUC_LOG_BUFFER_TYPE_MAX		3
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c b/drivers/gpu/drm/xe/xe_guc_capture.c
index b84f9f30609f..7090985e9cdf 100644
--- a/drivers/gpu/drm/xe/xe_guc_capture.c
+++ b/drivers/gpu/drm/xe/xe_guc_capture.c
@@ -22,6 +22,7 @@
 #include "xe_gt_mcr.h"
 #include "xe_gt_printk.h"
 #include "xe_guc.h"
+#include "xe_guc_ads.h"
 #include "xe_guc_capture.h"
 #include "xe_guc_capture_types.h"
 #include "xe_guc_ct.h"
@@ -670,6 +671,85 @@ size_t xe_guc_capture_ads_input_worst_size(struct xe_guc *guc)
 	return PAGE_ALIGN(total_size);
 }
 
+static int guc_capture_output_size_est(struct xe_guc *guc)
+{
+	struct xe_gt *gt = guc_to_gt(guc);
+	struct xe_hw_engine *hwe;
+	enum xe_hw_engine_id id;
+
+	int capture_size = 0;
+	size_t tmp = 0;
+
+	if (!guc->capture)
+		return -ENODEV;
+
+	/*
+	 * If every single engine-instance suffered a failure in quick succession but
+	 * were all unrelated, then a burst of multiple error-capture events would dump
+	 * registers for every one engine instance, one at a time. In this case, GuC
+	 * would even dump the global-registers repeatedly.
+	 *
+	 * For each engine instance, there would be 1 x guc_state_capture_group_t output
+	 * followed by 3 x guc_state_capture_t lists. The latter is how the register
+	 * dumps are split across different register types (where the '3' are global vs class
+	 * vs instance).
+	 */
+	for_each_hw_engine(hwe, gt, id) {
+		enum guc_capture_list_class_type capture_class;
+
+		capture_class = xe_engine_class_to_guc_capture_class(hwe->class);
+		capture_size += sizeof(struct guc_state_capture_group_header_t) +
+					 (3 * sizeof(struct guc_state_capture_header_t));
+
+		if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_GLOBAL,
+					     0, &tmp, true))
+			capture_size += tmp;
+		if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_CLASS,
+					     capture_class, &tmp, true))
+			capture_size += tmp;
+		if (!guc_capture_getlistsize(guc, 0, GUC_STATE_CAPTURE_TYPE_ENGINE_INSTANCE,
+					     capture_class, &tmp, true))
+			capture_size += tmp;
+	}
+
+	return capture_size;
+}
+
+/*
+ * Add on a 3x multiplier to allow for multiple back-to-back captures occurring
+ * before the Xe can read the data out and process it
+ */
+#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
+
+static void check_guc_capture_size(struct xe_guc *guc)
+{
+	int capture_size = guc_capture_output_size_est(guc);
+	int spare_size = capture_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
+	u32 buffer_size = xe_guc_log_section_size_capture(&guc->log);
+
+	/*
+	 * NOTE: capture_size is much smaller than the capture region
+	 * allocation (DG2: <80K vs 1MB).
+	 * Additionally, its based on space needed to fit all engines getting
+	 * reset at once within the same G2H handler task slot. This is very
+	 * unlikely. However, if GuC really does run out of space for whatever
+	 * reason, we will see an separate warning message when processing the
+	 * G2H event capture-notification, search for:
+	 * xe_guc_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
+	 */
+	if (capture_size < 0)
+		xe_gt_dbg(guc_to_gt(guc),
+			  "Failed to calculate error state capture buffer minimum size: %d!\n",
+			  capture_size);
+	if (capture_size > buffer_size)
+		xe_gt_dbg(guc_to_gt(guc), "Error state capture buffer maybe small: %d < %d\n",
+			  buffer_size, capture_size);
+	else if (spare_size > buffer_size)
+		xe_gt_dbg(guc_to_gt(guc),
+			  "Error state capture buffer lacks spare size: %d < %d (min = %d)\n",
+			  buffer_size, spare_size, capture_size);
+}
+
 /*
  * xe_guc_capture_steered_list_init - Init steering register list
  * @guc: The GuC object
@@ -685,9 +765,10 @@ void xe_guc_capture_steered_list_init(struct xe_guc *guc)
 	 * the end of the pre-populated render list.
 	 */
 	guc_capture_alloc_steered_lists(guc);
+	check_guc_capture_size(guc);
 }
 
-/**
+/*
  * xe_guc_capture_init - Init for GuC register capture
  * @guc: The GuC object
  *
diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index a37ee3419428..d6b5ac522b6c 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -96,3 +96,68 @@ int xe_guc_log_init(struct xe_guc_log *log)
 
 	return 0;
 }
+
+static u32 xe_guc_log_section_size_crash(struct xe_guc_log *log)
+{
+	return CRASH_BUFFER_SIZE;
+}
+
+static u32 xe_guc_log_section_size_debug(struct xe_guc_log *log)
+{
+	return DEBUG_BUFFER_SIZE;
+}
+
+/**
+ * xe_guc_log_section_size_capture - Get capture buffer size within log sections.
+ * @log: The log object.
+ *
+ * This function will return the capture buffer size within log sections.
+ *
+ * Return: capture buffer size.
+ */
+u32 xe_guc_log_section_size_capture(struct xe_guc_log *log)
+{
+	return CAPTURE_BUFFER_SIZE;
+}
+
+/**
+ * xe_guc_get_log_buffer_size - Get log buffer size for a type.
+ * @log: The log object.
+ * @type: The log buffer type
+ *
+ * Return: buffer size.
+ */
+u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type)
+{
+	switch (type) {
+	case GUC_LOG_BUFFER_CRASH_DUMP:
+		return xe_guc_log_section_size_crash(log);
+	case GUC_LOG_BUFFER_DEBUG:
+		return xe_guc_log_section_size_debug(log);
+	case GUC_LOG_BUFFER_CAPTURE:
+		return xe_guc_log_section_size_capture(log);
+	}
+	return 0;
+}
+
+/**
+ * xe_guc_get_log_buffer_offset - Get offset in log buffer for a type.
+ * @log: The log object.
+ * @type: The log buffer type
+ *
+ * This function will return the offset in the log buffer for a type.
+ * Return: buffer offset.
+ */
+u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type)
+{
+	enum guc_log_buffer_type i;
+	u32 offset = PAGE_SIZE;/* for the log_buffer_states */
+
+	for (i = GUC_LOG_BUFFER_CRASH_DUMP; i < GUC_LOG_BUFFER_TYPE_MAX; ++i) {
+		if (i == type)
+			break;
+		offset += xe_guc_get_log_buffer_size(log, i);
+	}
+
+	return offset;
+}
diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h
index 2d25ab28b4b3..87ecd1814854 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.h
+++ b/drivers/gpu/drm/xe/xe_guc_log.h
@@ -7,6 +7,7 @@
 #define _XE_GUC_LOG_H_
 
 #include "xe_guc_log_types.h"
+#include "abi/guc_log_abi.h"
 
 struct drm_printer;
 
@@ -17,7 +18,7 @@ struct drm_printer;
 #else
 #define CRASH_BUFFER_SIZE	SZ_8K
 #define DEBUG_BUFFER_SIZE	SZ_64K
-#define CAPTURE_BUFFER_SIZE	SZ_16K
+#define CAPTURE_BUFFER_SIZE	SZ_1M
 #endif
 /*
  * While we're using plain log level in i915, GuC controls are much more...
@@ -45,4 +46,8 @@ xe_guc_log_get_level(struct xe_guc_log *log)
 	return log->level;
 }
 
+u32 xe_guc_log_section_size_capture(struct xe_guc_log *log);
+u32 xe_guc_get_log_buffer_size(struct xe_guc_log *log, enum guc_log_buffer_type type);
+u32 xe_guc_get_log_buffer_offset(struct xe_guc_log *log, enum guc_log_buffer_type type);
+
 #endif
-- 
2.34.1