<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
On 5/16/2023 12:17, Belgaumkar, Vinay wrote:<br>
<blockquote type="cite" cite="mid:696ba7f2-4353-e154-af0b-83604dda2546@intel.com">
On 4/18/2023 11:17 AM, <a class="moz-txt-link-abbreviated" href="mailto:John.C.Harrison@Intel.com">John.C.Harrison@Intel.com</a> wrote:
<br>
<blockquote type="cite">From: John Harrison
<a class="moz-txt-link-rfc2396E" href="mailto:John.C.Harrison@Intel.com"><John.C.Harrison@Intel.com></a>
<br>
<br>
This is useful for getting debug information out in certain
<br>
situations, such as failing kernel selftests and CI runs that
don't
<br>
log error captures. It is especially useful for things like
retrieving
<br>
GuC logs as GuC operation can't be tracked by adding printk or
ftrace
<br>
entries.
<br>
<br>
v2: Add CONFIG_DRM_I915_DEBUG_GEM wrapper (review feedback by
Rodrigo).
<br>
</blockquote>
<br>
Do the CI sparse warnings hold water? With that looked at,
<br>
</blockquote>
You mean this one totally fatal and absolutely must be fixed error?<br>
<blockquote>
<pre class="moz-quote-pre" wrap="">Fast mode used, each commit won't be checked separately.</pre>
</blockquote>
<br>
Does anyone even know what that means or why it (apparently totally
randomly) hits some patch sets and not others?<br>
<br>
If you mean the checkpatch warnings. One is about not reporting out
of memory errors (because you are supposed to return -ENOMEM and let
the user handle it instead), but that doesn't apply for an internal
kernel only thing which is already just a debug print. The other is
about macro argument re-use, which is not an issue in this case and
not worth re-writing the code to avoid.<br>
<br>
John.<br>
<br>
<br>
<blockquote type="cite" cite="mid:696ba7f2-4353-e154-af0b-83604dda2546@intel.com">
<br>
LGTM,
<br>
<br>
Reviewed-by: Vinay Belgaumkar <a class="moz-txt-link-rfc2396E" href="mailto:vinay.belgaumkar@intel.com"><vinay.belgaumkar@intel.com></a>
<br>
<br>
<blockquote type="cite">
<br>
Signed-off-by: John Harrison <a class="moz-txt-link-rfc2396E" href="mailto:John.C.Harrison@Intel.com"><John.C.Harrison@Intel.com></a>
<br>
---
<br>
drivers/gpu/drm/i915/i915_gpu_error.c | 132
++++++++++++++++++++++++++
<br>
drivers/gpu/drm/i915/i915_gpu_error.h | 10 ++
<br>
2 files changed, 142 insertions(+)
<br>
<br>
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c
b/drivers/gpu/drm/i915/i915_gpu_error.c
<br>
index f020c0086fbcd..03d62c250c465 100644
<br>
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
<br>
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
<br>
@@ -2219,3 +2219,135 @@ void i915_disable_error_state(struct
drm_i915_private *i915, int err)
<br>
i915->gpu_error.first_error = ERR_PTR(err);
<br>
spin_unlock_irq(&i915->gpu_error.lock);
<br>
}
<br>
+
<br>
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
<br>
+void intel_klog_error_capture(struct intel_gt *gt,
<br>
+ intel_engine_mask_t engine_mask)
<br>
+{
<br>
+ static int g_count;
<br>
+ struct drm_i915_private *i915 = gt->i915;
<br>
+ struct i915_gpu_coredump *error;
<br>
+ intel_wakeref_t wakeref;
<br>
+ size_t buf_size = PAGE_SIZE * 128;
<br>
+ size_t pos_err;
<br>
+ char *buf, *ptr, *next;
<br>
+ int l_count = g_count++;
<br>
+ int line = 0;
<br>
+
<br>
+ /* Can't allocate memory during a reset */
<br>
+ if (test_bit(I915_RESET_BACKOFF, >->reset.flags))
{
<br>
+ drm_err(>->i915->drm, "[Capture/%d.%d]
Inside GT reset, skipping error capture :(\n",
<br>
+ l_count, line++);
<br>
+ return;
<br>
+ }
<br>
+
<br>
+ error = READ_ONCE(i915->gpu_error.first_error);
<br>
+ if (error) {
<br>
+ drm_err(&i915->drm, "[Capture/%d.%d] Clearing
existing error capture first...\n",
<br>
+ l_count, line++);
<br>
+ i915_reset_error_state(i915);
<br>
+ }
<br>
+
<br>
+ with_intel_runtime_pm(&i915->runtime_pm, wakeref)
<br>
+ error = i915_gpu_coredump(gt, engine_mask,
CORE_DUMP_FLAG_NONE);
<br>
+
<br>
+ if (IS_ERR(error)) {
<br>
+ drm_err(&i915->drm, "[Capture/%d.%d] Failed to
capture error capture: %ld!\n",
<br>
+ l_count, line++, PTR_ERR(error));
<br>
+ return;
<br>
+ }
<br>
+
<br>
+ buf = kvmalloc(buf_size, GFP_KERNEL);
<br>
+ if (!buf) {
<br>
+ drm_err(&i915->drm, "[Capture/%d.%d] Failed to
allocate buffer for error capture!\n",
<br>
+ l_count, line++);
<br>
+ i915_gpu_coredump_put(error);
<br>
+ return;
<br>
+ }
<br>
+
<br>
+ drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915
error capture for %ps...\n",
<br>
+ l_count, line++, __builtin_return_address(0));
<br>
+
<br>
+ /* Largest string length safe to print via dmesg */
<br>
+# define MAX_CHUNK 800
<br>
+
<br>
+ pos_err = 0;
<br>
+ while (1) {
<br>
+ ssize_t got = i915_gpu_coredump_copy_to_buffer(error,
buf, pos_err, buf_size - 1);
<br>
+
<br>
+ if (got <= 0)
<br>
+ break;
<br>
+
<br>
+ buf[got] = 0;
<br>
+ pos_err += got;
<br>
+
<br>
+ ptr = buf;
<br>
+ while (got > 0) {
<br>
+ size_t count;
<br>
+ char tag[2];
<br>
+
<br>
+ next = strnchr(ptr, got, '\n');
<br>
+ if (next) {
<br>
+ count = next - ptr;
<br>
+ *next = 0;
<br>
+ tag[0] = '>';
<br>
+ tag[1] = '<';
<br>
+ } else {
<br>
+ count = got;
<br>
+ tag[0] = '}';
<br>
+ tag[1] = '{';
<br>
+ }
<br>
+
<br>
+ if (count > MAX_CHUNK) {
<br>
+ size_t pos;
<br>
+ char *ptr2 = ptr;
<br>
+
<br>
+ for (pos = MAX_CHUNK; pos < count; pos +=
MAX_CHUNK) {
<br>
+ char chr = ptr[pos];
<br>
+
<br>
+ ptr[pos] = 0;
<br>
+ drm_info(&i915->drm,
"[Capture/%d.%d] }%s{\n",
<br>
+ l_count, line++, ptr2);
<br>
+ ptr[pos] = chr;
<br>
+ ptr2 = ptr + pos;
<br>
+
<br>
+ /*
<br>
+ * If spewing large amounts of data via a
serial console,
<br>
+ * this can be a very slow process. So be
friendly and try
<br>
+ * not to cause 'softlockup on CPU'
problems.
<br>
+ */
<br>
+ cond_resched();
<br>
+ }
<br>
+
<br>
+ if (ptr2 < (ptr + count))
<br>
+ drm_info(&i915->drm,
"[Capture/%d.%d] %c%s%c\n",
<br>
+ l_count, line++, tag[0], ptr2,
tag[1]);
<br>
+ else if (tag[0] == '>')
<br>
+ drm_info(&i915->drm,
"[Capture/%d.%d] ><\n",
<br>
+ l_count, line++);
<br>
+ } else {
<br>
+ drm_info(&i915->drm, "[Capture/%d.%d]
%c%s%c\n",
<br>
+ l_count, line++, tag[0], ptr, tag[1]);
<br>
+ }
<br>
+
<br>
+ ptr = next;
<br>
+ got -= count;
<br>
+ if (next) {
<br>
+ ptr++;
<br>
+ got--;
<br>
+ }
<br>
+
<br>
+ /* As above. */
<br>
+ cond_resched();
<br>
+ }
<br>
+
<br>
+ if (got)
<br>
+ drm_info(&i915->drm, "[Capture/%d.%d] Got
%zd bytes remaining!\n",
<br>
+ l_count, line++, got);
<br>
+ }
<br>
+
<br>
+ kvfree(buf);
<br>
+
<br>
+ drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd
bytes\n", l_count, line++, pos_err);
<br>
+}
<br>
+#endif
<br>
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h
b/drivers/gpu/drm/i915/i915_gpu_error.h
<br>
index a91932cc65317..a78c061ce26fb 100644
<br>
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
<br>
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
<br>
@@ -258,6 +258,16 @@ static inline u32
i915_reset_engine_count(struct i915_gpu_error *error,
<br>
#define CORE_DUMP_FLAG_NONE 0x0
<br>
#define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
<br>
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) &&
IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
<br>
+void intel_klog_error_capture(struct intel_gt *gt,
<br>
+ intel_engine_mask_t engine_mask);
<br>
+#else
<br>
+static inline void intel_klog_error_capture(struct intel_gt
*gt,
<br>
+ intel_engine_mask_t engine_mask)
<br>
+{
<br>
+}
<br>
+#endif
<br>
+
<br>
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
<br>
__printf(2, 3)
<br>
</blockquote>
</blockquote>
<br>
</body>
</html>