[Intel-gfx] [PATCH v2 1/2] drm/i915: Dump error capture to kernel log

Belgaumkar, Vinay vinay.belgaumkar at intel.com
Tue May 16 19:17:08 UTC 2023


On 4/18/2023 11:17 AM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> This is useful for getting debug information out in certain
> situations, such as failing kernel selftests and CI runs that don't
> log error captures. It is especially useful for things like retrieving
> GuC logs as GuC operation can't be tracked by adding printk or ftrace
> entries.
>
> v2: Add CONFIG_DRM_I915_DEBUG_GEM wrapper (review feedback by Rodrigo).

Do the CI sparse warnings hold water? With that looked at,

LGTM,

Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar at intel.com>

>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gpu_error.c | 132 ++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_gpu_error.h |  10 ++
>   2 files changed, 142 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index f020c0086fbcd..03d62c250c465 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -2219,3 +2219,135 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err)
>   		i915->gpu_error.first_error = ERR_PTR(err);
>   	spin_unlock_irq(&i915->gpu_error.lock);
>   }
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> +void intel_klog_error_capture(struct intel_gt *gt,
> +			      intel_engine_mask_t engine_mask)
> +{
> +	static int g_count;
> +	struct drm_i915_private *i915 = gt->i915;
> +	struct i915_gpu_coredump *error;
> +	intel_wakeref_t wakeref;
> +	size_t buf_size = PAGE_SIZE * 128;
> +	size_t pos_err;
> +	char *buf, *ptr, *next;
> +	int l_count = g_count++;
> +	int line = 0;
> +
> +	/* Can't allocate memory during a reset */
> +	if (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
> +		drm_err(&gt->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n",
> +			l_count, line++);
> +		return;
> +	}
> +
> +	error = READ_ONCE(i915->gpu_error.first_error);
> +	if (error) {
> +		drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n",
> +			l_count, line++);
> +		i915_reset_error_state(i915);
> +	}
> +
> +	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
> +		error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE);
> +
> +	if (IS_ERR(error)) {
> +		drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n",
> +			l_count, line++, PTR_ERR(error));
> +		return;
> +	}
> +
> +	buf = kvmalloc(buf_size, GFP_KERNEL);
> +	if (!buf) {
> +		drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n",
> +			l_count, line++);
> +		i915_gpu_coredump_put(error);
> +		return;
> +	}
> +
> +	drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n",
> +		 l_count, line++, __builtin_return_address(0));
> +
> +	/* Largest string length safe to print via dmesg */
> +#	define MAX_CHUNK	800
> +
> +	pos_err = 0;
> +	while (1) {
> +		ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1);
> +
> +		if (got <= 0)
> +			break;
> +
> +		buf[got] = 0;
> +		pos_err += got;
> +
> +		ptr = buf;
> +		while (got > 0) {
> +			size_t count;
> +			char tag[2];
> +
> +			next = strnchr(ptr, got, '\n');
> +			if (next) {
> +				count = next - ptr;
> +				*next = 0;
> +				tag[0] = '>';
> +				tag[1] = '<';
> +			} else {
> +				count = got;
> +				tag[0] = '}';
> +				tag[1] = '{';
> +			}
> +
> +			if (count > MAX_CHUNK) {
> +				size_t pos;
> +				char *ptr2 = ptr;
> +
> +				for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) {
> +					char chr = ptr[pos];
> +
> +					ptr[pos] = 0;
> +					drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n",
> +						 l_count, line++, ptr2);
> +					ptr[pos] = chr;
> +					ptr2 = ptr + pos;
> +
> +					/*
> +					 * If spewing large amounts of data via a serial console,
> +					 * this can be a very slow process. So be friendly and try
> +					 * not to cause 'softlockup on CPU' problems.
> +					 */
> +					cond_resched();
> +				}
> +
> +				if (ptr2 < (ptr + count))
> +					drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
> +						 l_count, line++, tag[0], ptr2, tag[1]);
> +				else if (tag[0] == '>')
> +					drm_info(&i915->drm, "[Capture/%d.%d] ><\n",
> +						 l_count, line++);
> +			} else {
> +				drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
> +					 l_count, line++, tag[0], ptr, tag[1]);
> +			}
> +
> +			ptr = next;
> +			got -= count;
> +			if (next) {
> +				ptr++;
> +				got--;
> +			}
> +
> +			/* As above. */
> +			cond_resched();
> +		}
> +
> +		if (got)
> +			drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n",
> +				 l_count, line++, got);
> +	}
> +
> +	kvfree(buf);
> +
> +	drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err);
> +}
> +#endif
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index a91932cc65317..a78c061ce26fb 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -258,6 +258,16 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>   #define CORE_DUMP_FLAG_NONE           0x0
>   #define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
>   
> +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) && IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
> +void intel_klog_error_capture(struct intel_gt *gt,
> +			      intel_engine_mask_t engine_mask);
> +#else
> +static inline void intel_klog_error_capture(struct intel_gt *gt,
> +					    intel_engine_mask_t engine_mask)
> +{
> +}
> +#endif
> +
>   #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
>   
>   __printf(2, 3)


More information about the Intel-gfx mailing list