[Intel-gfx] [PATCH 6/7] drm/i915/guc: Copy new GuC error capture logs upon G2H notification.
Teres Alexis, Alan Previn
alan.previn.teres.alexis at intel.com
Wed Jan 19 01:36:18 UTC 2022
A fresh round of offline design discussions with internal team has decided
that:
- we dont want to use an interim circular buffer to copy all of the GuC
generated register dumps of one or more 'engine-capture-groups'.
- instead, we shall dynamically allocate multiple nodes, each node being
a single "engine-capture-dump". A link list of one or many engine-capture-
dumps would result from a single engine-capture-group.
- this dynamic allocation should happen during the G2H error-capture
notification event which happens before the corresponding G2H context-
reset that triggers the i915_gpu_coredump (where we want to avoid
memory allocation moving forward).
- we also realize that during the link-list allocation we would need
a first-parse of the guc-log-error-state-capture head-to-tail entries
in order to duplicate global and engine-class register dumps for each
each engine instance register dump if we find dependent-engine resets
in a engine-capture-group.
- later when i915_gpu_coredump calls into capture_engine, we finally
attach the corresponding node from the link list above (detaching it
from that link list) into i915_gpu_coredump's intel_engine_coredump
structure when have matching LRCA/guc-id/engine-instace.
- we would also have to add a flag through i915_gpu_coredump top level
trigger through to capture_engine to indicate if the capture was triggered
via a guc context reset or a forced user reset or gt-reset. In the latter
case (user/gt reset), we should capture the register values directly
from the HW, i.e. the pre-guc behavior without matching against GuC.
...alan
On Tue, 2022-01-18 at 02:03 -0800, Alan Previn wrote:
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> index b637628905ec..fc80c5f31915 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c+static void __guc_capture_store_snapshot_work(struct intel_guc *guc)
..
..
> +{
> + struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
> + unsigned int buffer_size, read_offset, write_offset, bytes_to_copy, full_count;
> + struct guc_log_buffer_state *log_buf_state;
> + struct guc_log_buffer_state log_buf_state_local;
> + void *src_data, *dst_data = NULL;
> + bool new_overflow;
> +
> + /* Lock to get the pointer to GuC capture-log-buffer-state */
> + mutex_lock(&guc->log_state[GUC_CAPTURE_LOG_BUFFER].lock);
> + log_buf_state = guc->log.buf_addr +
> + (sizeof(struct guc_log_buffer_state) * GUC_CAPTURE_LOG_BUFFER);
> + src_data = guc->log.buf_addr + intel_guc_get_log_buffer_offset(GUC_CAPTURE_LOG_BUFFER);
> +
> + /*
> + * Make a copy of the state structure, inside GuC log buffer
> + * (which is uncached mapped), on the stack to avoid reading
> + * from it multiple times.
> + */
> + memcpy(&log_buf_state_local, log_buf_state, sizeof(struct guc_log_buffer_state));
> + buffer_size = intel_guc_get_log_buffer_size(GUC_CAPTURE_LOG_BUFFER);
> + read_offset = log_buf_state_local.read_ptr;
> + write_offset = log_buf_state_local.sampled_write_ptr;
> + full_count = log_buf_state_local.buffer_full_cnt;
> +
> + /* Bookkeeping stuff */
> + guc->log_state[GUC_CAPTURE_LOG_BUFFER].flush += log_buf_state_local.flush_to_file;
> + new_overflow = intel_guc_check_log_buf_overflow(guc,
> + &guc->log_state[GUC_CAPTURE_LOG_BUFFER],
> + full_count);
> +
> + /* Update the state of shared log buffer */
> + log_buf_state->read_ptr = write_offset;
> + log_buf_state->flush_to_file = 0;
> +
> + mutex_unlock(&guc->log_state[GUC_CAPTURE_LOG_BUFFER].lock);
> +
> + dst_data = guc->capture.priv->out_store.addr;
> + if (dst_data) {
> + mutex_lock(&guc->capture.priv->out_store.lock);
> +
> + /* Now copy the actual logs. */
> + if (unlikely(new_overflow)) {
> + /* copy the whole buffer in case of overflow */
> + read_offset = 0;
> + write_offset = buffer_size;
> + } else if (unlikely((read_offset > buffer_size) ||
> + (write_offset > buffer_size))) {
> + drm_err(&i915->drm, "invalid GuC log capture buffer state!\n");
> + /* copy whole buffer as offsets are unreliable */
> + read_offset = 0;
> + write_offset = buffer_size;
> + }
> +
> + /* first copy from the tail end of the GuC log capture buffer */
> + if (read_offset > write_offset) {
> + guc_capture_store_insert(guc, &guc->capture.priv->out_store, src_data,
> + write_offset);
> + bytes_to_copy = buffer_size - read_offset;
> + } else {
> + bytes_to_copy = write_offset - read_offset;
> + }
> + guc_capture_store_insert(guc, &guc->capture.priv->out_store, src_data + read_offset,
> + bytes_to_copy);
> +
> + mutex_unlock(&guc->capture.priv->out_store.lock);
> + }
> +}
> +
> +void intel_guc_capture_store_snapshot(struct intel_guc *guc)
> +{
> + if (guc->capture.priv)
> + __guc_capture_store_snapshot_work(guc);
> +}
> +
> +static void guc_capture_store_destroy(struct intel_guc *guc)
> +{
> + mutex_destroy(&guc->capture.priv->out_store.lock);
> + guc->capture.priv->out_store.size = 0;
> + kfree(guc->capture.priv->out_store.addr);
> + guc->capture.priv->out_store.addr = NULL;
> +}
> +
> +static int guc_capture_store_create(struct intel_guc *guc)
> +{
> + /*
> + * Make this interim buffer larger than GuC capture output buffer so that we can absorb
> + * a little delay when processing the raw capture dumps into text friendly logs
> + * for the i915_gpu_coredump output
> + */
> + size_t max_dump_size;
> +
> + GEM_BUG_ON(guc->capture.priv->out_store.addr);
> +
> + max_dump_size = PAGE_ALIGN(intel_guc_capture_output_min_size_est(guc));
> + max_dump_size = roundup_pow_of_two(max_dump_size);
> +
> + guc->capture.priv->out_store.addr = kzalloc(max_dump_size, GFP_KERNEL);
> + if (!guc->capture.priv->out_store.addr)
> + return -ENOMEM;
> +
> + guc->capture.priv->out_store.size = max_dump_size;
> + mutex_init(&guc->capture.priv->out_store.lock);
> +
> + return 0;
> +}
> +
More information about the Intel-gfx
mailing list