[Intel-gfx] [PATCH v2 1/2] drm/i915: Dump error capture to kernel log

Tue May 16 22:06:03 UTC 2023

On 5/16/2023 13:52, Rodrigo Vivi wrote:
> On Tue, May 16, 2023 at 12:21:05PM -0700, John Harrison wrote:
>>     On 5/16/2023 12:17, Belgaumkar, Vinay wrote:
>>                                                                                  
>>     > On 4/18/2023 11:17 AM, [1]John.C.Harrison at Intel.com wrote:
>>                                                                                  
>>     >> From: John Harrison [2]<John.C.Harrison at Intel.com>
>>                                                                                  
>>     >> This is useful for getting debug information out in certain
>>     >> situations, such as failing kernel selftests and CI runs that don't
>>     >> log error captures. It is especially useful for things like retrieving
>>     >> GuC logs as GuC operation can't be tracked by adding printk or ftrace
>>     >> entries.
>>                                                                                  
>>     >> v2: Add CONFIG_DRM_I915_DEBUG_GEM wrapper (review feedback by Rodrigo).
> Thanks
>
>>                                                                                  
>>     > Do the CI sparse warnings hold water? With that looked at,
>>                                                                                  
>>     You mean this one totally fatal and absolutely must be fixed error?
>>                                                                                  
>>     Fast mode used, each commit won't be checked separately.
> You should never rely on this assumption. There are bisects and autobisects
> out there. Also every patch needs to be independently available for backport.
>
> So, if there's any issue it needs to be fix before we merge.
What assumption? That sparse claims failure even when there are no 
errors at all, just a notice about 'fast mode used'?  If there are 
errors, please point out where I can find them. AFAICT, the sparse email 
is actually saying the patch set is clean despite the fact it has a big 
red cross on it.

John.

>
>>                                                                                  
>>     Does anyone even know what that means or why it (apparently totally
>>     randomly) hits some patch sets and not others?
>>                                                                                  
>>     If you mean the checkpatch warnings. One is about not reporting out of
>>     memory errors (because you are supposed to return -ENOMEM and let the user
>>     handle it instead), but that doesn't apply for an internal kernel only
>>     thing which is already just a debug print. The other is about macro
>>     argument re-use, which is not an issue in this case and not worth
>>     re-writing the code to avoid.
>>                                                                                  
>>     John.
>>                                                                                  
>>     > LGTM,
>>                                                                                  
>>     > Reviewed-by: Vinay Belgaumkar [3]<vinay.belgaumkar at intel.com>
>>                                                                                  
>>     >> Signed-off-by: John Harrison [4]<John.C.Harrison at Intel.com>
>>     >> ---
>>     >>   drivers/gpu/drm/i915/i915_gpu_error.c | 132
>>     >> ++++++++++++++++++++++++++
>>     >>   drivers/gpu/drm/i915/i915_gpu_error.h |  10 ++
>>     >>   2 files changed, 142 insertions(+)
>>                                                                                  
>>     >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c
>>     >> b/drivers/gpu/drm/i915/i915_gpu_error.c
>>     >> index f020c0086fbcd..03d62c250c465 100644
>>     >> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>>     >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>>     >> @@ -2219,3 +2219,135 @@ void i915_disable_error_state(struct
>>     >> drm_i915_private *i915, int err)
>>     >>           i915->gpu_error.first_error = ERR_PTR(err);
>>     >>       spin_unlock_irq(&i915->gpu_error.lock);
>>     >>   }
>>     >> +
>>     >> +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
>>     >> +void intel_klog_error_capture(struct intel_gt *gt,
>>     >> +                  intel_engine_mask_t engine_mask)
>>     >> +{
>>     >> +    static int g_count;
>>     >> +    struct drm_i915_private *i915 = gt->i915;
>>     >> +    struct i915_gpu_coredump *error;
>>     >> +    intel_wakeref_t wakeref;
>>     >> +    size_t buf_size = PAGE_SIZE * 128;
>>     >> +    size_t pos_err;
>>     >> +    char *buf, *ptr, *next;
>>     >> +    int l_count = g_count++;
>>     >> +    int line = 0;
>>     >> +
>>     >> +    /* Can't allocate memory during a reset */
>>     >> +    if (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
>>     >> +        drm_err(&gt->i915->drm, "[Capture/%d.%d] Inside GT reset,
>>     >> skipping error capture :(\n",
>>     >> +            l_count, line++);
>>     >> +        return;
>>     >> +    }
>>     >> +
>>     >> +    error = READ_ONCE(i915->gpu_error.first_error);
>>     >> +    if (error) {
>>     >> +        drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error
>>     >> capture first...\n",
>>     >> +            l_count, line++);
>>     >> +        i915_reset_error_state(i915);
>>     >> +    }
>>     >> +
>>     >> +    with_intel_runtime_pm(&i915->runtime_pm, wakeref)
>>     >> +        error = i915_gpu_coredump(gt, engine_mask,
>>     >> CORE_DUMP_FLAG_NONE);
>>     >> +
>>     >> +    if (IS_ERR(error)) {
>>     >> +        drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error
>>     >> capture: %ld!\n",
>>     >> +            l_count, line++, PTR_ERR(error));
>>     >> +        return;
>>     >> +    }
>>     >> +
>>     >> +    buf = kvmalloc(buf_size, GFP_KERNEL);
>>     >> +    if (!buf) {
>>     >> +        drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer
>>     >> for error capture!\n",
>>     >> +            l_count, line++);
>>     >> +        i915_gpu_coredump_put(error);
>>     >> +        return;
>>     >> +    }
>>     >> +
>>     >> +    drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture
>>     >> for %ps...\n",
>>     >> +         l_count, line++, __builtin_return_address(0));
>>     >> +
>>     >> +    /* Largest string length safe to print via dmesg */
>>     >> +#    define MAX_CHUNK    800
>>     >> +
>>     >> +    pos_err = 0;
>>     >> +    while (1) {
>>     >> +        ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf,
>>     >> pos_err, buf_size - 1);
>>     >> +
>>     >> +        if (got <= 0)
>>     >> +            break;
>>     >> +
>>     >> +        buf[got] = 0;
>>     >> +        pos_err += got;
>>     >> +
>>     >> +        ptr = buf;
>>     >> +        while (got > 0) {
>>     >> +            size_t count;
>>     >> +            char tag[2];
>>     >> +
>>     >> +            next = strnchr(ptr, got, '\n');
>>     >> +            if (next) {
>>     >> +                count = next - ptr;
>>     >> +                *next = 0;
>>     >> +                tag[0] = '>';
>>     >> +                tag[1] = '<';
>>     >> +            } else {
>>     >> +                count = got;
>>     >> +                tag[0] = '}';
>>     >> +                tag[1] = '{';
>>     >> +            }
>>     >> +
>>     >> +            if (count > MAX_CHUNK) {
>>     >> +                size_t pos;
>>     >> +                char *ptr2 = ptr;
>>     >> +
>>     >> +                for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) {
>>     >> +                    char chr = ptr[pos];
>>     >> +
>>     >> +                    ptr[pos] = 0;
>>     >> +                    drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n",
>>     >> +                         l_count, line++, ptr2);
>>     >> +                    ptr[pos] = chr;
>>     >> +                    ptr2 = ptr + pos;
>>     >> +
>>     >> +                    /*
>>     >> +                     * If spewing large amounts of data via a serial
>>     >> console,
>>     >> +                     * this can be a very slow process. So be friendly
>>     >> and try
>>     >> +                     * not to cause 'softlockup on CPU' problems.
>>     >> +                     */
>>     >> +                    cond_resched();
>>     >> +                }
>>     >> +
>>     >> +                if (ptr2 < (ptr + count))
>>     >> +                    drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
>>     >> +                         l_count, line++, tag[0], ptr2, tag[1]);
>>     >> +                else if (tag[0] == '>')
>>     >> +                    drm_info(&i915->drm, "[Capture/%d.%d] ><\n",
>>     >> +                         l_count, line++);
>>     >> +            } else {
>>     >> +                drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
>>     >> +                     l_count, line++, tag[0], ptr, tag[1]);
>>     >> +            }
>>     >> +
>>     >> +            ptr = next;
>>     >> +            got -= count;
>>     >> +            if (next) {
>>     >> +                ptr++;
>>     >> +                got--;
>>     >> +            }
>>     >> +
>>     >> +            /* As above. */
>>     >> +            cond_resched();
>>     >> +        }
>>     >> +
>>     >> +        if (got)
>>     >> +            drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes
>>     >> remaining!\n",
>>     >> +                 l_count, line++, got);
>>     >> +    }
>>     >> +
>>     >> +    kvfree(buf);
>>     >> +
>>     >> +    drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n",
>>     >> l_count, line++, pos_err);
>>     >> +}
>>     >> +#endif
>>     >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h
>>     >> b/drivers/gpu/drm/i915/i915_gpu_error.h
>>     >> index a91932cc65317..a78c061ce26fb 100644
>>     >> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
>>     >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
>>     >> @@ -258,6 +258,16 @@ static inline u32 i915_reset_engine_count(struct
>>     >> i915_gpu_error *error,
>>     >>   #define CORE_DUMP_FLAG_NONE           0x0
>>     >>   #define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
>>     >>   +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) &&
>>     >> IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
>>     >> +void intel_klog_error_capture(struct intel_gt *gt,
>>     >> +                  intel_engine_mask_t engine_mask);
>>     >> +#else
>>     >> +static inline void intel_klog_error_capture(struct intel_gt *gt,
>>     >> +                        intel_engine_mask_t engine_mask)
>>     >> +{
>>     >> +}
>>     >> +#endif
>>     >> +
>>     >>   #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
>>     >>     __printf(2, 3)
>>
>> References
>>
>>     Visible links
>>     1. mailto:John.C.Harrison at intel.com
>>     2. mailto:John.C.Harrison at intel.com
>>     3. mailto:vinay.belgaumkar at intel.com
>>     4. mailto:John.C.Harrison at intel.com