[PATCH v5 1/4] drm/xe: Add devcoredump chunking

Thu Apr 17 19:59:35 UTC 2025

On 4/16/2025 5:04 PM, Matthew Brost wrote:
> On Wed, Apr 16, 2025 at 02:35:42PM -0700, John Harrison wrote:
>> On 4/16/2025 1:35 PM, Matthew Brost wrote:
>>> On Wed, Apr 16, 2025 at 12:30:27PM -0700, John Harrison wrote:
>>>> On 4/15/2025 10:44 PM, Matthew Brost wrote:
>>>>> On Tue, Apr 15, 2025 at 04:18:29PM -0700, John Harrison wrote:
>>>>>> On 4/15/2025 11:39 AM, Matthew Brost wrote:
>>>>>>> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
>>>>>>> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
>>>>>>> callback as needed.
>>>>>>>
>>>>>>> Some memory allocations are changed to GFP_ATOMIC as they done in
>>>>>>> xe_devcoredump_read which holds lock in the path of reclaim. The
>>>>>>> allocations are small, so in practice should never fail.
>>>>>>>
>>>>>>> v2:
>>>>>>>      - Update commit message wrt gfp atomic (John H)
>>>>>>>
>>>>>>> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>>>>>>> Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
>>>>>>> ---
>>>>>>>      drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>>>>>>>      drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>>>>>>>      drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>>>>>>>      3 files changed, 50 insertions(+), 13 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
>>>>>>> index 81b9d9bb3f57..a9e618abf8ac 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
>>>>>>> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>>>>>>>      	return &q->gt->uc.guc;
>>>>>>>      }
>>>>>>> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>>>>>> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
>>>>>>> +				     ssize_t start,
>>>>>>>      				     struct xe_devcoredump *coredump)
>>>>>>>      {
>>>>>>>      	struct xe_device *xe;
>>>>>>> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>>>>>>      	ss = &coredump->snapshot;
>>>>>>>      	iter.data = buffer;
>>>>>>> -	iter.start = 0;
>>>>>>> +	iter.start = start;
>>>>>>>      	iter.remain = count;
>>>>>>>      	p = drm_coredump_printer(&iter);
>>>>>>> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>>>>>>>      	ss->vm = NULL;
>>>>>>>      }
>>>>>>> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
>>>>>>> +
>>>>>>>      static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>>>>>      				   size_t count, void *data, size_t datalen)
>>>>>>>      {
>>>>>>> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>>>>>      	/* Ensure delayed work is captured before continuing */
>>>>>>>      	flush_work(&ss->work);
>>>>>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>>>>>> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
>>>>>>> +
>>>>>>>      	mutex_lock(&coredump->lock);
>>>>>>>      	if (!ss->read.buffer) {
>>>>>>> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>>>>>      		return 0;
>>>>>>>      	}
>>>>>>> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
>>>>>>> +	    offset < ss->read.chunk_position) {
>>>>>>> +		ss->read.chunk_position =
>>>>>>> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
>>>>>>> +
>>>>>>> +		__xe_devcoredump_read(ss->read.buffer,
>>>>>>> +				      XE_DEVCOREDUMP_CHUNK_MAX,
>>>>>>> +				      ss->read.chunk_position, coredump);
>>>>>>> +	}
>>>>>>> +
>>>>>>>      	byte_copied = count < ss->read.size - offset ? count :
>>>>>>>      		ss->read.size - offset;
>>>>>>> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
>>>>>>> +	memcpy(buffer, ss->read.buffer +
>>>>>>> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>>>>>>>      	mutex_unlock(&coredump->lock);
>>>>>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>>>>>> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
>>>>>>> +
>>>>>>>      	return byte_copied;
>>>>>>>      }
>>>>>>> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>>>>>>>      	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>>>>>>>      	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>>>>>>> -	xe_pm_runtime_put(xe);
>>>>>>> +	ss->read.chunk_position = 0;
>>>>>>>      	/* Calculate devcoredump size */
>>>>>>> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
>>>>>>> -
>>>>>>> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>>>>>> -	if (!ss->read.buffer)
>>>>>>> -		return;
>>>>>>> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
>>>>>>> +
>>>>>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
>>>>>>> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
>>>>>>> +					   GFP_USER);
>>>>>>> +		if (!ss->read.buffer)
>>>>>>> +			goto put_pm;
>>>>>>> +
>>>>>>> +		__xe_devcoredump_read(ss->read.buffer,
>>>>>>> +				      XE_DEVCOREDUMP_CHUNK_MAX,
>>>>>>> +				      0, coredump);
>>>>>>> +	} else {
>>>>>>> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>>>>>> +		if (!ss->read.buffer)
>>>>>>> +			goto put_pm;
>>>>>>> +
>>>>>>> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
>>>>>>> +				      coredump);
>>>>>>> +		xe_devcoredump_snapshot_free(ss);
>>>>>>> +	}
>>>>>>> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
>>>>>>> -	xe_devcoredump_snapshot_free(ss);
>>>>>>> +put_pm:
>>>>>>> +	xe_pm_runtime_put(xe);
>>>>>>>      }
>>>>>>>      static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>>>>>>> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>>>>>>>      	if (offset & 3)
>>>>>>>      		drm_printf(p, "Offset not word aligned: %zu", offset);
>>>>>>> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
>>>>>>> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
>>>>>>>      	if (!line_buff) {
>>>>>>>      		drm_printf(p, "Failed to allocate line buffer\n");
>>>>>>>      		return;
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>>>>>> index 1a1d16a96b2d..a174385a6d83 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>>>>>> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>>>>>>>      	struct {
>>>>>>>      		/** @read.size: size of devcoredump in human readable format */
>>>>>>>      		ssize_t size;
>>>>>>> +		/** @read.chunk_position: position of devcoredump chunk */
>>>>>>> +		ssize_t chunk_position;
>>>>>>>      		/** @read.buffer: buffer of devcoredump in human readable format */
>>>>>>>      		char *buffer;
>>>>>>>      	} read;
>>>>>>> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>>>>>> index af2c817d552c..21403a250834 100644
>>>>>>> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>>>>>> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>>>>>> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>>>>>>>      	if (num_dw == 0)
>>>>>>>      		return -EINVAL;
>>>>>>> -	hwconfig = kzalloc(size, GFP_KERNEL);
>>>>>>> +	hwconfig = kzalloc(size, GFP_ATOMIC);
>>>>>> As per earlier comments, this should not be happening while dumping a core
>>>>>> dump. The only code which uses this is steering related. That implies we are
>>>>>> reading EU registers while printing out a previously saved core dump. That
>>>>>> either means we are unnecessarily reading registers during the print or we
>>>>>> are not capturing state in advance and only actually reading it out when
>>>>>> printing. Neither of which is good.
>>>>>>
>>>>> Ok, I don't have bandwidth to fix that. I'll let this series die then.
>>>>>
>>>>> Matt
>>>> Or maybe just add a FIXME comment? I've been meaning to take a look at this
>>>> myself but just haven't had the time.
>>>>
>>> I guess that is a better option. I can add a FIXME when merging.
>>>
>>>> If we need the large file support to allow people to debug issues then we
>>>> should get it in. Your patch is not causing the bad register accesses (I
>>> Yes, this is an ask. And no, this patch isn't causing bad register
>>> access rather exposing it via a lockdep splat.
>>>
>>>> don't think), that is something that is already broken in the driver. But I
>>>> don't think you should make this change with a comment suggesting it is all
>>>> fine and happy. The commit description should be more like "FIXME: This
>>>> should not not be happening, but until the unexpected register reads are
>>>> removed/moved make the alloc atomic so it doesn't go boom" and add a similar
>>>> comment to the allocation itself.
>>>>
>>> Ok, can fix commit too when merging.
>>>
>>> Matt
>> Sounds good.
>>
>> Although I've worked out where the dodgy reads are coming from. It's the GuC
>> backend for the register capture code. And it's not actually doing register
>> reads, but just re-calculating the steering for the register value that it
>> had previously read at capture time. It should be possible to cache that
>> information as well, rather than re-calculate it at dump time. And that will
>> get rid of the need for the atomic alloc. Will try to get something posted
>> later today. Up to you if you want to hold off merging for another day or
>> not.
>>
> I'll wait then I guess.
>
> Matt
Posted: https://patchwork.freedesktop.org/series/147940/

Turns out the capture of steered registers was just completely broken 
anyway!  So was definitely worth chasing this down.

John.

>
>> John.
>>
>>
>>>> John.
>>>>
>>>>>> Rather than making allocations atomic when they don't need to be, we should
>>>>>> be fixing why the coredump is not doing a clean separation of save versus
>>>>>> print.
>>>>>>
>>>>>> John.
>>>>>>
>>>>>>
>>>>>>>      	if (!hwconfig)
>>>>>>>      		return -ENOMEM;