[PATCH v4 4/4] drm/xe: Implement VM snapshot support for BO's and userptr

Tue Feb 20 13:36:35 UTC 2024


On 2024-02-16 18:43, Souza, Jose wrote:
> On Tue, 2024-02-13 at 15:52 +0100, Maarten Lankhorst wrote:
>> Since we cannot immediately capture the BO's and userptr, perform it in
>> 2 stages. The immediate stage takes a reference to each BO and userptr,
>> while a delayed worker captures the contents and then frees the
>> reference.
>>
>> This is required because in signaling context, no locks can be taken, no
>> memory can be allocated, and no waits on userspace can be performed.
>>
>> With the delayed worker, all of this can be performed very easily,
>> without having to resort to hacks.
>>
>> Changes since v1:
>> - Fix crash on NULL captured vm.
>> - Use ascii85_encode to capture BO contents and save some space.
>> - Add length to coredump output for each captured area.
>> Changes since v2:
>> - Dump each mapping on their own line, to simplify tooling.
>> - Fix null pointer deref in xe_vm_snapshot_free.
>> Changes since v3:
>> - Don't add uninitialized value to snap->ofs. (Souza)
>> - Use kernel types for u32 and u64.
>> - Move snap_mutex destruction to final vm destruction. (Souza)
>>
>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_devcoredump.c       |  33 ++++-
>>   drivers/gpu/drm/xe/xe_devcoredump_types.h |   8 +
>>   drivers/gpu/drm/xe/xe_vm.c                | 169 +++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_vm.h                |   5 +
>>   4 files changed, 211 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
>> index 08d3f6cb72292..3e863e51b9d4d 100644
>> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
>> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
>> @@ -17,6 +17,7 @@
>>   #include "xe_guc_submit.h"
>>   #include "xe_hw_engine.h"
>>   #include "xe_sched_job.h"
>> +#include "xe_vm.h"
>>   
>>   /**
>>    * DOC: Xe device coredump
>> @@ -59,12 +60,22 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>>   	return &q->gt->uc.guc;
>>   }
>>   
>> +static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>> +{
>> +	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
>> +
>> +	xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
>> +	if (ss->vm)
>> +		xe_vm_snapshot_capture_delayed(ss->vm);
>> +	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
>> +}
>> +
>>   static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>   				   size_t count, void *data, size_t datalen)
>>   {
>>   	struct xe_devcoredump *coredump = data;
>>   	struct xe_device *xe = coredump_to_xe(coredump);
>> -	struct xe_devcoredump_snapshot *ss;
>> +	struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
>>   	struct drm_printer p;
>>   	struct drm_print_iterator iter;
>>   	struct timespec64 ts;
>> @@ -74,12 +85,14 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>   	if (!data || !coredump_to_xe(coredump))
>>   		return -ENODEV;
>>   
>> +	/* Ensure delayed work is captured before continuing */
>> +	flush_work(&ss->work);
>> +
>>   	iter.data = buffer;
>>   	iter.offset = 0;
>>   	iter.start = offset;
>>   	iter.remain = count;
>>   
>> -	ss = &coredump->snapshot;
>>   	p = drm_coredump_printer(&iter);
>>   
>>   	drm_printf(&p, "**** Xe Device Coredump ****\n");
>> @@ -104,6 +117,10 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>   		if (coredump->snapshot.hwe[i])
>>   			xe_hw_engine_snapshot_print(coredump->snapshot.hwe[i],
>>   						    &p);
>> +	if (coredump->snapshot.vm) {
>> +		drm_printf(&p, "\n**** VM state ****\n");
>> +		xe_vm_snapshot_print(coredump->snapshot.vm, &p);
>> +	}
>>   
>>   	return count - iter.remain;
>>   }
>> @@ -117,12 +134,16 @@ static void xe_devcoredump_free(void *data)
>>   	if (!data || !coredump_to_xe(coredump))
>>   		return;
>>   
>> +	cancel_work_sync(&coredump->snapshot.work);
>> +
>>   	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
>>   	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
>>   	xe_sched_job_snapshot_free(coredump->snapshot.job);
>>   	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
>>   		if (coredump->snapshot.hwe[i])
>>   			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
>> +	xe_vm_snapshot_free(coredump->snapshot.vm);
>> +	memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
> 
> why this memset()?
I wanted to ensure next snapshot starts at the same state as the first, 
the snapshot is not freed here and struct is re-used.

Cheers,
~Maarten