[PATCH v4 4/4] drm/xe: Fix use after free when client stats are captured

Lucas De Marchi lucas.demarchi at intel.com
Thu Aug 8 20:06:32 UTC 2024


On Mon, Jul 22, 2024 at 03:00:25PM GMT, Rodrigo Vivi wrote:
>On Thu, Jul 18, 2024 at 02:05:48PM -0700, Umesh Nerlige Ramappa wrote:
>> xe_file_close triggers an asynchronous queue cleanup and then frees up
>> the xef object. Since queue cleanup flushes all pending jobs and the KMD
>> stores client usage stats into the xef object after jobs are flushed, we
>> see a use-after-free for the xef object. Resolve this by taking a
>> reference to xef from xe_exec_queue.
>>
>> While at it, revert an earlier change that contained a partial work
>> around for this issue.
>>
>> v2:
>> - Take a ref to xef even for the VM bind queue (Matt)
>> - Squash patches relevant to that fix and work around (Lucas)
>>
>> v3: Fix typo (Lucas)
>>
>> Fixes: ce62827bc294 ("drm/xe: Do not access xe file when updating exec queue run_ticks")
>> Fixes: 6109f24f87d7 ("drm/xe: Add helper to accumulate exec queue runtime")
>> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/issues/1908
>
>Hi Umesh,
>
>first of all thanks for this fix.
>
>But I'd like to ask you for a next time to provide the fix in a single patch
>or have the first patch as a dirty fix and a refactor on top, so it gets
>easily backported to the fixes and stable branches.

I think the most important thing is to have them in 6.11. Worst case
scenario is to bring them to 6.10 where the bug was included, but I
don't think it's really needed: there was already a stop gap solution in
6.10.

>
>For drm-xe-next-fixes I'm now picking this entire series. But for any
>stable we will need to provide a backport that applies there.

did this pull request you mentioned fell through the cracks? I don't see
any of the commits 627a24f5f25d..drm-xe/drm-xe-next-fixes in 6.11-rc2:

$ git log --oneline v6.11-rc2..drm-xe/drm-xe-next-fixes       
0cf3f7297d01 (drm-xe/drm-xe-next-fixes, drm-xe-next-fixes) drm/xe/vf: Fix register value lookup    
5beb9a114a42 drm/xe: Fix use after free when client stats are captured                             
bf5ef88cebdb drm/xe: Take a ref to xe file when user creates a VM                                  
6896cc5479b0 drm/xe: Add ref counting for xe_file                                                  
676a56fdbcf8 drm/xe: Move part of xe_file cleanup to a helper                                      
ea0cabb53f9a drm/xe: Validate user fence during creation                                           

And I don't see them there through other branches neither.
Could you include them via drm-xe-fixes?

thanks
Lucas De Marchi

>
>Thanks,
>Rodrigo.
>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> Reviewed-by: Matthew Brost <matthew.brost at intel.com>
>> Reviewed-by: Lucas De Marchi <lucas.demarchi at intel.com>
>> ---
>>  drivers/gpu/drm/xe/xe_drm_client.c       |  5 +----
>>  drivers/gpu/drm/xe/xe_exec_queue.c       | 10 +++++++++-
>>  drivers/gpu/drm/xe/xe_exec_queue_types.h |  7 +++----
>>  3 files changed, 13 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
>> index 6a26923fa10e..7ddd59908334 100644
>> --- a/drivers/gpu/drm/xe/xe_drm_client.c
>> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
>> @@ -251,11 +251,8 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>>
>>  	/* Accumulate all the exec queues from this client */
>>  	mutex_lock(&xef->exec_queue.lock);
>> -	xa_for_each(&xef->exec_queue.xa, i, q) {
>> +	xa_for_each(&xef->exec_queue.xa, i, q)
>>  		xe_exec_queue_update_run_ticks(q);
>> -		xef->run_ticks[q->class] += q->run_ticks - q->old_run_ticks;
>> -		q->old_run_ticks = q->run_ticks;
>> -	}
>>  	mutex_unlock(&xef->exec_queue.lock);
>>
>>  	/* Get the total GPU cycles */
>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>> index 3336a01a1006..69867a7b7c77 100644
>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>> @@ -37,6 +37,10 @@ static void __xe_exec_queue_free(struct xe_exec_queue *q)
>>  {
>>  	if (q->vm)
>>  		xe_vm_put(q->vm);
>> +
>> +	if (q->xef)
>> +		xe_file_put(q->xef);
>> +
>>  	kfree(q);
>>  }
>>
>> @@ -649,6 +653,7 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>>  		goto kill_exec_queue;
>>
>>  	args->exec_queue_id = id;
>> +	q->xef = xe_file_get(xef);
>>
>>  	return 0;
>>
>> @@ -762,6 +767,7 @@ bool xe_exec_queue_is_idle(struct xe_exec_queue *q)
>>   */
>>  void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q)
>>  {
>> +	struct xe_file *xef;
>>  	struct xe_lrc *lrc;
>>  	u32 old_ts, new_ts;
>>
>> @@ -773,6 +779,8 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q)
>>  	if (!q->vm || !q->vm->xef)
>>  		return;
>>
>> +	xef = q->vm->xef;
>> +
>>  	/*
>>  	 * Only sample the first LRC. For parallel submission, all of them are
>>  	 * scheduled together and we compensate that below by multiplying by
>> @@ -783,7 +791,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q)
>>  	 */
>>  	lrc = q->lrc[0];
>>  	new_ts = xe_lrc_update_timestamp(lrc, &old_ts);
>> -	q->run_ticks += (new_ts - old_ts) * q->width;
>> +	xef->run_ticks[q->class] += (new_ts - old_ts) * q->width;
>>  }
>>
>>  void xe_exec_queue_kill(struct xe_exec_queue *q)
>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>> index ded9f9396429..1408b02eea53 100644
>> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
>> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>> @@ -38,6 +38,9 @@ enum xe_exec_queue_priority {
>>   * a kernel object.
>>   */
>>  struct xe_exec_queue {
>> +	/** @xef: Back pointer to xe file if this is user created exec queue */
>> +	struct xe_file *xef;
>> +
>>  	/** @gt: graphics tile this exec queue can submit to */
>>  	struct xe_gt *gt;
>>  	/**
>> @@ -139,10 +142,6 @@ struct xe_exec_queue {
>>  	 * Protected by @vm's resv. Unused if @vm == NULL.
>>  	 */
>>  	u64 tlb_flush_seqno;
>> -	/** @old_run_ticks: prior hw engine class run time in ticks for this exec queue */
>> -	u64 old_run_ticks;
>> -	/** @run_ticks: hw engine class run time in ticks for this exec queue */
>> -	u64 run_ticks;
>>  	/** @lrc: logical ring context for this exec queue */
>>  	struct xe_lrc *lrc[];
>>  };
>> --
>> 2.38.1
>>


More information about the Intel-xe mailing list