[RFC PATCH] drm/amdkfd: Run restore_workers on freezable WQs

Mon Oct 30 15:16:20 UTC 2023

On 2023-10-30 4:23, Christian König wrote:
> Am 28.10.23 um 00:39 schrieb Felix Kuehling:
>> Make restore workers freezable so we don't have to explicitly flush them
>> in suspend and GPU reset code paths, and we don't accidentally try to
>> restore BOs while the GPU is suspended. Not having to flush restore_work
>> also helps avoid lock/fence dependencies in the GPU reset case where 
>> we're
>> not allowed to wait for fences.
>>
>> This is an RFC and request for testing. I have not tested this myself 
>> yet.
>> My notes below:
>>
>> Restore work won't be frozen during GPU reset. Does it matter? Queues 
>> will
>> stay evicted until resume in any case. But restore_work may be in 
>> trouble
>> if it gets run in the middle of a GPU reset. In that case, if anything
>> fails, it will just reschedule itself, so should be fine as long as it
>> doesn't interfere with the reset itself (respects any mechanisms in 
>> place
>> to prevent HW access during the reset).
>>
>> What HW access does restore_work perform:
>> - migrating buffers: uses GPU scheduler, should be suspended during 
>> reset
>> - TLB flushes in userptr restore worker: not called directly, relies on
>>    HWS to flush TLBs on VMID attachment
>> - TLB flushes in SVM restore worker: Does TLB flush in the mapping code
>> - Resuming user mode queues: should not happen while GPU reset keeps 
>> queue
>>    eviction counter elevated
>> Ergo: Except for the SVM case, it's OK to not flush restore work before
>> GPU resets. I'll need to rethink the interaction of SVM restore_work 
>> with
>> GPU resets.
>
> It also sounds like the restore work is some high level work and 
> shouldn't interact with the lower level GPU reset.

That was my hope as well. Just trying to think through to make sure I'm 
not making any incorrect assumptions.

Do you know if there is anything preventing a TLB flush using MMIO from 
messing up a GPU reset in progress? That's the only thing in the SVM 
restore work that tries to touch HW directly.

>
>>
>> What about cancelling p->eviction_work? Eviction work would normally be
>> needed to signal eviction fences, but we're doing that explicitly in
>> suspend_all_processes. Does eviction_work wait for fences anywhere? Yes,
>> indirectly by flushing restore_work. So we should not try to cancel it
>> during a GPU reset.
>>
>> Problem: accessing p->ef concurrently in evict_process_worker and
>> suspend_all_processes. Need a spinlock to use and update it safely.
>> Problem: What if evict_process_worker gets stuck in flushing 
>> restore_work?
>> We can skip all of that if p->ef is NULL (which it is during the reset).
>> Even if it gets stuck, it's no problem if the reset doesn't depend on 
>> it.
>> It should get unstuck after the reset.
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>> ---
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  9 ++--
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 49 +++++++++++++------
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  4 +-
>>   4 files changed, 44 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 54f31a420229..89e632257663 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1644,7 +1644,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
>>           goto out_unlock;
>>       }
>>       WRITE_ONCE(pinfo->block_mmu_notifications, false);
>> -    schedule_delayed_work(&pinfo->restore_userptr_work, 0);
>> +    queue_delayed_work(system_freezable_wq,
>> +               &pinfo->restore_userptr_work, 0);
>>     out_unlock:
>>       mutex_unlock(&pinfo->lock);
>> @@ -2458,7 +2459,8 @@ int amdgpu_amdkfd_evict_userptr(struct 
>> mmu_interval_notifier *mni,
>>                          KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
>>           if (r)
>>               pr_err("Failed to quiesce KFD\n");
>> - schedule_delayed_work(&process_info->restore_userptr_work,
>> +        queue_delayed_work(system_freezable_wq,
>> +            &process_info->restore_userptr_work,
>> msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>>       }
>>       mutex_unlock(&process_info->notifier_lock);
>> @@ -2793,7 +2795,8 @@ static void 
>> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>>         /* If validation failed, reschedule another attempt */
>>       if (evicted_bos) {
>> - schedule_delayed_work(&process_info->restore_userptr_work,
>> +        queue_delayed_work(system_freezable_wq,
>> +            &process_info->restore_userptr_work,
>> msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
>>             kfd_smi_event_queue_restore_rescheduled(mm);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 9cc32f577e38..cf017d027fee 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -919,6 +919,7 @@ struct kfd_process {
>>        * during restore
>>        */
>>       struct dma_fence *ef;
>> +    spinlock_t ef_lock;
>>         /* Work items for evicting and restoring BOs */
>>       struct delayed_work eviction_work;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index fbf053001af9..a07cba58ec5e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
>>       if (!kfd_process_wq)
>>           kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
>>       if (!kfd_restore_wq)
>> -        kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
>> +        kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
>> +                             WQ_FREEZABLE);
>>         if (!kfd_process_wq || !kfd_restore_wq) {
>>           kfd_process_destroy_wq();
>> @@ -1460,6 +1461,7 @@ static struct kfd_process *create_process(const 
>> struct task_struct *thread)
>>         kref_init(&process->ref);
>>       mutex_init(&process->mutex);
>> +    spin_lock_init(&process->ef_lock);
>>       process->mm = thread->mm;
>>       process->lead_thread = thread->group_leader;
>>       process->n_pdds = 0;
>> @@ -1904,6 +1906,19 @@ kfd_process_gpuid_from_node(struct kfd_process 
>> *p, struct kfd_node *node,
>>       return -EINVAL;
>>   }
>>   +static void signal_eviction_fence(struct kfd_process *p)
>> +{
>> +    spin_lock(&p->ef_lock);
>> +    if (!p->ef)
>> +        goto unlock_out;
>> +    dma_fence_signal(p->ef);
>
> This needs to grab the internal lock of the eviction fence, I'm not 
> sure that has correct ordering with the newly added ef_lock.

Hmm, the only thing we could get a circular lock dependency would be, if 
we took the p->ef_lock in a fence callback. I don't think that ever 
happens, because even the eviction work runs on a worker thread (exactly 
to avoid such lock dependency issues).

Anyway, I could try to move the fence_signal out of the critical 
section. The lock is only there to ensure that exactly one process 
signals and frees the fence.

	spin_lock(&p->ef_lock);
	ef = p->ef;
	WRITE_ONCE(p->ef, NULL);
	spin_unlock(&p->ef_lock);
	if (ef) {
		dma_fence_signal(ef);
		dma_fence_put(ef);
	}

>
>> +    dma_fence_put(p->ef);
>> +    WRITE_ONCE(p->ef, NULL);
>> +
>> +unlock_out:
>> +    spin_unlock(&p->ef_lock);
>> +}
>> +
>>   static void evict_process_worker(struct work_struct *work)
>>   {
>>       int ret;
>> @@ -1916,8 +1931,11 @@ static void evict_process_worker(struct 
>> work_struct *work)
>>        * lifetime of this thread, kfd_process p will be valid
>>        */
>>       p = container_of(dwork, struct kfd_process, eviction_work);
>> -    WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
>> -          "Eviction fence mismatch\n");
>> +    /* If the eviction fence is not valid, we're probably in a suspend
>> +     * or GPU reset cycle. There is nothing to do in this case.
>> +     */
>> +    if (!READ_ONCE(p->ef))
>> +        return;
>
> This evict process work is high level I don't think it should have any 
> dependency on the GPU reset.

Right. This is not here to avoid issues, just a short-cut to avoid 
unnecessary work.

Regards,
   Felix

>
> Regards,
> Christian.
>
>>         /* Narrow window of overlap between restore and evict work
>>        * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
>> @@ -1930,9 +1948,7 @@ static void evict_process_worker(struct 
>> work_struct *work)
>>       pr_debug("Started evicting pasid 0x%x\n", p->pasid);
>>       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
>>       if (!ret) {
>> -        dma_fence_signal(p->ef);
>> -        dma_fence_put(p->ef);
>> -        p->ef = NULL;
>> +        signal_eviction_fence(p);
>>           queue_delayed_work(kfd_restore_wq, &p->restore_work,
>>                   msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
>>   @@ -1967,9 +1983,19 @@ static void restore_process_worker(struct 
>> work_struct *work)
>>         p->last_restore_timestamp = get_jiffies_64();
>>       /* VMs may not have been acquired yet during debugging. */
>> -    if (p->kgd_process_info)
>> +    if (p->kgd_process_info) {
>> +        struct dma_fence *ef = NULL;
>> +
>>           ret = 
>> amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
>> -                                 &p->ef);
>> +                                 &ef);
>> +        if (!ret) {
>> +            spin_lock(&p->ef_lock);
>> +            WARN_ONCE(p->ef, "Eviction fence is not NULL");
>> +            WRITE_ONCE(p->ef, ef);
>> +            spin_unlock(&p->ef_lock);
>> +        }
>> +    }
>> +
>>       if (ret) {
>>           pr_debug("Failed to restore BOs of pasid 0x%x, retry after 
>> %d ms\n",
>>                p->pasid, PROCESS_BACK_OFF_TIME_MS);
>> @@ -1994,14 +2020,9 @@ void kfd_suspend_all_processes(void)
>>         WARN(debug_evictions, "Evicting all processes");
>>       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>> -        cancel_delayed_work_sync(&p->eviction_work);
>> -        flush_delayed_work(&p->restore_work);
>> -
>>           if (kfd_process_evict_queues(p, 
>> KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>>               pr_err("Failed to suspend process 0x%x\n", p->pasid);
>> -        dma_fence_signal(p->ef);
>> -        dma_fence_put(p->ef);
>> -        p->ef = NULL;
>> +        signal_eviction_fence(p);
>>       }
>>       srcu_read_unlock(&kfd_processes_srcu, idx);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index fe937670c927..aa6c34127be9 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -1869,7 +1869,7 @@ static void svm_range_restore_work(struct 
>> work_struct *work)
>>       /* If validation failed, reschedule another attempt */
>>       if (evicted_ranges) {
>>           pr_debug("reschedule to restore svm range\n");
>> -        schedule_delayed_work(&svms->restore_work,
>> +        queue_delayed_work(system_freezable_wq, &svms->restore_work,
>> msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>>             kfd_smi_event_queue_restore_rescheduled(mm);
>> @@ -1945,7 +1945,7 @@ svm_range_evict(struct svm_range *prange, 
>> struct mm_struct *mm,
>>               pr_debug("failed to quiesce KFD\n");
>>             pr_debug("schedule to restore svm %p ranges\n", svms);
>> -        schedule_delayed_work(&svms->restore_work,
>> +        queue_delayed_work(system_freezable_wq, &svms->restore_work,
>> msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
>>       } else {
>>           unsigned long s, l;
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20231030/be32dede/attachment-0001.htm>