[PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset callback
Felix Kuehling
felix.kuehling at amd.com
Fri Dec 20 03:33:54 UTC 2019
On 2019-12-19 21:34, Liu, Shaoyun wrote:
> Will it looks cleaner if we keep a pre_reset flag in per device
> structure and check it in the function when talk to hw?
I was briefly considering that when I saw how many function needed that
pre_reset flag. But this could lead to race conditions with other
threads doing evictions concurrently (e.g. MMU notifiers). I think we
want concurrent evictions to hang until the GPU reset is done, not just
skip the actual eviction quietly.
Hmm, I need to think more about the interaction between evictions and
resets.
Regards,
Felix
>
> Regards
> Shaoyun.liu
> ------------------------------------------------------------------------
> *From:* Kuehling, Felix <Felix.Kuehling at amd.com>
> *Sent:* December 19, 2019 9:21:08 PM
> *To:* amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>;
> Liu, Monk <Monk.Liu at amd.com>; Liu, Shaoyun <Shaoyun.Liu at amd.com>;
> Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
> *Subject:* Re: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in
> pre_reset callback
> [+Andrey]
>
> Hi Shaoyun, Monk, and Andrey,
>
> I tested this on my bare-metal Vega10 system. GPU reset (using BACO) is
> flaky on this system with and without this patch. The first reset seems
> to work OK, the second one fails in different ways. In theory this
> change should be an improvement as it eliminated at least one failure
> mode (hanging before the actual reset). A least I don't think I'm making
> anything worse.
>
> My test is to run KFDTest and trigger a GPU reset while it's busy with
> the eviction test. This test runs multiple KFD processes and also
> performs graphics memory allocations and command submissions to force
> TTM to evict KFD processes. I trigger the reset by hanging the HWS
> through a debugfs hook:
>
> # cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id >
> /sys/kernel/debug/kfd/hang_hws
>
> Please give this patch a try and reinstate the pre_reset call before the
> FLR for SRIOV.
>
> Regards,
> Felix
>
> On 2019-12-19 9:09 p.m., Felix Kuehling wrote:
> > The intention of the pre_reset callback is to update the driver
> > state to reflect that all user mode queues are preempted and the
> > HIQ is destroyed. However we should not actually preempt any queues
> > or otherwise touch the hardware because it is presumably hanging.
> > The impending reset will take care of actually stopping all queues.
> >
> > This should prevent KIQ register access hanging on SRIOV function
> > level reset (FLR). It should also speed up the reset by avoiding
> > unnecessary timeouts from a potentially hanging GPU scheduler.
> >
> > CC: shaoyunl <shaoyun.liu at amd.com>
> > CC: Liu Monk <Monk.Liu at amd.com>
> > Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 24 ++++++++++-------
> > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++-------
> > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 5 ++--
> > drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 +++---
> > .../gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 +--
> > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 8 +++---
> > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 11 ++++----
> > .../amd/amdkfd/kfd_process_queue_manager.c | 2 +-
> > 8 files changed, 53 insertions(+), 36 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > index c6b6901bbda3..796996a0d926 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > @@ -486,6 +486,7 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd,
> unsigned int buf_size,
> > unsigned int chunk_size);
> > static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
> >
> > +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset);
> > static int kfd_resume(struct kfd_dev *kfd);
> >
> > struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
> > @@ -728,7 +729,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
> > {
> > if (!kfd->init_complete)
> > return 0;
> > - kgd2kfd_suspend(kfd);
> > + kfd_suspend(kfd, true);
> >
> > kfd_signal_reset_event(kfd);
> > return 0;
> > @@ -767,13 +768,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
> > if (!kfd->init_complete)
> > return;
> >
> > - /* For first KFD device suspend all the KFD processes */
> > - if (atomic_inc_return(&kfd_locked) == 1)
> > - kfd_suspend_all_processes();
> > -
> > - kfd->dqm->ops.stop(kfd->dqm);
> > -
> > - kfd_iommu_suspend(kfd);
> > + kfd_suspend(kfd, false);
> > }
> >
> > int kgd2kfd_resume(struct kfd_dev *kfd)
> > @@ -795,6 +790,17 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
> > return ret;
> > }
> >
> > +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset)
> > +{
> > + /* For first KFD device suspend all the KFD processes */
> > + if (atomic_inc_return(&kfd_locked) == 1)
> > + kfd_suspend_all_processes(pre_reset);
> > +
> > + kfd->dqm->ops.stop(kfd->dqm, pre_reset);
> > +
> > + kfd_iommu_suspend(kfd);
> > +}
> > +
> > static int kfd_resume(struct kfd_dev *kfd)
> > {
> > int err = 0;
> > @@ -877,7 +883,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
> > if (!p)
> > return -ESRCH;
> >
> > - r = kfd_process_evict_queues(p);
> > + r = kfd_process_evict_queues(p, false);
> >
> > kfd_unref_process(p);
> > return r;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index f7f6df40875e..3a49944164da 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -592,7 +592,8 @@ static int update_queue(struct
> device_queue_manager *dqm, struct queue *q)
> > }
> >
> > static int evict_process_queues_nocpsch(struct
> device_queue_manager *dqm,
> > - struct qcm_process_device *qpd)
> > + struct qcm_process_device *qpd,
> > + bool pre_reset)
> > {
> > struct queue *q;
> > struct mqd_manager *mqd_mgr;
> > @@ -622,6 +623,8 @@ static int evict_process_queues_nocpsch(struct
> device_queue_manager *dqm,
> >
> > if (WARN_ONCE(!dqm->sched_running, "Evict when
> stopped\n"))
> > continue;
> > + if (pre_reset)
> > + continue;
> >
> > retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
> > KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
> > @@ -639,7 +642,8 @@ static int evict_process_queues_nocpsch(struct
> device_queue_manager *dqm,
> > }
> >
> > static int evict_process_queues_cpsch(struct device_queue_manager
> *dqm,
> > - struct qcm_process_device *qpd)
> > + struct qcm_process_device *qpd,
> > + bool pre_reset)
> > {
> > struct queue *q;
> > struct kfd_process_device *pdd;
> > @@ -664,6 +668,10 @@ static int evict_process_queues_cpsch(struct
> device_queue_manager *dqm,
> > q->properties.is_active = false;
> > dqm->queue_count--;
> > }
> > +
> > + if (pre_reset)
> > + goto out;
> > +
> > retval = execute_queues_cpsch(dqm,
> > qpd->is_debug ?
> > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
> > @@ -944,10 +952,10 @@ static int start_nocpsch(struct
> device_queue_manager *dqm)
> > return 0;
> > }
> >
> > -static int stop_nocpsch(struct device_queue_manager *dqm)
> > +static int stop_nocpsch(struct device_queue_manager *dqm, bool
> pre_reset)
> > {
> > if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> > - pm_uninit(&dqm->packets);
> > + pm_uninit(&dqm->packets, pre_reset);
> > dqm->sched_running = false;
> >
> > return 0;
> > @@ -1107,20 +1115,21 @@ static int start_cpsch(struct
> device_queue_manager *dqm)
> > return 0;
> > fail_allocate_vidmem:
> > fail_set_sched_resources:
> > - pm_uninit(&dqm->packets);
> > + pm_uninit(&dqm->packets, false);
> > fail_packet_manager_init:
> > return retval;
> > }
> >
> > -static int stop_cpsch(struct device_queue_manager *dqm)
> > +static int stop_cpsch(struct device_queue_manager *dqm, bool pre_reset)
> > {
> > dqm_lock(dqm);
> > - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> > + if (!pre_reset)
> > + unmap_queues_cpsch(dqm,
> KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> > dqm->sched_running = false;
> > dqm_unlock(dqm);
> >
> > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> > - pm_uninit(&dqm->packets);
> > + pm_uninit(&dqm->packets, pre_reset);
> >
> > return 0;
> > }
> > @@ -1891,7 +1900,7 @@ int kfd_process_vm_fault(struct
> device_queue_manager *dqm,
> > return -EINVAL;
> > pdd = kfd_get_process_device_data(dqm->dev, p);
> > if (pdd)
> > - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> > + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd,
> false);
> > kfd_unref_process(p);
> >
> > return ret;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > index a8c37e6da027..9f82f95f6a92 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> > @@ -103,7 +103,7 @@ struct device_queue_manager_ops {
> >
> > int (*initialize)(struct device_queue_manager *dqm);
> > int (*start)(struct device_queue_manager *dqm);
> > - int (*stop)(struct device_queue_manager *dqm);
> > + int (*stop)(struct device_queue_manager *dqm, bool pre_reset);
> > void (*uninitialize)(struct device_queue_manager *dqm);
> > int (*create_kernel_queue)(struct device_queue_manager *dqm,
> > struct kernel_queue *kq,
> > @@ -129,7 +129,8 @@ struct device_queue_manager_ops {
> > struct qcm_process_device *qpd);
> >
> > int (*evict_process_queues)(struct device_queue_manager *dqm,
> > - struct qcm_process_device *qpd);
> > + struct qcm_process_device *qpd,
> > + bool pre_reset);
> > int (*restore_process_queues)(struct device_queue_manager *dqm,
> > struct qcm_process_device *qpd);
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> > index 2d56dc534459..dbd2e8e9ae69 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> > @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue
> *kq, struct kfd_dev *dev,
> > }
> >
> > /* Uninitialize a kernel queue and free all its memory usages. */
> > -static void kq_uninitialize(struct kernel_queue *kq)
> > +static void kq_uninitialize(struct kernel_queue *kq, bool pre_reset)
> > {
> > - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> > + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ &&
> !pre_reset)
> > kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> > kq->queue->mqd,
> > KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> > @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct
> kfd_dev *dev,
> > return NULL;
> > }
> >
> > -void kernel_queue_uninit(struct kernel_queue *kq)
> > +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset)
> > {
> > - kq_uninitialize(kq);
> > + kq_uninitialize(kq, pre_reset);
> > kfree(kq);
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> > index 6cabed06ef5d..7732a3bbebd6 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> > @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
> device_queue_manager *dqm)
> > return 0;
> > }
> >
> > -void pm_uninit(struct packet_manager *pm)
> > +void pm_uninit(struct packet_manager *pm, bool pre_reset)
> > {
> > mutex_destroy(&pm->lock);
> > - kernel_queue_uninit(pm->priv_queue);
> > + kernel_queue_uninit(pm->priv_queue, pre_reset);
> > }
> >
> > int pm_send_set_resources(struct packet_manager *pm,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 087e96838997..9bc4ced4acba 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -762,9 +762,9 @@ struct kfd_process *kfd_get_process(const struct
> task_struct *);
> > struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
> > struct kfd_process *kfd_lookup_process_by_mm(const struct
> mm_struct *mm);
> > void kfd_unref_process(struct kfd_process *p);
> > -int kfd_process_evict_queues(struct kfd_process *p);
> > +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset);
> > int kfd_process_restore_queues(struct kfd_process *p);
> > -void kfd_suspend_all_processes(void);
> > +void kfd_suspend_all_processes(bool pre_reset);
> > int kfd_resume_all_processes(void);
> >
> > int kfd_process_device_init_vm(struct kfd_process_device *pdd,
> > @@ -883,7 +883,7 @@ struct device_queue_manager
> *device_queue_manager_init(struct kfd_dev *dev);
> > void device_queue_manager_uninit(struct device_queue_manager *dqm);
> > struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> > enum kfd_queue_type type);
> > -void kernel_queue_uninit(struct kernel_queue *kq);
> > +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset);
> > int kfd_process_vm_fault(struct device_queue_manager *dqm,
> unsigned int pasid);
> >
> > /* Process Queue Manager */
> > @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
> kfd_vi_pm_funcs;
> > extern const struct packet_manager_funcs kfd_v9_pm_funcs;
> >
> > int pm_init(struct packet_manager *pm, struct device_queue_manager
> *dqm);
> > -void pm_uninit(struct packet_manager *pm);
> > +void pm_uninit(struct packet_manager *pm, bool pre_reset);
> > int pm_send_set_resources(struct packet_manager *pm,
> > struct scheduling_resources *res);
> > int pm_send_runlist(struct packet_manager *pm, struct list_head
> *dqm_queues);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 536a153ac9a4..7bcadd3a1e3b 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -948,7 +948,7 @@ struct kfd_process
> *kfd_lookup_process_by_mm(const struct mm_struct *mm)
> > * Eviction is reference-counted per process-device. This means
> multiple
> > * evictions from different sources can be nested safely.
> > */
> > -int kfd_process_evict_queues(struct kfd_process *p)
> > +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset)
> > {
> > struct kfd_process_device *pdd;
> > int r = 0;
> > @@ -956,7 +956,8 @@ int kfd_process_evict_queues(struct kfd_process *p)
> >
> > list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
> > r =
> pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
> > - &pdd->qpd);
> > + &pdd->qpd,
> > + pre_reset);
> > if (r) {
> > pr_err("Failed to evict process queues\n");
> > goto fail;
> > @@ -1026,7 +1027,7 @@ static void evict_process_worker(struct
> work_struct *work)
> > flush_delayed_work(&p->restore_work);
> >
> > pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> > - ret = kfd_process_evict_queues(p);
> > + ret = kfd_process_evict_queues(p, false);
> > if (!ret) {
> > dma_fence_signal(p->ef);
> > dma_fence_put(p->ef);
> > @@ -1082,7 +1083,7 @@ static void restore_process_worker(struct
> work_struct *work)
> > pr_err("Failed to restore queues of pasid 0x%x\n",
> p->pasid);
> > }
> >
> > -void kfd_suspend_all_processes(void)
> > +void kfd_suspend_all_processes(bool pre_reset)
> > {
> > struct kfd_process *p;
> > unsigned int temp;
> > @@ -1092,7 +1093,7 @@ void kfd_suspend_all_processes(void)
> > cancel_delayed_work_sync(&p->eviction_work);
> > cancel_delayed_work_sync(&p->restore_work);
> >
> > - if (kfd_process_evict_queues(p))
> > + if (kfd_process_evict_queues(p, pre_reset))
> > pr_err("Failed to suspend process 0x%x\n",
> p->pasid);
> > dma_fence_signal(p->ef);
> > dma_fence_put(p->ef);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index d3eacf72e8db..8fa856e6a03f 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
> process_queue_manager *pqm, unsigned int qid)
> > /* destroy kernel queue (DIQ) */
> > dqm = pqn->kq->dev->dqm;
> > dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> > - kernel_queue_uninit(pqn->kq);
> > + kernel_queue_uninit(pqn->kq, false);
> > }
> >
> > if (pqn->q) {
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20191219/39e272a8/attachment-0001.htm>
More information about the amd-gfx
mailing list