[PATCH 07/16] drm/amdkfd: Fix event destruction with pending waiters

Oded Gabbay oded.gabbay at gmail.com
Wed Oct 25 08:50:54 UTC 2017


On Sat, Oct 21, 2017 at 3:23 AM, Felix Kuehling <Felix.Kuehling at amd.com> wrote:
> When an event with pending waiters is destroyed, those waiters may
> end up sleeping forever unless they are notified and woken up.
> Implement the notification by clearing the waiter->event pointer,
> which becomes invalid anyway, when the event is freed, and waking
> up the waiting tasks.
>
> Waiters on an event that's destroyed return failure.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c | 72 +++++++++++++++++++++------------
>  1 file changed, 46 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 5bb88b74..6050e88 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -345,18 +345,24 @@ void kfd_event_init_process(struct kfd_process *p)
>
>  static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
>  {
> +       /* Wake up pending waiters. They will return failure */
> +       while (!list_empty(&ev->waiters)) {
> +               struct kfd_event_waiter *waiter =
> +                       list_first_entry(&ev->waiters, struct kfd_event_waiter,
> +                                        waiters);
> +
> +               waiter->event = NULL;
> +               /* _init because free_waiters will call list_del */
> +               list_del_init(&waiter->waiters);
> +               wake_up_process(waiter->sleeping_task);
> +       }
> +
>         if (ev->signal_page) {
>                 release_event_notification_slot(ev->signal_page,
>                                                 ev->signal_slot_index);
>                 p->signal_event_count--;
>         }
>
> -       /*
> -        * Abandon the list of waiters. Individual waiting threads will
> -        * clean up their own data.
> -        */
> -       list_del(&ev->waiters);
> -
>         hash_del(&ev->events);
>         kfree(ev);
>  }
> @@ -646,22 +652,36 @@ static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
>                 list_add(&waiter->waiters, &ev->waiters);
>  }
>
> -static bool test_event_condition(bool all, uint32_t num_events,
> +/* test_event_condition - Test condition of events being waited for
> + * @all:           Return completion only if all events have signaled
> + * @num_events:    Number of events to wait for
> + * @event_waiters: Array of event waiters, one per event
> + *
> + * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
> + * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
> + * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
> + * the events have been destroyed.
> + */
> +static uint32_t test_event_condition(bool all, uint32_t num_events,
>                                 struct kfd_event_waiter *event_waiters)
>  {
>         uint32_t i;
>         uint32_t activated_count = 0;
>
>         for (i = 0; i < num_events; i++) {
> +               if (!event_waiters[i].event)
> +                       return KFD_IOC_WAIT_RESULT_FAIL;
> +
>                 if (event_waiters[i].activated) {
>                         if (!all)
> -                               return true;
> +                               return KFD_IOC_WAIT_RESULT_COMPLETE;
>
>                         activated_count++;
>                 }
>         }
>
> -       return activated_count == num_events;
> +       return activated_count == num_events ?
> +               KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
>  }
>
>  /*
> @@ -745,11 +765,6 @@ int kfd_wait_on_events(struct kfd_process *p,
>
>         mutex_lock(&p->event_mutex);
>
> -       /* Set to something unreasonable - this is really
> -        * just a bool for now.
> -        */
> -       *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT;
> -
>         for (i = 0; i < num_events; i++) {
>                 struct kfd_event_data event_data;
>
> @@ -766,17 +781,22 @@ int kfd_wait_on_events(struct kfd_process *p,
>         }
>
>         /* Check condition once. */
> -       if (test_event_condition(all, num_events, event_waiters)) {
> -               *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE;
> +       *wait_result = test_event_condition(all, num_events, event_waiters);
> +       if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
>                 ret = copy_signaled_event_data(num_events,
>                                                event_waiters, events);
>                 goto out_unlock;
> -       } else {
> -               /* Add to wait lists if we need to wait. */
> -               for (i = 0; i < num_events; i++)
> -                       init_event_waiter_add_to_waitlist(&event_waiters[i]);
> +       } else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
> +               /* This should not happen. Events shouldn't be
> +                * destroyed while we're holding the event_mutex
> +                */
> +               goto out_unlock;
>         }
>
> +       /* Add to wait lists if we need to wait. */
> +       for (i = 0; i < num_events; i++)
> +               init_event_waiter_add_to_waitlist(&event_waiters[i]);
> +
>         mutex_unlock(&p->event_mutex);
>
>         while (true) {
> @@ -809,15 +829,13 @@ int kfd_wait_on_events(struct kfd_process *p,
>                  */
>                 set_current_state(TASK_INTERRUPTIBLE);
>
> -               if (test_event_condition(all, num_events, event_waiters)) {
> -                       *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE;
> +               *wait_result = test_event_condition(all, num_events,
> +                                                   event_waiters);
> +               if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
>                         break;
> -               }
>
> -               if (timeout <= 0) {
> -                       *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT;
> +               if (timeout <= 0)
>                         break;
> -               }
>
>                 timeout = schedule_timeout(timeout);
>         }
> @@ -837,6 +855,8 @@ int kfd_wait_on_events(struct kfd_process *p,
>  out:
>         if (ret)
>                 *wait_result = KFD_IOC_WAIT_RESULT_FAIL;
> +       else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
> +               ret = -EIO;
>
>         return ret;
>  }
> --
> 2.7.4
>

This patch is:
Acked-by: Oded Gabbay <oded.gabbay at gmail.com>


More information about the amd-gfx mailing list