[PATCH 07/16] drm/amdkfd: Fix event destruction with pending waiters
Oded Gabbay
oded.gabbay at gmail.com
Wed Oct 25 08:50:54 UTC 2017
On Sat, Oct 21, 2017 at 3:23 AM, Felix Kuehling <Felix.Kuehling at amd.com> wrote:
> When an event with pending waiters is destroyed, those waiters may
> end up sleeping forever unless they are notified and woken up.
> Implement the notification by clearing the waiter->event pointer,
> which becomes invalid anyway, when the event is freed, and waking
> up the waiting tasks.
>
> Waiters on an event that's destroyed return failure.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_events.c | 72 +++++++++++++++++++++------------
> 1 file changed, 46 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index 5bb88b74..6050e88 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -345,18 +345,24 @@ void kfd_event_init_process(struct kfd_process *p)
>
> static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
> {
> + /* Wake up pending waiters. They will return failure */
> + while (!list_empty(&ev->waiters)) {
> + struct kfd_event_waiter *waiter =
> + list_first_entry(&ev->waiters, struct kfd_event_waiter,
> + waiters);
> +
> + waiter->event = NULL;
> + /* _init because free_waiters will call list_del */
> + list_del_init(&waiter->waiters);
> + wake_up_process(waiter->sleeping_task);
> + }
> +
> if (ev->signal_page) {
> release_event_notification_slot(ev->signal_page,
> ev->signal_slot_index);
> p->signal_event_count--;
> }
>
> - /*
> - * Abandon the list of waiters. Individual waiting threads will
> - * clean up their own data.
> - */
> - list_del(&ev->waiters);
> -
> hash_del(&ev->events);
> kfree(ev);
> }
> @@ -646,22 +652,36 @@ static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
> list_add(&waiter->waiters, &ev->waiters);
> }
>
> -static bool test_event_condition(bool all, uint32_t num_events,
> +/* test_event_condition - Test condition of events being waited for
> + * @all: Return completion only if all events have signaled
> + * @num_events: Number of events to wait for
> + * @event_waiters: Array of event waiters, one per event
> + *
> + * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
> + * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
> + * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
> + * the events have been destroyed.
> + */
> +static uint32_t test_event_condition(bool all, uint32_t num_events,
> struct kfd_event_waiter *event_waiters)
> {
> uint32_t i;
> uint32_t activated_count = 0;
>
> for (i = 0; i < num_events; i++) {
> + if (!event_waiters[i].event)
> + return KFD_IOC_WAIT_RESULT_FAIL;
> +
> if (event_waiters[i].activated) {
> if (!all)
> - return true;
> + return KFD_IOC_WAIT_RESULT_COMPLETE;
>
> activated_count++;
> }
> }
>
> - return activated_count == num_events;
> + return activated_count == num_events ?
> + KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
> }
>
> /*
> @@ -745,11 +765,6 @@ int kfd_wait_on_events(struct kfd_process *p,
>
> mutex_lock(&p->event_mutex);
>
> - /* Set to something unreasonable - this is really
> - * just a bool for now.
> - */
> - *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT;
> -
> for (i = 0; i < num_events; i++) {
> struct kfd_event_data event_data;
>
> @@ -766,17 +781,22 @@ int kfd_wait_on_events(struct kfd_process *p,
> }
>
> /* Check condition once. */
> - if (test_event_condition(all, num_events, event_waiters)) {
> - *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE;
> + *wait_result = test_event_condition(all, num_events, event_waiters);
> + if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
> ret = copy_signaled_event_data(num_events,
> event_waiters, events);
> goto out_unlock;
> - } else {
> - /* Add to wait lists if we need to wait. */
> - for (i = 0; i < num_events; i++)
> - init_event_waiter_add_to_waitlist(&event_waiters[i]);
> + } else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
> + /* This should not happen. Events shouldn't be
> + * destroyed while we're holding the event_mutex
> + */
> + goto out_unlock;
> }
>
> + /* Add to wait lists if we need to wait. */
> + for (i = 0; i < num_events; i++)
> + init_event_waiter_add_to_waitlist(&event_waiters[i]);
> +
> mutex_unlock(&p->event_mutex);
>
> while (true) {
> @@ -809,15 +829,13 @@ int kfd_wait_on_events(struct kfd_process *p,
> */
> set_current_state(TASK_INTERRUPTIBLE);
>
> - if (test_event_condition(all, num_events, event_waiters)) {
> - *wait_result = KFD_IOC_WAIT_RESULT_COMPLETE;
> + *wait_result = test_event_condition(all, num_events,
> + event_waiters);
> + if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
> break;
> - }
>
> - if (timeout <= 0) {
> - *wait_result = KFD_IOC_WAIT_RESULT_TIMEOUT;
> + if (timeout <= 0)
> break;
> - }
>
> timeout = schedule_timeout(timeout);
> }
> @@ -837,6 +855,8 @@ int kfd_wait_on_events(struct kfd_process *p,
> out:
> if (ret)
> *wait_result = KFD_IOC_WAIT_RESULT_FAIL;
> + else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
> + ret = -EIO;
>
> return ret;
> }
> --
> 2.7.4
>
This patch is:
Acked-by: Oded Gabbay <oded.gabbay at gmail.com>
More information about the amd-gfx
mailing list