[PATCH] drm/amdgpu: fix amdgpu_amdkfd_remove_eviction_fence
Felix Kuehling
felix.kuehling at amd.com
Wed Aug 15 18:17:12 UTC 2018
On 2018-08-15 03:02 AM, Christian König wrote:
> Hi Felix,
>
> yeah, you pretty much nailed it.
>
> The problem is that the array itself is RCU protected. This means that
> you can only copy the whole structure when you want to update it.
>
> The exception is reservation_object_add_shared() which only works
> because we replace an either signaled fence or a fence within the same
> context but a later sequence number.
>
> This also explains why this is only a band aid and the whole approach
> of removing fences doesn't work in general. At any time somebody could
> have taken an RCU reference to the old array, created a copy of it and
> is now still using this one.
>
> The only 100% correct solution would be to mark the existing fence as
> signaled and replace it everywhere else.
Depends what you're trying to achieve. I think the problem you see is,
that some reader may still be using the old reservation_object_list copy
with the fence still in it. But, the remaining lifetime of the
reservation_object_list copy is limited. If we wanted to be sure no more
copies with the old fence exist, all we'd need to do is call
synchronize_rcu. Maybe we need to do that before releasing the fence
references, or release the fence reference in an RCU callback to be safe.
Regards,
Felix
>
> Going to fix the copy&paste error I made with the sequence number and
> send it out again.
>
> Regards,
> Christian.
>
> Am 14.08.2018 um 22:17 schrieb Felix Kuehling:
>> [+Harish]
>>
>> I think this looks good for the most part. See one comment inline below.
>>
>> But bear with me while I'm trying to understand what was wrong with the
>> old code. Please correct me if I get this wrong or point out anything
>> I'm missing.
>>
>> The reservation_object_list looks to be protected by a combination of
>> three mechanism:
>>
>> * Holding the reservation object
>> * RCU
>> * seqcount
>>
>> Updating the fence list requires holding the reservation object. But
>> there are some readers that can be called without holding that lock
>> (reservation_object_copy_fences, reservation_object_get_fences_rcu,
>> reservation_object_wait_timeout_rcu,
>> reservation_object_test_signaled_rcu). They rely on RCU to work on a
>> copy and seqcount to make sure they had the most up-to-date information.
>> So any function updating the fence lists needs to do RCU and seqcount
>> correctly to prevent breaking those readers.
>>
>> As I understand it, RCU with seqcount retry just means that readers will
>> spin retrying while there are writers, and writers are never blocked by
>> readers. Writers are blocked by each other, because they need to hold
>> the reservation.
>>
>> The code you added looks a lot like
>> reservation_object_add_shared_replace, which removes fences that have
>> signalled, and atomically replaces obj->fences with a new
>> reservation_fence_list. That atomicity is important because each pointer
>> in the obj->fences->shared array is separately protected by RCU, but not
>> the array as a whole or the shared_count.
>>
>> See one comment inline.
>>
>> Regards,
>> Felix
>>
>> On 2018-08-14 03:39 AM, Christian König wrote:
>>> Fix quite a number of bugs here. Unfortunately only compile tested.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 103
>>> ++++++++++-------------
>>> 1 file changed, 46 insertions(+), 57 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index fa38a960ce00..dece31516dc4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -206,11 +206,9 @@ static int
>>> amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
>>> struct amdgpu_amdkfd_fence ***ef_list,
>>> unsigned int *ef_count)
>>> {
>>> - struct reservation_object_list *fobj;
>>> - struct reservation_object *resv;
>>> - unsigned int i = 0, j = 0, k = 0, shared_count;
>>> - unsigned int count = 0;
>>> - struct amdgpu_amdkfd_fence **fence_list;
>>> + struct reservation_object *resv = bo->tbo.resv;
>>> + struct reservation_object_list *old, *new;
>>> + unsigned int i, j, k;
>>> if (!ef && !ef_list)
>>> return -EINVAL;
>>> @@ -220,76 +218,67 @@ static int
>>> amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
>>> *ef_count = 0;
>>> }
>>> - resv = bo->tbo.resv;
>>> - fobj = reservation_object_get_list(resv);
>>> -
>>> - if (!fobj)
>>> + old = reservation_object_get_list(resv);
>>> + if (!old)
>>> return 0;
>>> - preempt_disable();
>>> - write_seqcount_begin(&resv->seq);
>>> + new = kmalloc(offsetof(typeof(*new), shared[old->shared_max]),
>>> + GFP_KERNEL);
>>> + if (!new)
>>> + return -ENOMEM;
>>> - /* Go through all the shared fences in the resevation object. If
>>> - * ef is specified and it exists in the list, remove it and
>>> reduce the
>>> - * count. If ef is not specified, then get the count of
>>> eviction fences
>>> - * present.
>>> + /* Go through all the shared fences in the resevation object
>>> and sort
>>> + * the interesting ones to the end of the list.
>>> */
>>> - shared_count = fobj->shared_count;
>>> - for (i = 0; i < shared_count; ++i) {
>>> + for (i = 0, j = old->shared_count, k = 0; i <
>>> old->shared_count; ++i) {
>>> struct dma_fence *f;
>>> - f = rcu_dereference_protected(fobj->shared[i],
>>> + f = rcu_dereference_protected(old->shared[i],
>>> reservation_object_held(resv));
>>> - if (ef) {
>>> - if (f->context == ef->base.context) {
>>> - dma_fence_put(f);
>>> - fobj->shared_count--;
>>> - } else {
>>> - RCU_INIT_POINTER(fobj->shared[j++], f);
>>> - }
>>> - } else if (to_amdgpu_amdkfd_fence(f))
>>> - count++;
>>> + if ((ef && f->context == ef->base.context) ||
>>> + (!ef && to_amdgpu_amdkfd_fence(f)))
>>> + RCU_INIT_POINTER(new->shared[--j], f);
>>> + else
>>> + RCU_INIT_POINTER(new->shared[k++], f);
>>> }
>>> - write_seqcount_end(&resv->seq);
>>> - preempt_enable();
>>> + new->shared_max = old->shared_max;
>>> + new->shared_count = k;
>>> - if (ef || !count)
>>> - return 0;
>>> + if (!ef) {
>>> + unsigned int count = old->shared_count - j;
>>> - /* Alloc memory for count number of eviction fence pointers.
>>> Fill the
>>> - * ef_list array and ef_count
>>> - */
>>> - fence_list = kcalloc(count, sizeof(struct amdgpu_amdkfd_fence *),
>>> - GFP_KERNEL);
>>> - if (!fence_list)
>>> - return -ENOMEM;
>>> + /* Alloc memory for count number of eviction fence
>>> pointers. Fill the
>>> + * ef_list array and ef_count
>>> + */
>>> + *ef_list = kcalloc(count, sizeof(**ef_list), GFP_KERNEL);
>>> + *ef_count = count;
>>> + if (!*ef_list) {
>>> + kfree(new);
>>> + return -ENOMEM;
>>> + }
>>> + }
>>> +
>>> + /* Install the new fence list, seqcount provides the barriers */
>>> + preempt_disable();
>>> + write_seqcount_begin(&resv->seq);
>>> + RCU_INIT_POINTER(resv->fence, new);
>>> preempt_disable();
>>> write_seqcount_begin(&resv->seq);
>> You're disabling preemption and calling write_seqcount_begin twice. I
>> think this is meant to be
>>
>> write_seqcount_end(&resv->seq);
>> preempt_enable();
>>
>>
>>> - j = 0;
>>> - for (i = 0; i < shared_count; ++i) {
>>> + /* Drop the references to the removed fences or move them to
>>> ef_list */
>>> + for (i = j, k = 0; i < old->shared_count; ++i) {
>>> struct dma_fence *f;
>>> - struct amdgpu_amdkfd_fence *efence;
>>> - f = rcu_dereference_protected(fobj->shared[i],
>>> - reservation_object_held(resv));
>>> -
>>> - efence = to_amdgpu_amdkfd_fence(f);
>>> - if (efence) {
>>> - fence_list[k++] = efence;
>>> - fobj->shared_count--;
>>> - } else {
>>> - RCU_INIT_POINTER(fobj->shared[j++], f);
>>> - }
>>> + f = rcu_dereference_protected(new->shared[i],
>>> + reservation_object_held(resv));
>>> + if (!ef)
>>> + (*ef_list)[k++] = to_amdgpu_amdkfd_fence(f);
>>> + else
>>> + dma_fence_put(f);
>>> }
>>> -
>>> - write_seqcount_end(&resv->seq);
>>> - preempt_enable();
>>> -
>>> - *ef_list = fence_list;
>>> - *ef_count = k;
>>> + kfree_rcu(old, rcu);
>>> return 0;
>>> }
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list