[PATCH 06/25] drm/amdgpu: Add KFD eviction fence

Felix Kuehling felix.kuehling at amd.com
Mon Jan 29 19:39:45 UTC 2018


On 2018-01-29 08:43 AM, Christian König wrote:
> Hi Felix & Harish,
>
> maybe explain why I found that odd: dma_fence_add_callback() sets the
> DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT flag before adding the callback.
>
> So the flag should always be set when there are callbacks.
> Did I miss anything?

I don't think we add any callbacks to our eviction fences.

Regards,
  Felix

>
> Regards,
> Christian.
>
> Am 29.01.2018 um 00:55 schrieb Felix Kuehling:
>> [+Harish, forgot to acknowledge him in the commit description, will fix
>> that in v2]
>>
>> Harish, please see Christian's question below in amd_kfd_fence_signal.
>> Did I understand this correctly?
>>
>> Regards,
>>    Felix
>>
>> On 2018-01-28 06:42 PM, Felix Kuehling wrote:
>>> On 2018-01-27 04:16 AM, Christian König wrote:
>>>> Am 27.01.2018 um 02:09 schrieb Felix Kuehling:
>>> [snip]
>>>>> +struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
>>>>> +                               void *mm)
>>>>> +{
>>>>> +    struct amdgpu_amdkfd_fence *fence = NULL;
>>>>> +
>>>>> +    fence = kzalloc(sizeof(*fence), GFP_KERNEL);
>>>>> +    if (fence == NULL)
>>>>> +        return NULL;
>>>>> +
>>>>> +    /* mm_struct mm is used as void pointer to identify the parent
>>>>> +     * KFD process. Don't dereference it. Fence and any threads
>>>>> using
>>>>> +     * mm is guranteed to be released before process termination.
>>>>> +     */
>>>>> +    fence->mm = mm;
>>>> That won't work. Fences can live much longer than any process who
>>>> created them.
>>>>
>>>> I've already found a fence in a BO still living hours after the
>>>> process was killed and the pid long recycled.
>>>>
>>>> I suggest to make fence->mm a real mm_struct pointer with reference
>>>> counting and then set it to NULL and drop the reference in
>>>> enable_signaling.
>>> I agree. But enable_signaling may be too early to drop the reference.
>>> amd_kfd_fence_check_mm could still be called later from
>>> amdgpu_ttm_bo_eviction_valuable, as long as the fence hasn't
>>> signaled yet.
>>>
>>> The safe place is problably in amd_kfd_fence_release.
>>>
>>>>> +    get_task_comm(fence->timeline_name, current);
>>>>> +    spin_lock_init(&fence->lock);
>>>>> +
>>>>> +    dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock,
>>>>> +           context, atomic_inc_return(&fence_seq));
>>>>> +
>>>>> +    return fence;
>>>>> +}
>>>>> +
>>>>> +struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct
>>>>> dma_fence *f)
>>>>> +{
>>>>> +    struct amdgpu_amdkfd_fence *fence;
>>>>> +
>>>>> +    if (!f)
>>>>> +        return NULL;
>>>>> +
>>>>> +    fence = container_of(f, struct amdgpu_amdkfd_fence, base);
>>>>> +    if (fence && f->ops == &amd_kfd_fence_ops)
>>>>> +        return fence;
>>>>> +
>>>>> +    return NULL;
>>>>> +}
>>>>> +
>>>>> +static const char *amd_kfd_fence_get_driver_name(struct dma_fence
>>>>> *f)
>>>>> +{
>>>>> +    return "amdgpu_amdkfd_fence";
>>>>> +}
>>>>> +
>>>>> +static const char *amd_kfd_fence_get_timeline_name(struct
>>>>> dma_fence *f)
>>>>> +{
>>>>> +    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
>>>>> +
>>>>> +    return fence->timeline_name;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * amd_kfd_fence_enable_signaling - This gets called when TTM wants
>>>>> to evict
>>>>> + *  a KFD BO and schedules a job to move the BO.
>>>>> + *  If fence is already signaled return true.
>>>>> + *  If fence is not signaled schedule a evict KFD process work item.
>>>>> + */
>>>>> +static bool amd_kfd_fence_enable_signaling(struct dma_fence *f)
>>>>> +{
>>>>> +    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
>>>>> +
>>>>> +    if (!fence)
>>>>> +        return false;
>>>>> +
>>>>> +    if (dma_fence_is_signaled(f))
>>>>> +        return true;
>>>>> +
>>>>> +    if (!kgd2kfd->schedule_evict_and_restore_process(
>>>>> +                (struct mm_struct *)fence->mm, f))
>>>>> +        return true;
>>>>> +
>>>>> +    return false;
>>>>> +}
>>>>> +
>>>>> +static int amd_kfd_fence_signal(struct dma_fence *f)
>>>>> +{
>>>>> +    unsigned long flags;
>>>>> +    int ret;
>>>>> +
>>>>> +    spin_lock_irqsave(f->lock, flags);
>>>>> +    /* Set enabled bit so cb will called */
>>>>> +    set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags);
>>>> Mhm, why is that necessary?
>>> This only gets called from fence_release below. I think this is to
>>> avoid
>>> needlessly scheduling an eviction/restore cycle when an eviction fence
>>> gets destroyed that hasn't been triggered before, probably during
>>> process termination.
>>>
>>> Harish, do you remember any other reason for this?
>>>
>>>>> +    ret = dma_fence_signal_locked(f);
>>>>> +    spin_unlock_irqrestore(f->lock, flags);
>>>>> +
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * amd_kfd_fence_release - callback that fence can be freed
>>>>> + *
>>>>> + * @fence: fence
>>>>> + *
>>>>> + * This function is called when the reference count becomes zero.
>>>>> + * It just RCU schedules freeing up the fence.
>>>>> + */
>>>>> +static void amd_kfd_fence_release(struct dma_fence *f)
>>>>> +{
>>>>> +    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
>>>>> +    /* Unconditionally signal the fence. The process is getting
>>>>> +     * terminated.
>>>>> +     */
>>>>> +    if (WARN_ON(!fence))
>>>>> +        return; /* Not an amdgpu_amdkfd_fence */
>>>>> +
>>>>> +    amd_kfd_fence_signal(f);
>>>>> +    kfree_rcu(f, rcu);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * amd_kfd_fence_check_mm - Check if @mm is same as that of the
>>>>> fence @f
>>>>> + *  if same return TRUE else return FALSE.
>>>>> + *
>>>>> + * @f: [IN] fence
>>>>> + * @mm: [IN] mm that needs to be verified
>>>>> + */
>>>>> +bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm)
>>>>> +{
>>>>> +    struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
>>>>> +
>>>>> +    if (!fence)
>>>>> +        return false;
>>>>> +    else if (fence->mm == mm)
>>>>> +        return true;
>>>>> +
>>>>> +    return false;
>>>>> +}
>>>>> +
>>>>> +const struct dma_fence_ops amd_kfd_fence_ops = {
>>>>> +    .get_driver_name = amd_kfd_fence_get_driver_name,
>>>>> +    .get_timeline_name = amd_kfd_fence_get_timeline_name,
>>>>> +    .enable_signaling = amd_kfd_fence_enable_signaling,
>>>>> +    .signaled = NULL,
>>>>> +    .wait = dma_fence_default_wait,
>>>>> +    .release = amd_kfd_fence_release,
>>>>> +};
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> index 65d5a4e..ca00dd2 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>>> @@ -36,8 +36,9 @@
>>>>>    #define AMDGPU_MAX_UVD_ENC_RINGS    2
>>>>>      /* some special values for the owner field */
>>>>> -#define AMDGPU_FENCE_OWNER_UNDEFINED    ((void*)0ul)
>>>>> -#define AMDGPU_FENCE_OWNER_VM        ((void*)1ul)
>>>>> +#define AMDGPU_FENCE_OWNER_UNDEFINED    ((void *)0ul)
>>>>> +#define AMDGPU_FENCE_OWNER_VM        ((void *)1ul)
>>>>> +#define AMDGPU_FENCE_OWNER_KFD        ((void *)2ul)
>>>>>      #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
>>>>>    #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>>>> index df65c66..0cb31d9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>>>> @@ -31,6 +31,7 @@
>>>>>    #include <drm/drmP.h>
>>>>>    #include "amdgpu.h"
>>>>>    #include "amdgpu_trace.h"
>>>>> +#include "amdgpu_amdkfd.h"
>>>>>      struct amdgpu_sync_entry {
>>>>>        struct hlist_node    node;
>>>>> @@ -86,10 +87,18 @@ static bool amdgpu_sync_same_dev(struct
>>>>> amdgpu_device *adev,
>>>>>    static void *amdgpu_sync_get_owner(struct dma_fence *f)
>>>>>    {
>>>>>        struct drm_sched_fence *s_fence = to_drm_sched_fence(f);
>>>>> +    struct amdgpu_amdkfd_fence *kfd_fence;
>>>>> +
>>>>> +    if (!f)
>>>>> +        return AMDGPU_FENCE_OWNER_UNDEFINED;
>>>> When you add the extra NULL check here then please move the
>>>> to_drm_sched_fence() after it as well.
>>> Yeah, makes sense.
>>>
>>> Regards,
>>>    Felix
>>>
>>>> Christian.
>>>>
>>>>>          if (s_fence)
>>>>>            return s_fence->owner;
>>>>>    +    kfd_fence = to_amdgpu_amdkfd_fence(f);
>>>>> +    if (kfd_fence)
>>>>> +        return AMDGPU_FENCE_OWNER_KFD;
>>>>> +
>>>>>        return AMDGPU_FENCE_OWNER_UNDEFINED;
>>>>>    }
>>>>>    @@ -204,11 +213,18 @@ int amdgpu_sync_resv(struct amdgpu_device
>>>>> *adev,
>>>>>        for (i = 0; i < flist->shared_count; ++i) {
>>>>>            f = rcu_dereference_protected(flist->shared[i],
>>>>>                              reservation_object_held(resv));
>>>>> +        /* We only want to trigger KFD eviction fences on
>>>>> +         * evict or move jobs. Skip KFD fences otherwise.
>>>>> +         */
>>>>> +        fence_owner = amdgpu_sync_get_owner(f);
>>>>> +        if (fence_owner == AMDGPU_FENCE_OWNER_KFD &&
>>>>> +            owner != AMDGPU_FENCE_OWNER_UNDEFINED)
>>>>> +            continue;
>>>>> +
>>>>>            if (amdgpu_sync_same_dev(adev, f)) {
>>>>>                /* VM updates are only interesting
>>>>>                 * for other VM updates and moves.
>>>>>                 */
>>>>> -            fence_owner = amdgpu_sync_get_owner(f);
>>>>>                if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
>>>>>                    (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) &&
>>>>>                    ((owner == AMDGPU_FENCE_OWNER_VM) !=
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>>> index e4bb435..c3f33d3 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>>> @@ -46,6 +46,7 @@
>>>>>    #include "amdgpu.h"
>>>>>    #include "amdgpu_object.h"
>>>>>    #include "amdgpu_trace.h"
>>>>> +#include "amdgpu_amdkfd.h"
>>>>>    #include "bif/bif_4_1_d.h"
>>>>>      #define DRM_FILE_PAGE_OFFSET (0x100000000ULL >> PAGE_SHIFT)
>>>>> @@ -1170,6 +1171,23 @@ static bool
>>>>> amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
>>>>>    {
>>>>>        unsigned long num_pages = bo->mem.num_pages;
>>>>>        struct drm_mm_node *node = bo->mem.mm_node;
>>>>> +    struct reservation_object_list *flist;
>>>>> +    struct dma_fence *f;
>>>>> +    int i;
>>>>> +
>>>>> +    /* If bo is a KFD BO, check if the bo belongs to the current
>>>>> process.
>>>>> +     * If true, then return false as any KFD process needs all its
>>>>> BOs to
>>>>> +     * be resident to run successfully
>>>>> +     */
>>>>> +    flist = reservation_object_get_list(bo->resv);
>>>>> +    if (flist) {
>>>>> +        for (i = 0; i < flist->shared_count; ++i) {
>>>>> +            f = rcu_dereference_protected(flist->shared[i],
>>>>> +                reservation_object_held(bo->resv));
>>>>> +            if (amd_kfd_fence_check_mm(f, current->mm))
>>>>> +                return false;
>>>>> +        }
>>>>> +    }
>>>>>          switch (bo->mem.mem_type) {
>>>>>        case TTM_PL_TT:
>>>>> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>>>> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>>>> index 94eab54..9e35249 100644
>>>>> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>>>> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>>>> @@ -30,6 +30,7 @@
>>>>>      #include <linux/types.h>
>>>>>    #include <linux/bitmap.h>
>>>>> +#include <linux/dma-fence.h>
>>>>>      struct pci_dev;
>>>>>    @@ -286,6 +287,9 @@ struct kfd2kgd_calls {
>>>>>     *
>>>>>     * @resume: Notifies amdkfd about a resume action done to a kgd
>>>>> device
>>>>>     *
>>>>> + * @schedule_evict_and_restore_process: Schedules work queue that
>>>>> will prepare
>>>>> + * for safe eviction of KFD BOs that belong to the specified
>>>>> process.
>>>>> + *
>>>>>     * This structure contains function callback pointers so the kgd
>>>>> driver
>>>>>     * will notify to the amdkfd about certain status changes.
>>>>>     *
>>>>> @@ -300,6 +304,8 @@ struct kgd2kfd_calls {
>>>>>        void (*interrupt)(struct kfd_dev *kfd, const void
>>>>> *ih_ring_entry);
>>>>>        void (*suspend)(struct kfd_dev *kfd);
>>>>>        int (*resume)(struct kfd_dev *kfd);
>>>>> +    int (*schedule_evict_and_restore_process)(struct mm_struct *mm,
>>>>> +            struct dma_fence *fence);
>>>>>    };
>>>>>      int kgd2kfd_init(unsigned interface_version,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>



More information about the amd-gfx mailing list