[PATCH 16/21] drm/amdgpu: implement lru amdgpu_queue_mgr policy for compute v3

Fri Mar 3 17:15:09 UTC 2017

On 3/3/2017 10:20 AM, Alex Deucher wrote:
> On Fri, Mar 3, 2017 at 8:23 AM, Christian König <deathsimple at vodafone.de> wrote:
>> Am 02.03.2017 um 22:44 schrieb Andres Rodriguez:
>>>
>>> Use an LRU policy to map usermode rings to HW compute queues.
>>>
>>> Most compute clients use one queue, and usually the first queue
>>> available. This results in poor pipe/queue work distribution when
>>> multiple compute apps are running. In most cases pipe 0 queue 0 is
>>> the only queue that gets used.
>>>
>>> In order to better distribute work across multiple HW queues, we adopt
>>> a policy to map the usermode ring ids to the LRU HW queue.
>>>
>>> This fixes a large majority of multi-app compute workloads sharing the
>>> same HW queue, even though 7 other queues are available.
>>>
>>> v2: use ring->funcs->type instead of ring->hw_ip
>>> v3: remove amdgpu_queue_mapper_funcs
>>>
>>> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  3 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  3 ++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c | 38 +++++++++++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c      | 57
>>> +++++++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |  4 ++
>>>   5 files changed, 104 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index d3f87f4..088aa4a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1501,20 +1501,23 @@ struct amdgpu_device {
>>>         struct kfd_dev          *kfd;
>>>         struct amdgpu_virt      virt;
>>>         /* link all shadow bo */
>>>         struct list_head                shadow_list;
>>>         struct mutex                    shadow_list_lock;
>>>         /* link all gtt */
>>>         spinlock_t                      gtt_list_lock;
>>>         struct list_head                gtt_list;
>>> +       /* keep an lru list of rings by HW IP */
>>> +       struct list_head                ring_lru_list;
>>> +       struct mutex                    ring_lru_list_lock;
>>
>>
>> No need for a mutex, a spinlock should do as well.
>>
>>
>>>         /* record hw reset is performed */
>>>         bool has_hw_reset;
>>>     };
>>>     static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>>> ttm_bo_device *bdev)
>>>   {
>>>         return container_of(bdev, struct amdgpu_device, mman.bdev);
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 6abb238..954e3b9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -1712,20 +1712,23 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>         spin_lock_init(&adev->gc_cac_idx_lock);
>>>         spin_lock_init(&adev->audio_endpt_idx_lock);
>>>         spin_lock_init(&adev->mm_stats.lock);
>>>         INIT_LIST_HEAD(&adev->shadow_list);
>>>         mutex_init(&adev->shadow_list_lock);
>>>         INIT_LIST_HEAD(&adev->gtt_list);
>>>         spin_lock_init(&adev->gtt_list_lock);
>>>   +     INIT_LIST_HEAD(&adev->ring_lru_list);
>>> +       mutex_init(&adev->ring_lru_list_lock);
>>> +
>>>         if (adev->asic_type >= CHIP_BONAIRE) {
>>>                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
>>>                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
>>>         } else {
>>>                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
>>>                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
>>>         }
>>>         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
>>>         if (adev->rmmio == NULL) {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>>> index cafe913..e6e4fba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>>> @@ -84,20 +84,54 @@ static int amdgpu_identity_map(struct amdgpu_device
>>> *adev,
>>>                 break;
>>>         default:
>>>                 *out_ring = NULL;
>>>                 DRM_ERROR("unknown HW IP type: %d\n", mapper->hw_ip);
>>>                 return -EINVAL;
>>>         }
>>>         return update_cached_map(mapper, ring, *out_ring);
>>>   }
>>>   +static enum amdgpu_ring_type amdgpu_hw_ip_to_ring_type(int hw_ip)
>>> +{
>>> +       switch (hw_ip) {
>>> +       case AMDGPU_HW_IP_GFX:
>>> +               return AMDGPU_RING_TYPE_GFX;
>>> +       case AMDGPU_HW_IP_COMPUTE:
>>> +               return AMDGPU_RING_TYPE_COMPUTE;
>>> +       case AMDGPU_HW_IP_DMA:
>>> +               return AMDGPU_RING_TYPE_SDMA;
>>> +       case AMDGPU_HW_IP_UVD:
>>> +               return AMDGPU_RING_TYPE_UVD;
>>> +       case AMDGPU_HW_IP_VCE:
>>> +               return AMDGPU_RING_TYPE_VCE;
>>> +       default:
>>> +               DRM_ERROR("Invalid HW IP specified %d\n", hw_ip);
>>> +               return -1;
>>> +       }
>>> +}
>>> +
>>> +static int amdgpu_lru_map(struct amdgpu_device *adev,
>>> +                         struct amdgpu_queue_mapper *mapper,
>>> +                         int user_ring,
>>> +                         struct amdgpu_ring **out_ring)
>>> +{
>>> +       int r;
>>> +       int ring_type = amdgpu_hw_ip_to_ring_type(mapper->hw_ip);
>>> +
>>> +       r = amdgpu_ring_lru_get(adev, ring_type, out_ring);
>>> +       if (r)
>>> +               return r;
>>> +
>>> +       return update_cached_map(mapper, user_ring, *out_ring);
>>> +}
>>> +
>>>   int amdgpu_queue_mgr_init(struct amdgpu_device *adev,
>>>                           struct amdgpu_queue_mgr *mgr)
>>>   {
>>>         int i;
>>>         if (!adev || !mgr)
>>>                 return -EINVAL;
>>>         memset(mgr, 0, sizeof(*mgr));
>>>   @@ -138,26 +172,28 @@ int amdgpu_queue_mgr_map(struct amdgpu_device
>>> *adev,
>>>         *out_ring = get_cached_map(mapper, ring);
>>>         if (*out_ring) {
>>>                 /* cache hit */
>>>                 r = 0;
>>>                 goto out_unlock;
>>>         }
>>>         switch (mapper->hw_ip) {
>>>         case AMDGPU_HW_IP_GFX:
>>> -       case AMDGPU_HW_IP_COMPUTE:
>>>         case AMDGPU_HW_IP_DMA:
>>>         case AMDGPU_HW_IP_UVD:
>>>         case AMDGPU_HW_IP_VCE:
>>>                 r = amdgpu_identity_map(adev, mapper, ring, out_ring);
>>>                 break;
>>> +       case AMDGPU_HW_IP_COMPUTE:
>>
>>
>> I'm pretty close to say add AMDGPU_HW_IP_DMA to that as well. What do you
>> think?
>
> Well, there are cases where you would want to utilize both upstream
> and downstream bandwidth in parallel which would require the UMD to be
> able to select two different SDMA rings.
>
> Alex
>

If we provide an explicit guarantee that the mappings will be 1:1, then 
we should be able to switch SDMA to lru as well. That would result in 
only two possible mappings:
     - 1->1, 1->2
     - 1->2, 2->1

This way whenever the UMD picks two separate rings they will always get 
separate rings.

I think there might be a corner case right now that we can hit that 
results in a many to one mapping. Let me double check that.

Once we have the 1:1 guarantee, I think there could be some improvements 
to having SDMA on LRU.

Andres

>>
>>
>>> +               r = amdgpu_lru_map(adev, mapper, ring, out_ring);
>>> +               break;
>>>         default:
>>>                 *out_ring = NULL;
>>>                 r = -EINVAL;
>>>                 DRM_ERROR("unknown HW IP type: %d\n", mapper->hw_ip);
>>>         }
>>>     out_unlock:
>>>         mutex_unlock(&mapper->lock);
>>>         return r;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index 43cd539..31c6274 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -173,20 +173,22 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring)
>>>         count = ring->funcs->align_mask + 1 -
>>>                 (ring->wptr & ring->funcs->align_mask);
>>>         count %= ring->funcs->align_mask + 1;
>>>         ring->funcs->insert_nop(ring, count);
>>>         mb();
>>>         amdgpu_ring_set_wptr(ring);
>>>         if (ring->funcs->end_use)
>>>                 ring->funcs->end_use(ring);
>>> +
>>> +       amdgpu_ring_lru_touch(ring->adev, ring);
>>>   }
>>>     /**
>>>    * amdgpu_ring_undo - reset the wptr
>>>    *
>>>    * @ring: amdgpu_ring structure holding ring information
>>>    *
>>>    * Reset the driver's copy of the wptr (all asics).
>>>    */
>>>   void amdgpu_ring_undo(struct amdgpu_ring *ring)
>>> @@ -272,20 +274,22 @@ int amdgpu_ring_init(struct amdgpu_device *adev,
>>> struct amdgpu_ring *ring,
>>>                                             &ring->gpu_addr,
>>>                                             (void **)&ring->ring);
>>>                 if (r) {
>>>                         dev_err(adev->dev, "(%d) ring create failed\n",
>>> r);
>>>                         return r;
>>>                 }
>>>                 memset((void *)ring->ring, 0, ring->ring_size);
>>>         }
>>>         ring->ptr_mask = (ring->ring_size / 4) - 1;
>>>         ring->max_dw = max_dw;
>>> +       INIT_LIST_HEAD(&ring->lru_list);
>>> +       amdgpu_ring_lru_touch(adev, ring);
>>>         if (amdgpu_debugfs_ring_init(adev, ring)) {
>>>                 DRM_ERROR("Failed to register debugfs file for rings
>>> !\n");
>>>         }
>>>         return 0;
>>>   }
>>>     /**
>>>    * amdgpu_ring_fini - tear down the driver ring struct.
>>>    *
>>> @@ -305,20 +309,73 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
>>>         amdgpu_bo_free_kernel(&ring->ring_obj,
>>>                               &ring->gpu_addr,
>>>                               (void **)&ring->ring);
>>>         amdgpu_debugfs_ring_fini(ring);
>>>         ring->adev->rings[ring->idx] = NULL;
>>>   }
>>>   +/**
>>> + * amdgpu_ring_lru_get - get the least recently used ring for a HW IP
>>> block
>>> + *
>>> + * @adev: amdgpu_device pointer
>>> + * @type: amdgpu_ring_type enum
>>> + * @ring: output ring
>>> + *
>>> + * Retreive the amdgpu_ring structure for the least recently used ring of
>>> + * a specific IP block (all asics).
>>> + * Returns 0 on success, error on failure.
>>> + */
>>> +int amdgpu_ring_lru_get(struct amdgpu_device *adev, int type,
>>> +                       struct amdgpu_ring **ring)
>>> +{
>>> +       struct amdgpu_ring *entry;
>>> +
>>> +       /* List is sorted in LRU order, find first entry corresponding
>>> +        * to the desired HW IP */
>>> +       *ring = NULL;
>>> +       mutex_lock(&adev->ring_lru_list_lock);
>>> +       list_for_each_entry(entry, &adev->ring_lru_list, lru_list) {
>>> +               if (entry->funcs->type == type) {
>>> +                       *ring = entry;
>>> +                       break;
>>> +               }
>>> +       }
>>> +       mutex_unlock(&adev->ring_lru_list_lock);
>>> +
>>> +       if (!*ring) {
>>> +               DRM_ERROR("Ring LRU contains no entries for ring
>>> type:%d\n", type);
>>> +               return -EINVAL;
>>> +       }
>>> +
>>> +       amdgpu_ring_lru_touch(adev, entry);
>>
>>
>> That takes the LRU lock twice, but do this only once.
>>
>> Regards,
>> Christian.
>>
>>
>>> +       return 0;
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_ring_lru_touch - mark a ring as recently being used
>>> + *
>>> + * @adev: amdgpu_device pointer
>>> + * @ring: ring to touch
>>> + *
>>> + * Move @ring to the the tail of the lru list
>>> + */
>>> +void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring
>>> *ring)
>>> +{
>>> +       /* list_move_tail handles the case where ring isn't part of the
>>> list */
>>> +       mutex_lock(&adev->ring_lru_list_lock);
>>> +       list_move_tail(&ring->lru_list, &adev->ring_lru_list);
>>> +       mutex_unlock(&adev->ring_lru_list_lock);
>>> +}
>>> +
>>>   /*
>>>    * Debugfs info
>>>    */
>>>   #if defined(CONFIG_DEBUG_FS)
>>>     /* Layout of file is 12 bytes consisting of
>>>    * - rptr
>>>    * - wptr
>>>    * - driver's copy of wptr
>>>    *
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 35da5c5..b51bdcd 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -137,20 +137,21 @@ struct amdgpu_ring_funcs {
>>>         void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>         void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>         void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t
>>> val);
>>>   };
>>>     struct amdgpu_ring {
>>>         struct amdgpu_device            *adev;
>>>         const struct amdgpu_ring_funcs  *funcs;
>>>         struct amdgpu_fence_driver      fence_drv;
>>>         struct amd_gpu_scheduler        sched;
>>> +       struct list_head                lru_list;
>>>         struct amdgpu_bo        *ring_obj;
>>>         volatile uint32_t       *ring;
>>>         unsigned                rptr_offs;
>>>         unsigned                wptr;
>>>         unsigned                wptr_old;
>>>         unsigned                ring_size;
>>>         unsigned                max_dw;
>>>         int                     count_dw;
>>>         uint64_t                gpu_addr;
>>> @@ -179,12 +180,15 @@ int amdgpu_ring_is_valid_index(struct amdgpu_device
>>> *adev,
>>>                                int hw_ip, int ring);
>>>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
>>>   void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>>>   void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct
>>> amdgpu_ib *ib);
>>>   void amdgpu_ring_commit(struct amdgpu_ring *ring);
>>>   void amdgpu_ring_undo(struct amdgpu_ring *ring);
>>>   int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring
>>> *ring,
>>>                      unsigned ring_size, struct amdgpu_irq_src *irq_src,
>>>                      unsigned irq_type);
>>>   void amdgpu_ring_fini(struct amdgpu_ring *ring);
>>> +int amdgpu_ring_lru_get(struct amdgpu_device *adev, int hw_ip,
>>> +                       struct amdgpu_ring **ring);
>>> +void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring
>>> *ring);
>>>     #endif
>>
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx