[PATCH 16/21] drm/amdgpu: implement lru amdgpu_queue_mgr policy for compute v3

Fri Mar 3 15:20:01 UTC 2017

On Fri, Mar 3, 2017 at 8:23 AM, Christian König <deathsimple at vodafone.de> wrote:
> Am 02.03.2017 um 22:44 schrieb Andres Rodriguez:
>>
>> Use an LRU policy to map usermode rings to HW compute queues.
>>
>> Most compute clients use one queue, and usually the first queue
>> available. This results in poor pipe/queue work distribution when
>> multiple compute apps are running. In most cases pipe 0 queue 0 is
>> the only queue that gets used.
>>
>> In order to better distribute work across multiple HW queues, we adopt
>> a policy to map the usermode ring ids to the LRU HW queue.
>>
>> This fixes a large majority of multi-app compute workloads sharing the
>> same HW queue, even though 7 other queues are available.
>>
>> v2: use ring->funcs->type instead of ring->hw_ip
>> v3: remove amdgpu_queue_mapper_funcs
>>
>> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  3 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  3 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c | 38 +++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c      | 57
>> +++++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h      |  4 ++
>>   5 files changed, 104 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d3f87f4..088aa4a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1501,20 +1501,23 @@ struct amdgpu_device {
>>         struct kfd_dev          *kfd;
>>         struct amdgpu_virt      virt;
>>         /* link all shadow bo */
>>         struct list_head                shadow_list;
>>         struct mutex                    shadow_list_lock;
>>         /* link all gtt */
>>         spinlock_t                      gtt_list_lock;
>>         struct list_head                gtt_list;
>> +       /* keep an lru list of rings by HW IP */
>> +       struct list_head                ring_lru_list;
>> +       struct mutex                    ring_lru_list_lock;
>
>
> No need for a mutex, a spinlock should do as well.
>
>
>>         /* record hw reset is performed */
>>         bool has_hw_reset;
>>     };
>>     static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>> ttm_bo_device *bdev)
>>   {
>>         return container_of(bdev, struct amdgpu_device, mman.bdev);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 6abb238..954e3b9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -1712,20 +1712,23 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>         spin_lock_init(&adev->gc_cac_idx_lock);
>>         spin_lock_init(&adev->audio_endpt_idx_lock);
>>         spin_lock_init(&adev->mm_stats.lock);
>>         INIT_LIST_HEAD(&adev->shadow_list);
>>         mutex_init(&adev->shadow_list_lock);
>>         INIT_LIST_HEAD(&adev->gtt_list);
>>         spin_lock_init(&adev->gtt_list_lock);
>>   +     INIT_LIST_HEAD(&adev->ring_lru_list);
>> +       mutex_init(&adev->ring_lru_list_lock);
>> +
>>         if (adev->asic_type >= CHIP_BONAIRE) {
>>                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
>>                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
>>         } else {
>>                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
>>                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
>>         }
>>         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
>>         if (adev->rmmio == NULL) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>> index cafe913..e6e4fba 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c
>> @@ -84,20 +84,54 @@ static int amdgpu_identity_map(struct amdgpu_device
>> *adev,
>>                 break;
>>         default:
>>                 *out_ring = NULL;
>>                 DRM_ERROR("unknown HW IP type: %d\n", mapper->hw_ip);
>>                 return -EINVAL;
>>         }
>>         return update_cached_map(mapper, ring, *out_ring);
>>   }
>>   +static enum amdgpu_ring_type amdgpu_hw_ip_to_ring_type(int hw_ip)
>> +{
>> +       switch (hw_ip) {
>> +       case AMDGPU_HW_IP_GFX:
>> +               return AMDGPU_RING_TYPE_GFX;
>> +       case AMDGPU_HW_IP_COMPUTE:
>> +               return AMDGPU_RING_TYPE_COMPUTE;
>> +       case AMDGPU_HW_IP_DMA:
>> +               return AMDGPU_RING_TYPE_SDMA;
>> +       case AMDGPU_HW_IP_UVD:
>> +               return AMDGPU_RING_TYPE_UVD;
>> +       case AMDGPU_HW_IP_VCE:
>> +               return AMDGPU_RING_TYPE_VCE;
>> +       default:
>> +               DRM_ERROR("Invalid HW IP specified %d\n", hw_ip);
>> +               return -1;
>> +       }
>> +}
>> +
>> +static int amdgpu_lru_map(struct amdgpu_device *adev,
>> +                         struct amdgpu_queue_mapper *mapper,
>> +                         int user_ring,
>> +                         struct amdgpu_ring **out_ring)
>> +{
>> +       int r;
>> +       int ring_type = amdgpu_hw_ip_to_ring_type(mapper->hw_ip);
>> +
>> +       r = amdgpu_ring_lru_get(adev, ring_type, out_ring);
>> +       if (r)
>> +               return r;
>> +
>> +       return update_cached_map(mapper, user_ring, *out_ring);
>> +}
>> +
>>   int amdgpu_queue_mgr_init(struct amdgpu_device *adev,
>>                           struct amdgpu_queue_mgr *mgr)
>>   {
>>         int i;
>>         if (!adev || !mgr)
>>                 return -EINVAL;
>>         memset(mgr, 0, sizeof(*mgr));
>>   @@ -138,26 +172,28 @@ int amdgpu_queue_mgr_map(struct amdgpu_device
>> *adev,
>>         *out_ring = get_cached_map(mapper, ring);
>>         if (*out_ring) {
>>                 /* cache hit */
>>                 r = 0;
>>                 goto out_unlock;
>>         }
>>         switch (mapper->hw_ip) {
>>         case AMDGPU_HW_IP_GFX:
>> -       case AMDGPU_HW_IP_COMPUTE:
>>         case AMDGPU_HW_IP_DMA:
>>         case AMDGPU_HW_IP_UVD:
>>         case AMDGPU_HW_IP_VCE:
>>                 r = amdgpu_identity_map(adev, mapper, ring, out_ring);
>>                 break;
>> +       case AMDGPU_HW_IP_COMPUTE:
>
>
> I'm pretty close to say add AMDGPU_HW_IP_DMA to that as well. What do you
> think?

Well, there are cases where you would want to utilize both upstream
and downstream bandwidth in parallel which would require the UMD to be
able to select two different SDMA rings.

Alex

>
>
>> +               r = amdgpu_lru_map(adev, mapper, ring, out_ring);
>> +               break;
>>         default:
>>                 *out_ring = NULL;
>>                 r = -EINVAL;
>>                 DRM_ERROR("unknown HW IP type: %d\n", mapper->hw_ip);
>>         }
>>     out_unlock:
>>         mutex_unlock(&mapper->lock);
>>         return r;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> index 43cd539..31c6274 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>> @@ -173,20 +173,22 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring)
>>         count = ring->funcs->align_mask + 1 -
>>                 (ring->wptr & ring->funcs->align_mask);
>>         count %= ring->funcs->align_mask + 1;
>>         ring->funcs->insert_nop(ring, count);
>>         mb();
>>         amdgpu_ring_set_wptr(ring);
>>         if (ring->funcs->end_use)
>>                 ring->funcs->end_use(ring);
>> +
>> +       amdgpu_ring_lru_touch(ring->adev, ring);
>>   }
>>     /**
>>    * amdgpu_ring_undo - reset the wptr
>>    *
>>    * @ring: amdgpu_ring structure holding ring information
>>    *
>>    * Reset the driver's copy of the wptr (all asics).
>>    */
>>   void amdgpu_ring_undo(struct amdgpu_ring *ring)
>> @@ -272,20 +274,22 @@ int amdgpu_ring_init(struct amdgpu_device *adev,
>> struct amdgpu_ring *ring,
>>                                             &ring->gpu_addr,
>>                                             (void **)&ring->ring);
>>                 if (r) {
>>                         dev_err(adev->dev, "(%d) ring create failed\n",
>> r);
>>                         return r;
>>                 }
>>                 memset((void *)ring->ring, 0, ring->ring_size);
>>         }
>>         ring->ptr_mask = (ring->ring_size / 4) - 1;
>>         ring->max_dw = max_dw;
>> +       INIT_LIST_HEAD(&ring->lru_list);
>> +       amdgpu_ring_lru_touch(adev, ring);
>>         if (amdgpu_debugfs_ring_init(adev, ring)) {
>>                 DRM_ERROR("Failed to register debugfs file for rings
>> !\n");
>>         }
>>         return 0;
>>   }
>>     /**
>>    * amdgpu_ring_fini - tear down the driver ring struct.
>>    *
>> @@ -305,20 +309,73 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
>>         amdgpu_bo_free_kernel(&ring->ring_obj,
>>                               &ring->gpu_addr,
>>                               (void **)&ring->ring);
>>         amdgpu_debugfs_ring_fini(ring);
>>         ring->adev->rings[ring->idx] = NULL;
>>   }
>>   +/**
>> + * amdgpu_ring_lru_get - get the least recently used ring for a HW IP
>> block
>> + *
>> + * @adev: amdgpu_device pointer
>> + * @type: amdgpu_ring_type enum
>> + * @ring: output ring
>> + *
>> + * Retreive the amdgpu_ring structure for the least recently used ring of
>> + * a specific IP block (all asics).
>> + * Returns 0 on success, error on failure.
>> + */
>> +int amdgpu_ring_lru_get(struct amdgpu_device *adev, int type,
>> +                       struct amdgpu_ring **ring)
>> +{
>> +       struct amdgpu_ring *entry;
>> +
>> +       /* List is sorted in LRU order, find first entry corresponding
>> +        * to the desired HW IP */
>> +       *ring = NULL;
>> +       mutex_lock(&adev->ring_lru_list_lock);
>> +       list_for_each_entry(entry, &adev->ring_lru_list, lru_list) {
>> +               if (entry->funcs->type == type) {
>> +                       *ring = entry;
>> +                       break;
>> +               }
>> +       }
>> +       mutex_unlock(&adev->ring_lru_list_lock);
>> +
>> +       if (!*ring) {
>> +               DRM_ERROR("Ring LRU contains no entries for ring
>> type:%d\n", type);
>> +               return -EINVAL;
>> +       }
>> +
>> +       amdgpu_ring_lru_touch(adev, entry);
>
>
> That takes the LRU lock twice, but do this only once.
>
> Regards,
> Christian.
>
>
>> +       return 0;
>> +}
>> +
>> +/**
>> + * amdgpu_ring_lru_touch - mark a ring as recently being used
>> + *
>> + * @adev: amdgpu_device pointer
>> + * @ring: ring to touch
>> + *
>> + * Move @ring to the the tail of the lru list
>> + */
>> +void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring
>> *ring)
>> +{
>> +       /* list_move_tail handles the case where ring isn't part of the
>> list */
>> +       mutex_lock(&adev->ring_lru_list_lock);
>> +       list_move_tail(&ring->lru_list, &adev->ring_lru_list);
>> +       mutex_unlock(&adev->ring_lru_list_lock);
>> +}
>> +
>>   /*
>>    * Debugfs info
>>    */
>>   #if defined(CONFIG_DEBUG_FS)
>>     /* Layout of file is 12 bytes consisting of
>>    * - rptr
>>    * - wptr
>>    * - driver's copy of wptr
>>    *
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 35da5c5..b51bdcd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -137,20 +137,21 @@ struct amdgpu_ring_funcs {
>>         void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>         void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>         void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t
>> val);
>>   };
>>     struct amdgpu_ring {
>>         struct amdgpu_device            *adev;
>>         const struct amdgpu_ring_funcs  *funcs;
>>         struct amdgpu_fence_driver      fence_drv;
>>         struct amd_gpu_scheduler        sched;
>> +       struct list_head                lru_list;
>>         struct amdgpu_bo        *ring_obj;
>>         volatile uint32_t       *ring;
>>         unsigned                rptr_offs;
>>         unsigned                wptr;
>>         unsigned                wptr_old;
>>         unsigned                ring_size;
>>         unsigned                max_dw;
>>         int                     count_dw;
>>         uint64_t                gpu_addr;
>> @@ -179,12 +180,15 @@ int amdgpu_ring_is_valid_index(struct amdgpu_device
>> *adev,
>>                                int hw_ip, int ring);
>>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
>>   void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>>   void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct
>> amdgpu_ib *ib);
>>   void amdgpu_ring_commit(struct amdgpu_ring *ring);
>>   void amdgpu_ring_undo(struct amdgpu_ring *ring);
>>   int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring
>> *ring,
>>                      unsigned ring_size, struct amdgpu_irq_src *irq_src,
>>                      unsigned irq_type);
>>   void amdgpu_ring_fini(struct amdgpu_ring *ring);
>> +int amdgpu_ring_lru_get(struct amdgpu_device *adev, int hw_ip,
>> +                       struct amdgpu_ring **ring);
>> +void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring
>> *ring);
>>     #endif
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx