[PATCH 19/22] drm/amdgpu: add framework for HW specific priority settings

Andres Rodriguez andresx7 at gmail.com
Wed Mar 1 17:44:09 UTC 2017



On 2017-03-01 10:49 AM, Alex Deucher wrote:
> On Wed, Mar 1, 2017 at 2:27 AM, zhoucm1 <david1.zhou at amd.com> wrote:
>>
>> On 2017年03月01日 06:14, Andres Rodriguez wrote:
>>> Add an initial framework for changing the HW priorities of rings. The
>>> framework allows requesting priority changes for the lifetime of an
>>> amdgpu_job. After the job completes the priority will decay to the next
>>> lowest priority for which a request is still valid.
>>>
>>> A new ring function set_priority() can now be populated to take care of
>>> the HW specific programming sequence for priority changes.
>>>
>>> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  4 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   |  2 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 10 ++++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 71
>>> ++++++++++++++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 11 +++++
>>>    5 files changed, 94 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 366f6d3..0676495 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -636,21 +636,21 @@ struct amdgpu_flip_work {
>>>    struct amdgpu_ib {
>>>          struct amdgpu_sa_bo             *sa_bo;
>>>          uint32_t                        length_dw;
>>>          uint64_t                        gpu_addr;
>>>          uint32_t                        *ptr;
>>>          uint32_t                        flags;
>>>    };
>>>      extern const struct amd_sched_backend_ops amdgpu_sched_ops;
>>>    -int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>> +int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, int
>>> priority,
>>>                       struct amdgpu_job **job, struct amdgpu_vm *vm);
>>>    int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
>>>                               struct amdgpu_job **job);
>>>      void amdgpu_job_free_resources(struct amdgpu_job *job);
>>>    void amdgpu_job_free(struct amdgpu_job *job);
>>>    int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
>>>                        struct amd_sched_entity *entity, void *owner,
>>>                        struct dma_fence **f);
>>>    @@ -990,20 +990,22 @@ struct amdgpu_cs_parser {
>>>    #define AMDGPU_VM_DOMAIN                    (1 << 3) /* bit set means in
>>> virtual memory context */
>>>      struct amdgpu_job {
>>>          struct amd_sched_job    base;
>>>          struct amdgpu_device    *adev;
>>>          struct amdgpu_vm        *vm;
>>>          struct amdgpu_ring      *ring;
>>>          struct amdgpu_sync      sync;
>>>          struct amdgpu_ib        *ibs;
>>>          struct dma_fence        *fence; /* the hw fence */
>>> +       struct dma_fence_cb     cb;
>>> +       int                     priority;
>>>          uint32_t                preamble_status;
>>>          uint32_t                num_ibs;
>>>          void                    *owner;
>>>          uint64_t                fence_ctx; /* the fence_context this job
>>> uses */
>>>          bool                    vm_needs_flush;
>>>          unsigned                vm_id;
>>>          uint64_t                vm_pd_addr;
>>>          uint32_t                gds_base, gds_size;
>>>          uint32_t                gws_base, gws_size;
>>>          uint32_t                oa_base, oa_size;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 605d40e..19ce202 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -179,21 +179,21 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser
>>> *p, void *data)
>>>                  case AMDGPU_CHUNK_ID_DEPENDENCIES:
>>>                          break;
>>>                  default:
>>>                          ret = -EINVAL;
>>>                          goto free_partial_kdata;
>>>                  }
>>>          }
>>>    -     ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm);
>>> +       ret = amdgpu_job_alloc(p->adev, num_ibs, p->ctx->priority,
>>> &p->job, vm);
>>>          if (ret)
>>>                  goto free_all_kdata;
>>>          if (p->uf_entry.robj)
>>>                  p->job->uf_addr = uf_offset;
>>>          kfree(chunk_array);
>>>          return 0;
>>>      free_all_kdata:
>>>          i = p->nchunks - 1;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 86a1242..45b3c90 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -32,50 +32,51 @@ static void amdgpu_job_timedout(struct amd_sched_job
>>> *s_job)
>>>    {
>>>          struct amdgpu_job *job = container_of(s_job, struct amdgpu_job,
>>> base);
>>>          DRM_ERROR("ring %s timeout, last signaled seq=%u, last emitted
>>> seq=%u\n",
>>>                    job->base.sched->name,
>>>                    atomic_read(&job->ring->fence_drv.last_seq),
>>>                    job->ring->fence_drv.sync_seq);
>>>          amdgpu_gpu_reset(job->adev);
>>>    }
>>>    -int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>> +int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, int
>>> priority,
>>>                       struct amdgpu_job **job, struct amdgpu_vm *vm)
>>>    {
>>>          size_t size = sizeof(struct amdgpu_job);
>>>          if (num_ibs == 0)
>>>                  return -EINVAL;
>>>          size += sizeof(struct amdgpu_ib) * num_ibs;
>>>          *job = kzalloc(size, GFP_KERNEL);
>>>          if (!*job)
>>>                  return -ENOMEM;
>>>          (*job)->adev = adev;
>>>          (*job)->vm = vm;
>>> +       (*job)->priority = priority;
>>>          (*job)->ibs = (void *)&(*job)[1];
>>>          (*job)->num_ibs = num_ibs;
>>>          amdgpu_sync_create(&(*job)->sync);
>>>          return 0;
>>>    }
>>>      int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned
>>> size,
>>>                               struct amdgpu_job **job)
>>>    {
>>>          int r;
>>>    -     r = amdgpu_job_alloc(adev, 1, job, NULL);
>>> +       r = amdgpu_job_alloc(adev, 1, AMDGPU_CTX_PRIORITY_NORMAL, job,
>>> NULL);
>>>          if (r)
>>>                  return r;
>>>          r = amdgpu_ib_get(adev, NULL, size, &(*job)->ibs[0]);
>>>          if (r)
>>>                  kfree(*job);
>>>          return r;
>>>    }
>>>    @@ -170,20 +171,25 @@ static struct dma_fence *amdgpu_job_run(struct
>>> amd_sched_job *sched_job)
>>>          BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>>>          trace_amdgpu_sched_run_job(job);
>>>          r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
>>> &fence);
>>>          if (r)
>>>                  DRM_ERROR("Error scheduling IBs (%d)\n", r);
>>>          /* if gpu reset, hw fence will be replaced here */
>>>          dma_fence_put(job->fence);
>>>          job->fence = dma_fence_get(fence);
>>> +
>>> +       r = amdgpu_ring_elevate_priority(job->ring, job->priority, job);
>>> +       if (r)
>>> +               DRM_ERROR("Failed to set job priority (%d)\n", r);
>> elevate priority of ring is for this job, right?
>> you're setting priority by mmio method and restore priority by fence cb,
>> your method isn't precise. I have some comments:
>> 1. seems we should elevate_priority before amdgpu_ib_schedule, so that the
>> ring is proper priority for this submission.
Yeah thanks for the catch on that. This might result in the first 
packets being emitted with a new_wave_priority lower than desired and 
introduce extra delay.
>> 2. can we put setting of registers of SPI into ring buffer as a command
>> frame? evevate_priority setting is front of a command frame and
>> restore_priority is end of frame.
We can't do it on the ring because we need any work that is already 
committed to the ring the inherit the highest priority. Otherwise the 
high priority work would have to wait for the regular priority work (to 
complete at regular priority latency) before it can begin.

For more details see my reply to Christian's email regarding this topic.
> We should probably check if changing the queue priority requires that
> the engine be idle as well.  I vaguely recall there being a packet for
> some of this that was added for HSA.
+Jay

Hey Jay,

Do you have more details on this?

I wrote these patches under the assumption that changing the priority 
while the engine is busy was allowed, but that it would not have any 
effect for waves that have already been emitted.

Would changing the priorities while the engines are active cause any 
hangs or bad side effects?
>
> Alex
>
>> 3. if no priority exchanges, we can skip setting for priority for command
>> submission.
I originally had a check that skipped set_priority() if the priorities 
matched, but I decided to remove it.

The reason is that while no changes to the HW state may need to happen, 
the implementation may need to track extra metadata about who is 
executing high priority work on the ring. Therefore whether the 
operation is a noop or not should be decided by the specific implementation.

Having said that, I think gfx_v8_0_ring_set_priority_compute() can be 
improved to skip more work. So I'll apply your suggestion there.

>>
>> Regards,
>> David Zhou
>>
>>> +
>>>          amdgpu_job_free_resources(job);
>>>          return fence;
>>>    }
>>>      const struct amd_sched_backend_ops amdgpu_sched_ops = {
>>>          .dependency = amdgpu_job_dependency,
>>>          .run_job = amdgpu_job_run,
>>>          .timedout_job = amdgpu_job_timedout,
>>>          .free_job = amdgpu_job_free_cb
>>>    };
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index 80cb051..12bc7a9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -192,20 +192,89 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring)
>>>     * Reset the driver's copy of the wptr (all asics).
>>>     */
>>>    void amdgpu_ring_undo(struct amdgpu_ring *ring)
>>>    {
>>>          ring->wptr = ring->wptr_old;
>>>          if (ring->funcs->end_use)
>>>                  ring->funcs->end_use(ring);
>>>    }
>>>    +static void amdgpu_ring_restore_priority_cb(struct dma_fence *f,
>>> +                                           struct dma_fence_cb *cb)
>>> +{
>>> +       int i;
>>> +       struct amdgpu_job *cb_job =
>>> +               container_of(cb, struct amdgpu_job, cb);
>>> +       struct amdgpu_ring *ring = cb_job->ring;
>>> +
>>> +       spin_lock(&ring->priority_lock);
>>> +
>>> +       /* remove ourselves from the list if necessary */
>>> +       if (cb_job == ring->last_job[cb_job->priority])
>>> +               ring->last_job[cb_job->priority] = NULL;
>>> +
>>> +       /* something higher prio is executing, no need to decay */
>>> +       if (ring->priority > cb_job->priority)
>>> +               goto out_unlock;
>>> +
>>> +       /* decay priority to the next level with a job available */
>>> +       for (i = cb_job->priority; i >= 0; i--) {
>>> +               if (i == AMDGPU_CTX_PRIORITY_NORMAL || ring->last_job[i])
>>> {
>>> +                       ring->priority = i;
>>> +                       if (ring->funcs->set_priority)
>>> +                               ring->funcs->set_priority(ring, i);
>>> +
>>> +                       break;
>>> +               }
>>> +       }
>>> +
>>> +out_unlock:
>>> +       spin_unlock(&ring->priority_lock);
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_ring_elevate_priority - change the ring's priority
>>> + *
>>> + * @ring: amdgpu_ring structure holding the information
>>> + * @priority: target priority
>>> + * @job: priority should remain elevated for the duration of this job
>>> + *
>>> + * Use HW specific mechanism's to elevate the ring's priority while @job
>>> + * is executing. Once @job finishes executing, the ring will reset back
>>> + * to normal priority.
>>> + * Returns 0 on success, error otherwise
>>> + */
>>> +int amdgpu_ring_elevate_priority(struct amdgpu_ring *ring, int priority,
>>> +                                struct amdgpu_job *job)
>>> +{
>>> +       if (priority < 0 || priority >= AMDGPU_CTX_PRIORITY_NUM)
>>> +               return -EINVAL;
>>> +
>>> +       spin_lock(&ring->priority_lock);
>>> +       ring->last_job[priority] = job;
>>> +
>>> +       if (priority <= ring->priority)
>>> +               goto out_unlock;
>>> +
>>> +       ring->priority = priority;
>>> +       if (ring->funcs->set_priority)
>>> +               ring->funcs->set_priority(ring, priority);
>>> +
>>> +       dma_fence_add_callback(job->fence, &job->cb,
>>> +                              amdgpu_ring_restore_priority_cb);
>>> +
>>> +out_unlock:
>>> +       spin_unlock(&ring->priority_lock);
>>> +       return 0;
>>> +}
>>> +
>>>    /**
>>>     * amdgpu_ring_init - init driver ring struct.
>>>     *
>>>     * @adev: amdgpu_device pointer
>>>     * @ring: amdgpu_ring structure holding ring information
>>>     * @max_ndw: maximum number of dw for ring alloc
>>>     * @nop: nop packet for this ring
>>>     *
>>>     * Initialize the driver information for the selected ring (all asics).
>>>     * Returns 0 on success, error on failure.
>>> @@ -275,20 +344,22 @@ int amdgpu_ring_init(struct amdgpu_device *adev,
>>> struct amdgpu_ring *ring,
>>>                                              (void **)&ring->ring);
>>>                  if (r) {
>>>                          dev_err(adev->dev, "(%d) ring create failed\n",
>>> r);
>>>                          return r;
>>>                  }
>>>                  memset((void *)ring->ring, 0, ring->ring_size);
>>>          }
>>>          ring->ptr_mask = (ring->ring_size / 4) - 1;
>>>          ring->max_dw = max_dw;
>>>          ring->hw_ip = hw_ip;
>>> +       ring->priority = AMDGPU_CTX_PRIORITY_NORMAL;
>>> +       spin_lock_init(&ring->priority_lock);
>>>          INIT_LIST_HEAD(&ring->lru_list);
>>>          amdgpu_ring_lru_touch(adev, ring);
>>>          if (amdgpu_debugfs_ring_init(adev, ring)) {
>>>                  DRM_ERROR("Failed to register debugfs file for rings
>>> !\n");
>>>          }
>>>          return 0;
>>>    }
>>>      /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index ecdd87c..befc29f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -17,20 +17,21 @@
>>>     * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES
>>> OR
>>>     * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>>     * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>>     * OTHER DEALINGS IN THE SOFTWARE.
>>>     *
>>>     * Authors: Christian König
>>>     */
>>>    #ifndef __AMDGPU_RING_H__
>>>    #define __AMDGPU_RING_H__
>>>    +#include <drm/amdgpu_drm.h>
>>>    #include "gpu_scheduler.h"
>>>      /* max number of rings */
>>>    #define AMDGPU_MAX_RINGS              16
>>>    #define AMDGPU_MAX_GFX_RINGS          1
>>>    #define AMDGPU_MAX_COMPUTE_RINGS      8
>>>    #define AMDGPU_MAX_VCE_RINGS          3
>>>      /* some special values for the owner field */
>>>    #define AMDGPU_FENCE_OWNER_UNDEFINED  ((void*)0ul)
>>> @@ -130,20 +131,22 @@ struct amdgpu_ring_funcs {
>>>          void (*pad_ib)(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>>>          unsigned (*init_cond_exec)(struct amdgpu_ring *ring);
>>>          void (*patch_cond_exec)(struct amdgpu_ring *ring, unsigned
>>> offset);
>>>          /* note usage for clock and power gating */
>>>          void (*begin_use)(struct amdgpu_ring *ring);
>>>          void (*end_use)(struct amdgpu_ring *ring);
>>>          void (*emit_switch_buffer) (struct amdgpu_ring *ring);
>>>          void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>>>          void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg);
>>>          void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t
>>> val);
>>> +       /* priority functions */
>>> +       void (*set_priority) (struct amdgpu_ring *ring, int priority);
>>>    };
>>>      struct amdgpu_ring {
>>>          struct amdgpu_device            *adev;
>>>          const struct amdgpu_ring_funcs  *funcs;
>>>          struct amdgpu_fence_driver      fence_drv;
>>>          struct amd_gpu_scheduler        sched;
>>>          struct list_head                lru_list;
>>>          struct amdgpu_bo        *ring_obj;
>>> @@ -165,31 +168,39 @@ struct amdgpu_ring {
>>>          struct amdgpu_bo        *mqd_obj;
>>>          u32                     doorbell_index;
>>>          bool                    use_doorbell;
>>>          unsigned                wptr_offs;
>>>          unsigned                fence_offs;
>>>          uint64_t                current_ctx;
>>>          char                    name[16];
>>>          unsigned                cond_exe_offs;
>>>          u64                     cond_exe_gpu_addr;
>>>          volatile u32            *cond_exe_cpu_addr;
>>> +
>>> +       spinlock_t              priority_lock;
>>> +       /* protected by priority_lock */
>>> +       struct amdgpu_job       *last_job[AMDGPU_CTX_PRIORITY_NUM];
>>> +       int                     priority;
>>> +
>>>    #if defined(CONFIG_DEBUG_FS)
>>>          struct dentry *ent;
>>>    #endif
>>>    };
>>>      int amdgpu_ring_is_valid_index(struct amdgpu_device *adev,
>>>                                 int hw_ip, int ring);
>>>    int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
>>>    void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count);
>>>    void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct
>>> amdgpu_ib *ib);
>>>    void amdgpu_ring_commit(struct amdgpu_ring *ring);
>>>    void amdgpu_ring_undo(struct amdgpu_ring *ring);
>>> +int amdgpu_ring_elevate_priority(struct amdgpu_ring *ring, int priority,
>>> +                                struct amdgpu_job *job);
>>>    int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring
>>> *ring,
>>>                       int hw_ip, unsigned ring_size,
>>>                       struct amdgpu_irq_src *irq_src, unsigned irq_type);
>>>    void amdgpu_ring_fini(struct amdgpu_ring *ring);
>>>    int amdgpu_ring_lru_get(struct amdgpu_device *adev, int hw_ip,
>>>                          struct amdgpu_ring **ring);
>>>    void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct
>>> amdgpu_ring *ring);
>>>      #endif
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



More information about the amd-gfx mailing list