[PATCH 5/8] drm/amdgpu: new VM update backends

Mon Mar 25 20:53:31 UTC 2019

On 2019-03-25 7:38 a.m., Christian König wrote:
> Am 20.03.19 um 12:57 schrieb Kuehling, Felix:
>> As far as I can tell, the whole series is a small cleanup and big
>> refactor to enable CPU clearing of PTs without a lot of ugliness or code
>> duplication.
>
> It's a bit more than that. Key point is that I can now easily add a 
> parameter for direct submission during page fault handling :)

You mean for working around problems with CPU page table updates from 
page faults, to force all such updates through SDMA?

Regards,
   Felix

>
> Christian.
>
>> It looks good to me. I haven't reviewed all the moved SDMA
>> update code to make sure it all works correctly, but at least the
>> prepare and commit functions look sane to me.
>>
>> For this patch I have a suggestion (inline) to remove params->ib, which
>> seems redundant with params->job. That would also ripple through the
>> remaining patches.
>>
>> Other than that, the series is Reviewed-by: Felix Kuehling
>> <Felix.Kuehling at amd.com>
>>
>> [+Kent], Look out for this patch series in an upcoming merge to
>> amd-kfd-staging. I don't think it'll cause conflicts, but has a risk of
>> regressions (like all big amdgpu_vm changes IME).
>>
>> Regards,
>>     Felix
>>
>> On 3/19/2019 8:44 AM, Christian König wrote:
>>> Separate out all functions for SDMA and CPU based page table
>>> updates into separate backends.
>>>
>>> This way we can keep most of the complexity of those from the
>>> core VM code.
>>>
>>> Signed-off-by: Christian König <christian.koenig at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/Makefile         |   3 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |   7 +-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  30 ++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  | 116 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 248 
>>> ++++++++++++++++++++
>>>    5 files changed, 401 insertions(+), 3 deletions(-)
>>>    create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
>>>    create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
>> [snip]
>>> +/**
>>> + * amdgpu_vm_sdma_prepare - prepare SDMA command submission
>>> + *
>>> + * @p: see amdgpu_vm_update_params definition
>>> + * @owner: owner we need to sync to
>>> + * @exclusive: exclusive move fence we need to sync to
>>> + *
>>> + * Returns:
>>> + * Negativ errno, 0 for success.
>>> + */
>>> +static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p,
>>> +                  void *owner, struct dma_fence *exclusive)
>>> +{
>>> +    struct amdgpu_bo *root = p->vm->root.base.bo;
>>> +    unsigned int ndw = AMDGPU_VM_SDMA_MIN_NUM_DW;
>>> +    int r;
>>> +
>>> +    r = amdgpu_job_alloc_with_ib(p->adev, ndw * 4, &p->job);
>>> +    if (r)
>>> +        return r;
>>> +
>>> +    r = amdgpu_sync_fence(p->adev, &p->job->sync, exclusive, false);
>>> +    if (r)
>>> +        return r;
>>> +
>>> +    r = amdgpu_sync_resv(p->adev, &p->job->sync, root->tbo.resv,
>>> +                 owner, false);
>>> +    if (r)
>>> +        return r;
>>> +
>>> +    p->num_dw_left = ndw;
>>> +    p->ib = &p->job->ibs[0];
>> With p->job added, do we still need p->ib? We could just use
>> &p->job->ibs[0] directly, which should perform the same or be more
>> efficient since it's just a constant offset from p->job.
>>
>>
>>> +    return 0;
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_vm_sdma_commit - commit SDMA command submission
>>> + *
>>> + * @p: see amdgpu_vm_update_params definition
>>> + * @fence: resulting fence
>>> + *
>>> + * Returns:
>>> + * Negativ errno, 0 for success.
>>> + */
>>> +static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
>>> +                 struct dma_fence **fence)
>>> +{
>>> +    struct amdgpu_bo *root = p->vm->root.base.bo;
>>> +    struct amdgpu_ring *ring;
>>> +    struct dma_fence *f;
>>> +    int r;
>>> +
>>> +    ring = container_of(p->vm->entity.rq->sched, struct 
>>> amdgpu_ring, sched);
>>> +
>>> +    WARN_ON(p->ib->length_dw == 0);
>>> +    amdgpu_ring_pad_ib(ring, p->ib);
>>> +    WARN_ON(p->ib->length_dw > p->num_dw_left);
>>> +    r = amdgpu_job_submit(p->job, &p->vm->entity,
>>> +                  AMDGPU_FENCE_OWNER_VM, &f);
>>> +    if (r)
>>> +        goto error;
>>> +
>>> +    amdgpu_bo_fence(root, f, true);
>>> +    if (fence)
>>> +        swap(*fence, f);
>>> +    dma_fence_put(f);
>>> +    return 0;
>>> +
>>> +error:
>>> +    amdgpu_job_free(p->job);
>>> +    return r;
>>> +}
>>> +
>>> +
>>> +/**
>>> + * amdgpu_vm_sdma_copy_ptes - copy the PTEs from mapping
>>> + *
>>> + * @p: see amdgpu_vm_update_params definition
>>> + * @bo: PD/PT to update
>>> + * @pe: addr of the page entry
>>> + * @count: number of page entries to copy
>>> + *
>>> + * Traces the parameters and calls the DMA function to copy the PTEs.
>>> + */
>>> +static void amdgpu_vm_sdma_copy_ptes(struct amdgpu_vm_update_params 
>>> *p,
>>> +                     struct amdgpu_bo *bo, uint64_t pe,
>>> +                     unsigned count)
>>> +{
>>> +    uint64_t src = p->ib->gpu_addr;
>>> +
>>> +    src += p->num_dw_left * 4;
>>> +
>>> +    pe += amdgpu_bo_gpu_offset(bo);
>>> +    trace_amdgpu_vm_copy_ptes(pe, src, count);
>>> +
>>> +    amdgpu_vm_copy_pte(p->adev, p->ib, pe, src, count);
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_vm_sdma_set_ptes - helper to call the right asic function
>>> + *
>>> + * @p: see amdgpu_vm_update_params definition
>>> + * @bo: PD/PT to update
>>> + * @pe: addr of the page entry
>>> + * @addr: dst addr to write into pe
>>> + * @count: number of page entries to update
>>> + * @incr: increase next addr by incr bytes
>>> + * @flags: hw access flags
>>> + *
>>> + * Traces the parameters and calls the right asic functions
>>> + * to setup the page table using the DMA.
>>> + */
>>> +static void amdgpu_vm_sdma_set_ptes(struct amdgpu_vm_update_params *p,
>>> +                    struct amdgpu_bo *bo, uint64_t pe,
>>> +                    uint64_t addr, unsigned count,
>>> +                    uint32_t incr, uint64_t flags)
>>> +{
>>> +    pe += amdgpu_bo_gpu_offset(bo);
>>> +    trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
>>> +    if (count < 3) {
>>> +        amdgpu_vm_write_pte(p->adev, p->ib, pe, addr | flags,
>>> +                    count, incr);
>>> +    } else {
>>> +        amdgpu_vm_set_pte_pde(p->adev, p->ib, pe, addr,
>>> +                      count, incr, flags);
>>> +    }
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_vm_sdma_update - execute VM update
>>> + *
>>> + * @p: see amdgpu_vm_update_params definition
>>> + * @bo: PD/PT to update
>>> + * @pe: addr of the page entry
>>> + * @addr: dst addr to write into pe
>>> + * @count: number of page entries to update
>>> + * @incr: increase next addr by incr bytes
>>> + * @flags: hw access flags
>>> + *
>>> + * Reserve space in the IB, setup mapping buffer on demand and 
>>> write commands to
>>> + * the IB.
>>> + */
>>> +static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
>>> +                 struct amdgpu_bo *bo, uint64_t pe,
>>> +                 uint64_t addr, unsigned count, uint32_t incr,
>>> +                 uint64_t flags)
>>> +{
>>> +    unsigned int i, ndw, nptes;
>>> +    uint64_t *pte;
>>> +    int r;
>>> +
>>> +    do {
>>> +        ndw = p->num_dw_left;
>>> +        ndw -= p->ib->length_dw;
>>> +
>>> +        if (ndw < 32) {
>>> +            r = amdgpu_vm_sdma_commit(p, NULL);
>>> +            if (r)
>>> +                return r;
>>> +
>>> +            /* estimate how many dw we need */
>>> +            ndw = 32;
>>> +            if (p->pages_addr)
>>> +                ndw += count * 2;
>>> +            ndw = max(ndw, AMDGPU_VM_SDMA_MIN_NUM_DW);
>>> +            ndw = min(ndw, AMDGPU_VM_SDMA_MAX_NUM_DW);
>>> +
>>> +            r = amdgpu_job_alloc_with_ib(p->adev, ndw * 4, &p->job);
>>> +            if (r)
>>> +                return r;
>>> +
>>> +            p->num_dw_left = ndw;
>>> +            p->ib = &p->job->ibs[0];
>>> +        }
>>> +
>>> +        if (!p->pages_addr) {
>>> +            /* set page commands needed */
>>> +            if (bo->shadow)
>>> +                amdgpu_vm_sdma_set_ptes(p, bo->shadow, pe, addr,
>>> +                            count, incr, flags);
>>> +            amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count,
>>> +                        incr, flags);
>>> +            return 0;
>>> +        }
>>> +
>>> +        /* copy commands needed */
>>> +        ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw *
>>> +            (bo->shadow ? 2 : 1);
>>> +
>>> +        /* for padding */
>>> +        ndw -= 7;
>>> +
>>> +        nptes = min(count, ndw / 2);
>>> +
>>> +        /* Put the PTEs at the end of the IB. */
>>> +        p->num_dw_left -= nptes * 2;
>>> +        pte = (uint64_t *)&(p->ib->ptr[p->num_dw_left]);
>>> +        for (i = 0; i < nptes; ++i, addr += incr) {
>>> +            pte[i] = amdgpu_vm_map_gart(p->pages_addr, addr);
>>> +            pte[i] |= flags;
>>> +        }
>>> +
>>> +        if (bo->shadow)
>>> +            amdgpu_vm_sdma_copy_ptes(p, bo->shadow, pe, nptes);
>>> +        amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes);
>>> +
>>> +        pe += nptes * 8;
>>> +        count -= nptes;
>>> +    } while (count);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +const struct amdgpu_vm_update_funcs amdgpu_vm_sdma_funcs = {
>>> +    .prepare = amdgpu_vm_sdma_prepare,
>>> +    .update = amdgpu_vm_sdma_update,
>>> +    .commit = amdgpu_vm_sdma_commit
>>> +};
>