[PATCH] drm/amdgpu: use sdma to optimize debugged non-visible device memory access

Mon Dec 6 19:24:13 UTC 2021

Am 2021-12-03 um 2:18 a.m. schrieb Christian König:
> Well NAK.
>
> We already discussed this and decided to not use any hardware
> acceleration for the debug access.

Conclusions from our offline discussion for the record:

We need amdgpu_ttm_access_memory (e.g. gdb accessing VRAM) for
post-mortem debugging in some use cases. This requires not disturbing HW
state while the system is under investigation. Using SDMA and mapping
memory for SDMA to access would be problematic in those situations.

Post-mortem debugging requires module parameters to disable GPU reset or
to set an infinite hang timeout. We can detect those options and enable
SDMA only when those options are not set. This allows good performance
for application debugging, while also satisfying the requirements for
post-mortem debugging when needed. Something like this should work
(Christian, please correct me if I got this wrong):

static inline bool allow_post_mortem_debug(struct amdgpu_device *adev) {
	return amdgpu_gpu_recovery == 0 ||
		adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
		adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
		adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
		adev->video_timeout == MAX_SCHEDULE_TIMEOUT;
}

We should further guard SDMA access with drm_dev_enter/exit. Question:
is this just needed for SDMA access, or should it apply to MMIO access
as well?

To avoid updating page tables and flushing TLBs during VRAM access, the
recommendation is to use a GTT BO as bounce-buffer, that's pre-mapped in
the GART table at driver initialization, and implement the copy with a
manual SDMA command submission instead of using amdgpu_ttm_copy_mem_to_mem.

Regards,
  Felix

>
> Apart from that you implementation is absolutely horrible and won't
> work in all cases.
>
> Regards,
> Christian.
>
> Am 02.12.21 um 22:43 schrieb Jonathan Kim:
>> To support better memory access performance on non-Large BAR devices,
>> use
>> SDMA copies instead of MM access.
>>
>> SDMA access is restricted to PAGE_SIZE'd access to account for the
>> PTRACED
>> process memory r/w operation use case.  Any other access size will use
>> MMIO.
>>
>> Failure to do an SDMA copy will result in a fallback to MM access.
>>
>> Note: This is an attempt readdress patch request
>> 'drm/amdgpu: extend ttm memory access to do sdma copies'
>> with the addition of restrictions and fallbacks.
>>
>> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 97 +++++++++++++++++++++++++
>>   1 file changed, 97 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 23fc57506a20..1cb984252f58 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -1741,6 +1741,91 @@ static void amdgpu_ttm_vram_mm_access(struct
>> amdgpu_device *adev, loff_t pos,
>>       }
>>   }
>>   +/**
>> + * amdgpu_ttm_access_memory_page_sdma - Read/write page of memory
>> that backs a buffer object.
>> + *
>> + * @bo:  The buffer object to read/write
>> + * @offset:  Offset into buffer object
>> + * @buf:  Secondary buffer to write/read from
>> + * @write:  true if writing
>> + *
>> + * This is used to access a page of VRAM that backs a buffer object
>> via SDMA
>> + * access for debugging purposes.
>> + */
>> +static int amdgpu_ttm_access_memory_page_sdma(struct
>> ttm_buffer_object *bo,
>> +                    unsigned long offset, void *buf,
>> +                    int write)
>> +{
>> +    struct amdgpu_bo *dst_bo, *abo = ttm_to_amdgpu_bo(bo);
>> +    struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
>> +    struct ttm_operation_ctx ctx = {.interruptible = true};
>> +    struct amdgpu_copy_mem src, dst;
>> +    struct drm_gem_object *gobj;
>> +    struct dma_fence *fence;
>> +    struct page *dst_page;
>> +    struct ttm_tt *dst_ttm;
>> +    int ret;
>> +
>> +    /* Create an SG BO to dma map the target buffer for direct copy. */
>> +    ret = amdgpu_gem_object_create(adev, PAGE_SIZE, PAGE_SIZE,
>> AMDGPU_GEM_DOMAIN_CPU,
>> +                0, ttm_bo_type_sg, NULL, &gobj);
>> +    if (ret)
>> +        return ret;
>> +
>> +    dst_bo = gem_to_amdgpu_bo(gobj);
>> +    dst_ttm = dst_bo->tbo.ttm;
>> +    dst_ttm->sg = kmalloc(sizeof(*dst_ttm->sg), GFP_KERNEL);
>> +    if (unlikely(!dst_ttm->sg)) {
>> +        ret = -ENOMEM;
>> +        goto free_bo;
>> +    }
>> +
>> +    dst_page = virt_to_page(buf);
>> +    ret = sg_alloc_table_from_pages(dst_ttm->sg, &dst_page, 1, 0,
>> +                    1 << PAGE_SHIFT, GFP_KERNEL);
>> +    if (unlikely(ret))
>> +        goto free_sg;
>> +
>> +    ret = dma_map_sgtable(adev->dev, dst_ttm->sg, DMA_BIDIRECTIONAL,
>> 0);
>> +    if (unlikely(ret))
>> +        goto release_sg;
>> +
>> +    drm_prime_sg_to_dma_addr_array(dst_ttm->sg,
>> dst_ttm->dma_address, 1);
>> +
>> +    amdgpu_bo_placement_from_domain(dst_bo, AMDGPU_GEM_DOMAIN_GTT);
>> +    ret = ttm_bo_validate(&dst_bo->tbo, &dst_bo->placement, &ctx);
>> +    if (ret)
>> +        goto unmap_sg;
>> +
>> +    src.mem = bo->resource;
>> +    src.offset = offset;
>> +    dst.bo = &dst_bo->tbo;
>> +    dst.mem = dst.bo->resource;
>> +    dst.offset = 0;
>> +
>> +    /* Do the direct copy and wait for fence response. */
>> +    ret = amdgpu_ttm_copy_mem_to_mem(adev, write ? &dst : &src,
>> write ? &src : &dst,
>> +                    1 << PAGE_SHIFT, amdgpu_bo_encrypted(abo),
>> +                    bo->base.resv, &fence);
>> +    if (!ret && fence) {
>> +        if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout))
>> +            ret = -ETIMEDOUT;
>> +
>> +        dma_fence_put(fence);
>> +    }
>> +
>> +unmap_sg:
>> +    dma_unmap_sgtable(adev->dev, dst_ttm->sg, DMA_BIDIRECTIONAL, 0);
>> +release_sg:
>> +    sg_free_table(dst_ttm->sg);
>> +free_sg:
>> +    kfree(dst_ttm->sg);
>> +    dst_ttm->sg = NULL;
>> +free_bo:
>> +    gobj->funcs->free(gobj);
>> +    return ret;
>> +}
>> +
>>   /**
>>    * amdgpu_ttm_access_memory - Read or Write memory that backs a
>> buffer object.
>>    *
>> @@ -1765,7 +1850,19 @@ static int amdgpu_ttm_access_memory(struct
>> ttm_buffer_object *bo,
>>       if (bo->resource->mem_type != TTM_PL_VRAM)
>>           return -EIO;
>>   +    /*
>> +     * Attempt SDMA access over non-visible VRAM first.
>> +     * On failure, fall back to MMIO access.
>> +     *
>> +     * Restrict this to PAGE_SIZE access for PTRACED memory operations.
>> +     * Any other access size should use MM access.
>> +     */
>>       amdgpu_res_first(bo->resource, offset, len, &cursor);
>> +    if (adev->gmc.visible_vram_size < cursor.start + len && len ==
>> PAGE_SIZE &&
>> +            !amdgpu_in_reset(adev) &&
>> +                !amdgpu_ttm_access_memory_page_sdma(bo, offset, buf,
>> write))
>> +        return len;
>> +
>>       while (cursor.remaining) {
>>           size_t count, size = cursor.size;
>>           loff_t pos = cursor.start;
>