[PATCH] drm/amdgpu: improve debug VRAM access performance using sdma
Kim, Jonathan
Jonathan.Kim at amd.com
Wed Jan 12 14:21:58 UTC 2022
[Public]
Thanks Christian. I've already merged based on Felix's review.
I'll send your suggested cleanup for review out soon.
Jon
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig at amd.com>
> Sent: January 12, 2022 2:33 AM
> To: Kim, Jonathan <Jonathan.Kim at amd.com>; amd-
> gfx at lists.freedesktop.org
> Cc: Kuehling, Felix <Felix.Kuehling at amd.com>
> Subject: Re: [PATCH] drm/amdgpu: improve debug VRAM access
> performance using sdma
>
> Am 04.01.22 um 20:12 schrieb Jonathan Kim:
> > For better performance during VRAM access for debugged processes, do
> > read/write copies over SDMA.
> >
> > In order to fulfill post mortem debugging on a broken device, fallback
> > to stable MMIO access when gpu recovery is disabled or when job
> > submission time outs are set to max. Failed SDMA access should
> > automatically fall back to MMIO access.
> >
> > Use a pre-allocated GTT bounce buffer pre-mapped into GART to avoid
> > page-table updates and TLB flushes on access.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 78
> +++++++++++++++++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 5 +-
> > 2 files changed, 82 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > index 367abed1d6e6..512df4c09772 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > @@ -48,6 +48,7 @@
> > #include <drm/ttm/ttm_range_manager.h>
> >
> > #include <drm/amdgpu_drm.h>
> > +#include <drm/drm_drv.h>
> >
> > #include "amdgpu.h"
> > #include "amdgpu_object.h"
> > @@ -1429,6 +1430,70 @@ static void
> amdgpu_ttm_vram_mm_access(struct amdgpu_device *adev, loff_t pos,
> > }
> > }
> >
> > +static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object
> *bo,
> > + unsigned long offset, void *buf, int
> len, int write) {
> > + struct amdgpu_bo *abo = ttm_to_amdgpu_bo(bo);
> > + struct amdgpu_device *adev = amdgpu_ttm_adev(abo->tbo.bdev);
> > + struct amdgpu_job *job;
> > + struct dma_fence *fence;
> > + uint64_t src_addr, dst_addr;
> > + unsigned int num_dw;
> > + int r, idx;
> > +
> > + if (len != PAGE_SIZE)
> > + return -EINVAL;
> > +
> > + if (!adev->mman.sdma_access_ptr)
> > + return -EACCES;
> > +
> > + r = drm_dev_enter(adev_to_drm(adev), &idx);
> > + if (r)
> > + return r;
> > +
> > + if (write)
> > + memcpy(adev->mman.sdma_access_ptr, buf, len);
> > +
> > + num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
> > + r = amdgpu_job_alloc_with_ib(adev, num_dw * 4,
> AMDGPU_IB_POOL_DELAYED, &job);
> > + if (r)
> > + goto out;
> > +
> > + src_addr = write ? amdgpu_bo_gpu_offset(adev-
> >mman.sdma_access_bo) :
> > + amdgpu_bo_gpu_offset(abo);
> > + dst_addr = write ? amdgpu_bo_gpu_offset(abo) :
> > + amdgpu_bo_gpu_offset(adev-
> >mman.sdma_access_bo);
>
> I suggest to write this as
>
> src_addr = a;
> dst_addr = b;
> if (write)
> swap(src_addr, dst_addr);
>
> This way we are not duplicating getting the different offsets.
>
> > + amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,
> > +PAGE_SIZE, false);
> > +
> > + amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job-
> >ibs[0]);
> > + WARN_ON(job->ibs[0].length_dw > num_dw);
> > +
> > + r = amdgpu_job_submit(job, &adev->mman.entity,
> AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
> > + if (r) {
> > + amdgpu_job_free(job);
> > + goto out;
> > + }
> > +
> > + if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout))
> > + r = -ETIMEDOUT;
> > + dma_fence_put(fence);
> > +
> > + if (!(r || write))
> > + memcpy(buf, adev->mman.sdma_access_ptr, len);
> > +out:
> > + drm_dev_exit(idx);
> > + return r;
> > +}
> > +
> > +static inline bool amdgpu_ttm_allow_post_mortem_debug(struct
> > +amdgpu_device *adev) {
> > + return amdgpu_gpu_recovery == 0 ||
> > + adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
> > + adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
> > + adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
> > + adev->video_timeout == MAX_SCHEDULE_TIMEOUT; }
>
> This should probably be inside amdgpu_device.c
>
> > +
> > /**
> > * amdgpu_ttm_access_memory - Read or Write memory that backs a
> buffer object.
> > *
> > @@ -1453,6 +1518,10 @@ static int amdgpu_ttm_access_memory(struct
> ttm_buffer_object *bo,
> > if (bo->resource->mem_type != TTM_PL_VRAM)
> > return -EIO;
> >
> > + if (!amdgpu_ttm_allow_post_mortem_debug(adev) &&
> > + !amdgpu_ttm_access_memory_sdma(bo, offset,
> buf, len, write))
> > + return len;
> > +
> > amdgpu_res_first(bo->resource, offset, len, &cursor);
> > while (cursor.remaining) {
> > size_t count, size = cursor.size;
> > @@ -1793,6 +1862,12 @@ int amdgpu_ttm_init(struct amdgpu_device
> *adev)
> > return r;
> > }
> >
> > + if (amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE,
> > + AMDGPU_GEM_DOMAIN_GTT,
> > + &adev->mman.sdma_access_bo, NULL,
> > + adev->mman.sdma_access_ptr))
> > + DRM_WARN("Debug VRAM access will use slowpath MM
> access\n");
> > +
> > return 0;
> > }
> >
> > @@ -1823,6 +1898,9 @@ void amdgpu_ttm_fini(struct amdgpu_device
> *adev)
> > ttm_range_man_fini(&adev->mman.bdev, AMDGPU_PL_OA);
> > ttm_device_fini(&adev->mman.bdev);
> > adev->mman.initialized = false;
> > + if (adev->mman.sdma_access_ptr)
>
> You can drop that if. Free functions can usually take a NULL pointer.
>
> Apart from those nit picks looks good to me as well.
>
> Regards,
> Christian.
>
> > + amdgpu_bo_free_kernel(&adev->mman.sdma_access_bo,
> NULL,
> > + &adev->mman.sdma_access_ptr);
> > DRM_INFO("amdgpu: ttm finalized\n");
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> > index 91a087f9dc7c..b0116c4a768f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> > @@ -98,6 +98,10 @@ struct amdgpu_mman {
> > u64 fw_vram_usage_size;
> > struct amdgpu_bo *fw_vram_usage_reserved_bo;
> > void *fw_vram_usage_va;
> > +
> > + /* PAGE_SIZE'd BO for process memory r/w over SDMA. */
> > + struct amdgpu_bo *sdma_access_bo;
> > + void *sdma_access_ptr;
> > };
> >
> > struct amdgpu_copy_mem {
> > @@ -193,5 +197,4 @@ uint64_t amdgpu_ttm_tt_pte_flags(struct
> amdgpu_device *adev, struct ttm_tt *ttm,
> > int amdgpu_ttm_evict_resources(struct amdgpu_device *adev, int
> > mem_type);
> >
> > void amdgpu_ttm_debugfs_init(struct amdgpu_device *adev);
> > -
> > #endif
More information about the amd-gfx
mailing list