[PATCH] drm/xe/lnl: Offload system clear page activity to GPU
Ghimiray, Himal Prasad
himal.prasad.ghimiray at intel.com
Fri Jun 21 12:08:47 UTC 2024
On 20-06-2024 19:16, Nirmoy Das wrote:
> On LNL because flat CCS, driver will create a migrate job to clear
> CCS meta data. Extend that to also clear pages using GPU with new
> ttm pool flag which allows offloading page clear activity to GP
>
> This gives very nice improvement for large buffer:
> Without the patch:
> ~/igt-gpu-tools/build/tests/xe_exec_store --run basic-store-benchmark
> IGT-Version: 1.28-g2ed908c0b (x86_64) (Linux: 6.9.0-xe+ x86_64)
> Using IGT_SRANDOM=1718889799 for randomisation
> Opened device: /dev/dri/card0
> Starting subtest: basic-store-benchmark
> Starting dynamic subtest: WC
> Dynamic subtest WC: SUCCESS (0.000s)
> Time taken for size SZ_4K: 4882 us
> Time taken for size SZ_2M: 3679 us
> Time taken for size SZ_64M: 13367 us
> Time taken for size SZ_128M: 21034 us
> Time taken for size SZ_256M: 32940 us
> Time taken for size SZ_1G: 116261 us
> Starting dynamic subtest: WB
> Dynamic subtest WB: SUCCESS (0.000s)
> Time taken for size SZ_4K: 5417 us
> Time taken for size SZ_2M: 5711 us
> Time taken for size SZ_64M: 15718 us
> Time taken for size SZ_128M: 26170 us
> Time taken for size SZ_256M: 50529 us
> Time taken for size SZ_1G: 177933 us
> Subtest basic-store-benchmark: SUCCESS (0.504s)
>
> With the patch:
> sudo ~/igt-gpu-tools/build/tests/xe_exec_store --run basic-store-benchmark
> IGT-Version: 1.28-g2ed908c0b (x86_64) (Linux: 6.9.0-xe+ x86_64)
> Using IGT_SRANDOM=1718889593 for randomisation
> Opened device: /dev/dri/card0
> Starting subtest: basic-store-benchmark
> Starting dynamic subtest: WC
> Dynamic subtest WC: SUCCESS (0.000s)
> Time taken for size SZ_4K: 4479 us
> Time taken for size SZ_2M: 3291 us
> Time taken for size SZ_64M: 6595 us
> Time taken for size SZ_128M: 9069 us
> Time taken for size SZ_256M: 12681 us
> Time taken for size SZ_1G: 41806 us
> Starting dynamic subtest: WB
> Dynamic subtest WB: SUCCESS (0.000s)
> Time taken for size SZ_4K: 3317 us
> Time taken for size SZ_2M: 6458 us
> Time taken for size SZ_64M: 12802 us
> Time taken for size SZ_128M: 19579 us
> Time taken for size SZ_256M: 38768 us
> Time taken for size SZ_1G: 143250 us
> Subtest basic-store-benchmark: SUCCESS (0.328s)
>
> Cc: Christian Koenig <christian.koenig at amd.com>
> Cc: "Thomas Hellström" <thomas.hellstrom at linux.intel.com>
> Cc: Matthew Auld <matthew.auld at intel.com>
> Signed-off-by: Nirmoy Das <nirmoy.das at intel.com>
> ---
> drivers/gpu/drm/xe/xe_bo.c | 4 ++++
> drivers/gpu/drm/xe/xe_device.c | 36 +++++++++++++++++++++-------
> drivers/gpu/drm/xe/xe_device_types.h | 2 ++
> drivers/gpu/drm/xe/xe_migrate.c | 6 ++---
> 4 files changed, 37 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 65c696966e96..10ec02412dc4 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -387,6 +387,10 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
> caching = ttm_uncached;
> }
>
> + /* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear pages */
> + if (xe->mem.gpu_page_clear)
> + page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
> +
> err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages);
> if (err) {
> kfree(tt);
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 75d4c8ae9234..8e8d54c59aae 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -240,8 +240,6 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
>
> if (xe->unordered_wq)
> destroy_workqueue(xe->unordered_wq);
> -
> - ttm_device_fini(&xe->ttm);
> }
>
> struct xe_device *xe_device_create(struct pci_dev *pdev,
> @@ -260,12 +258,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
> if (IS_ERR(xe))
> return xe;
>
> - err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> - xe->drm.anon_inode->i_mapping,
> - xe->drm.vma_offset_manager, false, false);
> - if (WARN_ON(err))
> - goto err;
> -
> err = drmm_add_action_or_reset(&xe->drm, xe_device_destroy, NULL);
> if (err)
> goto err;
> @@ -543,6 +535,13 @@ static int xe_device_set_has_flat_ccs(struct xe_device *xe)
> return xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
> }
>
> +static void xe_device_destroy_ttm_device(struct drm_device *dev, void *dummy)
> +{
> + struct xe_device *xe = to_xe_device(dev);
> +
> + ttm_device_fini(&xe->ttm);
> +}
> +
> int xe_device_probe(struct xe_device *xe)
> {
> struct xe_tile *tile;
> @@ -550,6 +549,7 @@ int xe_device_probe(struct xe_device *xe)
> int err;
> u8 last_gt;
> u8 id;
> + unsigned int ttm_pool_flags = 0;
>
> xe_pat_init_early(xe);
>
> @@ -572,6 +572,26 @@ int xe_device_probe(struct xe_device *xe)
>
> xe_ttm_sys_mgr_init(xe);
>
> + /* On iGFX device with flat CCS we clear CCS metadata, let's extend that
> + * and use GPU to clear pages as well.
> + */
> + if (xe_device_has_flat_ccs(xe) && !IS_DGFX(xe)) {
> + ttm_pool_flags = TTM_POOL_FLAG_SKIP_CLEAR_ON_FREE;
> + xe->mem.gpu_page_clear = true;
> + }
> +
> + err = ttm_device_init_with_pool_flags(&xe->ttm, &xe_ttm_funcs,
> + xe->drm.dev,
> + xe->drm.anon_inode->i_mapping,
> + xe->drm.vma_offset_manager,
> + false, false, ttm_pool_flags);
> + if (WARN_ON(err))
> + return err;
> +
> + err = drmm_add_action_or_reset(&xe->drm, xe_device_destroy_ttm_device, NULL);
> + if (err)
> + return err;
> +
> for_each_gt(gt, xe, id) {
> err = xe_gt_init_early(gt);
> if (err)
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index c37be471d11c..ece68c6f3668 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -325,6 +325,8 @@ struct xe_device {
> struct xe_mem_region vram;
> /** @mem.sys_mgr: system TTM manager */
> struct ttm_resource_manager sys_mgr;
> + /** @gpu_page_clear: clear pages offloaded to GPU */
> + bool gpu_page_clear;
> } mem;
>
> /** @sriov: device level virtualization data */
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index 05f933787860..0023f32d147d 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -1003,6 +1003,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
> struct xe_gt *gt = m->tile->primary_gt;
> struct xe_device *xe = gt_to_xe(gt);
> bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false;
> + bool clear_on_create = xe->mem.gpu_page_clear;
> struct dma_fence *fence = NULL;
> u64 size = bo->size;
> struct xe_res_cursor src_it;
> @@ -1022,7 +1023,6 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
> struct xe_sched_job *job;
> struct xe_bb *bb;
> u32 batch_size, update_idx;
> -
> bool usm = xe->info.has_usm;
> u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
>
> @@ -1032,7 +1032,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
> batch_size = 2 +
> pte_update_size(m, clear_vram, src, &src_it,
> &clear_L0, &clear_L0_ofs, &clear_L0_pt,
> - clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,
> + !clear_on_create ? 0 : emit_clear_cmd_len(gt), 0,
> avail_pts);
>
> if (xe_device_has_flat_ccs(xe))
> @@ -1060,7 +1060,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
> bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> update_idx = bb->len;
>
> - if (!clear_system_ccs)
> + if (clear_on_create)
will break on dgfx
> emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
>
> if (xe_device_has_flat_ccs(xe)) {
More information about the Intel-xe
mailing list