[PATCH] drm/xe/lnl: Offload system clear page activity to GPU
Nirmoy Das
nirmoy.das at intel.com
Fri Jun 21 12:12:17 UTC 2024
On 6/21/2024 2:08 PM, Ghimiray, Himal Prasad wrote:
>
>
> On 20-06-2024 19:16, Nirmoy Das wrote:
>> On LNL because flat CCS, driver will create a migrate job to clear
>> CCS meta data. Extend that to also clear pages using GPU with new
>> ttm pool flag which allows offloading page clear activity to GP
>>
>> This gives very nice improvement for large buffer:
>> Without the patch:
>> ~/igt-gpu-tools/build/tests/xe_exec_store --run basic-store-benchmark
>> IGT-Version: 1.28-g2ed908c0b (x86_64) (Linux: 6.9.0-xe+ x86_64)
>> Using IGT_SRANDOM=1718889799 for randomisation
>> Opened device: /dev/dri/card0
>> Starting subtest: basic-store-benchmark
>> Starting dynamic subtest: WC
>> Dynamic subtest WC: SUCCESS (0.000s)
>> Time taken for size SZ_4K: 4882 us
>> Time taken for size SZ_2M: 3679 us
>> Time taken for size SZ_64M: 13367 us
>> Time taken for size SZ_128M: 21034 us
>> Time taken for size SZ_256M: 32940 us
>> Time taken for size SZ_1G: 116261 us
>> Starting dynamic subtest: WB
>> Dynamic subtest WB: SUCCESS (0.000s)
>> Time taken for size SZ_4K: 5417 us
>> Time taken for size SZ_2M: 5711 us
>> Time taken for size SZ_64M: 15718 us
>> Time taken for size SZ_128M: 26170 us
>> Time taken for size SZ_256M: 50529 us
>> Time taken for size SZ_1G: 177933 us
>> Subtest basic-store-benchmark: SUCCESS (0.504s)
>>
>> With the patch:
>> sudo ~/igt-gpu-tools/build/tests/xe_exec_store --run
>> basic-store-benchmark
>> IGT-Version: 1.28-g2ed908c0b (x86_64) (Linux: 6.9.0-xe+ x86_64)
>> Using IGT_SRANDOM=1718889593 for randomisation
>> Opened device: /dev/dri/card0
>> Starting subtest: basic-store-benchmark
>> Starting dynamic subtest: WC
>> Dynamic subtest WC: SUCCESS (0.000s)
>> Time taken for size SZ_4K: 4479 us
>> Time taken for size SZ_2M: 3291 us
>> Time taken for size SZ_64M: 6595 us
>> Time taken for size SZ_128M: 9069 us
>> Time taken for size SZ_256M: 12681 us
>> Time taken for size SZ_1G: 41806 us
>> Starting dynamic subtest: WB
>> Dynamic subtest WB: SUCCESS (0.000s)
>> Time taken for size SZ_4K: 3317 us
>> Time taken for size SZ_2M: 6458 us
>> Time taken for size SZ_64M: 12802 us
>> Time taken for size SZ_128M: 19579 us
>> Time taken for size SZ_256M: 38768 us
>> Time taken for size SZ_1G: 143250 us
>> Subtest basic-store-benchmark: SUCCESS (0.328s)
>>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Cc: "Thomas Hellström" <thomas.hellstrom at linux.intel.com>
>> Cc: Matthew Auld <matthew.auld at intel.com>
>> Signed-off-by: Nirmoy Das <nirmoy.das at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_bo.c | 4 ++++
>> drivers/gpu/drm/xe/xe_device.c | 36 +++++++++++++++++++++-------
>> drivers/gpu/drm/xe/xe_device_types.h | 2 ++
>> drivers/gpu/drm/xe/xe_migrate.c | 6 ++---
>> 4 files changed, 37 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
>> index 65c696966e96..10ec02412dc4 100644
>> --- a/drivers/gpu/drm/xe/xe_bo.c
>> +++ b/drivers/gpu/drm/xe/xe_bo.c
>> @@ -387,6 +387,10 @@ static struct ttm_tt *xe_ttm_tt_create(struct
>> ttm_buffer_object *ttm_bo,
>> caching = ttm_uncached;
>> }
>> + /* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear pages */
>> + if (xe->mem.gpu_page_clear)
>> + page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC;
>> +
>> err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching,
>> extra_pages);
>> if (err) {
>> kfree(tt);
>> diff --git a/drivers/gpu/drm/xe/xe_device.c
>> b/drivers/gpu/drm/xe/xe_device.c
>> index 75d4c8ae9234..8e8d54c59aae 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -240,8 +240,6 @@ static void xe_device_destroy(struct drm_device
>> *dev, void *dummy)
>> if (xe->unordered_wq)
>> destroy_workqueue(xe->unordered_wq);
>> -
>> - ttm_device_fini(&xe->ttm);
>> }
>> struct xe_device *xe_device_create(struct pci_dev *pdev,
>> @@ -260,12 +258,6 @@ struct xe_device *xe_device_create(struct
>> pci_dev *pdev,
>> if (IS_ERR(xe))
>> return xe;
>> - err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
>> - xe->drm.anon_inode->i_mapping,
>> - xe->drm.vma_offset_manager, false, false);
>> - if (WARN_ON(err))
>> - goto err;
>> -
>> err = drmm_add_action_or_reset(&xe->drm, xe_device_destroy, NULL);
>> if (err)
>> goto err;
>> @@ -543,6 +535,13 @@ static int xe_device_set_has_flat_ccs(struct
>> xe_device *xe)
>> return xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
>> }
>> +static void xe_device_destroy_ttm_device(struct drm_device *dev,
>> void *dummy)
>> +{
>> + struct xe_device *xe = to_xe_device(dev);
>> +
>> + ttm_device_fini(&xe->ttm);
>> +}
>> +
>> int xe_device_probe(struct xe_device *xe)
>> {
>> struct xe_tile *tile;
>> @@ -550,6 +549,7 @@ int xe_device_probe(struct xe_device *xe)
>> int err;
>> u8 last_gt;
>> u8 id;
>> + unsigned int ttm_pool_flags = 0;
>> xe_pat_init_early(xe);
>> @@ -572,6 +572,26 @@ int xe_device_probe(struct xe_device *xe)
>> xe_ttm_sys_mgr_init(xe);
>> + /* On iGFX device with flat CCS we clear CCS metadata, let's
>> extend that
>> + * and use GPU to clear pages as well.
>> + */
>> + if (xe_device_has_flat_ccs(xe) && !IS_DGFX(xe)) {
>> + ttm_pool_flags = TTM_POOL_FLAG_SKIP_CLEAR_ON_FREE;
>> + xe->mem.gpu_page_clear = true;
>> + }
>> +
>> + err = ttm_device_init_with_pool_flags(&xe->ttm, &xe_ttm_funcs,
>> + xe->drm.dev,
>> + xe->drm.anon_inode->i_mapping,
>> + xe->drm.vma_offset_manager,
>> + false, false, ttm_pool_flags);
>> + if (WARN_ON(err))
>> + return err;
>> +
>> + err = drmm_add_action_or_reset(&xe->drm,
>> xe_device_destroy_ttm_device, NULL);
>> + if (err)
>> + return err;
>> +
>> for_each_gt(gt, xe, id) {
>> err = xe_gt_init_early(gt);
>> if (err)
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h
>> b/drivers/gpu/drm/xe/xe_device_types.h
>> index c37be471d11c..ece68c6f3668 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -325,6 +325,8 @@ struct xe_device {
>> struct xe_mem_region vram;
>> /** @mem.sys_mgr: system TTM manager */
>> struct ttm_resource_manager sys_mgr;
>> + /** @gpu_page_clear: clear pages offloaded to GPU */
>> + bool gpu_page_clear;
>> } mem;
>> /** @sriov: device level virtualization data */
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
>> b/drivers/gpu/drm/xe/xe_migrate.c
>> index 05f933787860..0023f32d147d 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -1003,6 +1003,7 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>> struct xe_gt *gt = m->tile->primary_gt;
>> struct xe_device *xe = gt_to_xe(gt);
>> bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) &&
>> !IS_DGFX(xe)) ? true : false;
>> + bool clear_on_create = xe->mem.gpu_page_clear;
>> struct dma_fence *fence = NULL;
>> u64 size = bo->size;
>> struct xe_res_cursor src_it;
>> @@ -1022,7 +1023,6 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>> struct xe_sched_job *job;
>> struct xe_bb *bb;
>> u32 batch_size, update_idx;
>> -
>> bool usm = xe->info.has_usm;
>> u32 avail_pts = max_mem_transfer_per_pass(xe) /
>> LEVEL0_PAGE_TABLE_ENCODE_SIZE;
>> @@ -1032,7 +1032,7 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>> batch_size = 2 +
>> pte_update_size(m, clear_vram, src, &src_it,
>> &clear_L0, &clear_L0_ofs, &clear_L0_pt,
>> - clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,
>> + !clear_on_create ? 0 : emit_clear_cmd_len(gt), 0,
>> avail_pts);
>> if (xe_device_has_flat_ccs(xe))
>> @@ -1060,7 +1060,7 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>> bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>> update_idx = bb->len;
>> - if (!clear_system_ccs)
>> + if (clear_on_create)
>
> will break on dgfx
True, I had a different version but I didn't update this afterwards.
Let me resend.
Regards,
Nirmoy
>
>> emit_clear(gt, bb, clear_L0_ofs, clear_L0,
>> XE_PAGE_SIZE, clear_vram);
>> if (xe_device_has_flat_ccs(xe)) {
More information about the Intel-xe
mailing list