[Intel-xe] [PATCH] drm/xe/mtl: Increase prefetch size
Lucas De Marchi
lucas.demarchi at intel.com
Fri Jul 21 20:42:07 UTC 2023
On Fri, Jul 21, 2023 at 12:10:27PM -0700, Matthew Brost wrote:
>MTL requires 2k prefetch for RCS hw engine, fix this. Also wire up hwe
>class in this calculation to not waste space in the common case of using
>a BCS hw engine.
>
>Bspec: 45718
>
>Cc: Matt Roper <matthew.d.roper at intel.com>
>Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>---
> drivers/gpu/drm/xe/tests/xe_migrate.c | 3 ++-
> drivers/gpu/drm/xe/xe_bb.c | 21 ++++++++++++---------
> drivers/gpu/drm/xe/xe_bb.h | 4 +++-
> drivers/gpu/drm/xe/xe_gt.c | 4 ++--
> drivers/gpu/drm/xe/xe_migrate.c | 7 ++++---
> 5 files changed, 23 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c
>index 30e5fdf6ca63..71a127efe99a 100644
>--- a/drivers/gpu/drm/xe/tests/xe_migrate.c
>+++ b/drivers/gpu/drm/xe/tests/xe_migrate.c
>@@ -286,7 +286,8 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
> goto free_pt;
> }
>
>- bb = xe_bb_new(tile->primary_gt, 32, xe->info.supports_usm);
>+ bb = xe_bb_new(tile->primary_gt, m->eng->hwe->class, 32,
>+ xe->info.supports_usm);
> if (IS_ERR(bb)) {
> KUNIT_FAIL(test, "Failed to create batchbuffer: %li\n",
> PTR_ERR(bb));
>diff --git a/drivers/gpu/drm/xe/xe_bb.c b/drivers/gpu/drm/xe/xe_bb.c
>index f9b6b7adf99f..15e616fcccef 100644
>--- a/drivers/gpu/drm/xe/xe_bb.c
>+++ b/drivers/gpu/drm/xe/xe_bb.c
>@@ -14,21 +14,23 @@
> #include "xe_sched_job.h"
> #include "xe_vm_types.h"
>
>-static int bb_prefetch(struct xe_gt *gt)
>+static int bb_prefetch(struct xe_gt *gt, enum xe_engine_class class)
> {
> struct xe_device *xe = gt_to_xe(gt);
>
>- if (GRAPHICS_VERx100(xe) >= 1250 && !xe_gt_is_media_type(gt))
>- /*
>- * RCS and CCS require 1K, although other engines would be
>- * okay with 512.
>- */
>+ if (xe->info.platform == XE_METEORLAKE &&
why are we checking for platform here rather than IP version? I think we
can take the route of doing a >= version here.
>+ class == XE_ENGINE_CLASS_RENDER)
>+ return SZ_2K;
>+ else if (GRAPHICS_VERx100(xe) >= 1250 &&
>+ (class == XE_ENGINE_CLASS_RENDER ||
>+ class == XE_ENGINE_CLASS_COMPUTE))
> return SZ_1K;
> else
> return SZ_512;
> }
>
>-struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 dwords, bool usm)
>+struct xe_bb *xe_bb_new(struct xe_gt *gt, enum xe_engine_class class,
>+ u32 dwords, bool usm)
> {
> struct xe_tile *tile = gt_to_tile(gt);
> struct xe_bb *bb = kmalloc(sizeof(*bb), GFP_KERNEL);
>@@ -44,7 +46,7 @@ struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 dwords, bool usm)
> * requirements.
> */
> bb->bo = xe_sa_bo_new(!usm ? tile->mem.kernel_bb_pool : gt->usm.bb_pool,
>- 4 * (dwords + 1) + bb_prefetch(gt));
>+ 4 * (dwords + 1) + bb_prefetch(gt, class));
I'm actually wondering why we need this bb prefetch at all. Couldn't we
just always map the scratch after the bo and call it a day?
Lucas De Marchi
> if (IS_ERR(bb->bo)) {
> err = PTR_ERR(bb->bo);
> goto err;
>@@ -66,7 +68,8 @@ __xe_bb_create_job(struct xe_engine *kernel_eng, struct xe_bb *bb, u64 *addr)
>
> bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>
>- WARN_ON(bb->len * 4 + bb_prefetch(kernel_eng->gt) > size);
>+ WARN_ON(bb->len * 4 + bb_prefetch(kernel_eng->gt,
>+ kernel_eng->hwe->class) > size);
>
> xe_sa_bo_flush_write(bb->bo);
>
>diff --git a/drivers/gpu/drm/xe/xe_bb.h b/drivers/gpu/drm/xe/xe_bb.h
>index 0cc9260c9634..ad6170b0b23a 100644
>--- a/drivers/gpu/drm/xe/xe_bb.h
>+++ b/drivers/gpu/drm/xe/xe_bb.h
>@@ -8,13 +8,15 @@
>
> #include "xe_bb_types.h"
>
>+enum xe_engine_class;
> struct dma_fence;
>
> struct xe_gt;
> struct xe_engine;
> struct xe_sched_job;
>
>-struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 size, bool usm);
>+struct xe_bb *xe_bb_new(struct xe_gt *gt, enum xe_engine_class class, u32 size,
>+ bool usm);
> struct xe_sched_job *xe_bb_create_job(struct xe_engine *kernel_eng,
> struct xe_bb *bb);
> struct xe_sched_job *xe_bb_create_migration_job(struct xe_engine *kernel_eng,
>diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>index 3e32d38aeeea..d928b161c480 100644
>--- a/drivers/gpu/drm/xe/xe_gt.c
>+++ b/drivers/gpu/drm/xe/xe_gt.c
>@@ -88,7 +88,7 @@ static int emit_nop_job(struct xe_gt *gt, struct xe_engine *e)
> u64 batch_ofs;
> long timeout;
>
>- bb = xe_bb_new(gt, 4, false);
>+ bb = xe_bb_new(gt, e->hwe->class, 4, false);
> if (IS_ERR(bb))
> return PTR_ERR(bb);
>
>@@ -126,7 +126,7 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_engine *e)
> long timeout;
> int count = 0;
>
>- bb = xe_bb_new(gt, SZ_4K, false); /* Just pick a large BB size */
>+ bb = xe_bb_new(gt, e->hwe->class, SZ_4K, false); /* Just pick a large BB size */
> if (IS_ERR(bb))
> return PTR_ERR(bb);
>
>diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
>index bc7dac4e2086..649ba38bca81 100644
>--- a/drivers/gpu/drm/xe/xe_migrate.c
>+++ b/drivers/gpu/drm/xe/xe_migrate.c
>@@ -685,7 +685,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
> batch_size += EMIT_COPY_DW +
> (xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0);
>
>- bb = xe_bb_new(gt, batch_size, usm);
>+ bb = xe_bb_new(gt, m->eng->hwe->class, batch_size, usm);
> if (IS_ERR(bb)) {
> err = PTR_ERR(bb);
> goto err_sync;
>@@ -914,7 +914,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
> if (WARN_ON_ONCE(!clear_L0))
> break;
>
>- bb = xe_bb_new(gt, batch_size, usm);
>+ bb = xe_bb_new(gt, m->eng->hwe->class, batch_size, usm);
> if (IS_ERR(bb)) {
> err = PTR_ERR(bb);
> goto err_sync;
>@@ -1196,7 +1196,8 @@ xe_migrate_update_pgtables(struct xe_migrate *m,
> */
> XE_BUG_ON(batch_size >= SZ_128K);
>
>- bb = xe_bb_new(gt, batch_size, !eng && xe->info.supports_usm);
>+ bb = xe_bb_new(gt, eng_override->hwe->class, batch_size,
>+ !eng && xe->info.supports_usm);
> if (IS_ERR(bb))
> return ERR_CAST(bb);
>
>--
>2.34.1
>
More information about the Intel-xe
mailing list