[Intel-xe] [PATCH v2] drm/xe/migrate: Some cleanups and fixes

Thu Mar 2 09:34:18 UTC 2023

On Fri, Feb 24, 2023 at 12:04:16PM +0100, Maarten Lankhorst wrote:
>   Hey,
>
>   Some quick notes.
>
>   On 2023-02-23 22:56, Niranjana Vishwanathapura wrote:
>
> Some minor fixes and cleanups in migration code.
>
> 1) Remove unused 'struct xe_bo *bo' from emit_pte()
> 2) Use 'false' for is_vram of  emit_pte() where it is obvious
>
>   This is a matter of personal taste. I originally put in those original
>   calls to make it clear what is false exactly, instead of having to infer
>   it from the context.

Thanks Maarten for the review.
Ok, will remove update 2)

>
> 3) Pass source bo as 'bo' in xe_migrate_copy() calls
> 4) Kernel-doc fix for function xe_migrate_clear()
> 5) In write_pgtable(), consider 'ofs' while calculating chunk size
>
>   Since you're trying to do a few unrelated things, better to split this up.
>

Ok

> 6) Update xe_migrate_doc.h with 32 page table structs (not 48)
>
>   Docbook updates could be a single patch.
>

Ok

>
> v2: Rebased
>
> Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
> ---
>  drivers/gpu/drm/xe/tests/xe_migrate.c |  4 ++--
>  drivers/gpu/drm/xe/xe_migrate.c       | 17 +++++++----------
>  drivers/gpu/drm/xe/xe_migrate_doc.h   |  2 +-
>  3 files changed, 10 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c
> index 03a60d5b42f1..12e8f79e65cb 100644
> --- a/drivers/gpu/drm/xe/tests/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c
> @@ -148,7 +148,7 @@ static void test_copy(struct xe_migrate *m, struct xe_bo *bo,
>         xe_map_memset(xe, &sysmem->vmap, 0, 0xd0, sysmem->size);
>         xe_map_memset(xe, &bo->vmap, 0, 0xc0, bo->size);
> 
> -       fence = xe_migrate_copy(m, sysmem, bo->ttm.resource,
> +       fence = xe_migrate_copy(m, bo, bo->ttm.resource,
>                                 sysmem->ttm.resource);
>         if (!sanity_fence_failed(xe, fence, big ? "Copying big bo vram -> sysmem" :
>                                  "Copying small bo vram -> sysmem", test)) {
> @@ -274,7 +274,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
>                 expected |= GEN12_PTE_PS64;
>         xe_res_first(pt->ttm.resource, 0, pt->size, &src_it);
>         emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt),
> -                &src_it, GEN8_PAGE_SIZE, pt);
> +                &src_it, GEN8_PAGE_SIZE);
>         run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test);
> 
>         retval = xe_map_rd(xe, &bo->vmap, GEN8_PAGE_SIZE * (NUM_KERNEL_PDE - 1),
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index cbcc355cd391..cf6ad1b1ccd5 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -423,7 +423,7 @@ static void emit_pte(struct xe_migrate *m,
>                      struct xe_bb *bb, u32 at_pt,
>                      bool is_vram,
>                      struct xe_res_cursor *cur,
> -                    u32 size, struct xe_bo *bo)
> +                    u32 size)
>  {
>         u32 ptes;
>         u64 ofs = at_pt * GEN8_PAGE_SIZE;
> @@ -672,19 +672,17 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>                         emit_arb_clear(bb);
> 
>                 if (!src_is_vram)
> -                       emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0,
> -                                bo);
> +                       emit_pte(m, bb, src_L0_pt, false, &src_it, src_L0);
>                 else
>                         xe_res_next(&src_it, src_L0);
> 
>                 if (!dst_is_vram)
> -                       emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0,
> -                                bo);
> +                       emit_pte(m, bb, dst_L0_pt, false, &dst_it, src_L0);
>                 else
>                         xe_res_next(&dst_it, src_L0);
> 
>                 if (copy_system_ccs)
> -                       emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, bo);
> +                       emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size);
> 
>                 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>                 update_idx = bb->len;
> @@ -783,7 +781,7 @@ static int emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
>  }
> 
>  /**
> - * xe_migrate_clear() - Copy content of TTM resources.
> + * xe_migrate_clear() - Clear content of TTM resources.
>   * @m: The migration context.
>   * @bo: The buffer object @dst is currently bound to.
>   * @dst: The dst TTM resource to be cleared.
> @@ -856,8 +854,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>                 /* Preemption is enabled again by the ring ops. */
>                 if (!clear_vram) {
>                         emit_arb_clear(bb);
> -                       emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0,
> -                                bo);
> +                       emit_pte(m, bb, clear_L0_pt, false, &src_it, clear_L0);
>                 } else {
>                         xe_res_next(&src_it, clear_L0);
>                 }
> @@ -941,7 +938,7 @@ static void write_pgtable(struct xe_gt *gt, struct xe_bb *bb, u64 ppgtt_ofs,
> 
>         do {
>                 u64 addr = ppgtt_ofs + ofs * 8;
> -               chunk = min(update->qwords, 0x1ffU);
> +               chunk = min(update->qwords, 0x1ffU - (ofs & 0x1ff));
>
> There is a bug here which should be a separate patch, but we will never hit it.
> A pagetable can hold 512 entries, but the moment we would do more than 511 updates,
> we would update the entry in the PDE above it.

Hmm, it is upto the caller of xe_migrate_update_pgtables() how to update page table
right? Possible that it is updating all 512 entries as physical addresses are not
contiguous. Right?

>
> The correct fix is chunk = min(size, 0x1ffU);
>

Yes I agree we should use 'size' here instead of 'update->qwords'. But I still think
we need '0x1ffU - (ofs & 0x1ff)' here. MI_STORE_DATA_IMM I believe can write across
page boundary. So, yes, '(ofs & 0x1ff)' is not strictly required here. But on the
same logic, we don't need to write at 512 chunk size either. So, I thought this
adjusting to '(ofs & 0x1ff)' is appropriate. Thoughts?

Niranjana

> 
>                 /* Ensure populatefn can do memset64 by aligning bb->cs */
>                 if (!(bb->len & 1))
> diff --git a/drivers/gpu/drm/xe/xe_migrate_doc.h b/drivers/gpu/drm/xe/xe_migrate_doc.h
> index 6a68fdff08dc..584972c4d3be 100644
> --- a/drivers/gpu/drm/xe/xe_migrate_doc.h
> +++ b/drivers/gpu/drm/xe/xe_migrate_doc.h
> @@ -21,7 +21,7 @@
>   * table BOs for updates, and identity map the entire device's VRAM with 1 GB
>   * pages.
>   *
> - * Currently the page structure consists of 48 phyiscal pages with 16 being
> + * Currently the page structure consists of 32 phyiscal pages with 16 being
>   * reserved for BO mapping during copies and clear, 1 reserved for kernel binds,
>   * several pages are needed to setup the identity mappings (exact number based
>   * on how many bits of address space the device has), and the rest are reserved