[PATCH v4] drm/xe/migrate: Fix CCS copy for small VRAM copy chunks

Tue Dec 19 08:41:05 UTC 2023

On 12/18/23 20:37, Matthew Auld wrote:
> On Mon, 18 Dec 2023 at 14:29, Thomas Hellström
> <thomas.hellstrom at linux.intel.com> wrote:
>> Since the migrate code is using the identity map for addressing VRAM,
>> copy chunks may become as small as 64K if the VRAM resource is fragmented.
>>
>> However, a chunk size smaller that 1MiB may lead to the *next* chunk's
>> offset into the CCS metadata backup memory may not be page-aligned, and
>> the XY_CTRL_SURF_COPY_BLT command can't handle that, and even if it could,
>> the current code doesn't handle the offset calculaton correctly.
>>
>> To fix this, make sure we align the size of VRAM copy chunks to 1MiB. If
>> the remaining data to copy is smaller than that, that's not a problem,
>> so use the remaining size. If the VRAM copy cunk becomes fragmented due
>> to the size alignment restriction, don't use the identity map, but instead
>> emit PTEs into the page-table like we do for system memory.
>>
>> v2:
>> - Rebase
>> v3:
>> - Future proof somewhat by taking into account the real data size to
>>    flat CCS metadata size ratio. (Matt Roper)
>> - Invert a couple of if-statements for better readability.
>> - Fix support for 4K-granularity VRAM sizes. (Tested on DG1).
>> v4:
>> - Fix up code comments
>> - Fix debug printout format typo.
>>
>> Cc: Matt Roper <matthew.d.roper at intel.com>
>> Cc: Matthew Auld <matthew.william.auld at gmail.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>> ---
>>   drivers/gpu/drm/xe/tests/xe_migrate.c |   2 +-
>>   drivers/gpu/drm/xe/xe_migrate.c       | 128 ++++++++++++++++----------
>>   2 files changed, 80 insertions(+), 50 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c
>> index 7a32faa2f688..a6523df0f1d3 100644
>> --- a/drivers/gpu/drm/xe/tests/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c
>> @@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
>>                  xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it);
>>
>>          emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt), false,
>> -                &src_it, XE_PAGE_SIZE, pt);
>> +                &src_it, XE_PAGE_SIZE, pt->ttm.resource);
>>
>>          run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test);
>>
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
>> index adf1dab5eba2..d8fff0403f74 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -62,6 +62,8 @@ struct xe_migrate {
>>           * out of the pt_bo.
>>           */
>>          struct drm_suballoc_manager vm_update_sa;
>> +       /** @min_chunk_size: For dgfx, Minimum chunk size */
>> +       u64 min_chunk_size;
>>   };
>>
>>   #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
>> @@ -364,6 +366,19 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
>>          if (err)
>>                  return ERR_PTR(err);
>>
>> +       if (IS_DGFX(xe)) {
>> +               if (xe_device_has_flat_ccs(xe))
>> +                       /* min chunk size corresponds to 4K of CCS Metadata */
>> +                       m->min_chunk_size = SZ_4K * SZ_64K /
>> +                               xe_device_ccs_bytes(xe, SZ_64K);
>> +               else
>> +                       /* Somewhat arbitrary to avoid a huge amount of blits */
>> +                       m->min_chunk_size = SZ_64K;
>> +               m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);
>> +               drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",
>> +                       (unsigned long long)m->min_chunk_size);
>> +       }
>> +
>>          return m;
>>   }
>>
>> @@ -375,16 +390,35 @@ static u64 max_mem_transfer_per_pass(struct xe_device *xe)
>>          return MAX_PREEMPTDISABLE_TRANSFER;
>>   }
>>
>> -static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur)
>> +static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)
>>   {
>> -       /*
>> -        * For VRAM we use identity mapped pages so we are limited to current
>> -        * cursor size. For system we program the pages ourselves so we have no
>> -        * such limitation.
>> -        */
>> -       return min_t(u64, max_mem_transfer_per_pass(xe),
>> -                    mem_type_is_vram(cur->mem_type) ? cur->size :
>> -                    cur->remaining);
>> +       struct xe_device *xe = tile_to_xe(m->tile);
>> +       u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);
>> +
>> +       if (mem_type_is_vram(cur->mem_type)) {
>> +               /*
>> +                * VRAM we want to blit in chunks with sizes aligned to
>> +                * min_chunk_size in order for the offset to CCS metadata to be
>> +                * page-aligned. If it's the last chunk it may be smaller.
>> +                *
>> +                * Another constraint is that we need to limit the blit to
>> +                * the VRAM block size, unless size is smaller than
>> +                * min_chunk_size.
>> +                */
>> +               u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
>> +
>> +               size = min_t(u64, size, chunk);
>> +               if (size > m->min_chunk_size)
>> +                       size = round_down(size, m->min_chunk_size);
>> +       }
>> +
>> +       return size;
> Patch makes sense. Just one question here.
>
> Say you have 1G object, but it is ultra fragmented into 64K only
> pages. IIUC cur->remaining would be 1G, and cur->size would be 64K.
> But with that chunk=1M and the returned size=1M? Shouldn't the
> returned size be 64K in this case?

No, in that case we actually set up a 1M blit, but the fragmentation is 
handled in xe_migrate_allow_identity(); Instead of using the identity 
mapping for VRAM, we actually emit PTEs for that 1M chunk.

/Thomas

>
>> +}
>> +
>> +static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
>> +{
>> +       /* If the chunk is not fragmented, allow identity map. */
>> +       return cur->size >= size;
>>   }
>>
>>   static u32 pte_update_size(struct xe_migrate *m,
>> @@ -397,7 +431,12 @@ static u32 pte_update_size(struct xe_migrate *m,
>>          u32 cmds = 0;
>>
>>          *L0_pt = pt_ofs;
>> -       if (!is_vram) {
>> +       if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
>> +               /* Offset into identity map. */
>> +               *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
>> +                                             cur->start + vram_region_gpu_offset(res));
>> +               cmds += cmd_size;
>> +       } else {
>>                  /* Clip L0 to available size */
>>                  u64 size = min(*L0, (u64)avail_pts * SZ_2M);
>>                  u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
>> @@ -413,11 +452,6 @@ static u32 pte_update_size(struct xe_migrate *m,
>>
>>                  /* Each chunk has a single blit command */
>>                  cmds += cmd_size;
>> -       } else {
>> -               /* Offset into identity map. */
>> -               *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
>> -                                             cur->start + vram_region_gpu_offset(res));
>> -               cmds += cmd_size;
>>          }
>>
>>          return cmds;
>> @@ -427,10 +461,10 @@ static void emit_pte(struct xe_migrate *m,
>>                       struct xe_bb *bb, u32 at_pt,
>>                       bool is_vram, bool is_comp_pte,
>>                       struct xe_res_cursor *cur,
>> -                    u32 size, struct xe_bo *bo)
>> +                    u32 size, struct ttm_resource *res)
>>   {
>>          struct xe_device *xe = tile_to_xe(m->tile);
>> -
>> +       struct xe_vm *vm = m->q->vm;
>>          u16 pat_index;
>>          u32 ptes;
>>          u64 ofs = at_pt * XE_PAGE_SIZE;
>> @@ -443,13 +477,6 @@ static void emit_pte(struct xe_migrate *m,
>>          else
>>                  pat_index = xe->pat.idx[XE_CACHE_WB];
>>
>> -       /*
>> -        * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
>> -        * we're only emitting VRAM PTEs during sanity tests, so when
>> -        * that's moved to a Kunit test, we should condition VRAM PTEs
>> -        * on running tests.
>> -        */
>> -
>>          ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
>>
>>          while (ptes) {
>> @@ -469,20 +496,22 @@ static void emit_pte(struct xe_migrate *m,
>>
>>                          addr = xe_res_dma(cur) & PAGE_MASK;
>>                          if (is_vram) {
>> -                               /* Is this a 64K PTE entry? */
>> -                               if ((m->q->vm->flags & XE_VM_FLAG_64K) &&
>> -                                   !(cur_ofs & (16 * 8 - 1))) {
>> -                                       xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K));
>> +                               if (vm->flags & XE_VM_FLAG_64K) {
>> +                                       u64 va = cur_ofs * XE_PAGE_SIZE / 8;
>> +
>> +                                       xe_assert(xe, (va & (SZ_64K - 1)) ==
>> +                                                 (addr & (SZ_64K - 1)));
>> +
>>                                          flags |= XE_PTE_PS64;
>>                                  }
>>
>> -                               addr += vram_region_gpu_offset(bo->ttm.resource);
>> +                               addr += vram_region_gpu_offset(res);
>>                                  devmem = true;
>>                          }
>>
>> -                       addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
>> -                                                                addr, pat_index,
>> -                                                                0, devmem, flags);
>> +                       addr = vm->pt_ops->pte_encode_addr(m->tile->xe,
>> +                                                          addr, pat_index,
>> +                                                          0, devmem, flags);
>>                          bb->cs[bb->len++] = lower_32_bits(addr);
>>                          bb->cs[bb->len++] = upper_32_bits(addr);
>>
>> @@ -694,8 +723,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>>                  bool usm = xe->info.has_usm;
>>                  u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
>>
>> -               src_L0 = xe_migrate_res_sizes(xe, &src_it);
>> -               dst_L0 = xe_migrate_res_sizes(xe, &dst_it);
>> +               src_L0 = xe_migrate_res_sizes(m, &src_it);
>> +               dst_L0 = xe_migrate_res_sizes(m, &dst_it);
>>
>>                  drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
>>                          pass++, src_L0, dst_L0);
>> @@ -716,6 +745,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>>                                                        &ccs_ofs, &ccs_pt, 0,
>>                                                        2 * avail_pts,
>>                                                        avail_pts);
>> +                       xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
>>                  }
>>
>>                  /* Add copy commands size here */
>> @@ -728,20 +758,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>>                          goto err_sync;
>>                  }
>>
>> -               if (!src_is_vram)
>> -                       emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
>> -                                src_bo);
>> -               else
>> +               if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
>>                          xe_res_next(&src_it, src_L0);
>> -
>> -               if (!dst_is_vram)
>> -                       emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
>> -                                dst_bo);
>>                  else
>> +                       emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
>> +                                src);
>> +
>> +               if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
>>                          xe_res_next(&dst_it, src_L0);
>> +               else
>> +                       emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
>> +                                dst);
>>
>>                  if (copy_system_ccs)
>> -                       emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src_bo);
>> +                       emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
>>
>>                  bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>>                  update_idx = bb->len;
>> @@ -950,7 +980,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>>                  bool usm = xe->info.has_usm;
>>                  u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
>>
>> -               clear_L0 = xe_migrate_res_sizes(xe, &src_it);
>> +               clear_L0 = xe_migrate_res_sizes(m, &src_it);
>>
>>                  drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);
>>
>> @@ -977,12 +1007,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
>>
>>                  size -= clear_L0;
>>                  /* Preemption is enabled again by the ring ops. */
>> -               if (!clear_vram) {
>> -                       emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
>> -                                bo);
>> -               } else {
>> +               if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
>>                          xe_res_next(&src_it, clear_L0);
>> -               }
>> +               else
>> +                       emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
>> +                                dst);
>> +
>>                  bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>>                  update_idx = bb->len;
>>
>> --
>> 2.42.0
>>