[PATCH] drm/xe: Enable 2M pages in xe_migrate_vram

Mon Aug 11 19:02:15 UTC 2025

On Mon, 2025-08-11 at 11:50 -0700, Matthew Brost wrote:
> On Mon, Aug 11, 2025 at 12:33:57PM -0600, Summers, Stuart wrote:
> > On Fri, 2025-08-08 at 11:59 -0700, Matthew Brost wrote:
> > > Using 2M pages in xe_migrate_vram has two benefits: we issue
> > > fewer
> > > instructions per 2M copy (1 vs. 512), and the cache hit rate
> > > should
> > > be
> > > higher. This results in increased copy engine bandwidth, as shown
> > > by
> > > benchmark IGTs.
> > > 
> > > Enable 2M pages by reserving PDEs in the migrate VM and using 2M
> > > pages
> > > in xe_migrate_vram if the DMA address order matches 2M.
> > > 
> > > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > 
> > Generally this makes sense to me. I have a couple of minor comments
> > below, but I agree doing this on a 2M granularity seems preferable
> > if
> > the user is using larger page sizes. We could even add a 1G copy,
> > but
> 
> Our copy chunk size is less than 1G (it’s ~8M), so there’s that. It’s
> possible the target we’re copying is a ≥1G buffer with a ≥1G DMA map.
> However, the ROI is much smaller in that case given our chunk size:
> we’d
> issue ~4× the instructions/cache entries vs. 1×, compared to the 4K →
> 2M
> case where it’s ~512× vs. 1×.
> 
> > this covers the most broad set of uses cases across platforms. I
> > don't
> > think we'll get that much more benefit out of the larger size.
> > 
> 
> Yea, I think 2M we will (and I saw) real benefits of copy bandwidth
> and
> I'm not sure if 1G would have measurable impact.  

Yeah I agree, was just thinking over the possibility given you're
already making the change here. It doesn't make sense to do this for
1GB.

> 
> > > ---
> > >  drivers/gpu/drm/xe/xe_migrate.c | 82
> > > +++++++++++++++++++++++++++++++--
> > >  1 file changed, 78 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> > > b/drivers/gpu/drm/xe/xe_migrate.c
> > > index ac9f4d4988d2..9021aa3ac1e0 100644
> > > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > > @@ -56,6 +56,13 @@ struct xe_migrate {
> > >         u64 usm_batch_base_ofs;
> > >         /** @cleared_mem_ofs: VM offset of @cleared_bo. */
> > >         u64 cleared_mem_ofs;
> > > +       /** @large_page_copy_ofs: VM offset of 2M pages used for
> > > large copies */
> > > +       u64 large_page_copy_ofs;
> > > +       /**
> > > +        * @large_page_copy_pdes: BO offset to writeout 2M pages
> > > (PDEs) used for
> > > +        * large copies
> > > +        */
> > > +       u64 large_page_copy_pdes;
> > >         /**
> > >          * @fence: dma-fence representing the last migration job
> > > batch.
> > >          * Protected by @job_mutex.
> > > @@ -289,6 +296,12 @@ static int xe_migrate_prepare_vm(struct
> > > xe_tile
> > > *tile, struct xe_migrate *m,
> > >                           (i + 1) * 8, u64, entry);
> > >         }
> > >  
> > > +       /* Reserve 2M PDEs */
> > > +       level = 1;
> > > +       m->large_page_copy_ofs = NUM_PT_SLOTS <<
> > > xe_pt_shift(level);
> > 
> > Can't we just re-use xe_migrate_vm_addr() and allow the range above
> > NUM_PT_SLOTS?
> > 
> 
> I decided to it precalculate + save large_page_copy_ofs as its value
> depends on what we assign to large_page_copy_pdes (i.e., the math of
> both need to match).

Ok makes sense.

Thanks,
Stuart

> 
> > > +       m->large_page_copy_pdes = map_ofs + XE_PAGE_SIZE * level
> > > +
> > > +               NUM_PT_SLOTS * 8;
> > > +
> > >         /* Set up a 1GiB NULL mapping at 255GiB offset. */
> > >         level = 2;
> > >         xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level +
> > > 255
> > > * 8, u64,
> > > @@ -1740,6 +1753,40 @@ static u32 pte_update_cmd_size(u64 size)
> > >         return num_dword;
> > >  }
> > >  
> > > +static void build_large_pt_update_batch_sram(struct xe_migrate
> > > *m,
> > > +                                            struct xe_bb *bb,
> > > +                                            struct
> > > drm_pagemap_addr
> > > *sram_addr,
> > > +                                            u32 size)
> > > +{
> > > +       u16 pat_index = tile_to_xe(m->tile)-
> > > >pat.idx[XE_CACHE_WB];
> > > +       u32 ptes, i = 0, large_size = (0x1 << xe_pt_shift(1));
> > > +
> > > +       ptes = DIV_ROUND_UP(size, large_size);
> > > +
> > > +       bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > > MI_SDI_NUM_QW(ptes);
> > > +       bb->cs[bb->len++] = m->large_page_copy_pdes;
> > 
> > IMO we could combine this with the PTE version. I realize there are
> > some slight differences, but if we, for instance, needed to change
> > the
> > PAT index for compression similar to what we're doing in some of
> > the
> > other batches in this file, we would then only have to do that in
> > one
> > place.
> > 
> 
> Let me see what that looks like. It seems doable without to many if
> statements.
> 
> Matt
> 
> > Not a blocker though.
> > 
> > Thanks,
> > Stuart
> > 
> > > +       bb->cs[bb->len++] = 0;
> > > +
> > > +       while (ptes--) {
> > > +               u64 addr = sram_addr[i].addr &
> > > ~(((u64)large_size) -
> > > 1);
> > > +
> > > +               xe_tile_assert(m->tile, sram_addr[i].proto ==
> > > +                              DRM_INTERCONNECT_SYSTEM);
> > > +               xe_tile_assert(m->tile, PAGE_SIZE <<
> > > sram_addr[i].order ==
> > > +                              large_size);
> > > +               xe_tile_assert(m->tile, addr);
> > > +
> > > +               addr = m->q->vm->pt_ops->pte_encode_addr(m->tile-
> > > >xe,
> > > +                                                        addr,
> > > pat_index,
> > > +                                                        1,
> > > false,
> > > 0);
> > > +
> > > +               bb->cs[bb->len++] = lower_32_bits(addr);
> > > +               bb->cs[bb->len++] = upper_32_bits(addr);
> > > +
> > > +               i += large_size / PAGE_SIZE;
> > > +       }
> > > +}
> > > +
> > >  static void build_pt_update_batch_sram(struct xe_migrate *m,
> > >                                        struct xe_bb *bb, u32
> > > pt_offset,
> > >                                        struct drm_pagemap_addr
> > > *sram_addr,
> > > @@ -1777,6 +1824,22 @@ static void
> > > build_pt_update_batch_sram(struct
> > > xe_migrate *m,
> > >         }
> > >  }
> > >  
> > > +static bool xe_migrate_vram_use_pde(struct drm_pagemap_addr
> > > *sram_addr,
> > > +                                   unsigned long size)
> > > +{
> > > +       u32 large_size = (0x1 << xe_pt_shift(1));
> > > +       unsigned long i;
> > > +
> > > +       for (i = 0; i < DIV_ROUND_UP(size, PAGE_SIZE); ++i) {
> > > +               if (PAGE_SIZE << sram_addr[i].order !=
> > > large_size)
> > > +                       return false;
> > > +
> > > +               i += large_size / PAGE_SIZE;
> > > +       }
> > > +
> > > +       return true;
> > > +}
> > > +
> > >  enum xe_migrate_copy_dir {
> > >         XE_MIGRATE_COPY_TO_VRAM,
> > >         XE_MIGRATE_COPY_TO_SRAM,
> > > @@ -1806,6 +1869,7 @@ static struct dma_fence
> > > *xe_migrate_vram(struct
> > > xe_migrate *m,
> > >                 PAGE_SIZE : 4;
> > >         int err;
> > >         unsigned long i, j;
> > > +       bool use_pde = xe_migrate_vram_use_pde(sram_addr, len +
> > > sram_offset);
> > >  
> > >         if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
> > >                         (sram_offset | vram_addr) &
> > > XE_CACHELINE_MASK))
> > > @@ -1840,16 +1904,26 @@ static struct dma_fence
> > > *xe_migrate_vram(struct xe_migrate *m,
> > >                 i += NR_PAGES(order);
> > >         }
> > >  
> > > -       build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
> > > -                                  sram_addr, len + sram_offset);
> > > +       if (use_pde)
> > > +               build_large_pt_update_batch_sram(m, bb,
> > > sram_addr,
> > > +                                                len +
> > > sram_offset);
> > > +       else
> > > +               build_pt_update_batch_sram(m, bb, pt_slot *
> > > XE_PAGE_SIZE,
> > > +                                          sram_addr, len +
> > > sram_offset);
> > >  
> > >         if (dir == XE_MIGRATE_COPY_TO_VRAM) {
> > > -               src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) +
> > > sram_offset;
> > > +               if (use_pde)
> > > +                       src_L0_ofs = m->large_page_copy_ofs +
> > > sram_offset;
> > > +               else
> > > +                       src_L0_ofs = xe_migrate_vm_addr(pt_slot,
> > > 0) +
> > > sram_offset;
> > >                 dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr,
> > > false);
> > >  
> > >         } else {
> > >                 src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr,
> > > false);
> > > -               dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) +
> > > sram_offset;
> > > +               if (use_pde)
> > > +                       dst_L0_ofs = m->large_page_copy_ofs +
> > > sram_offset;
> > > +               else
> > > +                       dst_L0_ofs = xe_migrate_vm_addr(pt_slot,
> > > 0) +
> > > sram_offset;
> > >         }
> > >  
> > >         bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> >