[PATCH 3/6] drm/xe/xe2: Introduce identity map for compressed pat for vram

Fri Jul 12 04:24:47 UTC 2024

On 7/11/2024 5:32 AM, Matthew Auld wrote:
> On 11/07/2024 10:19, Akshata Jahagirdar wrote:
>> Xe2+ has unified compression (exactly one compression mode/format),
>> where compression is now controlled via PAT at PTE level.
>> This simplifies KMD operations, as it can now decompress freely
>> without concern for the buffer's original compression format—unlike DG2,
>> which had multiple compression formats and thus required copying the
>> raw CCS state during VRAM eviction. In addition mixed VRAM and system
>> memory buffers were not supported with compression enabled.
>>
>> On Xe2 dGPU compression is still only supported with VRAM, however we
>> can now support compression with VRAM and system memory buffers,
>> with GPU access being seamless underneath. So long as when doing
>> VRAM -> system memory the KMD uses compressed -> uncompressed,
>> to decompress it. This also allows CPU access to such buffers,
>> assuming that userspace first decompress the corresponding
>> pages being accessed.
>> If the pages are already in system memory then KMD would have already
>> decompressed them. When restoring such buffers with sysmem -> VRAM
>> the KMD can't easily know which pages were originally compressed,
>> so we always use uncompressed -> uncompressed here.
>> With this it also means we can drop all the raw CCS handling on such
>> platforms (including needing to allocate extra CCS storage).
>>
>> In order to support this we now need to have two different identity
>> mappings for compressed and uncompressed VRAM.
>> In this patch, we set up the additional identity map for the VRAM with
>> compressed pat_index. We then select the appropriate mapping during
>> migration/clear.During eviction (vram->sysmem), we use the mapping 
>> from compressed -> uncompressed.
>> During restore (sysmem->vram), we need the mapping from uncompressed 
>> -> uncompressed.
>> Therefore, we need to have two different mappings for compressed and 
>> uncompressed
>> vram. We set up an additional identity map for the vram with compressed
>> pat_index. We then select the appropriate mapping during 
>> migration/clear.
>
> Nit: Formatting looks off.
Will fix this.
>
>>
>> Signed-off-by: Akshata Jahagirdar <akshata.jahagirdar at intel.com>
>
> Should this not be earlier in the series. I would have expected it to 
> be the first patch, since both the new clearing and copy logic are 
> built on top of this AFAICT.
>
So, clearing logic is not dependent on identity map changes.
To clear the BO, we modify using an uncompressed pat-index that 
indirectly updates the
compression status as uncompressed i.e zeroed CCS.
So, kept the current patch order as
  1. Handle clear ccs logic for xe2 dgfx
2. selftest to check clearing of bo and ccs
3. introduce identity map patch - needed for migration changes.
4. migration logic implementation
5. migration selftest
>> ---
>>   drivers/gpu/drm/xe/xe_migrate.c | 55 +++++++++++++++++++++++++--------
>>   1 file changed, 42 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c 
>> b/drivers/gpu/drm/xe/xe_migrate.c
>> index 2fc2cf375b1e..a3d6d3113ac2 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -120,14 +120,20 @@ static u64 xe_migrate_vm_addr(u64 slot, u32 level)
>>       return (slot + 1ULL) << xe_pt_shift(level + 1);
>>   }
>>   -static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr)
>> +static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool 
>> is_comp_pte)
>>   {
>>       /*
>>        * Remove the DPA to get a correct offset into identity table 
>> for the
>>        * migrate offset
>>        */
>> +    u64 identity_offset = 256ULL;
>> +
>> +    if (GRAPHICS_VER(xe) >= 20 && is_comp_pte)
>> +        identity_offset = 256ULL +
>> + DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G);
>> +
>>       addr -= xe->mem.vram.dpa_base;
>> -    return addr + (256ULL << xe_pt_shift(2));
>> +    return addr + (identity_offset << xe_pt_shift(2));
>>   }
>>     static int xe_migrate_prepare_vm(struct xe_tile *tile, struct 
>> xe_migrate *m,
>> @@ -214,12 +220,12 @@ static int xe_migrate_prepare_vm(struct xe_tile 
>> *tile, struct xe_migrate *m,
>>       } else {
>>           u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
>>   -        m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr);
>> +        m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
>>             if (xe->info.has_usm) {
>>               batch = tile->primary_gt->usm.bb_pool->bo;
>>               batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
>> -            m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, 
>> batch_addr);
>> +            m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, 
>> batch_addr, false);
>>           }
>>       }
>>   @@ -251,7 +257,7 @@ static int xe_migrate_prepare_vm(struct xe_tile 
>> *tile, struct xe_migrate *m,
>>             | XE_PTE_NULL);
>>       m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
>>   -    /* Identity map the entire vram at 256GiB offset */
>> +    /* Identity map the entire vram for uncompressed pat_index at 
>> 256GiB offset */
>>       if (IS_DGFX(xe)) {
>>           u64 pos, ofs, flags;
>>           /* XXX: Unclear if this should be usable_size? */
>> @@ -294,6 +300,30 @@ static int xe_migrate_prepare_vm(struct xe_tile 
>> *tile, struct xe_migrate *m,
>>           }
>>             xe_assert(xe, pos == vram_limit);
>> +
>> +        /*
>> +         * Identity map the entire vram for compressed pat_index for 
>> xe2+
>> +         * if flat ccs is enabled.
>> +         */
>> +        if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) {
>> +            u16 comp_pat_index = 
>> xe->pat.idx[XE_CACHE_NONE_COMPRESSION];
>> +            u64 vram_offset = 256 +
>> + DIV_ROUND_UP_ULL(xe->mem.vram.actual_physical_size, SZ_1G);
>> +
>> +            level = 2;
>> +            ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8;
>> +            flags = vm->pt_ops->pte_encode_addr(xe, 0, 
>> comp_pat_index, level,
>> +                                true, 0);
>> +
>> +            /*
>> +             * Use 1GB pages, it shouldn't matter the physical 
>> amount of
>> +             * vram is less, when we don't access it.
>> +             */
>> +            for (pos = xe->mem.vram.dpa_base;
>> +                 pos < xe->mem.vram.actual_physical_size + 
>> xe->mem.vram.dpa_base;
>> +                 pos += SZ_1G, ofs += 8)
>
> Nit: Formatting looks off?  There are some other formatting issues 
> reported by checkpatch in the ci results.
>
> Also it looks like there were some recent changes in 
> xe_migrate_prepare_vm, with how the identity map is constructed. I 
> think this will need to be updated to match? See: 
> 6d3581edffea0b3a64b0d3094d3f09222e0024f7.
>
I see, will update according to these changes. Thank you!
>> +                xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
>> +        }
>>       }
>>         /*
>> @@ -475,7 +505,7 @@ static bool xe_migrate_allow_identity(u64 size, 
>> const struct xe_res_cursor *cur)
>>   }
>>     static u32 pte_update_size(struct xe_migrate *m,
>> -               bool is_vram,
>> +               bool is_vram, bool is_comp_pte,
>>                  struct ttm_resource *res,
>>                  struct xe_res_cursor *cur,
>>                  u64 *L0, u64 *L0_ofs, u32 *L0_pt,
>> @@ -487,7 +517,7 @@ static u32 pte_update_size(struct xe_migrate *m,
>>       if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
>>           /* Offset into identity map. */
>>           *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
>> -                          cur->start + vram_region_gpu_offset(res));
>> +                          cur->start + vram_region_gpu_offset(res), 
>> is_comp_pte);
>>           cmds += cmd_size;
>>       } else {
>>           /* Clip L0 to available size */
>> @@ -778,17 +808,17 @@ struct dma_fence *xe_migrate_copy(struct 
>> xe_migrate *m,
>>             src_L0 = min(src_L0, dst_L0);
>>   -        batch_size += pte_update_size(m, src_is_vram, src, 
>> &src_it, &src_L0,
>> +        batch_size += pte_update_size(m, src_is_vram, false, src, 
>> &src_it, &src_L0,
>>                             &src_L0_ofs, &src_L0_pt, 0, 0,
>>                             avail_pts);
>>   -        batch_size += pte_update_size(m, dst_is_vram, dst, 
>> &dst_it, &src_L0,
>> +        batch_size += pte_update_size(m, dst_is_vram, false, dst, 
>> &dst_it, &src_L0,
>>                             &dst_L0_ofs, &dst_L0_pt, 0,
>>                             avail_pts, avail_pts);
>>             if (copy_system_ccs) {
>>               ccs_size = xe_device_ccs_bytes(xe, src_L0);
>> -            batch_size += pte_update_size(m, false, NULL, &ccs_it, 
>> &ccs_size,
>> +            batch_size += pte_update_size(m, false, false, NULL, 
>> &ccs_it, &ccs_size,
>>                                 &ccs_ofs, &ccs_pt, 0,
>>                                 2 * avail_pts,
>>                                 avail_pts);
>> @@ -1029,14 +1059,13 @@ struct dma_fence *xe_migrate_clear(struct 
>> xe_migrate *m,
>>             /* Calculate final sizes and batch size.. */
>>           batch_size = 2 +
>> -            pte_update_size(m, clear_vram, src, &src_it,
>> +            pte_update_size(m, clear_vram, false, src, &src_it,
>>                       &clear_L0, &clear_L0_ofs, &clear_L0_pt,
>>                       clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0,
>>                       avail_pts);
>>             if (xe_device_needs_ccs_emit(xe))
>>               batch_size += EMIT_COPY_CCS_DW;
>> -
>>           /* Clear commands */
>>             if (WARN_ON_ONCE(!clear_L0))
>> @@ -1146,7 +1175,7 @@ static void write_pgtable(struct xe_tile *tile, 
>> struct xe_bb *bb, u64 ppgtt_ofs,
>>       if (!ppgtt_ofs)
>>           ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
>>                           xe_bo_addr(update->pt_bo, 0,
>> -                               XE_PAGE_SIZE));
>> +                               XE_PAGE_SIZE), false);
>>         do {
>>           u64 addr = ppgtt_ofs + ofs * 8;