[Intel-xe] [PATCH] drm/xe/ppgtt: Add support for correct PAT encoding in Page Table Entry (PTE) format

Fri Jul 21 16:55:26 UTC 2023


On 7/21/2023 10:21 PM, Vodapalli, Ravi Kumar wrote:
>
>
> On 7/21/2023 9:43 PM, Lucas De Marchi wrote:
>> On Thu, Jul 20, 2023 at 07:48:33PM +0530, Vodapalli, Ravi Kumar wrote:
>>>
>>>
>>> On 7/17/2023 6:53 PM, Lucas De Marchi wrote:
>>>> On Wed, Jul 12, 2023 at 02:52:37PM +0000, Matthew Brost wrote:
>>>>> On Wed, Jul 12, 2023 at 06:26:40PM +0530, Ravi Kumar Vodapalli wrote:
>>>>>> Different platforms has different PAT encoding in PTE format, add
>>>>>> correct PAT encoding for pre-Xe2 platforms (XELP, XEHPC, XELPG).
>>>>>>
>>>>>> Bspec: 45101, 71582
>>>>>> Signed-off-by: Ravi Kumar Vodapalli <ravi.kumar.vodapalli at intel.com>
>>>>>> ---
>>>>>>  drivers/gpu/drm/xe/xe_pt.c       | 110 
>>>>>> +++++++++++++++++++++++++++----
>>>>>>  drivers/gpu/drm/xe/xe_pte.h      |  31 +++++++++
>>>>>>  drivers/gpu/drm/xe/xe_vm_types.h |   1 +
>>>>>>  3 files changed, 129 insertions(+), 13 deletions(-)
>>>>>>  create mode 100644 drivers/gpu/drm/xe/xe_pte.h
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
>>>>>> index 00855681c0d5..e7f8bd4dd518 100644
>>>>>> --- a/drivers/gpu/drm/xe/xe_pt.c
>>>>>> +++ b/drivers/gpu/drm/xe/xe_pt.c
>>>>>> @@ -16,6 +16,7 @@
>>>>>>  #include "xe_trace.h"
>>>>>>  #include "xe_ttm_stolen_mgr.h"
>>>>>>  #include "xe_vm.h"
>>>>>> +#include "xe_pte.h"
>>>>>>
>>>>>>  struct xe_pt_dir {
>>>>>>      struct xe_pt pt;
>>>>>> @@ -111,19 +112,7 @@ static u64 __pte_encode(u64 pte, enum 
>>>>>> xe_cache_level cache,
>>>>>>      if (unlikely(vma && xe_vma_is_null(vma)))
>>>>>>          pte |= XE_PTE_NULL;
>>>>>>
>>>>>> -    /* FIXME: I don't think the PPAT handling is correct for MTL */
>>>>>> -
>>>>>> -    switch (cache) {
>>>>>> -    case XE_CACHE_NONE:
>>>>>> -        pte |= PPAT_UNCACHED;
>>>>>> -        break;
>>>>>> -    case XE_CACHE_WT:
>>>>>> -        pte |= PPAT_DISPLAY_ELLC;
>>>>>> -        break;
>>>>>> -    default:
>>>>>> -        pte |= PPAT_CACHED;
>>>>>> -        break;
>>>>>> -    }
>>>>>> +    pte = xe_vma_vm(vma)->pte_encode(pte, cache);
>>>>>>
>>>>>>      if (pt_level == 1)
>>>>>>          pte |= XE_PDE_PS_2M;
>>>>>> @@ -187,6 +176,94 @@ static u64 __xe_pt_empty_pte(struct xe_tile 
>>>>>> *tile, struct xe_vm *vm,
>>>>>>      }
>>>>>>  }
>>>>>>
>>>>>> +static u64 xelp_pte_encode_pat(u8 pat_index)
>>>>>> +{
>>>>>> +    u64 pte_pat = 0;
>>>>>> +
>>>>>> +    if (pat_index & BIT(0))
>>>>>> +        pte_pat |= BIT(3);
>>>>>> +
>>>>>> +    if (pat_index & BIT(1))
>>>>>> +        pte_pat |= BIT(4);
>>>>>> +
>>>>>> +    if (pat_index & BIT(2))
>>>>>> +        pte_pat |= BIT(7);
>>>>>> +
>>>>>> +    return pte_pat;
>>>>>> +}
>>>>>> +
>>>>>> +static u64 __xelp_pte_encode(u64 pte, enum xe_cache_level cache)
>>>>>> +{
>>>>>> +    pte &= ~XELP_PAT_MASK;
>>>>>> +
>>>>>> +    switch (cache) {
>>>>>> +    case XE_CACHE_NONE:
>>>>>> +        pte |= xelp_pte_encode_pat(XELP_PAT_UNCACHE);
>>>>>> +        break;
>>>>>> +    case XE_CACHE_WT:
>>>>>> +        pte |= xelp_pte_encode_pat(XELP_PAT_WT_CACHE);
>>>>>> +        break;
>>>>>> +    case XE_CACHE_WB:
>>>>>> +        pte |= xelp_pte_encode_pat(XELP_PAT_WB_CACHE);
>>>>>> +        break;
>>>>>> +    default:
>>>>>> +        /* pte |= PPAT_CACHED; */
>>>>>> +        break;
>>>>>> +    }
>>>>>> +
>>>>>> +    return pte;
>>>>>> +}
>>>>>> +
>>>>>> +static u64 __xehpc_pte_encode(u64 pte, enum xe_cache_level cache)
>>>>>> +{
>>>>>> +    pte &= ~XELP_PAT_MASK;
>>>>>> +
>>>>>> +    /* Initially assumed as CLOS0.
>>>>>> +     * To Do: CLOS1/CLOS2 has to be set depending on requirement
>>>>>> +     */
>>>>>> +    switch (cache) {
>>>>>> +    case XE_CACHE_NONE:
>>>>>> +        pte |= xelp_pte_encode_pat(XEHPC_PAT_CLOS0_UNCACHE);
>>>>>> +        break;
>>>>>> +    case XE_CACHE_WT:
>>>>>> +        pte |= xelp_pte_encode_pat(XEHPC_PAT_CLOS0_WT_CACHE);
>>>>>> +        break;
>>>>>> +    case XE_CACHE_WB:
>>>>>> +        pte |= xelp_pte_encode_pat(XEHPC_PAT_CLOS0_WB_CACHE);
>>>>>> +        break;
>>>>>> +    default:
>>>>>> +        /* pte |= PPAT_CACHED; */
>>>>>> +        break;
>>>>>> +    }
>>>>>> +
>>>>>> +    return pte;
>>>>>> +}
>>>>>> +
>>>>>> +static u64 __xelpg_pte_encode(u64 pte, enum xe_cache_level cache)
>>>>>> +{
>>>>>> +    pte &= ~XELP_PAT_MASK;
>>>>>> +
>>>>>> +    /* Initially assumed as Non cohorent mode, correct values
>>>>>> +     * has to be set depending on requirement.
>>>>>> +     */
>>>>>> +    switch (cache) {
>>>>>> +    case XE_CACHE_NONE:
>>>>>> +        pte |= xelp_pte_encode_pat(XELPG_PAT_UNCACHE);
>>>>>> +        break;
>>>>>> +    case XE_CACHE_WT:
>>>>>> +        pte |= xelp_pte_encode_pat(XELPG_PAT_WT_CACHE);
>>>>>> +        break;
>>>>>> +    case XE_CACHE_WB:
>>>>>> +        pte |= xelp_pte_encode_pat(XELPG_PAT_WB_CACHE);
>>>>>> +        break;
>>>>>> +    default:
>>>>>> +        /* pte |= PPAT_CACHED; */
>>>>>> +        break;
>>>>>> +    }
>>>>>> +
>>>>>> +    return pte;
>>>>>> +}
>>>>>> +
>>>>>>  /**
>>>>>>   * xe_pt_create() - Create a page-table.
>>>>>>   * @vm: The vm to create for.
>>>>>> @@ -216,6 +293,13 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm, 
>>>>>> struct xe_tile *tile,
>>>>>>      if (!pt)
>>>>>>          return ERR_PTR(-ENOMEM);
>>>>>>
>>>>>> +    if (GRAPHICS_VERx100(vm->xe) >= 1270)
>>>>>> +        vm->pte_encode = __xelpg_pte_encode;
>>>>>> +    else if (GRAPHICS_VERx100(vm->xe) >= 1260)
>>>>>> +        vm->pte_encode = __xehpc_pte_encode;
>>>>>> +    else
>>>>>> +        vm->pte_encode = __xelp_pte_encode;
>>>>>> +
>>>>>>      bo = xe_bo_create_pin_map(vm->xe, tile, vm, SZ_4K,
>>>>>>                    ttm_bo_type_kernel,
>>>>>>                    XE_BO_CREATE_VRAM_IF_DGFX(tile) |
>>>>>> diff --git a/drivers/gpu/drm/xe/xe_pte.h 
>>>>>> b/drivers/gpu/drm/xe/xe_pte.h
>>>>>> new file mode 100644
>>>>>> index 000000000000..8b3b5b471a99
>>>>>> --- /dev/null
>>>>>> +++ b/drivers/gpu/drm/xe/xe_pte.h
>>>>>
>>>>> s/xe_pte.h/xe_pte_pat.h
>>>>>
>>>>> Just my option prefer this.
>>>>
>>>> dunno, but since we have different pte formats for ggtt and ppgtt, IMO
>>>> it would add to the confusion.
>>>>
>>>> Maybe this belongs in xe_pat.h since it's xe_pat.c that programs the
>>>> tables giving the meaning spelled out in the macros here for each 
>>>> index.
>>>> The rest of the code then need to set the bits in the PTE to reference
>>>> those indexes, according to the PTE format.
>>>>
>>>>>
>>>>>> @@ -0,0 +1,31 @@
>>>>>> +/* SPDX-License-Identifier: MIT */
>>>>>> +/*
>>>>>> + * Copyright © 2021 Intel Corporation
>>>>>> + */
>>>>>> +
>>>>>> +#ifndef _XE_PTE_H_
>>>>>> +#define _XE_PTE_H_
>>>>>> +
>>>>>> +#define XELP_PAT_MASK                  BIT_ULL(7) | BIT_ULL(4) | 
>>>>>> BIT_ULL(3)
>>>>>> +#define XELP_PAT_WB_CACHE              0
>>>>>> +#define XELP_PAT_WC_CACHE              1
>>>>>> +#define XELP_PAT_WT_CACHE              2
>>>>>> +#define XELP_PAT_UNCACHE               3
>>>>
>>>> example... this duplicates what we have in xe_pat.c (comment added 
>>>> here
>>>> for clarity)
>>>>
>>>> static const u32 tgl_pat_table[] = {
>>>>         [0] = TGL_PAT_WB,
>>>>         [1] = TGL_PAT_WC,
>>>>         [2] = TGL_PAT_WT,
>>>>         [3] = TGL_PAT_UC,
>>>>     // .... those are programmed, but not expected to be used
>>>>         [4] = TGL_PAT_WB,
>>>>         [5] = TGL_PAT_WB,
>>>>         [6] = TGL_PAT_WB,
>>>>         [7] = TGL_PAT_WB,
>>>> };
>>>>
>>>
>>> TGL_PAT_WB maps to PAT table register bit mapping, we cannot 
>>> directly use tgl_pat_table to get PAT index rather we
>>> need to implement code to retrieve the index like below instead we 
>>> can use macro directly.
>>>
>>> xelp_pte_encode_pat(u8 cache_type) //example pass TGL_PAT_WB
>>> {
>>>     u8 index;
>>>     for (i=0, i < 32, i++)
>>>         if (tgl_pat_table[i] == cache_type)
>>
>> no, we don't want the caller to pass the platform-specific value.
>> It should pass a platfrom-agnostic value and xe_pat_get_index()
>> translates that to the platform-specific index. There are just 2 entries
>> really used by the kernel: uc and wb. Then the xe_pat.c returns what is
>> the entry to be used in those cases.
>>
>> Lucas De Marchi
>>
>
> I have discussed Regarding encoding the index into pat pte with 
> Matt,Roper
> He said instead of writing common function for both ppgtt and ggtt, 
> the best approach would be separate them as below so that in future 
> adding more support will be easier
>
> static u64 xelp_pte_encode_pat(u8 pat_index)
> {
>     u64 pte_pat = 0;
>
>     if (pat_index & BIT(0))
>         pte_pat |= BIT(3);
>
>     if (pat_index & BIT(1))
>         pte_pat |= BIT(4);
>
>     if (pat_index & BIT(2))
>         pte_pat |= BIT(7);
>
>     return pte_pat;
> }
>
> Thanks,
> Ravi Kumar V
>
>>>             index = i;
>>> }
>>>
>>>
>>>>
>>>> IMO the abstration could be something like
>>>>
>>>>     // this is valid for both ppgtt and ggtt
>>>>     pat = xe_pat_get_index(xe, cache);
>>>>
>>>>     // this changes between ggtt / ppgtt
>>>>     pte |= vm->pte_encode_pat(xe, pat);
>>>>
>>>
>>> I have one approach like below example where we can use same 
>>> function for both ggtt and ppgtt
>>>
>>> static u64 pte_encode_pat(enum gtt_type, u8 pat_index)
>>> {
>>>     u64 pte_pat = 0;
>>>
>>>     if (gtt_type == PPGTT)
>>>     {
>>>         if (pat_index & BIT(0))
>>>             pte_pat |= BIT(3);
>>>
>>>         if (pat_index & BIT(1))
>>>             pte_pat |= BIT(4);
>>>
>>>         if (pat_index & BIT(2))
>>>             pte_pat |= BIT(7);
>>>      }
>>>     else if(gtt_type = GGTT)
>>>     {
>>>         GGTT type PAT PTE encoding code here.
>>>     }
>>>         return pte_pat;
>>> }
>>>
>>>
>>> vm->pte_encode_pat(u64 pte, enum gtt_type, xe_cache_level cache)
>>> {
>>>     switch (cache) {
>>>            case XE_CACHE_NONE:
>>>             pte |= pte_encode_pat(gtt_type, XELP_PAT_UNCACHE);
>>>             break;
>>> }
>>>
>>> what's your opinion.
>>>
>>> Thanks,
>>> Ravi Kumar V
>>>>
>>>> so the PAT indexes are handled by xe_pat.[ch]. The pte *encoding*
>>>> is then handled differently by ggtt and ppgtt. Thoughts?
>>>>
>>>> Lucas De Marchi
>>>>
>>>>
>>>>>> +
>>>>>> +#define XEHPC_PAT_CLOS0_UNCACHE        0
>>>>>> +#define XEHPC_PAT_CLOS0_WC_CACHE       1
>>>>>> +#define XEHPC_PAT_CLOS0_WT_CACHE       2
>>>>>> +#define XEHPC_PAT_CLOS0_WB_CACHE       3
>>>>>> +#define XEHPC_PAT_CLOS1_WT_CACHE       4
>>>>>> +#define XEHPC_PAT_CLOS1_WB_CACHE       5
>>>>>> +#define XEHPC_PAT_CLOS2_WT_CACHE       6
>>>>>> +#define XEHPC_PAT_CLOS2_WB_CACHE       7
>>>>>> +
>>>>>> +#define XELPG_PAT_WB_CACHE             0
>>>>>> +#define XELPG_PAT_WT_CACHE             1
>>>>>> +#define XELPG_PAT_UNCACHE              2
>>>>>> +#define XELPG_PAT_1_WAY_WB_CACHE       3
>>>>>> +#define XELPG_PAT_2_WAY_WB_CACHE       4
>>>>>> +
>>>>>> +#endif
>>>>>> +
>>>>>> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h 
>>>>>> b/drivers/gpu/drm/xe/xe_vm_types.h
>>>>>> index edb3c99a9c81..9e1b33c17d05 100644
>>>>>> --- a/drivers/gpu/drm/xe/xe_vm_types.h
>>>>>> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
>>>>>> @@ -312,6 +312,7 @@ struct xe_vm {
>>>>>>
>>>>>>      /** @batch_invalidate_tlb: Always invalidate TLB before 
>>>>>> batch start */
>>>>>>      bool batch_invalidate_tlb;
>>>>>> +    u64 (*pte_encode)(u64 pte, enum xe_cache_level cache);
>>>>>
>>>>> Add kernel doc.
>>>>>
>>>>> s/pte_encode/pte_pat_encode
>>>>>
>>>>> Again just my opinion. Let see if anyone else has an opinion on 
>>>>> the naming.
>>>>>
>>>>> Other than these NITs it LGTM. Didn't read to the bspec to check the
>>>>> actually bit setting, but can do that ahead of the next rev. 
>>>>> Speaking of
>>>>> that, do you have a bspec link so I don't have to search through it?
>>>>>
>>>>> Matt
>>>>>
>>>>>>  };
>>>>>>
>>>>>>  /** struct xe_vma_op_map - VMA map operation */
>>>>>> -- 
>>>>>> 2.25.1
>>>>>>
>>>
>