[PATCH v3 1/2] drm/xe/guc: Add support for w/a KLVs

Wed Mar 27 20:47:31 UTC 2024

On 3/26/2024 04:06, Michal Wajdeczko wrote:
> On 25.03.2024 19:55, John Harrison wrote:
>> On 3/25/2024 08:19, Michal Wajdeczko wrote:
>>> On 25.03.2024 16:04, Badal Nilawar wrote:
>>>> To prevent running out of bits, new w/a enable flags are being added
>>> nit: shouldn't we spell out "workaround" or use "W/A" as acronym ?
>> On the i915 side, we always used 'w/a'. It is more abbreviation than
>> acronym, certainly it is not an acronym for a proper noun. But yes, the
>> first instance in any description or comment should be written out in
>> full and only after that should abbreviations be used.
>>
>>>> via a KLV system instead of a 32 bit flags word.
>>>>
>>>> v2: GuC version check > 70.10 is not needed as xe will not be supporting
>>>>       anything below < 70.19 (John Harrison)
>>>> v3: Use 64 bit ggtt address for future
>>>>       compatibility (John Harrison/Daniele)
>>>>
>>>> Cc: John Harrison <John.C.Harrison at intel.com>
>>>> Signed-off-by: Badal Nilawar <badal.nilawar at intel.com>
>>>> ---
>>>>    drivers/gpu/drm/xe/xe_guc_ads.c       | 62 ++++++++++++++++++++++++++-
>>>>    drivers/gpu/drm/xe/xe_guc_ads_types.h |  2 +
>>>>    drivers/gpu/drm/xe/xe_guc_fwif.h      |  5 ++-
>>>>    3 files changed, 66 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c
>>>> b/drivers/gpu/drm/xe/xe_guc_ads.c
>>>> index df2bffb7e220..a98344a0ff4b 100644
>>>> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
>>>> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
>>>> @@ -80,6 +80,10 @@ ads_to_map(struct xe_guc_ads *ads)
>>>>     *      +---------------------------------------+
>>>>     *      | padding                               |
>>>>     *      +---------------------------------------+ <== 4K aligned
>>>> + *      | w/a KLVs                              |
>>>> + *      +---------------------------------------+
>>>> + *      | padding                               |
>>>> + *      +---------------------------------------+ <== 4K aligned
>>>>     *      | capture lists                         |
>>>>     *      +---------------------------------------+
>>>>     *      | padding                               |
>>>> @@ -131,6 +135,11 @@ static size_t guc_ads_golden_lrc_size(struct
>>>> xe_guc_ads *ads)
>>>>        return PAGE_ALIGN(ads->golden_lrc_size);
>>>>    }
>>>>    +static u32 guc_ads_waklv_size(struct xe_guc_ads *ads)
>>>> +{
>>>> +    return PAGE_ALIGN(ads->ads_waklv_size);
>>> btw, shouldn't we start using ALIGN(xx, SZ_4K)
>>>
>>>> +}
>>>> +
>>>>    static size_t guc_ads_capture_size(struct xe_guc_ads *ads)
>>>>    {
>>>>        /* FIXME: Allocate a proper capture list */
>>>> @@ -167,12 +176,22 @@ static size_t guc_ads_golden_lrc_offset(struct
>>>> xe_guc_ads *ads)
>>>>        return PAGE_ALIGN(offset);
>>>>    }
>>>>    +static size_t guc_ads_waklv_offset(struct xe_guc_ads *ads)
>>>> +{
>>>> +    u32 offset;
>>>> +
>>>> +    offset = guc_ads_golden_lrc_offset(ads) +
>>>> +         guc_ads_golden_lrc_size(ads);
>>>> +
>>>> +    return PAGE_ALIGN(offset);
>>>> +}
>>>> +
>>>>    static size_t guc_ads_capture_offset(struct xe_guc_ads *ads)
>>>>    {
>>>>        size_t offset;
>>>>    -    offset = guc_ads_golden_lrc_offset(ads) +
>>>> -        guc_ads_golden_lrc_size(ads);
>>>> +    offset = guc_ads_waklv_offset(ads) +
>>>> +         guc_ads_waklv_size(ads);
>>>>          return PAGE_ALIGN(offset);
>>>>    }
>>>> @@ -260,6 +279,42 @@ static size_t calculate_golden_lrc_size(struct
>>>> xe_guc_ads *ads)
>>>>        return total_size;
>>>>    }
>>>>    +static void guc_waklv_init(struct xe_guc_ads *ads)
>>>> +{
>>>> +    u64 addr_ggtt;
>>>> +    u32 offset, remain, size;
>>>> +
>>>> +    offset = guc_ads_waklv_offset(ads);
>>>> +    remain = guc_ads_waklv_size(ads);
>>>> +
>>>> +    /*
>>>> +     * Add workarounds here:
>>>> +     *
>>>> +     * if (want_wa_<name>) {
>>>> +     *      size = guc_waklv_<name>(guc, offset, remain);
>>>> +     *      offset += size;
>>>> +     *      remain -= size;
>>> maybe just asserting the used size will work ?
>>>
>>>          used += guc_waklv_NAME(guc, offset + used);
>>>          xe_gt_assert(gt, used <= guc_ads_waklv_size(ads));
>>>
>>>> +     * }
>>>> +     */
>>>> +
>>>> +    size = guc_ads_waklv_size(ads) - remain;
>>>> +    if (!size)
>>>> +        return;
>> Hmm. This test would be much more obvious as to its purpose if it was
>> simply "if(!used)". Not sure if the rest of the code is overall simpler
>> or more complex, though?
>>
>> It would be nice if the "used += ...; xe_asert(used < size);" could be
>> done in a loop that just takes an array of all the KLV ids. Not sure how
>> to achieve that with the w/a enable test, though. At least not without
>> the code just getting overly complicated. I guess it's not that much of
>> an issue to replicate the assert line for each w/a entry.
>>
> this array of KLVs may look like this:
>
> static const struct {
> 	bool (*need_wa)(struct xe_gt*);
> 	u32 (*add_klv)(struct xe_gt*, u32 offset);
> }
Like I said, it seems overly complex. Two mostly replicated lines versus 
a new structure with new function wrappers around the XE_WA test, ...  
Doesn't really seem to be a net win.

The simpler option would be to pass "offset, used" instead of "offset + 
used". The helper then does the addition internally together with the 
assert. It still returns 'size', so the external call just becomes "used 
+= simple(guc, KLV_ID, offset, used);".

>
> but maybe leave it until we will have more than one W/A to handle
We have all of these waiting for this patch to land:
+       if (XE_WA(gt, 14019882105))
+               guc_waklv_enable_simple(ads,
+ GUC_WORKAROUND_KLV_BLOCK_INTERRUPTS_WHEN_MGSR_BLOCKED,
+                                       &offset, &remain);
+       if (XE_WA(gt, 14019991076))
+               guc_waklv_enable_simple(ads,
+ GUC_WORKAROUND_KLV_ID_GAMCTRL_TLB_BIND_ON_CTX_TEARDOWN,
+                                       &offset, &remain);
+       if (XE_WA(gt, 14020001231))
+               guc_waklv_enable_simple(ads,
+ GUC_WORKAROUND_KLV_ID_DISABLE_PSMI_INTERRUPTS_AT_C6_ENTRY_RESTORE_AT_EXIT,
+                                       &offset, &remain);
+       if (XE_WA(gt, 18024947630))
+               guc_waklv_enable_simple(ads,
+ GUC_WORKAROUND_KLV_ID_GAM_PFQ_SHADOW_TAIL_POLLING,
+                                       &offset, &remain);
+       if (XE_WA(gt, 16022287689))
+               guc_waklv_enable_simple(ads,
+ GUC_WORKAROUND_KLV_ID_DISABLE_MTP_DURING_ASYNC_COMPUTE,
+                                       &offset, &remain);

Plus 13011645652 but that one actually takes a data value.

So we definitely have more KLVs waiting to be sent.

>
>>>> +
>>>> +    offset = guc_ads_waklv_offset(ads);
>>>> +    addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset;
>>>> +
>>>> +    ads_blob_write(ads, ads.wa_klv_addr_lo, lower_32_bits(addr_ggtt));
>>>> +    ads_blob_write(ads, ads.wa_klv_addr_hi, upper_32_bits(addr_ggtt));
>>>> +    ads_blob_write(ads, ads.wa_klv_size, size);
>>>> +}
>>>> +
>>>> +static int calculate_waklv_size(struct xe_guc_ads *ads)
>>>> +{
>>>> +    /* Fudge something chunky for now: */
>>>> +    return PAGE_SIZE;
>>> maybe SZ_4K ?
>> Technically, it is specifically a page not an arbitrary size that
>> happens to be page. However, it is really the GuC page size rather than
>> the host page size. At least, there is a requirement for the address
>> given to GuC to be page aligned as the lower bits are discarded. Whether
>> there is also any requirement on the host side for allocations to be
>> page aligned because they are going to be rounded up anyway by the
>> internal allocation mechanism is another matter. I believe that was the
>> case on i915 but not sure about Xe? So maybe it is both GT page size and
>> host page size?
>>
>> Is there any host architecture which has a page size that is not a
>> multiple of 4KB? If so, then maybe we need some fancy calculation that
>> is the lowest common factor of host page size and GT page size. But that
>> seems unlikely. In which the host page size seems the simplest valid
>> definition. If you want to be really paranoid, you could maybe add a
>> global compile time assert somewhere that PAGE_SIZE is a multiple of 4K?
> still, if GuC always expects 4K "page size" why we insist to use host
> PAGE_SIZE that could be a different size ? even if it will be multiple
> of 4K and it will eventually work on GuC side, why do we want to waste a
> GGTT space for unnecessary padding ?
My point is that the host allocator also has a minimum granularity. If 
it always rounds up to a host page (which it certainly did on i915, not 
sure if the rules are the same on Xe), then there is no point to 
allocate less than that because it will be wasted space.

> anyway, just an idea, not a blocker
Likewise. I'm not sure there is a perfect answer here.

I guess wasting GPU address space is worse than wasting system memory. 
So maybe just go with SZ_4K given that we don't (so far as I know) have 
a specific define for the GPU page size?

John.

>
>>> and is it really a 'calculate' helper ?
>>>
>>> if so then maybe add template comment how this will be calculated using
>>> want_wa_<name> and guc_waklv_<name> tuples
>> I suspect the calculation will be more like "if(IPVER >= 100) size =
>> PAGE * 10; else if(IPVER >= 50) size = PAGE * 2; else size = PAGE;". So
>> yes it is a calculation, but for the foreseeable future the calculation
>> is trivial. A single page is both the minimum size possible and is
>> sufficiently large enough for all current platforms. It may be worth
>> using that last sentence as the comment instead?
> this still sounds more like an 'estimate' than 'calculate' but let it be
>
>>>> +}
>>>> +
>>>>    #define MAX_GOLDEN_LRC_SIZE    (SZ_4K * 64)
>>>>      int xe_guc_ads_init(struct xe_guc_ads *ads)
>>>> @@ -271,6 +326,7 @@ int xe_guc_ads_init(struct xe_guc_ads *ads)
>>>>          ads->golden_lrc_size = calculate_golden_lrc_size(ads);
>>>>        ads->regset_size = calculate_regset_size(gt);
>>>> +    ads->ads_waklv_size = calculate_waklv_size(ads);
>>>>          bo = xe_managed_bo_create_pin_map(xe, tile, guc_ads_size(ads)
>>>> + MAX_GOLDEN_LRC_SIZE,
>>>>                          XE_BO_CREATE_SYSTEM_BIT |
>>>> @@ -598,6 +654,8 @@ void xe_guc_ads_populate(struct xe_guc_ads *ads)
>>>>        guc_mapping_table_init(gt, &info_map);
>>>>        guc_capture_list_init(ads);
>>>>        guc_doorbell_init(ads);
>>>> +    /* Workaround KLV list */
>>> drop useless comment ...
>>>
>>>> +    guc_waklv_init(ads);
>>>>          if (xe->info.has_usm) {
>>>>            guc_um_init_params(ads);
>>>> diff --git a/drivers/gpu/drm/xe/xe_guc_ads_types.h
>>>> b/drivers/gpu/drm/xe/xe_guc_ads_types.h
>>>> index 4afe44bece4b..62235b2a6fe3 100644
>>>> --- a/drivers/gpu/drm/xe/xe_guc_ads_types.h
>>>> +++ b/drivers/gpu/drm/xe/xe_guc_ads_types.h
>>>> @@ -20,6 +20,8 @@ struct xe_guc_ads {
>>>>        size_t golden_lrc_size;
>>>>        /** @regset_size: size of register set passed to GuC for
>>>> save/restore */
>>>>        u32 regset_size;
>>>> +    /** @ads_waklv_size: waklv size */
>>> ... instead improve comment here
>>>
>>>> +    u32 ads_waklv_size;
>>>>    };
>>>>      #endif
>>>> diff --git a/drivers/gpu/drm/xe/xe_guc_fwif.h
>>>> b/drivers/gpu/drm/xe/xe_guc_fwif.h
>>>> index c281fdbfd2d6..52503719d2aa 100644
>>>> --- a/drivers/gpu/drm/xe/xe_guc_fwif.h
>>>> +++ b/drivers/gpu/drm/xe/xe_guc_fwif.h
>>>> @@ -207,7 +207,10 @@ struct guc_ads {
>>>>        u32
>>>> capture_instance[GUC_CAPTURE_LIST_INDEX_MAX][GUC_MAX_ENGINE_CLASSES];
>>>>        u32
>>>> capture_class[GUC_CAPTURE_LIST_INDEX_MAX][GUC_MAX_ENGINE_CLASSES];
>>>>        u32 capture_global[GUC_CAPTURE_LIST_INDEX_MAX];
>>>> -    u32 reserved[14];
>>>> +    u32 wa_klv_addr_lo;
>>>> +    u32 wa_klv_addr_hi;
>>>> +    u32 wa_klv_size;
>>> maybe it's worth to add a comment from which GuC version these new
>>> fields are redefined
>> Not necessary. Xe does not support any GuC version which does not
>> support a w/a KLV. Therefore this is simply baseline support same as all
>> the other fields here.
> ok, I missed series that sets baseline version for xe, but maybe still
> worth to add some comment to the commit message that this is already
> supported by all our baseline GuC firmwares ?
>
>> John.
>>
>>>> +    u32 reserved[11];
>>>>    } __packed;
>>>>      /* Engine usage stats */