[PATCH] drm/xe/guc: Stop reading masks from device memory when populating the ADS
Daniele Ceraolo Spurio
daniele.ceraolospurio at intel.com
Fri Feb 14 16:42:32 UTC 2025
On 2/13/2025 2:50 PM, John Harrison wrote:
> On 2/13/2025 13:40, Daniele Ceraolo Spurio wrote:
>> If the device memory is corrupted during the suspend/resume flow, the
>> masks might end up being random values and using them might lead us to
>> trying to set values for engines that do not exist, which in turns might
>> lead to invalid memory accesses.
> Which is all bad and should be fixed, but that is not the only
> problems we would get from a corrupted ADS blob. E.g. GuC itself is
> going to have exactly the same problem given that this is where it
> gets its engine masks from.
The aim here is not to make the driver load successfully, it is to avoid
the kernel doing an invalid memory access. The GuC load will still fail
afterwards, but we'll handle it gracefully.
>
> Is it worth adding some kind of check on the memory being valid? Add a
> magic word somewhere unused and check that it is still correct? And if
> not, fail the re-init with a meaningful error message.
I think I worded the commit message incorrectly. We do re-write the ADS
on resume, including the masks. If we find an invalid value (like it has
been reported in
https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4037) it likely
implies that the local memory is not functioning correctly. If we wanted
to catch this, the best approach would be a memory health test on resume
(which is probably a good idea, but unrelated to this patch).
>
>>
>> Given that the driver does know which engines are available, we can just
>> calculate the masks instead of reading them out of memory.
>>
>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>> Cc: John Harrison <John.C.Harrison at Intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_guc_ads.c | 36 ++++++++++++---------------------
>> 1 file changed, 13 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c
>> b/drivers/gpu/drm/xe/xe_guc_ads.c
>> index fab259adc380..10e2ab5791b7 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
>> @@ -129,9 +129,6 @@ struct __guc_ads_blob {
>> #define info_map_write(xe_, map_, field_, val_) \
>> xe_map_wr_field(xe_, map_, 0, struct guc_gt_system_info,
>> field_, val_)
>> -#define info_map_read(xe_, map_, field_) \
>> - xe_map_rd_field(xe_, map_, 0, struct guc_gt_system_info, field_)
>> -
>> static size_t guc_ads_regset_size(struct xe_guc_ads *ads)
>> {
>> struct xe_device *xe = ads_to_xe(ads);
>> @@ -493,13 +490,12 @@ static void fill_engine_enable_masks(struct
>> xe_gt *gt,
>> static void guc_prep_golden_lrc_null(struct xe_guc_ads *ads)
>> {
>> struct xe_device *xe = ads_to_xe(ads);
>> - struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads),
>> - offsetof(struct __guc_ads_blob, system_info));
>> - u8 guc_class;
>> + u8 class;
>> - for (guc_class = 0; guc_class <= GUC_MAX_ENGINE_CLASSES;
>> ++guc_class) {
>> - if (!info_map_read(xe, &info_map,
>> - engine_enabled_masks[guc_class]))
>> + for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) {
>> + u8 guc_class = xe_engine_class_to_guc_class(class);
>> +
>> + if (!engine_enable_mask(ads_to_gt(ads), class))
>> continue;
>> ads_blob_write(ads, ads.eng_state_size[guc_class],
>> @@ -546,25 +542,24 @@ static void guc_mapping_table_init(struct xe_gt
>> *gt,
>> static u32 guc_get_capture_engine_mask(struct xe_gt *gt, struct
>> iosys_map *info_map,
>> enum guc_capture_list_class_type capture_class)
>> {
>> - struct xe_device *xe = gt_to_xe(gt);
>> u32 mask;
>> switch (capture_class) {
>> case GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE:
>> - mask = info_map_read(xe, info_map,
>> engine_enabled_masks[GUC_RENDER_CLASS]);
>> - mask |= info_map_read(xe, info_map,
>> engine_enabled_masks[GUC_COMPUTE_CLASS]);
>> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_RENDER);
>> + mask |= engine_enable_mask(gt, XE_ENGINE_CLASS_COMPUTE);
> Seems odd to have '; mask |=' rather than just '|'.
>
> Not a blocker, but given you are changing this anyway, might as well
> clean it up?
Sure, I'll update.
Daniele
>
> John.
>
>> break;
>> case GUC_CAPTURE_LIST_CLASS_VIDEO:
>> - mask = info_map_read(xe, info_map,
>> engine_enabled_masks[GUC_VIDEO_CLASS]);
>> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_VIDEO_DECODE);
>> break;
>> case GUC_CAPTURE_LIST_CLASS_VIDEOENHANCE:
>> - mask = info_map_read(xe, info_map,
>> engine_enabled_masks[GUC_VIDEOENHANCE_CLASS]);
>> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_VIDEO_ENHANCE);
>> break;
>> case GUC_CAPTURE_LIST_CLASS_BLITTER:
>> - mask = info_map_read(xe, info_map,
>> engine_enabled_masks[GUC_BLITTER_CLASS]);
>> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_COPY);
>> break;
>> case GUC_CAPTURE_LIST_CLASS_GSC_OTHER:
>> - mask = info_map_read(xe, info_map,
>> engine_enabled_masks[GUC_GSC_OTHER_CLASS]);
>> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_OTHER);
>> break;
>> default:
>> mask = 0;
>> @@ -907,8 +902,6 @@ static void guc_populate_golden_lrc(struct
>> xe_guc_ads *ads)
>> {
>> struct xe_device *xe = ads_to_xe(ads);
>> struct xe_gt *gt = ads_to_gt(ads);
>> - struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads),
>> - offsetof(struct __guc_ads_blob, system_info));
>> size_t total_size = 0, alloc_size, real_size;
>> u32 addr_ggtt, offset;
>> int class;
>> @@ -917,12 +910,9 @@ static void guc_populate_golden_lrc(struct
>> xe_guc_ads *ads)
>> addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset;
>> for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) {
>> - u8 guc_class;
>> + u8 guc_class = xe_engine_class_to_guc_class(class);
>> - guc_class = xe_engine_class_to_guc_class(class);
>> -
>> - if (!info_map_read(xe, &info_map,
>> - engine_enabled_masks[guc_class]))
>> + if (!engine_enable_mask(gt, class))
>> continue;
>> xe_gt_assert(gt, gt->default_lrc[class]);
>
More information about the Intel-xe
mailing list