[PATCH] drm/xe/guc: Stop reading masks from device memory when populating the ADS
K V P, Satyanarayana
satyanarayana.k.v.p at intel.com
Fri Feb 14 17:02:18 UTC 2025
> -----Original Message-----
> From: Ceraolo Spurio, Daniele <daniele.ceraolospurio at intel.com>
> Sent: Friday, February 14, 2025 10:22 PM
> To: K V P, Satyanarayana <satyanarayana.k.v.p at intel.com>; intel-
> xe at lists.freedesktop.org
> Cc: Harrison, John C <john.c.harrison at intel.com>
> Subject: Re: [PATCH] drm/xe/guc: Stop reading masks from device memory
> when populating the ADS
>
>
>
> On 2/14/2025 1:54 AM, K V P, Satyanarayana wrote:
> > Hi.
> >> -----Original Message-----
> >> From: Intel-xe <intel-xe-bounces at lists.freedesktop.org> On Behalf Of
> Daniele
> >> Ceraolo Spurio
> >> Sent: Friday, February 14, 2025 3:11 AM
> >> To: intel-xe at lists.freedesktop.org
> >> Cc: Ceraolo Spurio, Daniele <daniele.ceraolospurio at intel.com>; Harrison,
> John
> >> C <john.c.harrison at intel.com>
> >> Subject: [PATCH] drm/xe/guc: Stop reading masks from device memory
> when
> >> populating the ADS
> >>
> >> If the device memory is corrupted during the suspend/resume flow, the
> >> masks might end up being random values and using them might lead us to
> >> trying to set values for engines that do not exist, which in turns might
> >> lead to invalid memory accesses.
> >>
> >> Given that the driver does know which engines are available, we can just
> >> calculate the masks instead of reading them out of memory.
> >>
> > Let us log an error if there is a mismatch between what is read from memory
> and the info available with driver.
> > This information will be useful to debug issues (if any) with corrupted device
> memory.
>
> I disagree. There are a ton of places where we touch memory and could
> perform such a check, so we should either do it everywhere or nowhere;
> we shouldn't special case the GuC just because there happens to be a
> patch in flight for it.
> Also note that this issue is likely due to the memory being
> non-functional, because the value is re-written after resume before
> being re-read, so we're not trying to read a pre-suspend value (I should
> have clearer on that in the commit message). If we want to test that
> memory works we should add a dedicated local memory health test in the
> resume flow and not do it as part of GuC init.
>
> Daniele
>
> >
> > - Satya.
> >
LGTM.
Reviewed-by: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> >> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> >> Cc: John Harrison <John.C.Harrison at Intel.com>
> >> ---
> >> drivers/gpu/drm/xe/xe_guc_ads.c | 36 ++++++++++++---------------------
> >> 1 file changed, 13 insertions(+), 23 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c
> >> b/drivers/gpu/drm/xe/xe_guc_ads.c
> >> index fab259adc380..10e2ab5791b7 100644
> >> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> >> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> >> @@ -129,9 +129,6 @@ struct __guc_ads_blob {
> >> #define info_map_write(xe_, map_, field_, val_) \
> >> xe_map_wr_field(xe_, map_, 0, struct guc_gt_system_info, field_,
> >> val_)
> >>
> >> -#define info_map_read(xe_, map_, field_) \
> >> - xe_map_rd_field(xe_, map_, 0, struct guc_gt_system_info, field_)
> >> -
> >> static size_t guc_ads_regset_size(struct xe_guc_ads *ads)
> >> {
> >> struct xe_device *xe = ads_to_xe(ads);
> >> @@ -493,13 +490,12 @@ static void fill_engine_enable_masks(struct
> xe_gt
> >> *gt,
> >> static void guc_prep_golden_lrc_null(struct xe_guc_ads *ads)
> >> {
> >> struct xe_device *xe = ads_to_xe(ads);
> >> - struct iosys_map info_map =
> >> IOSYS_MAP_INIT_OFFSET(ads_to_map(ads),
> >> - offsetof(struct __guc_ads_blob, system_info));
> >> - u8 guc_class;
> >> + u8 class;
> >>
> >> - for (guc_class = 0; guc_class <= GUC_MAX_ENGINE_CLASSES;
> >> ++guc_class) {
> >> - if (!info_map_read(xe, &info_map,
> >> - engine_enabled_masks[guc_class]))
> >> + for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) {
> >> + u8 guc_class = xe_engine_class_to_guc_class(class);
> >> +
> >> + if (!engine_enable_mask(ads_to_gt(ads), class))
> >> continue;
> >>
> >> ads_blob_write(ads, ads.eng_state_size[guc_class],
> >> @@ -546,25 +542,24 @@ static void guc_mapping_table_init(struct xe_gt
> >> *gt,
> >> static u32 guc_get_capture_engine_mask(struct xe_gt *gt, struct
> iosys_map
> >> *info_map,
> >> enum guc_capture_list_class_type
> >> capture_class)
> >> {
> >> - struct xe_device *xe = gt_to_xe(gt);
> >> u32 mask;
> >>
> >> switch (capture_class) {
> >> case GUC_CAPTURE_LIST_CLASS_RENDER_COMPUTE:
> >> - mask = info_map_read(xe, info_map,
> >> engine_enabled_masks[GUC_RENDER_CLASS]);
> >> - mask |= info_map_read(xe, info_map,
> >> engine_enabled_masks[GUC_COMPUTE_CLASS]);
> >> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_RENDER);
> >> + mask |= engine_enable_mask(gt,
> >> XE_ENGINE_CLASS_COMPUTE);
> >> break;
> >> case GUC_CAPTURE_LIST_CLASS_VIDEO:
> >> - mask = info_map_read(xe, info_map,
> >> engine_enabled_masks[GUC_VIDEO_CLASS]);
> >> + mask = engine_enable_mask(gt,
> >> XE_ENGINE_CLASS_VIDEO_DECODE);
> >> break;
> >> case GUC_CAPTURE_LIST_CLASS_VIDEOENHANCE:
> >> - mask = info_map_read(xe, info_map,
> >> engine_enabled_masks[GUC_VIDEOENHANCE_CLASS]);
> >> + mask = engine_enable_mask(gt,
> >> XE_ENGINE_CLASS_VIDEO_ENHANCE);
> >> break;
> >> case GUC_CAPTURE_LIST_CLASS_BLITTER:
> >> - mask = info_map_read(xe, info_map,
> >> engine_enabled_masks[GUC_BLITTER_CLASS]);
> >> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_COPY);
> >> break;
> >> case GUC_CAPTURE_LIST_CLASS_GSC_OTHER:
> >> - mask = info_map_read(xe, info_map,
> >> engine_enabled_masks[GUC_GSC_OTHER_CLASS]);
> >> + mask = engine_enable_mask(gt, XE_ENGINE_CLASS_OTHER);
> >> break;
> >> default:
> >> mask = 0;
> >> @@ -907,8 +902,6 @@ static void guc_populate_golden_lrc(struct
> >> xe_guc_ads *ads)
> >> {
> >> struct xe_device *xe = ads_to_xe(ads);
> >> struct xe_gt *gt = ads_to_gt(ads);
> >> - struct iosys_map info_map =
> >> IOSYS_MAP_INIT_OFFSET(ads_to_map(ads),
> >> - offsetof(struct __guc_ads_blob, system_info));
> >> size_t total_size = 0, alloc_size, real_size;
> >> u32 addr_ggtt, offset;
> >> int class;
> >> @@ -917,12 +910,9 @@ static void guc_populate_golden_lrc(struct
> >> xe_guc_ads *ads)
> >> addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset;
> >>
> >> for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) {
> >> - u8 guc_class;
> >> + u8 guc_class = xe_engine_class_to_guc_class(class);
> >>
> >> - guc_class = xe_engine_class_to_guc_class(class);
> >> -
> >> - if (!info_map_read(xe, &info_map,
> >> - engine_enabled_masks[guc_class]))
> >> + if (!engine_enable_mask(gt, class))
> >> continue;
> >>
> >> xe_gt_assert(gt, gt->default_lrc[class]);
> >> --
> >> 2.43.0
More information about the Intel-xe
mailing list