[CI 1/2] drm/xe/guc: Enable extended CAT error reporting

Daniele Ceraolo Spurio daniele.ceraolospurio at intel.com
Thu Jun 26 00:05:59 UTC 2025



On 6/25/2025 4:54 PM, Matthew Brost wrote:
> On Wed, Jun 25, 2025 at 01:54:06PM -0700, Daniele Ceraolo Spurio wrote:
>> On newer HW (Xe2 onwards + PVC) it is possible to get extra information
>> when a CAT error occurs, specifically a dword reporting the error type.
>> To enable this extra reporting, we need to opt-in with the GuC, which is
>> done via a specific per-VF feature opt-in H2G.
>>
>> On platforms where the HW does not support the extra reporting, the GuC
>> will set the type to 0xdeadbeef, so we can keep the code simple and
>> opt-in to the feature on every platform and then just discard the data
>> if it is invalid.
>>
>> Note that on native/PF we're guaranteed that the opt in is available
>> because we don't support any GuC old enough to not have it, but if we're
>> a VF we might be running on a non-XE PF with an older GuC, so we need to
>> handle that case. We can re-use the invalid type above to handle this
>> scenario the same way as if the feature was not supported in HW.
>>
>> Given that this patch is the first user of the guc_buf_cache on native
>> and VF, it also extends that feature to non-PF use-cases.
>>
>> v2: simpler print for the error type (John), rebase
>> v3: use guc_buf_cache instead of new alloc, simpler doc (Michal)
>>
>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>> Cc: Nirmoy Das <nirmoy.das at intel.com>
>> Cc: John Harrison <John.C.Harrison at Intel.com>
>> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
>> Reviewed-by: Nirmoy Das <nirmoy.das at intel.com> #v1
>> Reviewed-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
>> Reviewed-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>>   drivers/gpu/drm/xe/abi/guc_actions_abi.h |  4 ++
>>   drivers/gpu/drm/xe/abi/guc_klvs_abi.h    | 15 +++++++
>>   drivers/gpu/drm/xe/xe_guc.c              | 56 ++++++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_guc.h              |  1 +
>>   drivers/gpu/drm/xe/xe_guc_submit.c       | 16 +++++--
>>   drivers/gpu/drm/xe/xe_uc.c               |  4 ++
>>   6 files changed, 93 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> index ff4f412c28d8..81eb046aeebf 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h
>> @@ -142,6 +142,7 @@ enum xe_guc_action {
>>   	XE_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>>   	XE_GUC_ACTION_SET_DEVICE_ENGINE_ACTIVITY_BUFFER = 0x550C,
>>   	XE_GUC_ACTION_SET_FUNCTION_ENGINE_ACTIVITY_BUFFER = 0x550D,
>> +	XE_GUC_ACTION_OPT_IN_FEATURE_KLV = 0x550E,
>>   	XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR = 0x6000,
>>   	XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC = 0x6002,
>>   	XE_GUC_ACTION_PAGE_FAULT_RES_DESC = 0x6003,
>> @@ -271,4 +272,7 @@ enum xe_guc_g2g_type {
>>   #define XE_G2G_DEREGISTER_TILE	REG_GENMASK(15, 12)
>>   #define XE_G2G_DEREGISTER_TYPE	REG_GENMASK(11, 8)
>>   
>> +/* invalid type for XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR */
>> +#define XE_GUC_CAT_ERR_TYPE_INVALID 0xdeadbeef
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>> index 7de8f827281f..5b2502bec2dc 100644
>> --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>> +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h
>> @@ -16,6 +16,7 @@
>>    *  +===+=======+==============================================================+
>>    *  | 0 | 31:16 | **KEY** - KLV key identifier                                 |
>>    *  |   |       |   - `GuC Self Config KLVs`_                                  |
>> + *  |   |       |   - `GuC Opt In Feature KLVs`_                               |
>>    *  |   |       |   - `GuC VGT Policy KLVs`_                                   |
>>    *  |   |       |   - `GuC VF Configuration KLVs`_                             |
>>    *  |   |       |                                                              |
>> @@ -124,6 +125,20 @@ enum  {
>>   	GUC_CONTEXT_POLICIES_KLV_NUM_IDS = 5,
>>   };
>>   
>> +/**
>> + * DOC: GuC Opt In Feature KLVs
>> + *
>> + * `GuC KLV`_ keys available for use with OPT_IN_FEATURE_KLV
>> + *
>> + *  _`GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE` : 0x4001
>> + *      Adds an extra dword to the XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR G2H
>> + *      containing the type of the CAT error. On HW that does not support
>> + *      reporting the CAT error type, the extra dword is set to 0xdeadbeef.
>> + */
>> +
>> +#define GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE_KEY 0x4001
>> +#define GUC_KLV_OPT_IN_FEATURE_EXT_CAT_ERR_TYPE_LEN 0u
>> +
>>   /**
>>    * DOC: GuC VGT Policy KLVs
>>    *
>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>> index 209e5d53c290..4a7c467ad669 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.c
>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>> @@ -29,6 +29,7 @@
>>   #include "xe_guc_db_mgr.h"
>>   #include "xe_guc_engine_activity.h"
>>   #include "xe_guc_hwconfig.h"
>> +#include "xe_guc_klv_helpers.h"
>>   #include "xe_guc_log.h"
>>   #include "xe_guc_pc.h"
>>   #include "xe_guc_relay.h"
>> @@ -570,6 +571,57 @@ static int guc_g2g_start(struct xe_guc *guc)
>>   	return err;
>>   }
>>   
>> +static int __guc_opt_in_features_enable(struct xe_guc *guc, u64 addr, u32 num_dwords)
>> +{
>> +	u32 action[] = {
>> +		XE_GUC_ACTION_OPT_IN_FEATURE_KLV,
>> +		lower_32_bits(addr),
>> +		upper_32_bits(addr),
>> +		num_dwords
>> +	};
>> +
>> +	return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
>> +}
>> +
>> +#define OPT_IN_MAX_DWORDS 16
>> +int xe_guc_opt_in_features_enable(struct xe_guc *guc)
>> +{
>> +	struct xe_device *xe = guc_to_xe(guc);
>> +	CLASS(xe_guc_buf, buf)(&guc->buf, OPT_IN_MAX_DWORDS);
>> +	u32 count = 0;
>> +	u32 *klvs;
>> +	int ret;
>> +
>> +	if (!xe_guc_buf_is_valid(buf))
>> +		return -ENOBUFS;
>> +
>> +	klvs = xe_guc_buf_cpu_ptr(buf);
>> +
>> +	/*
>> +	 * The extra CAT error type opt-in was added in GuC v70.17.0, which maps
>> +	 * to compatibility version v1.7.0.
>> +	 * Note that the GuC allows enabling this KLV even on platforms that do
>> +	 * not support the extra type; in such case the returned type variable
>> +	 * will be set to a known invalid value which we can check against.
>> +	 */
>> +	if (GUC_SUBMIT_VER(guc) >= MAKE_GUC_VER(1, 7, 0))
>> +		klvs[count++] = PREP_GUC_KLV_TAG(OPT_IN_FEATURE_EXT_CAT_ERR_TYPE);
>> +
>> +	if (count) {
>> +		xe_assert(xe, count <= OPT_IN_MAX_DWORDS);
>> +
>> +		ret = __guc_opt_in_features_enable(guc, xe_guc_buf_flush(buf), count);
>> +		if (ret < 0) {
>> +			xe_gt_err(guc_to_gt(guc),
>> +				  "failed to enable GuC opt-in features: %pe\n",
>> +				  ERR_PTR(ret));
>> +			return ret;
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static void guc_fini_hw(void *arg)
>>   {
>>   	struct xe_guc *guc = arg;
>> @@ -767,6 +819,10 @@ int xe_guc_post_load_init(struct xe_guc *guc)
>>   
>>   	xe_guc_ads_populate_post_load(&guc->ads);
>>   
>> +	ret = xe_guc_opt_in_features_enable(guc);
>> +	if (ret)
>> +		return ret;
>> +
>>   	if (xe_guc_g2g_wanted(guc_to_xe(guc))) {
>>   		ret = guc_g2g_start(guc);
>>   		if (ret)
>> diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
>> index 58338be44558..4a66575f017d 100644
>> --- a/drivers/gpu/drm/xe/xe_guc.h
>> +++ b/drivers/gpu/drm/xe/xe_guc.h
>> @@ -33,6 +33,7 @@ int xe_guc_reset(struct xe_guc *guc);
>>   int xe_guc_upload(struct xe_guc *guc);
>>   int xe_guc_min_load_for_hwconfig(struct xe_guc *guc);
>>   int xe_guc_enable_communication(struct xe_guc *guc);
>> +int xe_guc_opt_in_features_enable(struct xe_guc *guc);
>>   int xe_guc_suspend(struct xe_guc *guc);
>>   void xe_guc_notify(struct xe_guc *guc);
>>   int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr);
>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>> index 26c6c71dc91a..32548c931615 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>> @@ -2103,12 +2103,16 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>>   	struct xe_gt *gt = guc_to_gt(guc);
>>   	struct xe_exec_queue *q;
>>   	u32 guc_id;
>> +	u32 type = XE_GUC_CAT_ERR_TYPE_INVALID;
>>   
>> -	if (unlikely(len < 1))
>> +	if (unlikely(!len || len > 2))
>>   		return -EPROTO;
>>   
>>   	guc_id = msg[0];
>>   
>> +	if (len == 2)
>> +		type = msg[1];
>> +
>>   	if (guc_id == GUC_ID_UNKNOWN) {
>>   		/*
>>   		 * GuC uses GUC_ID_UNKNOWN if it can not map the CAT fault to any PF/VF
>> @@ -2122,8 +2126,14 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>>   	if (unlikely(!q))
>>   		return -EPROTO;
>>   
>> -	xe_gt_dbg(gt, "Engine memory cat error: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
>> -		  xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>> +	if (type != XE_GUC_CAT_ERR_TYPE_INVALID)
>> +		xe_gt_dbg(gt,
>> +			  "Engine memory CAT error [%u]: class=%s, logical_mask: 0x%x, guc_id=%d",
>> +			  type, xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> Do we define the type anywhere - I only see XE_GUC_CAT_ERR_TYPE_INVALID.
>
> It would be useful if we had this defined somewhere in KMD headers or
> even more useful if type was accompanied by a string description.

The type is HW-defined, the GuC just forwards it. AFAICT the values are 
not guaranteed to be the same across platforms (the Xe and Xe2 lists are 
different, see bspec 54047 and 72187), so I don't think we want to 
maintain a list in the driver.

Daniele

>
> Matt
>   
>> +	else
>> +		xe_gt_dbg(gt,
>> +			  "Engine memory CAT error: class=%s, logical_mask: 0x%x, guc_id=%d",
>> +			  xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>>   
>>   	trace_xe_exec_queue_memory_cat_error(q);
>>   
>> diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
>> index 3a8751a8b92d..5c45b0f072a4 100644
>> --- a/drivers/gpu/drm/xe/xe_uc.c
>> +++ b/drivers/gpu/drm/xe/xe_uc.c
>> @@ -165,6 +165,10 @@ static int vf_uc_init_hw(struct xe_uc *uc)
>>   
>>   	uc->guc.submission_state.enabled = true;
>>   
>> +	err = xe_guc_opt_in_features_enable(&uc->guc);
>> +	if (err)
>> +		return err;
>> +
>>   	err = xe_gt_record_default_lrcs(uc_to_gt(uc));
>>   	if (err)
>>   		return err;
>> -- 
>> 2.43.0
>>



More information about the Intel-xe mailing list