[Intel-xe] [PATCH V6 1/2] drm/xe: Introduce low level driver error counting APIs

Michal Wajdeczko michal.wajdeczko at intel.com
Fri Oct 13 14:26:19 UTC 2023



On 13.10.2023 14:16, Tejas Upadhyay wrote:
> Low level driver error that might have power or performance
> impact on the system, we are adding a new error counter to GT
> and tile and increment on each occurrance. Lets introduce APIs

typo

> to define and increment each error type counter.
> 
> V5:
>   - Fix MAX enum issue
>   - Use kernel doc for enum - Niranjana
> V4:
>   - Move API declaration under tile.h - Jani
>   - Typos
> V3:
>   - correct #define max value
> V2:
>   - Move some code to its related patch - Michal
>   - Renaming if API and enum - Michal
>   - GUC errors are moved per GT - Michal
>   - Some nits - Michal
> 
> Signed-off-by: Tejas Upadhyay <tejas.upadhyay at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device_types.h | 17 +++++++++++++++++
>  drivers/gpu/drm/xe/xe_gt.c           | 18 ++++++++++++++++++
>  drivers/gpu/drm/xe/xe_gt.h           |  3 +++
>  drivers/gpu/drm/xe/xe_gt_types.h     | 19 +++++++++++++++++++
>  drivers/gpu/drm/xe/xe_tile.c         | 18 ++++++++++++++++++
>  drivers/gpu/drm/xe/xe_tile.h         |  2 ++
>  6 files changed, 77 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index bc375ddda5a7..aac6d66622d9 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -59,6 +59,20 @@ struct xe_pat_ops;
>  		 const struct xe_tile * : (const struct xe_device *)((tile__)->xe),	\
>  		 struct xe_tile * : (tile__)->xe)
>  
> +/** @enum xe_tile_drv_err_type
> + *  enum for different types of tile level errors in driver
> + */
> +enum xe_tile_drv_err_type {
> +	/* Error type for all PPGTT and GTT errors */
> +	XE_TILE_DRV_ERR_GTT,

nit: any particular reason why do you want to track them together?
AFAIK there could be quite different problems in GGTT vs PPGTT management

> +	/* Interrupt errors */
> +	XE_TILE_DRV_ERR_INTR,
> +	/* Max defined error types, keep this last */
> +	__XE_TILE_DRV_ERR_MAX
> +};
> +
> +#define XE_TILE_DRV_ERR_MAX	__XE_TILE_DRV_ERR_MAX

why do you redefine this MAX?

this different naming of MAX value (using double underscore) was on
purpose to avoid mixing that with 'regular' enum values.
this define kills that

> +
>  /**
>   * struct xe_mem_region - memory region structure
>   * This is used to describe a memory region in xe
> @@ -188,6 +202,9 @@ struct xe_tile {
>  
>  	/** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>  	struct kobject *sysfs;
> +
> +	/** @drv_err_cnt: driver error counter for this tile */
> +	u32 drv_err_cnt[XE_TILE_DRV_ERR_MAX];

__XE_TILE_DRV_ERR_MAX should work here too

>  };
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index c63e2e4750b1..b81de3984626 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -47,6 +47,24 @@
>  #include "xe_wa.h"
>  #include "xe_wopcm.h"
>  
> +/**
> + * xe_gt_report_driver_error - Count driver error for gt
> + * @gt: GT to count error for
> + * @err: enum error type
> + *
> + * Increment the driver error counter in respective error
> + * category for this GT.
> + *
> + * Returns void.
> + */
> +void xe_gt_report_driver_error(struct xe_gt *gt,
> +			       const enum xe_gt_drv_err_type err)
> +{
> +	xe_gt_assert(gt, err >= ARRAY_SIZE(gt->drv_err_cnt));

asserts use different logic than WARN_ON so this should rather be:

	xe_gt_assert(gt, err >= 0);
	xe_gt_assert(gt, err < __XE_TILE_DRV_ERR_MAX);

> +	WRITE_ONCE(gt->drv_err_cnt[err],
> +		   READ_ONCE(gt->drv_err_cnt[err]) + 1);
> +}
> +
>  struct xe_gt *xe_gt_alloc(struct xe_tile *tile)
>  {
>  	struct xe_gt *gt;
> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> index caded203a8a0..9442d615042f 100644
> --- a/drivers/gpu/drm/xe/xe_gt.h
> +++ b/drivers/gpu/drm/xe/xe_gt.h
> @@ -67,4 +67,7 @@ static inline bool xe_gt_is_usm_hwe(struct xe_gt *gt, struct xe_hw_engine *hwe)
>  		hwe->instance == gt->usm.reserved_bcs_instance;
>  }
>  
> +void xe_gt_report_driver_error(struct xe_gt *gt,
> +			       const enum xe_gt_drv_err_type err);
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index d4310be3e1e7..422f54c45ac9 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -24,6 +24,22 @@ enum xe_gt_type {
>  	XE_GT_TYPE_MEDIA,
>  };
>  
> +/** @enum xe_gt_drv_err_type
> + *  enum for different types of gt level errors in driver
> + */
> +enum xe_gt_drv_err_type {
> +	/* Driver guc communication errors */
> +	XE_GT_DRV_ERR_GUC_COMM,

do we care only about communication with GuC (lost CTB/MMIO, error
replies) or about all problems related to GuC (corrupted FW, FW crash)?

> +	/* Engine execution errors */
> +	XE_GT_DRV_ERR_ENGINE,
> +	/* Other errors like error during save/restore registers */
> +	XE_GT_DRV_ERR_OTHERS,
> +	/* Max defined error types, keep this last */
> +	__XE_GT_DRV_ERR_MAX
> +};
> +
> +#define XE_GT_DRV_ERR_MAX	__XE_GT_DRV_ERR_MAX

ditto

> +
>  #define XE_MAX_DSS_FUSE_REGS	3
>  #define XE_MAX_EU_FUSE_REGS	1
>  
> @@ -347,6 +363,9 @@ struct xe_gt {
>  		/** @oob: bitmap with active OOB workaroudns */
>  		unsigned long *oob;
>  	} wa_active;
> +
> +	/** @drv_err_cnt: driver error counter for this GT */
> +	u32 drv_err_cnt[XE_GT_DRV_ERR_MAX];
>  };
>  
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> index 131752a57f65..708dd385f2b1 100644
> --- a/drivers/gpu/drm/xe/xe_tile.c
> +++ b/drivers/gpu/drm/xe/xe_tile.c
> @@ -71,6 +71,24 @@
>   *  - MOCS and PAT programming
>   */
>  
> +/**
> + * xe_tile_report_driver_error - Count driver error for tile
> + * @tile: Tile to count error for
> + * @err: enum error type
> + *
> + * Increment the driver error counter in respective error
> + * category for this tile.
> + *
> + * Returns void.
> + */
> +void xe_tile_report_driver_error(struct xe_tile *tile,
> +				 const enum xe_tile_drv_err_type err)
> +{
> +	xe_assert(tile_to_xe(tile), err >= ARRAY_SIZE(tile->drv_err_cnt));

wrong condition

> +	WRITE_ONCE(tile->drv_err_cnt[err],
> +		   READ_ONCE(tile->drv_err_cnt[err]) + 1);
> +}
> +
>  /**
>   * xe_tile_alloc - Perform per-tile memory allocation
>   * @tile: Tile to perform allocations for
> diff --git a/drivers/gpu/drm/xe/xe_tile.h b/drivers/gpu/drm/xe/xe_tile.h
> index 782c47f8bd45..092a6b17a97e 100644
> --- a/drivers/gpu/drm/xe/xe_tile.h
> +++ b/drivers/gpu/drm/xe/xe_tile.h
> @@ -14,5 +14,7 @@ int xe_tile_alloc(struct xe_tile *tile);
>  int xe_tile_init_noalloc(struct xe_tile *tile);
>  
>  void xe_tile_migrate_wait(struct xe_tile *tile);
> +void xe_tile_report_driver_error(struct xe_tile *tile,
> +				 const enum xe_tile_drv_err_type err);
>  
>  #endif


More information about the Intel-xe mailing list