[PATCH v1] drm/xe: Rework throttle ABI

Matt Roper matthew.d.roper at intel.com
Fri Oct 25 17:03:41 UTC 2024


On Fri, Oct 25, 2024 at 02:52:38PM +0530, Raag Jadav wrote:
> Current implementation adds multiple sysfs entries for getting GT
> throttle status and its reasons, which forces the user to read multiple
> entries for evaluating the result. Since output of each entry is based
> on the same underlying hardware register and considering the access type
> of this register is RO/v, the value of this register can change at any
> given point, even between subsequent sysfs reads. This makes current
> implementation fundamentally flawed which can produce inconsistent results.
> 
> Rework throttle ABI and introduce throttle_status attribute which will
> provide throttle status through a oneshot register read, making it
> relatively less error prone. The new ABI will provide throttle reasons
> in a string based list based on the respective bits which are set in the
> hardware. Empty output means no bits are set and hence no throttling.

But the old ABI is already released, and we have platforms with
force_probe lifted already.  Presumably userspace software is already
using the existing interface (otherwise it never would have been allowed
to land upstream), so that means we can't change it in incompatible ways
anymore; we're locked into supporting the current interface forever on
these platforms.

We can change the ABI for _future_ platforms if it makes sense, but it's
too late to make compatibility-breaking changes for LNL and BMG.

> 
> $ cat /sys/devices/.../tile0/gt0/freq0/throttle_status
> prochot
> thermal
> ratl
> thermalert
> tdc
> pl4
> pl1
> pl2
> 
> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2810

I'm not sure what this ticket is about, but it was already closed
several weeks ago due to no longer reproducing.  The ABI change here
doesn't seem to be directly related to this ticket.


Matt

> Signed-off-by: Raag Jadav <raag.jadav at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_gt_throttle.c | 219 ++++------------------------
>  1 file changed, 31 insertions(+), 188 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_gt_throttle.c b/drivers/gpu/drm/xe/xe_gt_throttle.c
> index 03b225364101..ba8bba7721ef 100644
> --- a/drivers/gpu/drm/xe/xe_gt_throttle.c
> +++ b/drivers/gpu/drm/xe/xe_gt_throttle.c
> @@ -16,24 +16,23 @@
>  /**
>   * DOC: Xe GT Throttle
>   *
> - * Provides sysfs entries and other helpers for frequency throttle reasons in GT
> + * Provides sysfs entry for frequency throttle status and its reasons in GT
>   *
> - * device/gt#/freq0/throttle/status - Overall status
> - * device/gt#/freq0/throttle/reason_pl1 - Frequency throttle due to PL1
> - * device/gt#/freq0/throttle/reason_pl2 - Frequency throttle due to PL2
> - * device/gt#/freq0/throttle/reason_pl4 - Frequency throttle due to PL4, Iccmax etc.
> - * device/gt#/freq0/throttle/reason_thermal - Frequency throttle due to thermal
> - * device/gt#/freq0/throttle/reason_prochot - Frequency throttle due to prochot
> - * device/gt#/freq0/throttle/reason_ratl - Frequency throttle due to RATL
> - * device/gt#/freq0/throttle/reason_vr_thermalert - Frequency throttle due to VR THERMALERT
> - * device/gt#/freq0/throttle/reason_vr_tdc -  Frequency throttle due to VR TDC
> + * device/tile#/gt#/freq0/throttle_status - Throttle status and its reasons
>   */
>  
> -static struct xe_gt *
> -dev_to_gt(struct device *dev)
> -{
> -	return kobj_to_gt(dev->kobj.parent);
> -}
> +/* Available GT throttle reasons */
> +static const char *const throttle_reasons[] = {
> +	[ffs(PROCHOT_MASK) - 1]		= "prochot",
> +	[ffs(THERMAL_LIMIT_MASK) - 1]	= "thermal",
> +	[ffs(RATL_MASK) - 1]		= "ratl",
> +	[ffs(VR_THERMALERT_MASK) - 1]	= "thermalert",
> +	[ffs(VR_TDC_MASK) - 1]		= "tdc",
> +	[ffs(POWER_LIMIT_4_MASK) - 1]	= "pl4",
> +	[ffs(POWER_LIMIT_1_MASK) - 1]	= "pl1",
> +	[ffs(POWER_LIMIT_2_MASK) - 1]	= "pl2",
> +};
> +static_assert(ARRAY_SIZE(throttle_reasons) == ffs(POWER_LIMIT_2_MASK));
>  
>  u32 xe_gt_throttle_get_limit_reasons(struct xe_gt *gt)
>  {
> @@ -49,191 +48,35 @@ u32 xe_gt_throttle_get_limit_reasons(struct xe_gt *gt)
>  	return reg;
>  }
>  
> -static u32 read_status(struct xe_gt *gt)
> +static ssize_t throttle_status_show(struct device *dev, struct device_attribute *attr, char *buf)
>  {
> -	u32 status = xe_gt_throttle_get_limit_reasons(gt) & GT0_PERF_LIMIT_REASONS_MASK;
> +	struct xe_gt *gt = kobj_to_gt(dev->kobj.parent);
> +	unsigned long status;
> +	const char *reason;
> +	u32 index, count = 0;
>  
> -	return status;
> -}
> -
> -static u32 read_reason_pl1(struct xe_gt *gt)
> -{
> -	u32 pl1 = xe_gt_throttle_get_limit_reasons(gt) & POWER_LIMIT_1_MASK;
> -
> -	return pl1;
> -}
> -
> -static u32 read_reason_pl2(struct xe_gt *gt)
> -{
> -	u32 pl2 = xe_gt_throttle_get_limit_reasons(gt) & POWER_LIMIT_2_MASK;
> -
> -	return pl2;
> -}
> -
> -static u32 read_reason_pl4(struct xe_gt *gt)
> -{
> -	u32 pl4 = xe_gt_throttle_get_limit_reasons(gt) & POWER_LIMIT_4_MASK;
> -
> -	return pl4;
> -}
> -
> -static u32 read_reason_thermal(struct xe_gt *gt)
> -{
> -	u32 thermal = xe_gt_throttle_get_limit_reasons(gt) & THERMAL_LIMIT_MASK;
> -
> -	return thermal;
> -}
> -
> -static u32 read_reason_prochot(struct xe_gt *gt)
> -{
> -	u32 prochot = xe_gt_throttle_get_limit_reasons(gt) & PROCHOT_MASK;
> -
> -	return prochot;
> -}
> -
> -static u32 read_reason_ratl(struct xe_gt *gt)
> -{
> -	u32 ratl = xe_gt_throttle_get_limit_reasons(gt) & RATL_MASK;
> -
> -	return ratl;
> -}
> -
> -static u32 read_reason_vr_thermalert(struct xe_gt *gt)
> -{
> -	u32 thermalert = xe_gt_throttle_get_limit_reasons(gt) & VR_THERMALERT_MASK;
> -
> -	return thermalert;
> -}
> -
> -static u32 read_reason_vr_tdc(struct xe_gt *gt)
> -{
> -	u32 tdc = xe_gt_throttle_get_limit_reasons(gt) & VR_TDC_MASK;
> -
> -	return tdc;
> -}
> +	status = xe_gt_throttle_get_limit_reasons(gt) & GT0_PERF_LIMIT_REASONS_MASK;
>  
> -static ssize_t status_show(struct device *dev,
> -			   struct device_attribute *attr,
> -			   char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool status = !!read_status(gt);
> -
> -	return sysfs_emit(buff, "%u\n", status);
> -}
> -static DEVICE_ATTR_RO(status);
> -
> -static ssize_t reason_pl1_show(struct device *dev,
> -			       struct device_attribute *attr,
> -			       char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool pl1 = !!read_reason_pl1(gt);
> -
> -	return sysfs_emit(buff, "%u\n", pl1);
> -}
> -static DEVICE_ATTR_RO(reason_pl1);
> -
> -static ssize_t reason_pl2_show(struct device *dev,
> -			       struct device_attribute *attr,
> -			       char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool pl2 = !!read_reason_pl2(gt);
> -
> -	return sysfs_emit(buff, "%u\n", pl2);
> -}
> -static DEVICE_ATTR_RO(reason_pl2);
> -
> -static ssize_t reason_pl4_show(struct device *dev,
> -			       struct device_attribute *attr,
> -			       char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool pl4 = !!read_reason_pl4(gt);
> -
> -	return sysfs_emit(buff, "%u\n", pl4);
> -}
> -static DEVICE_ATTR_RO(reason_pl4);
> -
> -static ssize_t reason_thermal_show(struct device *dev,
> -				   struct device_attribute *attr,
> -				   char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool thermal = !!read_reason_thermal(gt);
> -
> -	return sysfs_emit(buff, "%u\n", thermal);
> -}
> -static DEVICE_ATTR_RO(reason_thermal);
> -
> -static ssize_t reason_prochot_show(struct device *dev,
> -				   struct device_attribute *attr,
> -				   char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool prochot = !!read_reason_prochot(gt);
> -
> -	return sysfs_emit(buff, "%u\n", prochot);
> -}
> -static DEVICE_ATTR_RO(reason_prochot);
> -
> -static ssize_t reason_ratl_show(struct device *dev,
> -				struct device_attribute *attr,
> -				char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool ratl = !!read_reason_ratl(gt);
> -
> -	return sysfs_emit(buff, "%u\n", ratl);
> -}
> -static DEVICE_ATTR_RO(reason_ratl);
> -
> -static ssize_t reason_vr_thermalert_show(struct device *dev,
> -					 struct device_attribute *attr,
> -					 char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool thermalert = !!read_reason_vr_thermalert(gt);
> -
> -	return sysfs_emit(buff, "%u\n", thermalert);
> -}
> -static DEVICE_ATTR_RO(reason_vr_thermalert);
> -
> -static ssize_t reason_vr_tdc_show(struct device *dev,
> -				  struct device_attribute *attr,
> -				  char *buff)
> -{
> -	struct xe_gt *gt = dev_to_gt(dev);
> -	bool tdc = !!read_reason_vr_tdc(gt);
> +	for_each_set_bit(index, &status, ffs(POWER_LIMIT_2_MASK)) {
> +		reason = throttle_reasons[index];
> +		if (reason)
> +			count += sysfs_emit_at(buf, count, "%s\n", reason);
> +	}
>  
> -	return sysfs_emit(buff, "%u\n", tdc);
> +	return count;
>  }
> -static DEVICE_ATTR_RO(reason_vr_tdc);
> +static DEVICE_ATTR_RO(throttle_status);
>  
> -static struct attribute *throttle_attrs[] = {
> -	&dev_attr_status.attr,
> -	&dev_attr_reason_pl1.attr,
> -	&dev_attr_reason_pl2.attr,
> -	&dev_attr_reason_pl4.attr,
> -	&dev_attr_reason_thermal.attr,
> -	&dev_attr_reason_prochot.attr,
> -	&dev_attr_reason_ratl.attr,
> -	&dev_attr_reason_vr_thermalert.attr,
> -	&dev_attr_reason_vr_tdc.attr,
> +static const struct attribute *throttle_attrs[] = {
> +	&dev_attr_throttle_status.attr,
>  	NULL
>  };
>  
> -static const struct attribute_group throttle_group_attrs = {
> -	.name = "throttle",
> -	.attrs = throttle_attrs,
> -};
> -
>  static void gt_throttle_sysfs_fini(void *arg)
>  {
>  	struct xe_gt *gt = arg;
>  
> -	sysfs_remove_group(gt->freq, &throttle_group_attrs);
> +	sysfs_remove_files(gt->freq, throttle_attrs);
>  }
>  
>  int xe_gt_throttle_init(struct xe_gt *gt)
> @@ -241,7 +84,7 @@ int xe_gt_throttle_init(struct xe_gt *gt)
>  	struct xe_device *xe = gt_to_xe(gt);
>  	int err;
>  
> -	err = sysfs_create_group(gt->freq, &throttle_group_attrs);
> +	err = sysfs_create_files(gt->freq, throttle_attrs);
>  	if (err)
>  		return err;
>  
> -- 
> 2.34.1
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation


More information about the Intel-xe mailing list