[PATCH v1 1/2] drm/xe/debugfs: Expose PCIe Gen5 update telemetry
Nilawar, Badal
badal.nilawar at intel.com
Wed Apr 2 18:24:26 UTC 2025
On 31-03-2025 19:53, Raag Jadav wrote:
> Expose debugfs telemetry required for PCIe Gen5 firmware update for
> discrete GPUs.
>
> Signed-off-by: Raag Jadav <raag.jadav at intel.com>
> ---
> drivers/gpu/drm/xe/xe_debugfs.c | 93 +++++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_pcode_api.h | 4 ++
> 2 files changed, 97 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
> index d0503959a8ed..67c941abf4fe 100644
> --- a/drivers/gpu/drm/xe/xe_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_debugfs.c
> @@ -17,6 +17,9 @@
> #include "xe_gt_debugfs.h"
> #include "xe_gt_printk.h"
> #include "xe_guc_ads.h"
> +#include "xe_mmio.h"
> +#include "xe_pcode_api.h"
> +#include "xe_pcode.h"
> #include "xe_pm.h"
> #include "xe_pxp_debugfs.h"
> #include "xe_sriov.h"
> @@ -191,6 +194,89 @@ static const struct file_operations wedged_mode_fops = {
> .write = wedged_mode_set,
> };
>
> +/**
> + * DOC: PCIe Gen5 Update Limitations
> + *
> + * Default link speed of discrete GPUs is determined by FIT parameters stored
> + * in their flash memory, which are subject to override through user initiated
> + * firmware updates. It has been observed that devices configured with PCIe
> + * Gen5 as their default speed can come across link quality issues due to host
> + * or motherboard limitations and may have to auto-downspeed to PCIe Gen4 when
> + * faced with unstable link at Gen5. The users are required to ensure that the
> + * device is capable of auto-downspeeding to PCIe Gen4 before pushing the image
> + * with Gen5 as default configuration. This can be done by reading
> + * ``pcie_gen4_downspeed_capable`` debugfs entry, which will denote PCIe Gen4
> + * auto-downspeed capability of the device with boolean output value of ``0``
> + * or ``1``, meaning `incapable` or `capable` respectively.
> + *
> + * .. code-block:: shell
> + *
> + * $ cat /sys/kernel/debug/dri/<N>/pcie_gen4_downspeed_capable
Why not on sysfs?
So how about simply using "downgrade" instead of "downspeed" through out
the code?
> + *
> + * Pushing PCIe Gen5 update on a auto-downspeed incapable device and facing
> + * link instability due to host or motherboard limitations can result in driver
> + * not being able to successfully bind to the device, making further firmware
> + * updates impossible with RMA being the only last resort.
> + *
> + * Link downspeed status of auto-downspeed capable devices is available through
> + * ``pcie_gen4_downspeed_status`` debugfs entry with boolean output value of
> + * ``0`` or ``1``, with ``0`` meaning no downspeeding was required during link
> + * training (which is the optimal scenario) and ``1`` meaning the device has
> + * downsped to PCIe Gen4 due to unstable Gen5 link.
> + *
> + * .. code-block:: shell
> + *
> + * $ cat /sys/kernel/debug/dri/<N>/pcie_gen4_downspeed_status
> + */
> +
> +static ssize_t pcie_gen4_downspeed_capable_show(struct file *f, char __user *ubuf,
> + size_t size, loff_t *pos)
> +{
> + struct xe_device *xe = file_inode(f)->i_private;
> + struct xe_mmio *mmio = xe_root_tile_mmio(xe);
> + char buf[16];
> + u32 len, val;
> +
> + xe_pm_runtime_get(xe);
> + val = xe_mmio_read32(mmio, PCODE_SCRATCH(16));
> + xe_pm_runtime_put(xe);
> +
> + len = scnprintf(buf, sizeof(buf), "%u\n",
> + REG_FIELD_GET(PCIE_GEN4_DOWNGRADE, val) == DOWNGRADE_CAPABLE ? 1 : 0);
> +
> + return simple_read_from_buffer(ubuf, size, pos, buf, len);
> +}
> +
> +static const struct file_operations pcie_gen4_downspeed_capable_fops = {
> + .owner = THIS_MODULE,
> + .read = pcie_gen4_downspeed_capable_show,
> +};
> +
> +static ssize_t pcie_gen4_downspeed_status_show(struct file *f, char __user *ubuf,
> + size_t size, loff_t *pos)
> +{
> + struct xe_device *xe = file_inode(f)->i_private;
> + struct xe_tile *root_tile = xe_device_get_root_tile(xe);
> + char buf[16];
> + u32 len, val;
> + int ret;
> +
> + xe_pm_runtime_get(xe);
> + ret = xe_pcode_read(root_tile, PCODE_MBOX(DGFX_PCODE_STATUS,
> + DGFX_GET_INIT_STATUS, 0), &val, NULL);
Its better to restrict this for BMG as for other DGFX platforms bit 31
might be reserved bit.
Thanks,
Badal
> + xe_pm_runtime_put(xe);
> + if (ret)
> + return ret;
> +
> + len = scnprintf(buf, sizeof(buf), "%u\n", REG_FIELD_GET(REG_BIT(31), val));
> + return simple_read_from_buffer(ubuf, size, pos, buf, len);
> +}
> +
> +static const struct file_operations pcie_gen4_downspeed_status_fops = {
> + .owner = THIS_MODULE,
> + .read = pcie_gen4_downspeed_status_show,
> +};
> +
> void xe_debugfs_register(struct xe_device *xe)
> {
> struct ttm_device *bdev = &xe->ttm;
> @@ -211,6 +297,13 @@ void xe_debugfs_register(struct xe_device *xe)
> debugfs_create_file("wedged_mode", 0600, root, xe,
> &wedged_mode_fops);
>
> + if (IS_DGFX(xe)) {
> + debugfs_create_file("pcie_gen4_downspeed_capable", 0400, root, xe,
> + &pcie_gen4_downspeed_capable_fops);
> + debugfs_create_file("pcie_gen4_downspeed_status", 0400, root, xe,
> + &pcie_gen4_downspeed_status_fops);
> + }
> +
> for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
> man = ttm_manager_type(bdev, mem_type);
>
> diff --git a/drivers/gpu/drm/xe/xe_pcode_api.h b/drivers/gpu/drm/xe/xe_pcode_api.h
> index e622ae17f08d..1f802d9793ad 100644
> --- a/drivers/gpu/drm/xe/xe_pcode_api.h
> +++ b/drivers/gpu/drm/xe/xe_pcode_api.h
> @@ -66,6 +66,10 @@
> /* Auxiliary info bits */
> #define AUXINFO_HISTORY_OFFSET REG_GENMASK(31, 29)
>
> +/* PCIe Gen4 downgrade capability bits */
> +#define PCIE_GEN4_DOWNGRADE REG_GENMASK(1, 0)
> +#define DOWNGRADE_CAPABLE 2
> +
> struct pcode_err_decode {
> int errno;
> const char *str;
More information about the Intel-xe
mailing list