[PATCH v3] drm/xe: Add device CXL capabilities identification

Mon May 27 16:43:59 UTC 2024

On 27.05.2024 10:40, Farah Kassabri wrote:
> As future Intel GPUs will use CXL interface with the host
> servers, this patch will add check if the xe device has CXL
> capabilities or not, by reading the PCIe standard DVSEC register
> and identify the CXL vendor id.
> 
> Signed-off-by: Farah Kassabri <fkassabri at habana.ai>
> ---
> Changes in v3:
> fixes following review.
> 
>  drivers/gpu/drm/xe/Makefile          |  1 +
>  drivers/gpu/drm/xe/xe_cxl.c          | 51 ++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_cxl.h          | 13 +++++++
>  drivers/gpu/drm/xe/xe_device.c       |  5 +++
>  drivers/gpu/drm/xe/xe_device_types.h | 10 ++++++
>  5 files changed, 80 insertions(+)
>  create mode 100644 drivers/gpu/drm/xe/xe_cxl.c
>  create mode 100644 drivers/gpu/drm/xe/xe_cxl.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index a5959bb7b1fb..65c0a0317ee7 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -65,6 +65,7 @@ $(uses_generated_oob): $(generated_oob)
>  xe-y += xe_bb.o \
>  	xe_bo.o \
>  	xe_bo_evict.o \
> +	xe_cxl.o \
>  	xe_debugfs.o \
>  	xe_devcoredump.o \
>  	xe_device.o \
> diff --git a/drivers/gpu/drm/xe/xe_cxl.c b/drivers/gpu/drm/xe/xe_cxl.c
> new file mode 100644
> index 000000000000..78e4179e965a
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_cxl.c
> @@ -0,0 +1,51 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#include "xe_cxl.h"
> +#include "xe_device.h"

btw, likely you only need "xe_device_types.h"

> +#include "../drivers/cxl/cxlpci.h"

and this still doesn't look good
can you double check with maintainers ?

> +
> +enum {
> +	CXL_DEV_TYPE_ONE = 1,
> +	CXL_DEV_TYPE_TWO,
> +	CXL_DEV_TYPE_THREE
> +};
> +
> +/**
> + * xe_cxl_init_capabilities - set CXL device capabilities
> + * @xe: xe device structure
> + *
> + * Set CXL capabilities and information of the xe device.
> + *
> + * Return: 0 on success, negative value on failure.

usually on failure we return "negative error code (errno)"
but ...

> + */
> +int xe_cxl_init_capabilities(struct xe_device *xe)
> +{
> +	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> +	u16 ctrl, cxl_dvsec;
> +	int rc;
> +
> +	cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);

   (btw, this line doesn't compile on KUnit)

> +	if (!cxl_dvsec)
> +		return -1;

... here it's a plain number (while likely should be an -errno value)
and ...

> +
> +	rc = pci_read_config_word(pdev, cxl_dvsec + CXL_DVSEC_CTRL_OFFSET, &ctrl);
> +	if (rc < 0) {

   (btw, pci_read_config_word() will return > 0 on error)

> +		drm_err(&xe->drm, "Failed to read DVSEC_CTRL register(%d)\n", rc);
> +		return rc;

... here it will be non-negative PCI error code

> +	}
> +
> +	xe->info.has_cxl_cap = true;
> +	xe->cxl.dvsec = cxl_dvsec;
> +
> +	if (ctrl & CXL_DVSEC_MEM_ENABLE)
> +		xe->cxl.type = (ctrl & BIT(0)) ? CXL_DEV_TYPE_TWO : CXL_DEV_TYPE_THREE;
> +	else
> +		xe->cxl.type = CXL_DEV_TYPE_ONE;
> +
> +	drm_dbg(&xe->drm, "The device has CXL capability, type %u\n", xe->cxl.type);
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_cxl.h b/drivers/gpu/drm/xe/xe_cxl.h
> new file mode 100644
> index 000000000000..82e36285c383
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_cxl.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#ifndef _XE_CXL_H_
> +#define _XE_CXL_H_
> +
> +struct xe_device;
> +
> +int xe_cxl_init_capabilities(struct xe_device *xe);
> +
> +#endif
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 5acf2c92789f..04677c9ff10f 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -50,6 +50,7 @@
>  #include "xe_ttm_sys_mgr.h"
>  #include "xe_vm.h"
>  #include "xe_wait_user_fence.h"
> +#include "xe_cxl.h"

includes shall also be sorted

>  
>  static int xe_file_open(struct drm_device *dev, struct drm_file *file)
>  {
> @@ -312,6 +313,10 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
>  	if (WARN_ON(err))
>  		goto err;
>  
> +	err = xe_cxl_init_capabilities(xe);
> +	if (err)
> +		goto err;

are you 100% sure that we should abort driver load on init failure?

and now it will fail on every platform without cxl caps

was this patch tested ?

> +
>  	return xe;
>  
>  err:
> diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> index 8e7048ff3ee5..323c3d57fdb8 100644
> --- a/drivers/gpu/drm/xe/xe_device_types.h
> +++ b/drivers/gpu/drm/xe/xe_device_types.h
> @@ -285,6 +285,8 @@ struct xe_device {
>  		u8 has_atomic_enable_pte_bit:1;
>  		/** @info.has_device_atomics_on_smem: Supports device atomics on SMEM */
>  		u8 has_device_atomics_on_smem:1;
> +		/** @info.has_cxl_cap: device has CXL capabilities */
> +		u8 has_cxl_cap:1;
>  
>  #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>  		struct {
> @@ -321,6 +323,14 @@ struct xe_device {
>  		struct ttm_resource_manager sys_mgr;
>  	} mem;
>  
> +	/** @cxl: CXL info for device */
> +	struct {
> +		/** @cxl.dvsec: offset to device DVSEC */
> +		u16 dvsec;
> +		/** @cxl.type: CXL device type */
> +		u16 type;
> +	} cxl;
> +
>  	/** @sriov: device level virtualization data */
>  	struct {
>  		/** @sriov.__mode: SR-IOV mode (Don't access directly!) */