[PATCH v2 07/11] drm/xe: Add xe_guc_tlb_invalidation layer

Tue Jul 9 21:31:59 UTC 2024

Hi Matt,

few nits (as usual)

On 08.07.2024 06:03, Matthew Brost wrote:
> Move GuC specific TLB invalidation into dedicated layer.
> 
> v2:
>  - Remove XXX comment
> 
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/xe/Makefile                   |   1 +
>  drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c   | 121 +--------------
>  drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h   |   4 +-
>  .../gpu/drm/xe/xe_gt_tlb_invalidation_types.h |  13 ++
>  drivers/gpu/drm/xe/xe_gt_types.h              |   2 +
>  drivers/gpu/drm/xe/xe_guc_ct.c                |   2 +-
>  drivers/gpu/drm/xe/xe_guc_tlb_invalidation.c  | 143 ++++++++++++++++++
>  drivers/gpu/drm/xe/xe_guc_tlb_invalidation.h  |  18 +++
>  8 files changed, 186 insertions(+), 118 deletions(-)
>  create mode 100644 drivers/gpu/drm/xe/xe_guc_tlb_invalidation.c
>  create mode 100644 drivers/gpu/drm/xe/xe_guc_tlb_invalidation.h
> 
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index 628c245c4822..21f4ddf1a946 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -83,6 +83,7 @@ xe-y += xe_bb.o \
>  	xe_guc_log.o \
>  	xe_guc_pc.o \
>  	xe_guc_submit.o \
> +	xe_guc_tlb_invalidation.o \
>  	xe_heci_gsc.o \
>  	xe_hw_engine.o \
>  	xe_hw_engine_class_sysfs.o \
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> index 79d1ed138db5..4b7b5f8205f9 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
> @@ -7,13 +7,12 @@
>  
>  #include "xe_gt_tlb_invalidation.h"
>  
> -#include "abi/guc_actions_abi.h"
>  #include "xe_device.h"
>  #include "xe_force_wake.h"
>  #include "xe_gt.h"
>  #include "xe_gt_printk.h"
> -#include "xe_guc.h"
>  #include "xe_guc_ct.h"
> +#include "xe_guc_tlb_invalidation.h"
>  #include "xe_mmio.h"
>  #include "xe_sriov.h"
>  #include "xe_trace.h"
> @@ -85,6 +84,7 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
>  	spin_lock_init(&gt->tlb_invalidation.fence_lock);
>  	INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
>  			  xe_gt_tlb_fence_timeout);
> +	gt->tlb_invalidation.ops = xe_guc_tlb_invalidation_get_ops(&gt->uc.guc);

what about force_execlist=true mode ?

shouldn't we have nop_ops (or direct_ops) and let the GuC code replace
ops when selected/in use ?

>  
>  	return drmm_mutex_init(&gt_to_xe(gt)->drm,
>  			       &gt->tlb_invalidation.seqno_lock);
> @@ -157,97 +157,16 @@ static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno)
>  	return seqno_recv >= seqno;
>  }
>  
> -static int send_tlb_invalidation(struct xe_guc *guc, u32 *action, int len)
> -{
> -	struct xe_gt *gt = guc_to_gt(guc);
> -
> -	xe_gt_assert(gt, action[1]);	/* Seqno */
> -	lockdep_assert_held(&gt->tlb_invalidation.seqno_lock);
> -
> -	/*
> -	 * XXX: The seqno algorithm relies on TLB invalidation being processed
> -	 * in order which they currently are, if that changes the algorithm will
> -	 * need to be updated.
> -	 */
> -
> -	return xe_guc_ct_send(&guc->ct, action, len,
> -			      G2H_LEN_DW_TLB_INVALIDATE, 1);
> -}
> -
> -#define MAKE_INVAL_OP(type)	((type << XE_GUC_TLB_INVAL_TYPE_SHIFT) | \
> -		XE_GUC_TLB_INVAL_MODE_HEAVY << XE_GUC_TLB_INVAL_MODE_SHIFT | \
> -		XE_GUC_TLB_INVAL_FLUSH_CACHE)
> -
>  static int send_tlb_invalidation_ggtt(struct xe_gt *gt, int seqno)
>  {
> -	u32 action[] = {
> -		XE_GUC_ACTION_TLB_INVALIDATION,
> -		seqno,
> -		MAKE_INVAL_OP(XE_GUC_TLB_INVAL_GUC),
> -	};
> -
> -	return send_tlb_invalidation(&gt->uc.guc, action, ARRAY_SIZE(action));
> +	return gt->tlb_invalidation.ops->tlb_invalidation_ggtt(gt, seqno);
>  }
>  
>  static int send_tlb_invalidation_ppgtt(struct xe_gt *gt, u64 start, u64 end,
>  				       u32 asid, int seqno)
>  {
> -#define MAX_TLB_INVALIDATION_LEN	7
> -	u32 action[MAX_TLB_INVALIDATION_LEN];
> -	int len = 0;
> -
> -	action[len++] = XE_GUC_ACTION_TLB_INVALIDATION;
> -	action[len++] = seqno;
> -	if (!gt_to_xe(gt)->info.has_range_tlb_invalidation) {
> -		action[len++] = MAKE_INVAL_OP(XE_GUC_TLB_INVAL_FULL);
> -	} else {
> -		u64 orig_start = start;
> -		u64 length = end - start;
> -		u64 align;
> -
> -		if (length < SZ_4K)
> -			length = SZ_4K;
> -
> -		/*
> -		 * We need to invalidate a higher granularity if start address
> -		 * is not aligned to length. When start is not aligned with
> -		 * length we need to find the length large enough to create an
> -		 * address mask covering the required range.
> -		 */
> -		align = roundup_pow_of_two(length);
> -		start = ALIGN_DOWN(start, align);
> -		end = ALIGN(end, align);
> -		length = align;
> -		while (start + length < end) {
> -			length <<= 1;
> -			start = ALIGN_DOWN(orig_start, length);
> -		}
> -
> -		/*
> -		 * Minimum invalidation size for a 2MB page that the hardware
> -		 * expects is 16MB
> -		 */
> -		if (length >= SZ_2M) {
> -			length = max_t(u64, SZ_16M, length);
> -			start = ALIGN_DOWN(orig_start, length);
> -		}
> -
> -		xe_gt_assert(gt, length >= SZ_4K);
> -		xe_gt_assert(gt, is_power_of_2(length));
> -		xe_gt_assert(gt, !(length & GENMASK(ilog2(SZ_16M) - 1,
> -						    ilog2(SZ_2M) + 1)));
> -		xe_gt_assert(gt, IS_ALIGNED(start, length));
> -
> -		action[len++] = MAKE_INVAL_OP(XE_GUC_TLB_INVAL_PAGE_SELECTIVE);
> -		action[len++] = asid;
> -		action[len++] = lower_32_bits(start);
> -		action[len++] = upper_32_bits(start);
> -		action[len++] = ilog2(length) - ilog2(SZ_4K);
> -	}
> -
> -	xe_gt_assert(gt, len <= MAX_TLB_INVALIDATION_LEN);
> -
> -	return send_tlb_invalidation(&gt->uc.guc, action, len);
> +	return gt->tlb_invalidation.ops->tlb_invalidation_ppgtt(gt, start, end,
> +								asid, seqno);
>  }
>  
>  static void xe_gt_tlb_invalidation_fence_prep(struct xe_gt *gt,
> @@ -278,10 +197,6 @@ static void xe_gt_tlb_invalidation_fence_prep(struct xe_gt *gt,
>  		gt->tlb_invalidation.seqno = 1;
>  }
>  
> -#define MAKE_INVAL_OP(type)	((type << XE_GUC_TLB_INVAL_TYPE_SHIFT) | \
> -		XE_GUC_TLB_INVAL_MODE_HEAVY << XE_GUC_TLB_INVAL_MODE_SHIFT | \
> -		XE_GUC_TLB_INVAL_FLUSH_CACHE)
> -
>  static int __xe_gt_tlb_invalidation_ggtt(struct xe_gt *gt,
>  					 struct xe_gt_tlb_invalidation_fence *fence)
>  {
> @@ -420,7 +335,7 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
>   *
>   * Update recv seqno, signal any GT TLB invalidation fences, and restart TDR
>   */
> -static void xe_gt_tlb_invalidation_done_handler(struct xe_gt *gt, int seqno)
> +void xe_gt_tlb_invalidation_done_handler(struct xe_gt *gt, int seqno)
>  {
>  	struct xe_device *xe = gt_to_xe(gt);
>  	struct xe_gt_tlb_invalidation_fence *fence, *next;
> @@ -469,30 +384,6 @@ static void xe_gt_tlb_invalidation_done_handler(struct xe_gt *gt, int seqno)
>  	spin_unlock_irqrestore(&gt->tlb_invalidation.pending_lock, flags);
>  }
>  
> -/**
> - * xe_guc_tlb_invalidation_done_handler - TLB invalidation done handler
> - * @guc: guc
> - * @msg: message indicating TLB invalidation done
> - * @len: length of message
> - *
> - * Parse seqno of TLB invalidation, wake any waiters for seqno, and signal any
> - * invalidation fences for seqno. Algorithm for this depends on seqno being
> - * received in-order and asserts this assumption.
> - *
> - * Return: 0 on success, -EPROTO for malformed messages.
> - */
> -int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
> -{
> -	struct xe_gt *gt = guc_to_gt(guc);
> -
> -	if (unlikely(len != 1))
> -		return -EPROTO;
> -
> -	xe_gt_tlb_invalidation_done_handler(gt, msg[0]);
> -
> -	return 0;
> -}
> -
>  static const char *
>  invalidation_fence_get_driver_name(struct dma_fence *dma_fence)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> index cbf49b3d0265..ee532ad64aac 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h
> @@ -11,7 +11,6 @@
>  #include "xe_gt_tlb_invalidation_types.h"
>  
>  struct xe_gt;
> -struct xe_guc;
>  struct xe_vma;
>  
>  int xe_gt_tlb_invalidation_init(struct xe_gt *gt);
> @@ -23,11 +22,12 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
>  int xe_gt_tlb_invalidation_range(struct xe_gt *gt,
>  				 struct xe_gt_tlb_invalidation_fence *fence,
>  				 u64 start, u64 end, u32 asid);
> -int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
>  
>  void xe_gt_tlb_invalidation_fence_init(struct xe_gt *gt,
>  				       struct xe_gt_tlb_invalidation_fence *fence);
>  
> +void xe_gt_tlb_invalidation_done_handler(struct xe_gt *gt, int seqno);
> +
>  static inline void
>  xe_gt_tlb_invalidation_fence_wait(struct xe_gt_tlb_invalidation_fence *fence)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h
> index 934c828efe31..1abb8692d14b 100644
> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h
> @@ -8,6 +8,8 @@
>  
>  #include <linux/dma-fence.h>
>  
> +struct xe_gt;
> +
>  /**
>   * struct xe_gt_tlb_invalidation_fence - XE GT TLB invalidation fence
>   *
> @@ -25,4 +27,15 @@ struct xe_gt_tlb_invalidation_fence {
>  	ktime_t invalidation_time;
>  };
>  
> +/**
> + * struct xe_gt_tlb_invalidation_ops - Xe GT TLB invalidation operations
> + */
> +struct xe_gt_tlb_invalidation_ops {
> +	/** @tlb_invalidation_ggtt: TLB invalidation GGTT */
> +	int (*tlb_invalidation_ggtt)(struct xe_gt *gt, int seqno);
> +	/** @tlb_invalidation_ppgtt: TLB invalidation PPGTT */
> +	int (*tlb_invalidation_ppgtt)(struct xe_gt *gt, u64 start, u64 end,
> +				      u32 asid, int seqno);

do we need full "tlb_invalidation" prefix here ?
maybe just:

	int (*ggtt)(struct xe_gt *gt, int seqno);
	int (*ppgtt)(struct xe_gt *gt, u64 start, u64 end,

as this will used like:

	return gt->tlb_invalidation.ops->ggtt(gt, seqno);
or
	return gt->tlb_invalidation.ops->ppgtt(gt, start, end,...

> +};
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
> index fb280f058cdc..4b9740a68457 100644
> --- a/drivers/gpu/drm/xe/xe_gt_types.h
> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
> @@ -169,6 +169,8 @@ struct xe_gt {
>  
>  	/** @tlb_invalidation: TLB invalidation state */
>  	struct {
> +		/** @tlb_invalidation.ops: TLB invalidation ops */
> +		const struct xe_gt_tlb_invalidation_ops *ops;
>  		/** @tlb_invalidation.seqno_lock: TLB invalidation seqno lock */
>  		struct mutex seqno_lock;
>  		/**
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index 7d2e937da1d8..b09423590f62 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -23,10 +23,10 @@
>  #include "xe_gt_printk.h"
>  #include "xe_gt_sriov_pf_control.h"
>  #include "xe_gt_sriov_pf_monitor.h"
> -#include "xe_gt_tlb_invalidation.h"
>  #include "xe_guc.h"
>  #include "xe_guc_relay.h"
>  #include "xe_guc_submit.h"
> +#include "xe_guc_tlb_invalidation.h"
>  #include "xe_map.h"
>  #include "xe_pm.h"
>  #include "xe_trace_guc.h"
> diff --git a/drivers/gpu/drm/xe/xe_guc_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_guc_tlb_invalidation.c
> new file mode 100644
> index 000000000000..b6fd61bb77ba
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_guc_tlb_invalidation.c
> @@ -0,0 +1,143 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#include "abi/guc_actions_abi.h"
> +
> +#include "xe_assert.h"
> +#include "xe_device.h"
> +#include "xe_gt.h"
> +#include "xe_gt_printk.h"
> +#include "xe_gt_tlb_invalidation.h"
> +#include "xe_guc.h"
> +#include "xe_guc_ct.h"
> +#include "xe_guc_tlb_invalidation.h"
> +
> +static int send_tlb_invalidation(struct xe_guc *guc, u32 *action, int len)

const u32 *action

and len should be unsigned

> +{
> +	struct xe_gt *gt = guc_to_gt(guc);
> +
> +	xe_gt_assert(gt, action[1]);	/* Seqno */

it would be nice to have that "1" as part of the GuC ABI,
maybe at least as:

#define H2G_TLB_INVALIDATE_MSG_1_SEQNO	GUC_HXG_REQUEST_MSG_n_DATAn

xe_gt_assert(gt, FIELD_GET(H2G_TLB_INVALIDATE_MSG_1_SEQNO, action[1]);

> +	lockdep_assert_held(&gt->tlb_invalidation.seqno_lock);
> +
> +	return xe_guc_ct_send(&guc->ct, action, len,
> +			      G2H_LEN_DW_TLB_INVALIDATE, 1);
> +}
> +
> +#define MAKE_INVAL_OP(type)	((type << XE_GUC_TLB_INVAL_TYPE_SHIFT) | \
> +		XE_GUC_TLB_INVAL_MODE_HEAVY << XE_GUC_TLB_INVAL_MODE_SHIFT | \
> +		XE_GUC_TLB_INVAL_FLUSH_CACHE)

maybe it's time to convert SHIFT-based GuC ABI definitions into
mask-based and use FIELD_PREP as we do elsewhere

> +
> +static int send_tlb_invalidation_ggtt(struct xe_gt *gt, int seqno)

shouldn't we take "guc" instead of "gt" ? see below

> +{
> +	u32 action[] = {
> +		XE_GUC_ACTION_TLB_INVALIDATION,
> +		seqno,
> +		MAKE_INVAL_OP(XE_GUC_TLB_INVAL_GUC),
> +	};
> +
> +	return send_tlb_invalidation(&gt->uc.guc, action, ARRAY_SIZE(action));
> +}
> +
> +static int send_tlb_invalidation_ppgtt(struct xe_gt *gt, u64 start, u64 end,
> +				       u32 asid, int seqno)
> +{
> +#define MAX_TLB_INVALIDATION_LEN	7

this should be defined as part of the GuC ABI

> +	u32 action[MAX_TLB_INVALIDATION_LEN];
> +	int len = 0;
> +
> +	action[len++] = XE_GUC_ACTION_TLB_INVALIDATION;
> +	action[len++] = seqno;
> +	if (!gt_to_xe(gt)->info.has_range_tlb_invalidation) {
> +		action[len++] = MAKE_INVAL_OP(XE_GUC_TLB_INVAL_FULL);
> +	} else {
> +		u64 orig_start = start;
> +		u64 length = end - start;
> +		u64 align;
> +
> +		if (length < SZ_4K)
> +			length = SZ_4K;
> +
> +		/*
> +		 * We need to invalidate a higher granularity if start address
> +		 * is not aligned to length. When start is not aligned with
> +		 * length we need to find the length large enough to create an
> +		 * address mask covering the required range.
> +		 */
> +		align = roundup_pow_of_two(length);
> +		start = ALIGN_DOWN(start, align);
> +		end = ALIGN(end, align);
> +		length = align;
> +		while (start + length < end) {
> +			length <<= 1;
> +			start = ALIGN_DOWN(orig_start, length);
> +		}
> +
> +		/*
> +		 * Minimum invalidation size for a 2MB page that the hardware
> +		 * expects is 16MB
> +		 */
> +		if (length >= SZ_2M) {
> +			length = max_t(u64, SZ_16M, length);
> +			start = ALIGN_DOWN(orig_start, length);
> +		}
> +
> +		xe_gt_assert(gt, length >= SZ_4K);
> +		xe_gt_assert(gt, is_power_of_2(length));
> +		xe_gt_assert(gt, !(length & GENMASK(ilog2(SZ_16M) - 1,
> +						    ilog2(SZ_2M) + 1)));
> +		xe_gt_assert(gt, IS_ALIGNED(start, length));
> +
> +		action[len++] = MAKE_INVAL_OP(XE_GUC_TLB_INVAL_PAGE_SELECTIVE);
> +		action[len++] = asid;
> +		action[len++] = lower_32_bits(start);
> +		action[len++] = upper_32_bits(start);
> +		action[len++] = ilog2(length) - ilog2(SZ_4K);
> +	}
> +
> +	xe_gt_assert(gt, len <= MAX_TLB_INVALIDATION_LEN);
> +
> +	return send_tlb_invalidation(&gt->uc.guc, action, len);
> +}
> +
> +static const struct xe_gt_tlb_invalidation_ops guc_tlb_invalidation_ops = {
> +	.tlb_invalidation_ppgtt = send_tlb_invalidation_ppgtt,
> +	.tlb_invalidation_ggtt = send_tlb_invalidation_ggtt,
> +};

maybe it's overkill, but what about using the following code layout:

// pure GuC code
static int guc_send_tlb_invalidation(guc, action, len)
{
}

// pure GuC code
static int guc_send_tlb_invalidation_ggtt(guc, seqno)
{
	return guc_send_tlb_invalidation(guc, ...);
}

// ops glue code
static int tlb_invalidation_ggtt_by_guc(gt, seqno)
{
	return guc_send_tlb_invalidation_ggtt(guc, ...);
}

const struct xe_gt_tlb_invalidation_ops guc_tlb_invalidation_ops = {
	.ggtt = tlb_invalidation_ggtt_by_guc,
};

> +
> +/**
> + * xe_guc_tlb_invalidation_get_ops - Get Guc TLB invalidation ops
> + * @guc: guc
> + *
> + * Return: xe_gt_tlb_invalidation_ops for GuC TLB invalidation
> + */
> +const struct xe_gt_tlb_invalidation_ops*
> +xe_guc_tlb_invalidation_get_ops(struct xe_guc *guc)

since we don't use "guc" here and don't have any logic behind this
function, so maybe easier/cleaner would be just to expose ops directly
in the header as:

extern const struct xe_gt_tlb_invalidation_ops guc_tlb_invalidation_ops;

or since we may want to postpone GuC ops registration until we know we
will use the GuC, expose function in xe_gt_tlb_invalidation.h:

void xe_gt_tlb_invalidation_set_ops(
	struct xe_gt *gt,
	const struct xe_gt_tlb_invalidation_ops *ops);

> +{
> +	return &guc_tlb_invalidation_ops;
> +}
> +
> +/**
> + * xe_guc_tlb_invalidation_done_handler - GuC TLB invalidation done handler
> + * @guc: guc
> + * @msg: message indicating TLB invalidation done
> + * @len: length of message
> + *
> + * Parse seqno of TLB invalidation, 

only this part seems to be valid, all below belongs more to the
description of the xe_gt_tlb_invalidation_done_handler() that maybe we
should explicitly refer to ?

> wake any waiters for seqno, and signal any
> + * invalidation fences for seqno. Algorithm for this depends on seqno being
> + * received in-order and asserts this assumption.
> + *
> + * Return: 0 on success, -EPROTO for malformed messages.
> + */
> +int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
> +{
> +	struct xe_gt *gt = guc_to_gt(guc);
> +
> +	if (unlikely(len != 1))

this magic "1" should be part of the GuC ABI

> +		return -EPROTO;
> +
> +	xe_gt_tlb_invalidation_done_handler(gt, msg[0]);
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_guc_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_guc_tlb_invalidation.h
> new file mode 100644
> index 000000000000..44408aae6955
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_guc_tlb_invalidation.h
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +#ifndef _XE_GUC_TLB_INVALIDATION_H_
> +#define _XE_GUC_TLB_INVALIDATION_H_
> +
> +struct xe_guc;

forward decls should be _after_ any includes

> +
> +#include <linux/types.h>
> +
> +const struct xe_gt_tlb_invalidation_ops*
> +xe_guc_tlb_invalidation_get_ops(struct xe_guc *guc);
> +
> +int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
> +
> +#endif	/* _XE_GUC_TLB_INVALIDATION_H_ */

IIRC in Xe we don't use decorations for closing #endif of include guard