[PATCH v4 3/4] drm/xe: Check 64-bit CTX timestamp for TDR when available

Fri May 9 16:15:44 UTC 2025

On Wed, May 07, 2025 at 11:10:13PM -0700, Matthew Brost wrote:
>On Wed, May 07, 2025 at 04:17:43PM -0700, Umesh Nerlige Ramappa wrote:
>> On Wed, May 07, 2025 at 02:22:13PM -0700, Matthew Brost wrote:
>> > On Wed, May 07, 2025 at 04:14:22PM -0500, Lucas De Marchi wrote:
>> > > On Tue, May 06, 2025 at 06:30:42PM -0700, Umesh Nerlige Ramappa wrote:
>> > > > Where available, add support for 64-bit CTX timestamp when calculating
>> > > > job run time.
>> > > >
>> > > > v2: (Matt)
>> > > > - Use xe->info.has_64bit_timestamp instead of version check
>> > > > - Allow timeout > 100sec on platforms with 64-bit timestampt
>> > > >
>> > > > Suggested-by: Lucas De Marchi <lucas.demarchi at intel.com>
>> > > > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> > > > ---
>> > > > drivers/gpu/drm/xe/regs/xe_engine_regs.h |  1 +
>> > > > drivers/gpu/drm/xe/regs/xe_lrc_layout.h  |  1 +
>> > > > drivers/gpu/drm/xe/xe_device_types.h     |  2 +
>> > > > drivers/gpu/drm/xe/xe_guc_submit.c       | 18 ++++---
>> > > > drivers/gpu/drm/xe/xe_lrc.c              | 64 +++++++++++++++++++++---
>> > > > drivers/gpu/drm/xe/xe_lrc.h              | 10 ++--
>> > > > drivers/gpu/drm/xe/xe_pci.c              |  2 +
>> > > > drivers/gpu/drm/xe/xe_pci_types.h        |  1 +
>> > > > drivers/gpu/drm/xe/xe_ring_ops.c         |  8 +++
>> > > > 9 files changed, 90 insertions(+), 17 deletions(-)
>> > > >
>> > > > diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> > > > index da713634d6a0..52f4c96c01dc 100644
>> > > > --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> > > > +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> > > > @@ -154,6 +154,7 @@
>> > > > #define   STOP_RING				REG_BIT(8)
>> > > >
>> > > > #define RING_CTX_TIMESTAMP(base)		XE_REG((base) + 0x3a8)
>> > > > +#define RING_CTX_TIMESTAMP_UDW(base)		XE_REG((base) + 0x3ac)
>> > > > #define CSBE_DEBUG_STATUS(base)			XE_REG((base) + 0x3fc)
>> > > >
>> > > > #define RING_FORCE_TO_NONPRIV(base, i)		XE_REG(((base) + 0x4d0) + (i) * 4)
>> > > > diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> > > > index 57944f90bbf6..8619244b7c7b 100644
>> > > > --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> > > > +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> > > > @@ -12,6 +12,7 @@
>> > > > #define CTX_RING_START			(0x08 + 1)
>> > > > #define CTX_RING_CTL			(0x0a + 1)
>> > > > #define CTX_TIMESTAMP			(0x22 + 1)
>> > > > +#define CTX_TIMESTAMP_UDW		(0x24 + 1)
>> > > > #define CTX_INDIRECT_RING_STATE		(0x26 + 1)
>> > > > #define CTX_PDP0_UDW			(0x30 + 1)
>> > > > #define CTX_PDP0_LDW			(0x32 + 1)
>> > > > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> > > > index 495bc00ebed4..9446a739948c 100644
>> > > > --- a/drivers/gpu/drm/xe/xe_device_types.h
>> > > > +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> > > > @@ -334,6 +334,8 @@ struct xe_device {
>> > > > 		u8 has_sriov:1;
>> > > > 		/** @info.has_usm: Device has unified shared memory support */
>> > > > 		u8 has_usm:1;
>> > > > +		/** @info.has_64bit_timestamp: Device supports 64-bit timestamps */
>> > > > +		u8 has_64bit_timestamp:1;
>> > > > 		/** @info.is_dgfx: is discrete device */
>> > > > 		u8 is_dgfx:1;
>> > > > 		/** @info.needs_scratch: needs scratch page for oob prefetch to work */
>> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>> > > > index 813c3c0bb250..bef32f2b0081 100644
>> > > > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>> > > > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>> > > > @@ -936,10 +936,12 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>> > > >
>> > > > static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>> > > > {
>> > > > -	struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
>> > > > -	u32 ctx_timestamp, ctx_job_timestamp;
>> > > > -	u32 timeout_ms = q->sched_props.job_timeout_ms;
>> > > > -	u32 diff;
>> > > > +	struct xe_guc *guc = exec_queue_to_guc(q);
>> > > > +	struct xe_device *xe = guc_to_xe(guc);
>> > > > +	struct xe_gt *gt = guc_to_gt(guc);
>> > > > +	u64 ctx_timestamp, ctx_job_timestamp;
>> > > > +	u64 timeout_ms = q->sched_props.job_timeout_ms;
>> > > > +	u64 diff, max = U32_MAX;
>> > > > 	u64 running_time_ms;
>> > > >
>> > > > 	if (!xe_sched_job_started(job)) {
>> > > > @@ -952,15 +954,17 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>> > > >
>> > > > 	ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
>> > > > 	ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
>> > > > +	if (xe->info.has_64bit_timestamp)
>> > > > +		max = U64_MAX;
>> > > >
>> > > > 	/*
>> > > > 	 * Counter wraps at ~223s at the usual 19.2MHz, be paranoid catch
>> > > > 	 * possible overflows with a high timeout.
>> > >
>> > > this would not be true anymore with a 64b counter. Anyway, why would we
>> > > need to be soooo paranoid to increase this and consider a diff >=
>> > > 223sec? For the timeout check we can leave with the counter wrapping at
>> > > 32b afaics and not change anything here.
>> > >
>> >
>> > Why? If the counter is 64 bits I don't see why you'd wouldn't use all the bits.
>> >
>> > > > 	 */
>> > > > -	xe_gt_assert(gt, timeout_ms < 100 * MSEC_PER_SEC);
>> > > > +	xe_gt_assert(gt, xe->info.has_64bit_timestamp || timeout_ms < 100 * MSEC_PER_SEC);
>> > >
>> > > the assert is on the value of q->sched_props.job_timeout_ms.
>> > > Why are you removing it? Or even why is the assert in this function
>> > > rather than on init?
>> > >
>> >
>> > Admittedly a bit of an odd choice to have an assert here rather than on
>> > init. Also in practice the highest value I think timeout can be is 10s
>> > without a special build of the kernel to allow crazy high values.
>> >
>> > >
>> > > >
>> > > > 	if (ctx_timestamp < ctx_job_timestamp)
>> > > > -		diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
>> > > > +		diff = ctx_timestamp + max - ctx_job_timestamp;
>> > > > 	else
>> > > > 		diff = ctx_timestamp - ctx_job_timestamp;
>> > > >
>> > > > @@ -971,7 +975,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>> > > > 		ADJUST_FIVE_PERCENT(xe_gt_clock_interval_to_ms(gt, diff));
>> > > >
>> > > > 	xe_gt_dbg(gt,
>> > > > -		  "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%u, diff=0x%08x",
>> > > > +		  "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%llu, diff=0x%llx",
>> > > > 		  xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>> > > > 		  q->guc->id, running_time_ms, timeout_ms, diff);
>> > > >
>> > > > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>> > > > index 9d9be9383207..b5f5547c7c4f 100644
>> > > > --- a/drivers/gpu/drm/xe/xe_lrc.c
>> > > > +++ b/drivers/gpu/drm/xe/xe_lrc.c
>> > > > @@ -653,6 +653,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
>> > > > #define LRC_SEQNO_PPHWSP_OFFSET 512
>> > > > #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
>> > > > #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
>> > > > +#define LRC_CTX_JOB_TIMESTAMP_UDW_OFFSET (LRC_CTX_JOB_TIMESTAMP_OFFSET + 4)
>> > > > #define LRC_PARALLEL_PPHWSP_OFFSET 2048
>> > > >
>> > > > u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
>> > > > @@ -691,6 +692,12 @@ static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
>> > > > 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
>> > > > }
>> > > >
>> > > > +static u32 __xe_lrc_ctx_job_timestamp_udw_offset(struct xe_lrc *lrc)
>> > > > +{
>> > > > +	/* This is stored in the driver-defined portion of PPHWSP */
>> > > > +	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_UDW_OFFSET;
>> > > > +}
>> > > > +
>> > > > static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
>> > > > {
>> > > > 	/* The parallel is stored in the driver-defined portion of PPHWSP */
>> > > > @@ -702,6 +709,11 @@ static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
>> > > > 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
>> > > > }
>> > > >
>> > > > +static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
>> > > > +{
>> > > > +	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
>> > > > +}
>> > > > +
>> > > > static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
>> > > > {
>> > > > 	/* Indirect ring state page is at the very end of LRC */
>> > > > @@ -728,7 +740,9 @@ DECL_MAP_ADDR_HELPERS(seqno)
>> > > > DECL_MAP_ADDR_HELPERS(regs)
>> > > > DECL_MAP_ADDR_HELPERS(start_seqno)
>> > > > DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
>> > > > +DECL_MAP_ADDR_HELPERS(ctx_job_timestamp_udw)
>> > > > DECL_MAP_ADDR_HELPERS(ctx_timestamp)
>> > > > +DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
>> > > > DECL_MAP_ADDR_HELPERS(parallel)
>> > > > DECL_MAP_ADDR_HELPERS(indirect_ring)
>> > > >
>> > > > @@ -745,19 +759,38 @@ u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
>> > > > 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
>> > > > }
>> > > >
>> > > > +/**
>> > > > + * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
>> > > > + * @lrc: Pointer to the lrc.
>> > > > + *
>> > > > + * Returns: ctx timestamp udw GGTT address
>> > > > + */
>> > > > +u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
>> > > > +{
>> > > > +	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
>> > > > +}
>> > > > +
>> > > > /**
>> > > >  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
>> > > >  * @lrc: Pointer to the lrc.
>> > > >  *
>> > > >  * Returns: ctx timestamp value
>> > > >  */
>> > > > -u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
>> > > > +u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
>> > > > {
>> > > > 	struct xe_device *xe = lrc_to_xe(lrc);
>> > > > 	struct iosys_map map;
>> > > > +	u32 ldw, udw = 0;
>> > > >
>> > > > 	map = __xe_lrc_ctx_timestamp_map(lrc);
>> > > > -	return xe_map_read32(xe, &map);
>> > > > +	ldw = xe_map_read32(xe, &map);
>> > > > +
>> > > > +	if (xe->info.has_64bit_timestamp) {
>> > > > +		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
>> > > > +		udw = xe_map_read32(xe, &map);
>> > > > +	}
>> > > > +
>> > > > +	return (u64)udw << 32 | ldw;
>> > >
>> > > I understand doing this for the next patch, but for this timeout I think
>> > > we could just discard the upper part and let the 32b wrap.
>> > >
>> >
>> > Again why? Seems odd to not use all 64 bits if available.
>>
>> I would use all the 64 bits. The only problem with 32 bits is it appears to
>> be small, maybe if you run a job for something like 500s, then the wrap may
>> have happened twice and we won't know.
>>
>> Is the LR (long running mode) not subject to drm job timeouts? If yes, and
>
>No - LR are not subject to drm job timeouts.
>
>> if there is no practical need for 64 bits for regular mode, I can revert
>> those parts.
>>
>
>IMO we should still use all this bits available to us, but agree there
>is no practical use as long timeouts violate the dma fencing rules. Do
>whatever you / Lucas seems best. Series LGTM.
>

I posted a new revision with just the 32 bit timestamp for TDR to get a 
head start on CI. Will wait for Lucas to confirm which one to use 
eventually.

Thanks,
Umesh
>Matt
>
>> Please advise.
>>
>> Thanks,
>> Umesh
>> >
>> > Matt
>> >
>> > > Lucas De Marchi