[PATCH v4 3/4] drm/xe: Check 64-bit CTX timestamp for TDR when available

Wed May 7 23:17:43 UTC 2025

On Wed, May 07, 2025 at 02:22:13PM -0700, Matthew Brost wrote:
>On Wed, May 07, 2025 at 04:14:22PM -0500, Lucas De Marchi wrote:
>> On Tue, May 06, 2025 at 06:30:42PM -0700, Umesh Nerlige Ramappa wrote:
>> > Where available, add support for 64-bit CTX timestamp when calculating
>> > job run time.
>> >
>> > v2: (Matt)
>> > - Use xe->info.has_64bit_timestamp instead of version check
>> > - Allow timeout > 100sec on platforms with 64-bit timestampt
>> >
>> > Suggested-by: Lucas De Marchi <lucas.demarchi at intel.com>
>> > Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> > ---
>> > drivers/gpu/drm/xe/regs/xe_engine_regs.h |  1 +
>> > drivers/gpu/drm/xe/regs/xe_lrc_layout.h  |  1 +
>> > drivers/gpu/drm/xe/xe_device_types.h     |  2 +
>> > drivers/gpu/drm/xe/xe_guc_submit.c       | 18 ++++---
>> > drivers/gpu/drm/xe/xe_lrc.c              | 64 +++++++++++++++++++++---
>> > drivers/gpu/drm/xe/xe_lrc.h              | 10 ++--
>> > drivers/gpu/drm/xe/xe_pci.c              |  2 +
>> > drivers/gpu/drm/xe/xe_pci_types.h        |  1 +
>> > drivers/gpu/drm/xe/xe_ring_ops.c         |  8 +++
>> > 9 files changed, 90 insertions(+), 17 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> > index da713634d6a0..52f4c96c01dc 100644
>> > --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> > +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>> > @@ -154,6 +154,7 @@
>> > #define   STOP_RING				REG_BIT(8)
>> >
>> > #define RING_CTX_TIMESTAMP(base)		XE_REG((base) + 0x3a8)
>> > +#define RING_CTX_TIMESTAMP_UDW(base)		XE_REG((base) + 0x3ac)
>> > #define CSBE_DEBUG_STATUS(base)			XE_REG((base) + 0x3fc)
>> >
>> > #define RING_FORCE_TO_NONPRIV(base, i)		XE_REG(((base) + 0x4d0) + (i) * 4)
>> > diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> > index 57944f90bbf6..8619244b7c7b 100644
>> > --- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> > +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>> > @@ -12,6 +12,7 @@
>> > #define CTX_RING_START			(0x08 + 1)
>> > #define CTX_RING_CTL			(0x0a + 1)
>> > #define CTX_TIMESTAMP			(0x22 + 1)
>> > +#define CTX_TIMESTAMP_UDW		(0x24 + 1)
>> > #define CTX_INDIRECT_RING_STATE		(0x26 + 1)
>> > #define CTX_PDP0_UDW			(0x30 + 1)
>> > #define CTX_PDP0_LDW			(0x32 + 1)
>> > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> > index 495bc00ebed4..9446a739948c 100644
>> > --- a/drivers/gpu/drm/xe/xe_device_types.h
>> > +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> > @@ -334,6 +334,8 @@ struct xe_device {
>> > 		u8 has_sriov:1;
>> > 		/** @info.has_usm: Device has unified shared memory support */
>> > 		u8 has_usm:1;
>> > +		/** @info.has_64bit_timestamp: Device supports 64-bit timestamps */
>> > +		u8 has_64bit_timestamp:1;
>> > 		/** @info.is_dgfx: is discrete device */
>> > 		u8 is_dgfx:1;
>> > 		/** @info.needs_scratch: needs scratch page for oob prefetch to work */
>> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>> > index 813c3c0bb250..bef32f2b0081 100644
>> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>> > @@ -936,10 +936,12 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>> >
>> > static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>> > {
>> > -	struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
>> > -	u32 ctx_timestamp, ctx_job_timestamp;
>> > -	u32 timeout_ms = q->sched_props.job_timeout_ms;
>> > -	u32 diff;
>> > +	struct xe_guc *guc = exec_queue_to_guc(q);
>> > +	struct xe_device *xe = guc_to_xe(guc);
>> > +	struct xe_gt *gt = guc_to_gt(guc);
>> > +	u64 ctx_timestamp, ctx_job_timestamp;
>> > +	u64 timeout_ms = q->sched_props.job_timeout_ms;
>> > +	u64 diff, max = U32_MAX;
>> > 	u64 running_time_ms;
>> >
>> > 	if (!xe_sched_job_started(job)) {
>> > @@ -952,15 +954,17 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>> >
>> > 	ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
>> > 	ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
>> > +	if (xe->info.has_64bit_timestamp)
>> > +		max = U64_MAX;
>> >
>> > 	/*
>> > 	 * Counter wraps at ~223s at the usual 19.2MHz, be paranoid catch
>> > 	 * possible overflows with a high timeout.
>>
>> this would not be true anymore with a 64b counter. Anyway, why would we
>> need to be soooo paranoid to increase this and consider a diff >=
>> 223sec? For the timeout check we can leave with the counter wrapping at
>> 32b afaics and not change anything here.
>>
>
>Why? If the counter is 64 bits I don't see why you'd wouldn't use all the bits.
>
>> > 	 */
>> > -	xe_gt_assert(gt, timeout_ms < 100 * MSEC_PER_SEC);
>> > +	xe_gt_assert(gt, xe->info.has_64bit_timestamp || timeout_ms < 100 * MSEC_PER_SEC);
>>
>> the assert is on the value of q->sched_props.job_timeout_ms.
>> Why are you removing it? Or even why is the assert in this function
>> rather than on init?
>>
>
>Admittedly a bit of an odd choice to have an assert here rather than on
>init. Also in practice the highest value I think timeout can be is 10s
>without a special build of the kernel to allow crazy high values.
>
>>
>> >
>> > 	if (ctx_timestamp < ctx_job_timestamp)
>> > -		diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
>> > +		diff = ctx_timestamp + max - ctx_job_timestamp;
>> > 	else
>> > 		diff = ctx_timestamp - ctx_job_timestamp;
>> >
>> > @@ -971,7 +975,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>> > 		ADJUST_FIVE_PERCENT(xe_gt_clock_interval_to_ms(gt, diff));
>> >
>> > 	xe_gt_dbg(gt,
>> > -		  "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%u, diff=0x%08x",
>> > +		  "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%llu, diff=0x%llx",
>> > 		  xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
>> > 		  q->guc->id, running_time_ms, timeout_ms, diff);
>> >
>> > diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>> > index 9d9be9383207..b5f5547c7c4f 100644
>> > --- a/drivers/gpu/drm/xe/xe_lrc.c
>> > +++ b/drivers/gpu/drm/xe/xe_lrc.c
>> > @@ -653,6 +653,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
>> > #define LRC_SEQNO_PPHWSP_OFFSET 512
>> > #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
>> > #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
>> > +#define LRC_CTX_JOB_TIMESTAMP_UDW_OFFSET (LRC_CTX_JOB_TIMESTAMP_OFFSET + 4)
>> > #define LRC_PARALLEL_PPHWSP_OFFSET 2048
>> >
>> > u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
>> > @@ -691,6 +692,12 @@ static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
>> > 	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
>> > }
>> >
>> > +static u32 __xe_lrc_ctx_job_timestamp_udw_offset(struct xe_lrc *lrc)
>> > +{
>> > +	/* This is stored in the driver-defined portion of PPHWSP */
>> > +	return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_UDW_OFFSET;
>> > +}
>> > +
>> > static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
>> > {
>> > 	/* The parallel is stored in the driver-defined portion of PPHWSP */
>> > @@ -702,6 +709,11 @@ static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
>> > 	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
>> > }
>> >
>> > +static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
>> > +{
>> > +	return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
>> > +}
>> > +
>> > static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
>> > {
>> > 	/* Indirect ring state page is at the very end of LRC */
>> > @@ -728,7 +740,9 @@ DECL_MAP_ADDR_HELPERS(seqno)
>> > DECL_MAP_ADDR_HELPERS(regs)
>> > DECL_MAP_ADDR_HELPERS(start_seqno)
>> > DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
>> > +DECL_MAP_ADDR_HELPERS(ctx_job_timestamp_udw)
>> > DECL_MAP_ADDR_HELPERS(ctx_timestamp)
>> > +DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
>> > DECL_MAP_ADDR_HELPERS(parallel)
>> > DECL_MAP_ADDR_HELPERS(indirect_ring)
>> >
>> > @@ -745,19 +759,38 @@ u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
>> > 	return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
>> > }
>> >
>> > +/**
>> > + * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
>> > + * @lrc: Pointer to the lrc.
>> > + *
>> > + * Returns: ctx timestamp udw GGTT address
>> > + */
>> > +u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
>> > +{
>> > +	return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
>> > +}
>> > +
>> > /**
>> >  * xe_lrc_ctx_timestamp() - Read ctx timestamp value
>> >  * @lrc: Pointer to the lrc.
>> >  *
>> >  * Returns: ctx timestamp value
>> >  */
>> > -u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
>> > +u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
>> > {
>> > 	struct xe_device *xe = lrc_to_xe(lrc);
>> > 	struct iosys_map map;
>> > +	u32 ldw, udw = 0;
>> >
>> > 	map = __xe_lrc_ctx_timestamp_map(lrc);
>> > -	return xe_map_read32(xe, &map);
>> > +	ldw = xe_map_read32(xe, &map);
>> > +
>> > +	if (xe->info.has_64bit_timestamp) {
>> > +		map = __xe_lrc_ctx_timestamp_udw_map(lrc);
>> > +		udw = xe_map_read32(xe, &map);
>> > +	}
>> > +
>> > +	return (u64)udw << 32 | ldw;
>>
>> I understand doing this for the next patch, but for this timeout I think
>> we could just discard the upper part and let the 32b wrap.
>>
>
>Again why? Seems odd to not use all 64 bits if available.

I would use all the 64 bits. The only problem with 32 bits is it appears 
to be small, maybe if you run a job for something like 500s, then the 
wrap may have happened twice and we won't know.

Is the LR (long running mode) not subject to drm job timeouts? If yes, 
and if there is no practical need for 64 bits for regular mode, I can 
revert those parts.

Please advise.

Thanks,
Umesh
>
>Matt
>
>> Lucas De Marchi