[PATCH v4 3/4] drm/xe: Check 64-bit CTX timestamp for TDR when available
Lucas De Marchi
lucas.demarchi at intel.com
Wed May 7 21:14:22 UTC 2025
On Tue, May 06, 2025 at 06:30:42PM -0700, Umesh Nerlige Ramappa wrote:
>Where available, add support for 64-bit CTX timestamp when calculating
>job run time.
>
>v2: (Matt)
>- Use xe->info.has_64bit_timestamp instead of version check
>- Allow timeout > 100sec on platforms with 64-bit timestampt
>
>Suggested-by: Lucas De Marchi <lucas.demarchi at intel.com>
>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>---
> drivers/gpu/drm/xe/regs/xe_engine_regs.h | 1 +
> drivers/gpu/drm/xe/regs/xe_lrc_layout.h | 1 +
> drivers/gpu/drm/xe/xe_device_types.h | 2 +
> drivers/gpu/drm/xe/xe_guc_submit.c | 18 ++++---
> drivers/gpu/drm/xe/xe_lrc.c | 64 +++++++++++++++++++++---
> drivers/gpu/drm/xe/xe_lrc.h | 10 ++--
> drivers/gpu/drm/xe/xe_pci.c | 2 +
> drivers/gpu/drm/xe/xe_pci_types.h | 1 +
> drivers/gpu/drm/xe/xe_ring_ops.c | 8 +++
> 9 files changed, 90 insertions(+), 17 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>index da713634d6a0..52f4c96c01dc 100644
>--- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>+++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
>@@ -154,6 +154,7 @@
> #define STOP_RING REG_BIT(8)
>
> #define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8)
>+#define RING_CTX_TIMESTAMP_UDW(base) XE_REG((base) + 0x3ac)
> #define CSBE_DEBUG_STATUS(base) XE_REG((base) + 0x3fc)
>
> #define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4)
>diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>index 57944f90bbf6..8619244b7c7b 100644
>--- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>+++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
>@@ -12,6 +12,7 @@
> #define CTX_RING_START (0x08 + 1)
> #define CTX_RING_CTL (0x0a + 1)
> #define CTX_TIMESTAMP (0x22 + 1)
>+#define CTX_TIMESTAMP_UDW (0x24 + 1)
> #define CTX_INDIRECT_RING_STATE (0x26 + 1)
> #define CTX_PDP0_UDW (0x30 + 1)
> #define CTX_PDP0_LDW (0x32 + 1)
>diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>index 495bc00ebed4..9446a739948c 100644
>--- a/drivers/gpu/drm/xe/xe_device_types.h
>+++ b/drivers/gpu/drm/xe/xe_device_types.h
>@@ -334,6 +334,8 @@ struct xe_device {
> u8 has_sriov:1;
> /** @info.has_usm: Device has unified shared memory support */
> u8 has_usm:1;
>+ /** @info.has_64bit_timestamp: Device supports 64-bit timestamps */
>+ u8 has_64bit_timestamp:1;
> /** @info.is_dgfx: is discrete device */
> u8 is_dgfx:1;
> /** @info.needs_scratch: needs scratch page for oob prefetch to work */
>diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>index 813c3c0bb250..bef32f2b0081 100644
>--- a/drivers/gpu/drm/xe/xe_guc_submit.c
>+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>@@ -936,10 +936,12 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w)
>
> static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
> {
>- struct xe_gt *gt = guc_to_gt(exec_queue_to_guc(q));
>- u32 ctx_timestamp, ctx_job_timestamp;
>- u32 timeout_ms = q->sched_props.job_timeout_ms;
>- u32 diff;
>+ struct xe_guc *guc = exec_queue_to_guc(q);
>+ struct xe_device *xe = guc_to_xe(guc);
>+ struct xe_gt *gt = guc_to_gt(guc);
>+ u64 ctx_timestamp, ctx_job_timestamp;
>+ u64 timeout_ms = q->sched_props.job_timeout_ms;
>+ u64 diff, max = U32_MAX;
> u64 running_time_ms;
>
> if (!xe_sched_job_started(job)) {
>@@ -952,15 +954,17 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
>
> ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
> ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
>+ if (xe->info.has_64bit_timestamp)
>+ max = U64_MAX;
>
> /*
> * Counter wraps at ~223s at the usual 19.2MHz, be paranoid catch
> * possible overflows with a high timeout.
this would not be true anymore with a 64b counter. Anyway, why would we
need to be soooo paranoid to increase this and consider a diff >=
223sec? For the timeout check we can leave with the counter wrapping at
32b afaics and not change anything here.
> */
>- xe_gt_assert(gt, timeout_ms < 100 * MSEC_PER_SEC);
>+ xe_gt_assert(gt, xe->info.has_64bit_timestamp || timeout_ms < 100 * MSEC_PER_SEC);
the assert is on the value of q->sched_props.job_timeout_ms.
Why are you removing it? Or even why is the assert in this function
rather than on init?
>
> if (ctx_timestamp < ctx_job_timestamp)
>- diff = ctx_timestamp + U32_MAX - ctx_job_timestamp;
>+ diff = ctx_timestamp + max - ctx_job_timestamp;
> else
> diff = ctx_timestamp - ctx_job_timestamp;
>
>@@ -971,7 +975,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
> ADJUST_FIVE_PERCENT(xe_gt_clock_interval_to_ms(gt, diff));
>
> xe_gt_dbg(gt,
>- "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%u, diff=0x%08x",
>+ "Check job timeout: seqno=%u, lrc_seqno=%u, guc_id=%d, running_time_ms=%llu, timeout_ms=%llu, diff=0x%llx",
> xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
> q->guc->id, running_time_ms, timeout_ms, diff);
>
>diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
>index 9d9be9383207..b5f5547c7c4f 100644
>--- a/drivers/gpu/drm/xe/xe_lrc.c
>+++ b/drivers/gpu/drm/xe/xe_lrc.c
>@@ -653,6 +653,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
> #define LRC_SEQNO_PPHWSP_OFFSET 512
> #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
> #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
>+#define LRC_CTX_JOB_TIMESTAMP_UDW_OFFSET (LRC_CTX_JOB_TIMESTAMP_OFFSET + 4)
> #define LRC_PARALLEL_PPHWSP_OFFSET 2048
>
> u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
>@@ -691,6 +692,12 @@ static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
> return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
> }
>
>+static u32 __xe_lrc_ctx_job_timestamp_udw_offset(struct xe_lrc *lrc)
>+{
>+ /* This is stored in the driver-defined portion of PPHWSP */
>+ return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_UDW_OFFSET;
>+}
>+
> static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
> {
> /* The parallel is stored in the driver-defined portion of PPHWSP */
>@@ -702,6 +709,11 @@ static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
> return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
> }
>
>+static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
>+{
>+ return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
>+}
>+
> static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
> {
> /* Indirect ring state page is at the very end of LRC */
>@@ -728,7 +740,9 @@ DECL_MAP_ADDR_HELPERS(seqno)
> DECL_MAP_ADDR_HELPERS(regs)
> DECL_MAP_ADDR_HELPERS(start_seqno)
> DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
>+DECL_MAP_ADDR_HELPERS(ctx_job_timestamp_udw)
> DECL_MAP_ADDR_HELPERS(ctx_timestamp)
>+DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
> DECL_MAP_ADDR_HELPERS(parallel)
> DECL_MAP_ADDR_HELPERS(indirect_ring)
>
>@@ -745,19 +759,38 @@ u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
> return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
> }
>
>+/**
>+ * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
>+ * @lrc: Pointer to the lrc.
>+ *
>+ * Returns: ctx timestamp udw GGTT address
>+ */
>+u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
>+{
>+ return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
>+}
>+
> /**
> * xe_lrc_ctx_timestamp() - Read ctx timestamp value
> * @lrc: Pointer to the lrc.
> *
> * Returns: ctx timestamp value
> */
>-u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
>+u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
> {
> struct xe_device *xe = lrc_to_xe(lrc);
> struct iosys_map map;
>+ u32 ldw, udw = 0;
>
> map = __xe_lrc_ctx_timestamp_map(lrc);
>- return xe_map_read32(xe, &map);
>+ ldw = xe_map_read32(xe, &map);
>+
>+ if (xe->info.has_64bit_timestamp) {
>+ map = __xe_lrc_ctx_timestamp_udw_map(lrc);
>+ udw = xe_map_read32(xe, &map);
>+ }
>+
>+ return (u64)udw << 32 | ldw;
I understand doing this for the next patch, but for this timeout I think
we could just discard the upper part and let the 32b wrap.
Lucas De Marchi
More information about the Intel-xe
mailing list