[Intel-gfx] [PATCH] dma-buf: Enhance dma-fence tracing

Mon Jan 21 23:20:40 UTC 2019

Rather than every backend and GPU driver reinventing the same wheel for
user level debugging of HW execution, the common dma-fence framework
should include the tracing infrastructure required for most client API
level flow visualisation.

With these common dma-fence level tracepoints, the userspace tools can
establish a detailed view of the client <-> HW flow across different
kernels. There is a strong ask to have this available, so that the
userspace developer can effectively assess if they're doing a good job
about feeding the beast of a GPU hardware.

In the case of needing to look into more fine-grained details of how
kernel internals work towards the goal of feeding the beast, the tools
may optionally amend the dma-fence tracing information with the driver
implementation specific. But for such cases, the tools should have a
graceful degradation in case the expected extra tracepoints have
changed or their format differs from the expected, as the kernel
implementation internals are not expected to stay the same.

It is important to distinguish between tracing for the purpose of client
flow visualisation and tracing for the purpose of low-level kernel
debugging. The latter is highly implementation specific, tied to
a particular HW and driver, whereas the former addresses a common goal
of user level tracing and likely a common set of userspace tools.
Having made the distinction that these tracepoints will be consumed for
client API tooling, we raise the spectre of tracepoint ABI stability. It
is hoped that by defining a common set of dma-fence tracepoints, we avoid
the pitfall of exposing low level details and so restrict ourselves only
to the high level flow that is applicable to all drivers and hardware.
Thus the reserved guarantee that this set of tracepoints will be stable
(with the emphasis on depicting client <-> HW flow as opposed to
driver <-> HW).

In terms of specific changes to the dma-fence tracing, we remove the
emission of the strings for every tracepoint (reserving them for
dma_fence_init for cases where they have unique dma_fence_ops, and
preferring to have descriptors for the whole fence context). strings do
not pack as well into the ftrace ringbuffer and we would prefer to
reduce the amount of indirect callbacks required for frequent tracepoint
emission.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Alex Deucher <alexdeucher at gmail.com>
Cc: "Christian König" <christian.koenig at amd.com>
Cc: Eric Anholt <eric at anholt.net>
Cc: Pierre-Loup Griffais <pgriffais at valvesoftware.com>
Cc: Michael Sartain <mikesart at fastmail.com>
Cc: Steven Rostedt <rostedt at goodmis.org>
---
 drivers/dma-buf/dma-fence.c                 |   9 +-
 drivers/gpu/drm/i915/i915_gem_clflush.c     |   5 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c  |   1 -
 drivers/gpu/drm/i915/i915_request.c         |  16 +-
 drivers/gpu/drm/i915/i915_timeline.c        |   5 +
 drivers/gpu/drm/i915/i915_trace.h           | 134 ---------------
 drivers/gpu/drm/i915/intel_guc_submission.c |  10 ++
 drivers/gpu/drm/i915/intel_lrc.c            |   6 +
 drivers/gpu/drm/i915/intel_ringbuffer.h     |   2 +
 include/trace/events/dma_fence.h            | 177 +++++++++++++++++++-
 10 files changed, 214 insertions(+), 151 deletions(-)

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 3aa8733f832a..5c93ed34b1ff 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -27,8 +27,15 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/dma_fence.h>
 
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_context_create);
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_context_destroy);
+
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_await);
 EXPORT_TRACEPOINT_SYMBOL(dma_fence_emit);
-EXPORT_TRACEPOINT_SYMBOL(dma_fence_enable_signal);
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_execute_start);
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_execute_end);
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_wait_start);
+EXPORT_TRACEPOINT_SYMBOL(dma_fence_wait_end);
 
 static DEFINE_SPINLOCK(dma_fence_stub_lock);
 static struct dma_fence dma_fence_stub;
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index 8e74c23cbd91..435c1303ecc8 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -22,6 +22,8 @@
  *
  */
 
+#include <trace/events/dma_fence.h>
+
 #include "i915_drv.h"
 #include "intel_frontbuffer.h"
 #include "i915_gem_clflush.h"
@@ -73,6 +75,7 @@ static void i915_clflush_work(struct work_struct *work)
 	struct clflush *clflush = container_of(work, typeof(*clflush), work);
 	struct drm_i915_gem_object *obj = clflush->obj;
 
+	trace_dma_fence_execute_start(&clflush->dma, smp_processor_id());
 	if (i915_gem_object_pin_pages(obj)) {
 		DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
 		goto out;
@@ -83,6 +86,7 @@ static void i915_clflush_work(struct work_struct *work)
 	i915_gem_object_unpin_pages(obj);
 
 out:
+	trace_dma_fence_execute_end(&clflush->dma, smp_processor_id());
 	i915_gem_object_put(obj);
 
 	dma_fence_signal(&clflush->dma);
@@ -97,6 +101,7 @@ i915_clflush_notify(struct i915_sw_fence *fence,
 
 	switch (state) {
 	case FENCE_COMPLETE:
+		trace_dma_fence_emit(&clflush->dma);
 		schedule_work(&clflush->work);
 		break;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index f250109e1f66..f10a9e675e2c 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -2397,7 +2397,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	 */
 	eb.request->batch = eb.batch;
 
-	trace_i915_request_queue(eb.request, eb.batch_flags);
 	err = eb_submit(&eb);
 err_request:
 	i915_request_add(eb.request);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5e178f5ac18b..d5bbc8cb8306 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -28,6 +28,8 @@
 #include <linux/sched/clock.h>
 #include <linux/sched/signal.h>
 
+#include <trace/events/dma_fence.h>
+
 #include "i915_drv.h"
 #include "i915_reset.h"
 
@@ -254,8 +256,6 @@ static void i915_request_retire(struct i915_request *request)
 	GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
 	GEM_BUG_ON(!i915_request_completed(request));
 
-	trace_i915_request_retire(request);
-
 	advance_ring(request);
 	free_capture_list(request);
 
@@ -383,8 +383,6 @@ void __i915_request_submit(struct i915_request *request)
 	/* Transfer from per-context onto the global per-engine timeline */
 	move_to_timeline(request, &engine->timeline);
 
-	trace_i915_request_execute(request);
-
 	wake_up_all(&request->execute);
 }
 
@@ -463,7 +461,8 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 
 	switch (state) {
 	case FENCE_COMPLETE:
-		trace_i915_request_submit(request);
+		trace_dma_fence_emit(&request->fence);
+
 		/*
 		 * We need to serialize use of the submit_request() callback
 		 * with its hotplugging performed during an emergency
@@ -770,6 +769,8 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
 		if (ret < 0)
 			return ret;
 
+		trace_dma_fence_await(&rq->fence, fence);
+
 		/* Record the latest fence used against each timeline */
 		if (fence->context != rq->i915->mm.unordered_timeline)
 			i915_timeline_sync_set(rq->timeline, fence);
@@ -878,7 +879,6 @@ void i915_request_add(struct i915_request *request)
 		  engine->name, request->fence.context, request->fence.seqno);
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
-	trace_i915_request_add(request);
 
 	/*
 	 * Make sure that no request gazumped us - if it was allocated after
@@ -1139,7 +1139,7 @@ long i915_request_wait(struct i915_request *rq,
 	if (!timeout)
 		return -ETIME;
 
-	trace_i915_request_wait_begin(rq, flags);
+	trace_dma_fence_wait_start(&rq->fence);
 
 	add_wait_queue(&rq->execute, &exec);
 	if (flags & I915_WAIT_LOCKED)
@@ -1245,7 +1245,7 @@ long i915_request_wait(struct i915_request *rq,
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
-	trace_i915_request_wait_end(rq);
+	trace_dma_fence_wait_end(&rq->fence);
 
 	return timeout;
 }
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 4667cc08c416..a83ae0b59b1f 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -4,6 +4,8 @@
  * Copyright © 2016-2018 Intel Corporation
  */
 
+#include <trace/events/dma_fence.h>
+
 #include "i915_drv.h"
 
 #include "i915_timeline.h"
@@ -30,6 +32,7 @@ void i915_timeline_init(struct drm_i915_private *i915,
 	/* Called during early_init before we know how many engines there are */
 
 	timeline->fence_context = dma_fence_context_alloc(1);
+	trace_dma_fence_context_create(timeline->fence_context, "i915", name);
 
 	spin_lock_init(&timeline->lock);
 
@@ -70,6 +73,8 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 {
 	GEM_BUG_ON(!list_empty(&timeline->requests));
 
+	trace_dma_fence_context_destroy(timeline->fence_context);
+
 	i915_syncmap_free(&timeline->sync);
 
 	list_del(&timeline->link);
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 33d90eca9cdd..538b6efb5a56 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -584,82 +584,7 @@ TRACE_EVENT(i915_gem_evict_vm,
 	    TP_printk("dev=%d, vm=%p", __entry->dev, __entry->vm)
 );
 
-TRACE_EVENT(i915_request_queue,
-	    TP_PROTO(struct i915_request *rq, u32 flags),
-	    TP_ARGS(rq, flags),
-
-	    TP_STRUCT__entry(
-			     __field(u32, dev)
-			     __field(u32, hw_id)
-			     __field(u64, ctx)
-			     __field(u16, class)
-			     __field(u16, instance)
-			     __field(u32, seqno)
-			     __field(u32, flags)
-			     ),
-
-	    TP_fast_assign(
-			   __entry->dev = rq->i915->drm.primary->index;
-			   __entry->hw_id = rq->gem_context->hw_id;
-			   __entry->class = rq->engine->uabi_class;
-			   __entry->instance = rq->engine->instance;
-			   __entry->ctx = rq->fence.context;
-			   __entry->seqno = rq->fence.seqno;
-			   __entry->flags = flags;
-			   ),
-
-	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, flags=0x%x",
-		      __entry->dev, __entry->class, __entry->instance,
-		      __entry->hw_id, __entry->ctx, __entry->seqno,
-		      __entry->flags)
-);
-
-DECLARE_EVENT_CLASS(i915_request,
-	    TP_PROTO(struct i915_request *rq),
-	    TP_ARGS(rq),
-
-	    TP_STRUCT__entry(
-			     __field(u32, dev)
-			     __field(u32, hw_id)
-			     __field(u64, ctx)
-			     __field(u16, class)
-			     __field(u16, instance)
-			     __field(u32, seqno)
-			     __field(u32, global)
-			     ),
-
-	    TP_fast_assign(
-			   __entry->dev = rq->i915->drm.primary->index;
-			   __entry->hw_id = rq->gem_context->hw_id;
-			   __entry->class = rq->engine->uabi_class;
-			   __entry->instance = rq->engine->instance;
-			   __entry->ctx = rq->fence.context;
-			   __entry->seqno = rq->fence.seqno;
-			   __entry->global = rq->global_seqno;
-			   ),
-
-	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, global=%u",
-		      __entry->dev, __entry->class, __entry->instance,
-		      __entry->hw_id, __entry->ctx, __entry->seqno,
-		      __entry->global)
-);
-
-DEFINE_EVENT(i915_request, i915_request_add,
-	    TP_PROTO(struct i915_request *rq),
-	    TP_ARGS(rq)
-);
-
 #if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS)
-DEFINE_EVENT(i915_request, i915_request_submit,
-	     TP_PROTO(struct i915_request *rq),
-	     TP_ARGS(rq)
-);
-
-DEFINE_EVENT(i915_request, i915_request_execute,
-	     TP_PROTO(struct i915_request *rq),
-	     TP_ARGS(rq)
-);
-
 TRACE_EVENT(i915_request_in,
 	    TP_PROTO(struct i915_request *rq, unsigned int port),
 	    TP_ARGS(rq, port),
@@ -728,16 +653,6 @@ TRACE_EVENT(i915_request_out,
 
 #else
 #if !defined(TRACE_HEADER_MULTI_READ)
-static inline void
-trace_i915_request_submit(struct i915_request *rq)
-{
-}
-
-static inline void
-trace_i915_request_execute(struct i915_request *rq)
-{
-}
-
 static inline void
 trace_i915_request_in(struct i915_request *rq, unsigned int port)
 {
@@ -775,55 +690,6 @@ TRACE_EVENT(intel_engine_notify,
 		      __entry->seqno, __entry->waiters)
 );
 
-DEFINE_EVENT(i915_request, i915_request_retire,
-	    TP_PROTO(struct i915_request *rq),
-	    TP_ARGS(rq)
-);
-
-TRACE_EVENT(i915_request_wait_begin,
-	    TP_PROTO(struct i915_request *rq, unsigned int flags),
-	    TP_ARGS(rq, flags),
-
-	    TP_STRUCT__entry(
-			     __field(u32, dev)
-			     __field(u32, hw_id)
-			     __field(u64, ctx)
-			     __field(u16, class)
-			     __field(u16, instance)
-			     __field(u32, seqno)
-			     __field(u32, global)
-			     __field(unsigned int, flags)
-			     ),
-
-	    /* NB: the blocking information is racy since mutex_is_locked
-	     * doesn't check that the current thread holds the lock. The only
-	     * other option would be to pass the boolean information of whether
-	     * or not the class was blocking down through the stack which is
-	     * less desirable.
-	     */
-	    TP_fast_assign(
-			   __entry->dev = rq->i915->drm.primary->index;
-			   __entry->hw_id = rq->gem_context->hw_id;
-			   __entry->class = rq->engine->uabi_class;
-			   __entry->instance = rq->engine->instance;
-			   __entry->ctx = rq->fence.context;
-			   __entry->seqno = rq->fence.seqno;
-			   __entry->global = rq->global_seqno;
-			   __entry->flags = flags;
-			   ),
-
-	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, global=%u, blocking=%u, flags=0x%x",
-		      __entry->dev, __entry->class, __entry->instance,
-		      __entry->hw_id, __entry->ctx, __entry->seqno,
-		      __entry->global, !!(__entry->flags & I915_WAIT_LOCKED),
-		      __entry->flags)
-);
-
-DEFINE_EVENT(i915_request, i915_request_wait_end,
-	    TP_PROTO(struct i915_request *rq),
-	    TP_ARGS(rq)
-);
-
 TRACE_EVENT_CONDITION(i915_reg_rw,
 	TP_PROTO(bool write, i915_reg_t reg, u64 val, int len, bool trace),
 
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index ab1c49b106f2..dda20ddf0418 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -535,6 +535,14 @@ static void guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 	client->submissions[engine->id] += 1;
 
 	spin_unlock(&client->wq_lock);
+
+	/*
+	 * XXX move to HW synthesis:
+	 * - emit SRM(RCS_TIMESTAMP) to per-context journal
+	 * - poll (maybe interrupt) journal from kthread, generating tracepoint
+	 * - notify callback for dma_fence tracepoint register
+	 */
+	trace_dma_fence_execute_start(&rq->fence, i915_trace_hwid(rq->engine));
 }
 
 /*
@@ -810,6 +818,8 @@ static void guc_submission_tasklet(unsigned long data)
 
 	rq = port_request(port);
 	while (rq && i915_request_completed(rq)) {
+		trace_dma_fence_execute_end(&rq->fence,
+					    i915_trace_hwid(rq->engine));
 		trace_i915_request_out(rq);
 		i915_request_put(rq);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index c0a42afaf177..3fc485925ab1 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -133,6 +133,8 @@
  */
 #include <linux/interrupt.h>
 
+#include <trace/events/dma_fence.h>
+
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
@@ -351,11 +353,15 @@ execlists_context_schedule_in(struct i915_request *rq)
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
 	intel_engine_context_in(rq->engine);
 	rq->hw_context->active = rq->engine;
+
+	trace_dma_fence_execute_start(&rq->fence, i915_trace_hwid(rq->engine));
 }
 
 static inline void
 execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
 {
+	trace_dma_fence_execute_end(&rq->fence, i915_trace_hwid(rq->engine));
+
 	rq->hw_context->active = NULL;
 	intel_engine_context_out(rq->engine);
 	execlists_context_status_change(rq, status);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c3ef0f9bf321..0fd7fb7fd2cb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -579,6 +579,8 @@ struct intel_engine_cs {
 	} stats;
 };
 
+#define i915_trace_hwid(e) ((e)->uabi_class << 16 | (e)->instance)
+
 static inline bool
 intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine)
 {
diff --git a/include/trace/events/dma_fence.h b/include/trace/events/dma_fence.h
index 2212adda8f77..75d0de3ccc4a 100644
--- a/include/trace/events/dma_fence.h
+++ b/include/trace/events/dma_fence.h
@@ -9,12 +9,119 @@
 
 struct dma_fence;
 
+/*
+ * dma-fence tracing *or* How to debug fences
+ *
+ * The dma-fence tracing provides insight into the user <-> HW execution
+ * flow, although dma-fence is not tied to HW and may be used to coordinate
+ * any execution flow. A dma-fence represents a job along a timeline (the fence
+ * context), and when the job is complete it is signaled. However, for the
+ * purposes of visualisation of HW execution, we need a little more information
+ * than that as we need to not only know when the job became ready for execution
+ * (was passed into the HW queue) but ideally when and on which HW engine that
+ * job was eventually scheduled. For CPU bound execution flows, similarly
+ * knowing on which CPU the job was scheduled can be vital information for
+ * debugging.
+ *
+ * The typical flow of events from user to HW for a dma-fence would be:
+ *
+ *   1. dma_fence_init
+ *   2. dma_fence_await (optional)
+ *      - records the dependencies between fences that must be signaled
+ *        before this fence is ready for execution; an asynchronous wait
+ *   3. dma_fence_emit
+ *      - the fence is ready for execution and passed to the execution queue
+ *        (the user to HW/FW/backend transition)
+ *   4. dma_fence_execute_start (optional)
+ *      - records the start of execution on the backend (HW) and includes
+ *        a tag to uniquely identify the backend engine so that concurrent
+ *        execution can be traced
+ *      - may only be emitted for the first fence in a context to begin
+ *        execution
+ *   5. dma_fence_execute_end (optional)
+ *     - records the corresponding completion point of backend execution
+ *     - may only be emitted for the last fence in a context to finish
+ *       execution
+ *   6. dma_fence_signaled
+ *     - records when the fence was marked as completed and the result
+ *       propagated to the various waiters
+ *   7. dma_fence_destroy
+ *
+ * Note that not all fences are used in conjunction with HW engines, and
+ * so may exclude the execution tracing. Nor do they all correspond with
+ * client API, although many may be used as background tasks required
+ * before HW execution.
+ *
+ * The flow of events from HW to user would be:
+ *
+ *   1. dma_fence_wait_begin
+ *   2. dma_fence_enable_signaling (optional)
+ *   3. dma_fence_signaled
+ *   4. dma_fence_wait_end
+ *
+ * For improved visualisation, dma_fence_context_create and
+ * dma_fence_context_destroy are used to couple the context id to a string.
+ */
+
+TRACE_EVENT(dma_fence_context_create,
+	    TP_PROTO(u64 context, const char *driver, const char *timeline),
+	    TP_ARGS(context, driver, timeline),
+
+	    TP_STRUCT__entry(
+			     __field(u64, context)
+			     __string(driver, driver)
+			     __string(timeline, timeline)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->context = context;
+			   __assign_str(driver, driver)
+			   __assign_str(timeline, timeline)
+			   ),
+
+	    TP_printk("context=%llu, driver=%s, timeline=%s",
+		      __entry->context, __get_str(driver), __get_str(timeline))
+);
+
+TRACE_EVENT(dma_fence_context_destroy,
+	    TP_PROTO(u64 context),
+	    TP_ARGS(context),
+
+	    TP_STRUCT__entry(
+			     __field(u64, context)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->context = context;
+			   ),
+
+	    TP_printk("context=%llu", __entry->context)
+);
+
 DECLARE_EVENT_CLASS(dma_fence,
 
 	TP_PROTO(struct dma_fence *fence),
 
 	TP_ARGS(fence),
 
+	TP_STRUCT__entry(
+		__field(u64, context)
+		__field(u64, seqno)
+	),
+
+	TP_fast_assign(
+		__entry->context = fence->context;
+		__entry->seqno = fence->seqno;
+	),
+
+	TP_printk("context=%llu, seqno=%llu", __entry->context, __entry->seqno)
+);
+
+TRACE_EVENT(dma_fence_init,
+	TP_PROTO(struct dma_fence *fence),
+
+	TP_ARGS(fence),
+
 	TP_STRUCT__entry(
 		__string(driver, fence->ops->get_driver_name(fence))
 		__string(timeline, fence->ops->get_timeline_name(fence))
@@ -41,13 +148,6 @@ DEFINE_EVENT(dma_fence, dma_fence_emit,
 	TP_ARGS(fence)
 );
 
-DEFINE_EVENT(dma_fence, dma_fence_init,
-
-	TP_PROTO(struct dma_fence *fence),
-
-	TP_ARGS(fence)
-);
-
 DEFINE_EVENT(dma_fence, dma_fence_destroy,
 
 	TP_PROTO(struct dma_fence *fence),
@@ -83,6 +183,69 @@ DEFINE_EVENT(dma_fence, dma_fence_wait_end,
 	TP_ARGS(fence)
 );
 
+TRACE_EVENT(dma_fence_await,
+	    TP_PROTO(struct dma_fence *wait, struct dma_fence *signal),
+	    TP_ARGS(wait, signal),
+
+	    TP_STRUCT__entry(
+			     __field(u64, wait_context)
+			     __field(u64, wait_seqno)
+			     __field(u64, signal_context)
+			     __field(u64, signal_seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->wait_context = wait->context;
+			   __entry->wait_seqno = wait->seqno;
+			   __entry->signal_context = signal->context;
+			   __entry->signal_seqno = signal->seqno;
+			   ),
+
+	    TP_printk("wait_context=%llu, wait_seqno=%llu, signal_context=%llu, signal_seqno=%llu",
+		      __entry->wait_context, __entry->wait_seqno,
+		      __entry->signal_context, __entry->signal_seqno)
+);
+
+TRACE_EVENT(dma_fence_execute_start,
+	    TP_PROTO(struct dma_fence *fence, u64 hwid),
+	    TP_ARGS(fence, hwid),
+
+	    TP_STRUCT__entry(
+			     __field(u64, context)
+			     __field(u64, seqno)
+			     __field(u64, hwid)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->context = fence->context;
+			   __entry->seqno = fence->seqno;
+			   __entry->hwid = hwid;
+			   ),
+
+	    TP_printk("context=%llu, seqno=%llu, hwid=%llu",
+		      __entry->context, __entry->seqno, __entry->hwid)
+);
+
+TRACE_EVENT(dma_fence_execute_end,
+	    TP_PROTO(struct dma_fence *fence, u64 hwid),
+	    TP_ARGS(fence, hwid),
+
+	    TP_STRUCT__entry(
+			     __field(u64, context)
+			     __field(u64, seqno)
+			     __field(u64, hwid)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->context = fence->context;
+			   __entry->seqno = fence->seqno;
+			   __entry->hwid = hwid;
+			   ),
+
+	    TP_printk("context=%llu, seqno=%llu, hwid=%llu",
+		      __entry->context, __entry->seqno, __entry->hwid)
+);
+
 #endif /*  _TRACE_DMA_FENCE_H */
 
 /* This part must be outside protection */
-- 
2.20.1