[PATCH v2 27/27] drm/dp_mst: Add topology ref history tracking for debugging

Sean Paul sean at poorly.run
Fri Sep 27 14:51:07 UTC 2019


On Tue, Sep 03, 2019 at 04:46:05PM -0400, Lyude Paul wrote:
> For very subtle mistakes with topology refs, it can be rather difficult
> to trace them down with the debugging info that we already have. I had
> one such issue recently while trying to implement suspend/resume
> reprobing for MST, and ended up coming up with this.
> 
> Inspired by Chris Wilson's wakeref tracking for i915, this adds a very
> similar feature to the DP MST helpers, which allows for partial tracking
> of topology refs for both ports and branch devices. This is a lot less
> advanced then wakeref tracking: we merely keep a count of all of the
> spots where a topology ref has been grabbed or dropped, then dump out
> that history in chronological order when a port or branch device's
> topology refcount reaches 0. So far, I've found this incredibly useful
> for debugging topology refcount errors.
> 
> Since this has the potential to be somewhat slow and loud, we add an
> expert kernel config option to enable or disable this feature,
> CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
> 

Looks very useful indeed! 

My only nit is that we could probably grow the list a little more aggressively
(or start it off at some size > 1) and avoid a bunch of reallocs. That said,
I'm not sure how often it's reallocated so it might not be an issue. Either
way, 

Reviewed-by: Sean Paul <sean at poorly.run>


> Changes since v1:
> * Don't forget to destroy topology_ref_history_lock
> 
> Cc: Juston Li <juston.li at intel.com>
> Cc: Imre Deak <imre.deak at intel.com>
> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
> Cc: Harry Wentland <hwentlan at amd.com>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Signed-off-by: Lyude Paul <lyude at redhat.com>
> ---
>  drivers/gpu/drm/Kconfig               |  14 ++
>  drivers/gpu/drm/drm_dp_mst_topology.c | 233 +++++++++++++++++++++++++-
>  include/drm/drm_dp_mst_helper.h       |  45 +++++
>  3 files changed, 288 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
> index e67c194c2aca..44fc2c2a6e2c 100644
> --- a/drivers/gpu/drm/Kconfig
> +++ b/drivers/gpu/drm/Kconfig
> @@ -93,6 +93,20 @@ config DRM_KMS_FB_HELPER
>  	help
>  	  FBDEV helpers for KMS drivers.
>  
> +config DRM_DEBUG_DP_MST_TOPOLOGY_REFS
> +        bool "Enable refcount backtrace history in the DP MST helpers"
> +        select STACKDEPOT
> +        depends on DRM_KMS_HELPER
> +        depends on DEBUG_KERNEL
> +        depends on EXPERT
> +        help
> +          Enables debug tracing for topology refs in DRM's DP MST helpers. A
> +          history of each topology reference/dereference will be printed to the
> +          kernel log once a port or branch device's topology refcount reaches 0.
> +
> +          This has the potential to use a lot of memory and print some very
> +          large kernel messages. If in doubt, say "N".
> +
>  config DRM_FBDEV_EMULATION
>  	bool "Enable legacy fbdev support for your modesetting driver"
>  	depends on DRM
> diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
> index 5b5c0b3b3c0e..18f9a02927d9 100644
> --- a/drivers/gpu/drm/drm_dp_mst_topology.c
> +++ b/drivers/gpu/drm/drm_dp_mst_topology.c
> @@ -28,6 +28,13 @@
>  #include <linux/sched.h>
>  #include <linux/seq_file.h>
>  
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +#include <linux/stackdepot.h>
> +#include <linux/sort.h>
> +#include <linux/timekeeping.h>
> +#include <linux/math64.h>
> +#endif
> +
>  #include <drm/drm_atomic.h>
>  #include <drm/drm_atomic_helper.h>
>  #include <drm/drm_dp_mst_helper.h>
> @@ -1405,12 +1412,189 @@ drm_dp_mst_put_port_malloc(struct drm_dp_mst_port *port)
>  }
>  EXPORT_SYMBOL(drm_dp_mst_put_port_malloc);
>  
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +
> +#define STACK_DEPTH 8
> +
> +static noinline void
> +__topology_ref_save(struct drm_dp_mst_topology_mgr *mgr,
> +		    struct drm_dp_mst_topology_ref_history *history,
> +		    enum drm_dp_mst_topology_ref_type type)
> +{
> +	struct drm_dp_mst_topology_ref_entry *entry = NULL;
> +	depot_stack_handle_t backtrace;
> +	ulong stack_entries[STACK_DEPTH];
> +	uint n;
> +	int i;
> +
> +	n = stack_trace_save(stack_entries, ARRAY_SIZE(stack_entries), 1);
> +	backtrace = stack_depot_save(stack_entries, n, GFP_KERNEL);
> +	if (!backtrace)
> +		goto fail_alloc;
> +
> +	/* Try to find an existing entry for this backtrace */
> +	for (i = 0; i < history->len; i++) {
> +		if (history->entries[i].backtrace == backtrace) {
> +			entry = &history->entries[i];
> +			break;
> +		}
> +	}
> +
> +	/* Otherwise add one */
> +	if (!entry) {
> +		struct drm_dp_mst_topology_ref_entry *new;
> +		int new_len = history->len + 1;
> +
> +		new = krealloc(history->entries, sizeof(*new) * new_len,
> +			       GFP_KERNEL);
> +		if (!new)
> +			goto fail_alloc;
> +
> +		entry = &new[history->len];
> +		history->len = new_len;
> +		history->entries = new;
> +
> +		entry->backtrace = backtrace;
> +		entry->type = type;
> +		entry->count = 0;
> +	}
> +	entry->count++;
> +	entry->ts_nsec = ktime_get_ns();
> +
> +	return;
> +fail_alloc:
> +	DRM_WARN_ONCE("Failed to allocate memory for topology refcount backtrace\n");
> +}
> +
> +static int
> +topology_ref_history_cmp(const void *a, const void *b)
> +{
> +	const struct drm_dp_mst_topology_ref_entry *entry_a = a, *entry_b = b;
> +
> +	if (entry_a->ts_nsec > entry_b->ts_nsec)
> +		return 1;
> +	else if (entry_a->ts_nsec < entry_b->ts_nsec)
> +		return -1;
> +	else
> +		return 0;
> +}
> +
> +static inline const char *
> +topology_ref_type_to_str(enum drm_dp_mst_topology_ref_type type)
> +{
> +	if (type == DRM_DP_MST_TOPOLOGY_REF_GET)
> +		return "get";
> +	else
> +		return "put";
> +}
> +
> +static void
> +__dump_topology_ref_history(struct drm_dp_mst_topology_ref_history *history,
> +			    void *ptr, const char *type_str)
> +{
> +	struct drm_printer p = drm_debug_printer(DBG_PREFIX);
> +	char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
> +	int i;
> +
> +	if (!buf)
> +		return;
> +
> +	if (!history->len)
> +		goto out;
> +
> +	/* First, sort the list so that it goes from oldest to newest
> +	 * reference entry
> +	 */
> +	sort(history->entries, history->len, sizeof(*history->entries),
> +	     topology_ref_history_cmp, NULL);
> +
> +	drm_printf(&p,
> +		   "%s (%p/%px) topology count reached 0, dumping history:\n",
> +		   type_str, ptr, ptr);
> +
> +	for (i = 0; i < history->len; i++) {
> +		const struct drm_dp_mst_topology_ref_entry *entry =
> +			&history->entries[i];
> +		ulong *entries;
> +		uint nr_entries;
> +		u64 ts_nsec = entry->ts_nsec;
> +		u64 rem_nsec = do_div(ts_nsec, 1000000000);
> +
> +		nr_entries = stack_depot_fetch(entry->backtrace, &entries);
> +		stack_trace_snprint(buf, PAGE_SIZE, entries, nr_entries, 4);
> +
> +		drm_printf(&p, "  %d %ss (last at %5llu.%06llu):\n%s",
> +			   entry->count,
> +			   topology_ref_type_to_str(entry->type),
> +			   ts_nsec, rem_nsec / 1000, buf);
> +	}
> +
> +	/* Now free the history, since this is the only time we expose it */
> +	kfree(history->entries);
> +out:
> +	kfree(buf);
> +}
> +
> +static __always_inline void
> +drm_dp_mst_dump_mstb_topology_history(struct drm_dp_mst_branch *mstb)
> +{
> +	__dump_topology_ref_history(&mstb->topology_ref_history, mstb,
> +				    "MSTB");
> +}
> +
> +static __always_inline void
> +drm_dp_mst_dump_port_topology_history(struct drm_dp_mst_port *port)
> +{
> +	__dump_topology_ref_history(&port->topology_ref_history, port,
> +				    "Port");
> +}
> +
> +static __always_inline void
> +save_mstb_topology_ref(struct drm_dp_mst_branch *mstb,
> +		       enum drm_dp_mst_topology_ref_type type)
> +{
> +	__topology_ref_save(mstb->mgr, &mstb->topology_ref_history, type);
> +}
> +
> +static __always_inline void
> +save_port_topology_ref(struct drm_dp_mst_port *port,
> +		       enum drm_dp_mst_topology_ref_type type)
> +{
> +	__topology_ref_save(port->mgr, &port->topology_ref_history, type);
> +}
> +
> +static inline void
> +topology_ref_history_lock(struct drm_dp_mst_topology_mgr *mgr)
> +{
> +	mutex_lock(&mgr->topology_ref_history_lock);
> +}
> +
> +static inline void
> +topology_ref_history_unlock(struct drm_dp_mst_topology_mgr *mgr)
> +{
> +	mutex_unlock(&mgr->topology_ref_history_lock);
> +}
> +#else
> +static inline void
> +topology_ref_history_lock(struct drm_dp_mst_topology_mgr *mgr) {}
> +static inline void
> +topology_ref_history_unlock(struct drm_dp_mst_topology_mgr *mgr) {}
> +static inline void
> +drm_dp_mst_dump_mstb_topology_history(struct drm_dp_mst_branch *mstb) {}
> +static inline void
> +drm_dp_mst_dump_port_topology_history(struct drm_dp_mst_port *port) {}
> +#define save_mstb_topology_ref(mstb, type)
> +#define save_port_topology_ref(port, type)
> +#endif
> +
>  static void drm_dp_destroy_mst_branch_device(struct kref *kref)
>  {
>  	struct drm_dp_mst_branch *mstb =
>  		container_of(kref, struct drm_dp_mst_branch, topology_kref);
>  	struct drm_dp_mst_topology_mgr *mgr = mstb->mgr;
>  
> +	drm_dp_mst_dump_mstb_topology_history(mstb);
> +
>  	INIT_LIST_HEAD(&mstb->destroy_next);
>  
>  	/*
> @@ -1448,11 +1632,18 @@ static void drm_dp_destroy_mst_branch_device(struct kref *kref)
>  static int __must_check
>  drm_dp_mst_topology_try_get_mstb(struct drm_dp_mst_branch *mstb)
>  {
> -	int ret = kref_get_unless_zero(&mstb->topology_kref);
> +	int ret;
>  
> -	if (ret)
> +	topology_ref_history_lock(mstb->mgr);
> +	ret = kref_get_unless_zero(&mstb->topology_kref);
> +
> +	if (ret) {
>  		DRM_DEBUG("mstb %p/%px (%d)\n",
>  			  mstb, mstb, kref_read(&mstb->topology_kref));
> +		save_mstb_topology_ref(mstb, DRM_DP_MST_TOPOLOGY_REF_GET);
> +	}
> +
> +	topology_ref_history_unlock(mstb->mgr);
>  
>  	return ret;
>  }
> @@ -1473,10 +1664,15 @@ drm_dp_mst_topology_try_get_mstb(struct drm_dp_mst_branch *mstb)
>   */
>  static void drm_dp_mst_topology_get_mstb(struct drm_dp_mst_branch *mstb)
>  {
> +	topology_ref_history_lock(mstb->mgr);
> +
> +	save_mstb_topology_ref(mstb, DRM_DP_MST_TOPOLOGY_REF_GET);
>  	WARN_ON(kref_read(&mstb->topology_kref) == 0);
>  	kref_get(&mstb->topology_kref);
>  	DRM_DEBUG("mstb %p/%px (%d)\n",
>  		  mstb, mstb, kref_read(&mstb->topology_kref));
> +
> +	topology_ref_history_unlock(mstb->mgr);
>  }
>  
>  /**
> @@ -1494,9 +1690,14 @@ static void drm_dp_mst_topology_get_mstb(struct drm_dp_mst_branch *mstb)
>  static void
>  drm_dp_mst_topology_put_mstb(struct drm_dp_mst_branch *mstb)
>  {
> +	topology_ref_history_lock(mstb->mgr);
> +
>  	DRM_DEBUG("mstb %p/%px (%d)\n",
>  		  mstb, mstb, kref_read(&mstb->topology_kref) - 1);
> +	save_mstb_topology_ref(mstb, DRM_DP_MST_TOPOLOGY_REF_PUT);
>  	kref_put(&mstb->topology_kref, drm_dp_destroy_mst_branch_device);
> +
> +	topology_ref_history_unlock(mstb->mgr);
>  }
>  
>  static void drm_dp_destroy_port(struct kref *kref)
> @@ -1505,6 +1706,8 @@ static void drm_dp_destroy_port(struct kref *kref)
>  		container_of(kref, struct drm_dp_mst_port, topology_kref);
>  	struct drm_dp_mst_topology_mgr *mgr = port->mgr;
>  
> +	drm_dp_mst_dump_port_topology_history(port);
> +
>  	/* There's nothing that needs locking to destroy an input port yet */
>  	if (port->input) {
>  		drm_dp_mst_put_port_malloc(port);
> @@ -1548,12 +1751,18 @@ static void drm_dp_destroy_port(struct kref *kref)
>  static int __must_check
>  drm_dp_mst_topology_try_get_port(struct drm_dp_mst_port *port)
>  {
> -	int ret = kref_get_unless_zero(&port->topology_kref);
> +	int ret;
> +
> +	topology_ref_history_lock(port->mgr);
> +	ret = kref_get_unless_zero(&port->topology_kref);
>  
> -	if (ret)
> +	if (ret) {
>  		DRM_DEBUG("port %p/%px (%d)\n",
>  			  port, port, kref_read(&port->topology_kref));
> +		save_port_topology_ref(port, DRM_DP_MST_TOPOLOGY_REF_GET);
> +	}
>  
> +	topology_ref_history_unlock(port->mgr);
>  	return ret;
>  }
>  
> @@ -1572,10 +1781,15 @@ drm_dp_mst_topology_try_get_port(struct drm_dp_mst_port *port)
>   */
>  static void drm_dp_mst_topology_get_port(struct drm_dp_mst_port *port)
>  {
> +	topology_ref_history_lock(port->mgr);
> +
>  	WARN_ON(kref_read(&port->topology_kref) == 0);
>  	kref_get(&port->topology_kref);
>  	DRM_DEBUG("port %p/%px (%d)\n",
>  		  port, port, kref_read(&port->topology_kref));
> +	save_port_topology_ref(port, DRM_DP_MST_TOPOLOGY_REF_GET);
> +
> +	topology_ref_history_unlock(port->mgr);
>  }
>  
>  /**
> @@ -1591,9 +1805,14 @@ static void drm_dp_mst_topology_get_port(struct drm_dp_mst_port *port)
>   */
>  static void drm_dp_mst_topology_put_port(struct drm_dp_mst_port *port)
>  {
> +	topology_ref_history_lock(port->mgr);
> +
>  	DRM_DEBUG("port %p/%px (%d)\n",
>  		  port, port, kref_read(&port->topology_kref) - 1);
> +	save_port_topology_ref(port, DRM_DP_MST_TOPOLOGY_REF_PUT);
>  	kref_put(&port->topology_kref, drm_dp_destroy_port);
> +
> +	topology_ref_history_unlock(port->mgr);
>  }
>  
>  static struct drm_dp_mst_branch *
> @@ -4548,6 +4767,9 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr,
>  	mutex_init(&mgr->payload_lock);
>  	mutex_init(&mgr->delayed_destroy_lock);
>  	mutex_init(&mgr->up_req_lock);
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +	mutex_init(&mgr->topology_ref_history_lock);
> +#endif
>  	INIT_LIST_HEAD(&mgr->tx_msg_downq);
>  	INIT_LIST_HEAD(&mgr->destroy_port_list);
>  	INIT_LIST_HEAD(&mgr->destroy_branch_device_list);
> @@ -4613,6 +4835,9 @@ void drm_dp_mst_topology_mgr_destroy(struct drm_dp_mst_topology_mgr *mgr)
>  	mutex_destroy(&mgr->qlock);
>  	mutex_destroy(&mgr->lock);
>  	mutex_destroy(&mgr->up_req_lock);
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +	mutex_destroy(&mgr->topology_ref_history_lock);
> +#endif
>  }
>  EXPORT_SYMBOL(drm_dp_mst_topology_mgr_destroy);
>  
> diff --git a/include/drm/drm_dp_mst_helper.h b/include/drm/drm_dp_mst_helper.h
> index 1bdee5ee6dcd..75b8fba6f399 100644
> --- a/include/drm/drm_dp_mst_helper.h
> +++ b/include/drm/drm_dp_mst_helper.h
> @@ -26,6 +26,26 @@
>  #include <drm/drm_dp_helper.h>
>  #include <drm/drm_atomic.h>
>  
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +#include <linux/stackdepot.h>
> +#include <linux/timekeeping.h>
> +
> +enum drm_dp_mst_topology_ref_type {
> +	DRM_DP_MST_TOPOLOGY_REF_GET,
> +	DRM_DP_MST_TOPOLOGY_REF_PUT,
> +};
> +
> +struct drm_dp_mst_topology_ref_history {
> +	struct drm_dp_mst_topology_ref_entry {
> +		enum drm_dp_mst_topology_ref_type type;
> +		int count;
> +		ktime_t ts_nsec;
> +		depot_stack_handle_t backtrace;
> +	} *entries;
> +	int len;
> +};
> +#endif /* IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS) */
> +
>  struct drm_dp_mst_branch;
>  
>  /**
> @@ -92,6 +112,14 @@ struct drm_dp_mst_port {
>  	 */
>  	struct kref malloc_kref;
>  
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +	/**
> +	 * @topology_ref_history: A history of each topology
> +	 * reference/dereference. See CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
> +	 */
> +	struct drm_dp_mst_topology_ref_history topology_ref_history;
> +#endif
> +
>  	u8 port_num;
>  	bool input;
>  	bool mcs;
> @@ -162,6 +190,14 @@ struct drm_dp_mst_branch {
>  	 */
>  	struct kref malloc_kref;
>  
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +	/**
> +	 * @topology_ref_history: A history of each topology
> +	 * reference/dereference. See CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
> +	 */
> +	struct drm_dp_mst_topology_ref_history topology_ref_history;
> +#endif
> +
>  	/**
>  	 * @destroy_next: linked-list entry used by
>  	 * drm_dp_delayed_destroy_work()
> @@ -630,6 +666,15 @@ struct drm_dp_mst_topology_mgr {
>  	 * transmissions.
>  	 */
>  	struct work_struct up_req_work;
> +
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +	/**
> +	 * @topology_ref_history_lock: protects
> +	 * &drm_dp_mst_port.topology_ref_history and
> +	 * &drm_dp_mst_branch.topology_ref_history.
> +	 */
> +	struct mutex topology_ref_history_lock;
> +#endif
>  };
>  
>  int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr,
> -- 
> 2.21.0
> 

-- 
Sean Paul, Software Engineer, Google / Chromium OS


More information about the amd-gfx mailing list