[PATCH 2/2] drm/xe/guc: Allow CTB G2H processing without G2H IRQ

Matthew Brost matthew.brost at intel.com
Wed Jun 5 21:44:02 UTC 2024


On Wed, Jun 05, 2024 at 10:19:47PM +0200, Michal Wajdeczko wrote:
> During early initialization, in the xe_guc_min_load_for_hwconfig()
> function, we are successfully enabling CTB communication, but it
> will only allow us to send non-blocking H2G messages, as due to
> not yet enabled IRQs, including G2H IRQs, we will not notice any
> new G2H message sent by the GuC, including replies to our blocking
> H2G request messages. And those successful replies are mandatory
> for the VF drivers to continue normal operations.
> 
> As attempt to workaround this driver initialization ordering issue,
> introduce special safe-mode CTB worker, that will periodically
> trigger G2H processing, like original IRQ handler, in case no
> MSI/MSIX IRQs were enabled on the driver yet. Once we detect that
> IRQ were enabled, we will stop this worker.
> 
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_guc_ct.c       | 42 ++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_guc_ct_types.h |  2 ++
>  2 files changed, 44 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index 17f0e5019bb8..b8155edb7f07 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -127,6 +127,7 @@ static void guc_ct_fini(struct drm_device *drm, void *arg)
>  }
>  
>  static void g2h_worker_func(struct work_struct *w);
> +static void safe_mode_worker_func(struct work_struct *w);
>  
>  static void primelockdep(struct xe_guc_ct *ct)
>  {
> @@ -155,6 +156,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
>  	spin_lock_init(&ct->fast_lock);
>  	xa_init(&ct->fence_lookup);
>  	INIT_WORK(&ct->g2h_worker, g2h_worker_func);
> +	INIT_DELAYED_WORK(&ct->safe_mode_worker,  safe_mode_worker_func);
>  	init_waitqueue_head(&ct->wq);
>  	init_waitqueue_head(&ct->g2h_fence_wq);
>  
> @@ -321,6 +323,42 @@ static void xe_guc_ct_set_state(struct xe_guc_ct *ct,
>  	mutex_unlock(&ct->lock);
>  }
>  
> +static bool ct_needs_safe_mode(struct xe_guc_ct *ct)
> +{
> +	return !pci_dev_msi_enabled(to_pci_dev(ct_to_xe(ct)->drm.dev));
> +}
> +
> +static bool ct_restart_safe_mode_worker(struct xe_guc_ct *ct)
> +{
> +	if (!ct_needs_safe_mode(ct))
> +		return false;
> +
> +	queue_delayed_work(ct->g2h_wq, &ct->safe_mode_worker, HZ / 10);
> +	return true;
> +}
> +
> +static void safe_mode_worker_func(struct work_struct *w)
> +{
> +	struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, safe_mode_worker.work);
> +
> +	xe_guc_ct_irq_handler(ct);

To build on my comment in patch 1. Rather than calling
xe_guc_ct_irq_handler here, can the we decouple the implementation of
g2h_worker_func into a new function and call that directly here? This
way the fast path isn't executed and this worker won't just kick another
worker? Maybe this has some implictions are suspend / resume / reset
(this worker might be pausable / flushed?) but it is likely worth while
to do this.

Matt

> +
> +	if (!ct_restart_safe_mode_worker(ct))
> +		xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode canceled\n");
> +}
> +
> +static void ct_enter_safe_mode(struct xe_guc_ct *ct)
> +{
> +	if (ct_restart_safe_mode_worker(ct))
> +		xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode enabled\n");
> +}
> +
> +static void ct_exit_safe_mode(struct xe_guc_ct *ct)
> +{
> +	if (cancel_delayed_work_sync(&ct->safe_mode_worker))
> +		xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode disabled\n");
> +}
> +
>  int xe_guc_ct_enable(struct xe_guc_ct *ct)
>  {
>  	struct xe_device *xe = ct_to_xe(ct);
> @@ -350,6 +388,9 @@ int xe_guc_ct_enable(struct xe_guc_ct *ct)
>  	wake_up_all(&ct->wq);
>  	xe_gt_dbg(gt, "GuC CT communication channel enabled\n");
>  
> +	if (ct_needs_safe_mode(ct))
> +		ct_enter_safe_mode(ct);
> +
>  	return 0;
>  
>  err_out:
> @@ -373,6 +414,7 @@ static void stop_g2h_handler(struct xe_guc_ct *ct)
>  void xe_guc_ct_disable(struct xe_guc_ct *ct)
>  {
>  	xe_guc_ct_set_state(ct, XE_GUC_CT_STATE_DISABLED);
> +	ct_exit_safe_mode(ct);
>  	stop_g2h_handler(ct);
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h
> index fede4c6e93cb..761cb9031298 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct_types.h
> +++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h
> @@ -110,6 +110,8 @@ struct xe_guc_ct {
>  	u32 g2h_outstanding;
>  	/** @g2h_worker: worker to process G2H messages */
>  	struct work_struct g2h_worker;
> +	/** @safe_mode_worker: worker to check G2H messages with IRQ disabled */
> +	struct delayed_work safe_mode_worker;
>  	/** @state: CT state */
>  	enum xe_guc_ct_state state;
>  	/** @fence_seqno: G2H fence seqno - 16 bits used by CT */
> -- 
> 2.43.0
> 


More information about the Intel-xe mailing list