[Intel-gfx] [PATCH 34/47] drm/i915/guc: Suspend/resume implementation for new interface

Mon Jul 12 22:56:19 UTC 2021

On 6/24/2021 00:05, Matthew Brost wrote:
> The new GuC interface introduces an MMIO H2G command,
> INTEL_GUC_ACTION_RESET_CLIENT, which is used to implement suspend. This
> MMIO tears down any active contexts generating a context reset G2H CTB
> for each. Once that step completes the GuC tears down the CTB
> channels. It is safe to suspend once this MMIO H2G command completes
> and all G2H CTBs have been processed. In practice the i915 will likely
> never receive a G2H as suspend should only be called after the GPU is
> idle.
>
> Resume is implemented in the same manner as before - simply reload the
> GuC firmware and reinitialize everything (e.g. CTB channels, contexts,
> etc..).
>
> Cc: John Harrison <john.c.harrison at intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
Reviewed-by: John Harrison <John.C.Harrison at Intel.com?

> ---
>   .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |  1 +
>   drivers/gpu/drm/i915/gt/uc/intel_guc.c        | 64 ++++++++-----------
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 14 ++--
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  5 ++
>   drivers/gpu/drm/i915/gt/uc/intel_uc.c         | 20 ++++--
>   5 files changed, 53 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> index 57e18babdf4b..596cf4b818e5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
> @@ -142,6 +142,7 @@ enum intel_guc_action {
>   	INTEL_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505,
>   	INTEL_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506,
>   	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
> +	INTEL_GUC_ACTION_RESET_CLIENT = 0x5B01,
>   	INTEL_GUC_ACTION_LIMIT
>   };
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> index 9b09395b998f..68266cbffd1f 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> @@ -524,51 +524,34 @@ int intel_guc_auth_huc(struct intel_guc *guc, u32 rsa_offset)
>    */
>   int intel_guc_suspend(struct intel_guc *guc)
>   {
> -	struct intel_uncore *uncore = guc_to_gt(guc)->uncore;
>   	int ret;
> -	u32 status;
>   	u32 action[] = {
> -		INTEL_GUC_ACTION_ENTER_S_STATE,
> -		GUC_POWER_D1, /* any value greater than GUC_POWER_D0 */
> +		INTEL_GUC_ACTION_RESET_CLIENT,
>   	};
>   
> -	/*
> -	 * If GuC communication is enabled but submission is not supported,
> -	 * we do not need to suspend the GuC.
> -	 */
> -	if (!intel_guc_submission_is_used(guc) || !intel_guc_is_ready(guc))
> +	if (!intel_guc_is_ready(guc))
>   		return 0;
>   
> -	/*
> -	 * The ENTER_S_STATE action queues the save/restore operation in GuC FW
> -	 * and then returns, so waiting on the H2G is not enough to guarantee
> -	 * GuC is done. When all the processing is done, GuC writes
> -	 * INTEL_GUC_SLEEP_STATE_SUCCESS to scratch register 14, so we can poll
> -	 * on that. Note that GuC does not ensure that the value in the register
> -	 * is different from INTEL_GUC_SLEEP_STATE_SUCCESS while the action is
> -	 * in progress so we need to take care of that ourselves as well.
> -	 */
> -
> -	intel_uncore_write(uncore, SOFT_SCRATCH(14),
> -			   INTEL_GUC_SLEEP_STATE_INVALID_MASK);
> -
> -	ret = intel_guc_send(guc, action, ARRAY_SIZE(action));
> -	if (ret)
> -		return ret;
> -
> -	ret = __intel_wait_for_register(uncore, SOFT_SCRATCH(14),
> -					INTEL_GUC_SLEEP_STATE_INVALID_MASK,
> -					0, 0, 10, &status);
> -	if (ret)
> -		return ret;
> -
> -	if (status != INTEL_GUC_SLEEP_STATE_SUCCESS) {
> -		DRM_ERROR("GuC failed to change sleep state. "
> -			  "action=0x%x, err=%u\n",
> -			  action[0], status);
> -		return -EIO;
> +	if (intel_guc_submission_is_used(guc)) {
> +		/*
> +		 * This H2G MMIO command tears down the GuC in two steps. First it will
> +		 * generate a G2H CTB for every active context indicating a reset. In
> +		 * practice the i915 shouldn't ever get a G2H as suspend should only be
> +		 * called when the GPU is idle. Next, it tears down the CTBs and this
> +		 * H2G MMIO command completes.
> +		 *
> +		 * Don't abort on a failure code from the GuC. Keep going and do the
> +		 * clean up in santize() and re-initialisation on resume and hopefully
> +		 * the error here won't be problematic.
> +		 */
> +		ret = intel_guc_send_mmio(guc, action, ARRAY_SIZE(action), NULL, 0);
> +		if (ret)
> +			DRM_ERROR("GuC suspend: RESET_CLIENT action failed with error %d!\n", ret);
>   	}
>   
> +	/* Signal that the GuC isn't running. */
> +	intel_guc_sanitize(guc);
> +
>   	return 0;
>   }
>   
> @@ -578,7 +561,12 @@ int intel_guc_suspend(struct intel_guc *guc)
>    */
>   int intel_guc_resume(struct intel_guc *guc)
>   {
> -	/* XXX: to be implemented with submission interface rework */
> +	/*
> +	 * NB: This function can still be called even if GuC submission is
> +	 * disabled, e.g. if GuC is enabled for HuC authentication only. Thus,
> +	 * if any code is later added here, it must be support doing nothing
> +	 * if submission is disabled (as per intel_guc_suspend).
> +	 */
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 59fca9748c15..16b61fe71b07 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -304,10 +304,10 @@ static int guc_submission_busy_loop(struct intel_guc* guc,
>   	return err;
>   }
>   
> -static int guc_wait_for_pending_msg(struct intel_guc *guc,
> -				    atomic_t *wait_var,
> -				    bool interruptible,
> -				    long timeout)
> +int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
> +				   atomic_t *wait_var,
> +				   bool interruptible,
> +				   long timeout)
>   {
>   	const int state = interruptible ?
>   		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
> @@ -352,8 +352,8 @@ int intel_guc_wait_for_idle(struct intel_guc *guc, long timeout)
>   	if (unlikely(timeout < 0))
>   		timeout = -timeout, interruptible = false;
>   
> -	return guc_wait_for_pending_msg(guc, &guc->outstanding_submission_g2h,
> -					interruptible, timeout);
> +	return intel_guc_wait_for_pending_msg(guc, &guc->outstanding_submission_g2h,
> +					      interruptible, timeout);
>   }
>   
>   static int guc_lrc_desc_pin(struct intel_context *ce, bool loop);
> @@ -625,7 +625,7 @@ void intel_guc_submission_reset_prepare(struct intel_guc *guc)
>   	for (i = 0; i < 4 && atomic_read(&guc->outstanding_submission_g2h); ++i) {
>   		intel_guc_to_host_event_handler(guc);
>   #define wait_for_reset(guc, wait_var) \
> -		guc_wait_for_pending_msg(guc, wait_var, false, (HZ / 20))
> +		intel_guc_wait_for_pending_msg(guc, wait_var, false, (HZ / 20))
>   		do {
>   			wait_for_reset(guc, &guc->outstanding_submission_g2h);
>   		} while (!list_empty(&guc->ct.requests.incoming));
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index 95df5ab06031..b9b9f0f60f91 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -27,6 +27,11 @@ void intel_guc_log_context_info(struct intel_guc *guc, struct drm_printer *p);
>   
>   bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
>   
> +int intel_guc_wait_for_pending_msg(struct intel_guc *guc,
> +				   atomic_t *wait_var,
> +				   bool interruptible,
> +				   long timeout);
> +
>   static inline bool intel_guc_submission_is_supported(struct intel_guc *guc)
>   {
>   	/* XXX: GuC submission is unavailable for now */
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index ab11fe731ee7..b523a8521351 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -596,14 +596,18 @@ void intel_uc_cancel_requests(struct intel_uc *uc)
>   void intel_uc_runtime_suspend(struct intel_uc *uc)
>   {
>   	struct intel_guc *guc = &uc->guc;
> -	int err;
>   
>   	if (!intel_guc_is_ready(guc))
>   		return;
>   
> -	err = intel_guc_suspend(guc);
> -	if (err)
> -		DRM_DEBUG_DRIVER("Failed to suspend GuC, err=%d", err);
> +	/*
> +	 * Wait for any outstanding CTB before tearing down communication /w the
> +	 * GuC.
> +	 */
> +#define OUTSTANDING_CTB_TIMEOUT_PERIOD	(HZ / 5)
> +	intel_guc_wait_for_pending_msg(guc, &guc->outstanding_submission_g2h,
> +				       false, OUTSTANDING_CTB_TIMEOUT_PERIOD);
> +	GEM_WARN_ON(atomic_read(&guc->outstanding_submission_g2h));
>   
>   	guc_disable_communication(guc);
>   }
> @@ -612,12 +616,16 @@ void intel_uc_suspend(struct intel_uc *uc)
>   {
>   	struct intel_guc *guc = &uc->guc;
>   	intel_wakeref_t wakeref;
> +	int err;
>   
>   	if (!intel_guc_is_ready(guc))
>   		return;
>   
> -	with_intel_runtime_pm(uc_to_gt(uc)->uncore->rpm, wakeref)
> -		intel_uc_runtime_suspend(uc);
> +	with_intel_runtime_pm(&uc_to_gt(uc)->i915->runtime_pm, wakeref) {
> +		err = intel_guc_suspend(guc);
> +		if (err)
> +			DRM_DEBUG_DRIVER("Failed to suspend GuC, err=%d", err);
> +	}
>   }
>   
>   static int __uc_resume(struct intel_uc *uc, bool enable_communication)