[Intel-gfx] [PATCH v2 2/2] drm/i915/guc: Fix missing return code checks in submission init

Ceraolo Spurio, Daniele daniele.ceraolospurio at intel.com
Mon Mar 6 22:22:04 UTC 2023



On 2/17/2023 2:33 PM, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
>
> The CI results for the 'fast request' patch set (enables error return
> codes for fire-and-forget H2G messages) hit an issue with the KMD
> sending context submission requests on an invalid context. That was
> caused by a fault injection probe failing the context creation of a
> kernel context. However, there was no return code checking on any of
> the kernel context registration paths. So the driver kept going and
> tried to use the kernel context for the record defaults process.
>
> This would not cause any actual problems. The invalid requests would
> be rejected by GuC and ultimately the start up sequence would
> correctly wedge due to the context creation failure. But fixing the
> issue correctly rather ignoring it means we won't get CI complaining
> when the fast request patch lands and enables the extra error checking.
>
> So fix it by checking for errors and aborting as appropriate when
> creating kernel contexts. While at it, clean up some other submission
> init related failure cleanup paths. Also, rename guc_init_lrc_mapping
> to guc_init_submission as the former name hasn't been valid in a long
> time.
>
> v2: Add another wrapper to keep the flow balanced (Daniele)
>
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Daniele

> ---
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 97 ++++++++++++++-----
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.h |  2 +-
>   drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 +-
>   3 files changed, 80 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index a04d7049a2c2f..88e881b100cf0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1441,7 +1441,7 @@ static int guc_action_enable_usage_stats(struct intel_guc *guc)
>   	return intel_guc_send(guc, action, ARRAY_SIZE(action));
>   }
>   
> -static void guc_init_engine_stats(struct intel_guc *guc)
> +static int guc_init_engine_stats(struct intel_guc *guc)
>   {
>   	struct intel_gt *gt = guc_to_gt(guc);
>   	intel_wakeref_t wakeref;
> @@ -1454,6 +1454,13 @@ static void guc_init_engine_stats(struct intel_guc *guc)
>   		guc_err(guc, "Failed to enable usage stats: %pe\n", ERR_PTR(ret));
>   	else
>   		guc_enable_busyness_worker(guc);
> +
> +	return ret;
> +}
> +
> +static void guc_fini_engine_stats(struct intel_guc *guc)
> +{
> +	guc_cancel_busyness_worker(guc);
>   }
>   
>   void intel_guc_busyness_park(struct intel_gt *gt)
> @@ -4109,9 +4116,11 @@ static void guc_set_default_submission(struct intel_engine_cs *engine)
>   	engine->submit_request = guc_submit_request;
>   }
>   
> -static inline void guc_kernel_context_pin(struct intel_guc *guc,
> -					  struct intel_context *ce)
> +static inline int guc_kernel_context_pin(struct intel_guc *guc,
> +					 struct intel_context *ce)
>   {
> +	int ret;
> +
>   	/*
>   	 * Note: we purposefully do not check the returns below because
>   	 * the registration can only fail if a reset is just starting.
> @@ -4119,16 +4128,24 @@ static inline void guc_kernel_context_pin(struct intel_guc *guc,
>   	 * isn't happening and even it did this code would be run again.
>   	 */
>   
> -	if (context_guc_id_invalid(ce))
> -		pin_guc_id(guc, ce);
> +	if (context_guc_id_invalid(ce)) {
> +		ret = pin_guc_id(guc, ce);
> +
> +		if (ret < 0)
> +			return ret;
> +	}
>   
>   	if (!test_bit(CONTEXT_GUC_INIT, &ce->flags))
>   		guc_context_init(ce);
>   
> -	try_context_registration(ce, true);
> +	ret = try_context_registration(ce, true);
> +	if (ret)
> +		unpin_guc_id(guc, ce);
> +
> +	return ret;
>   }
>   
> -static inline void guc_init_lrc_mapping(struct intel_guc *guc)
> +static inline int guc_init_submission(struct intel_guc *guc)
>   {
>   	struct intel_gt *gt = guc_to_gt(guc);
>   	struct intel_engine_cs *engine;
> @@ -4155,9 +4172,17 @@ static inline void guc_init_lrc_mapping(struct intel_guc *guc)
>   		struct intel_context *ce;
>   
>   		list_for_each_entry(ce, &engine->pinned_contexts_list,
> -				    pinned_contexts_link)
> -			guc_kernel_context_pin(guc, ce);
> +				    pinned_contexts_link) {
> +			int ret = guc_kernel_context_pin(guc, ce);
> +
> +			if (ret) {
> +				/* No point in trying to clean up as i915 will wedge on failure */
> +				return ret;
> +			}
> +		}
>   	}
> +
> +	return 0;
>   }
>   
>   static void guc_release(struct intel_engine_cs *engine)
> @@ -4400,31 +4425,57 @@ static int guc_init_global_schedule_policy(struct intel_guc *guc)
>   	return ret;
>   }
>   
> -void intel_guc_submission_enable(struct intel_guc *guc)
> +static void guc_route_semaphores(struct intel_guc *guc, bool to_guc)
>   {
>   	struct intel_gt *gt = guc_to_gt(guc);
> +	u32 val;
>   
> -	/* Enable and route to GuC */
> -	if (GRAPHICS_VER(gt->i915) >= 12)
> -		intel_uncore_write(gt->uncore, GEN12_GUC_SEM_INTR_ENABLES,
> -				   GUC_SEM_INTR_ROUTE_TO_GUC |
> -				   GUC_SEM_INTR_ENABLE_ALL);
> +	if (GRAPHICS_VER(gt->i915) < 12)
> +		return;
> +
> +	if (to_guc)
> +		val = GUC_SEM_INTR_ROUTE_TO_GUC | GUC_SEM_INTR_ENABLE_ALL;
> +	else
> +		val = 0;
> +
> +	intel_uncore_write(gt->uncore, GEN12_GUC_SEM_INTR_ENABLES, val);
> +}
> +
> +int intel_guc_submission_enable(struct intel_guc *guc)
> +{
> +	int ret;
> +
> +	/* Semaphore interrupt enable and route to GuC */
> +	guc_route_semaphores(guc, true);
> +
> +	ret = guc_init_submission(guc);
> +	if (ret)
> +		goto fail_sem;
> +
> +	ret = guc_init_engine_stats(guc);
> +	if (ret)
> +		goto fail_sem;
> +
> +	ret = guc_init_global_schedule_policy(guc);
> +	if (ret)
> +		goto fail_stats;
>   
> -	guc_init_lrc_mapping(guc);
> -	guc_init_engine_stats(guc);
> -	guc_init_global_schedule_policy(guc);
> +	return 0;
> +
> +fail_stats:
> +	guc_fini_engine_stats(guc);
> +fail_sem:
> +	guc_route_semaphores(guc, false);
> +	return ret;
>   }
>   
>   /* Note: By the time we're here, GuC may have already been reset */
>   void intel_guc_submission_disable(struct intel_guc *guc)
>   {
> -	struct intel_gt *gt = guc_to_gt(guc);
> -
>   	guc_cancel_busyness_worker(guc);
>   
> -	/* Disable and route to host */
> -	if (GRAPHICS_VER(gt->i915) >= 12)
> -		intel_uncore_write(gt->uncore, GEN12_GUC_SEM_INTR_ENABLES, 0x0);
> +	/* Semaphore interrupt disable and route to host */
> +	guc_route_semaphores(guc, false);
>   }
>   
>   static bool __guc_submission_supported(struct intel_guc *guc)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> index 5a95a9f0a8e31..c57b29cdb1a64 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
> @@ -15,7 +15,7 @@ struct intel_engine_cs;
>   
>   void intel_guc_submission_init_early(struct intel_guc *guc);
>   int intel_guc_submission_init(struct intel_guc *guc);
> -void intel_guc_submission_enable(struct intel_guc *guc);
> +int intel_guc_submission_enable(struct intel_guc *guc);
>   void intel_guc_submission_disable(struct intel_guc *guc);
>   void intel_guc_submission_fini(struct intel_guc *guc);
>   int intel_guc_preempt_work_create(struct intel_guc *guc);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index 6648691bd6450..d42671dde53b3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -531,8 +531,11 @@ static int __uc_init_hw(struct intel_uc *uc)
>   	else
>   		intel_huc_auth(huc);
>   
> -	if (intel_uc_uses_guc_submission(uc))
> -		intel_guc_submission_enable(guc);
> +	if (intel_uc_uses_guc_submission(uc)) {
> +		ret = intel_guc_submission_enable(guc);
> +		if (ret)
> +			goto err_log_capture;
> +	}
>   
>   	if (intel_uc_uses_guc_slpc(uc)) {
>   		ret = intel_guc_slpc_enable(&guc->slpc);



More information about the dri-devel mailing list