[PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck

Ofir Bitton obitton at habana.ai
Wed May 17 18:01:31 UTC 2023


On 16/05/2023 12:30, Oded Gabbay wrote:
> If a workload got stuck, we print an error to the kernel log about it.
> Add to that print the configured max timeout value, as that value is
> not fixed between ASICs and in addition it can be configured using
> a kernel module parameter.
>
> Signed-off-by: Oded Gabbay <ogabbay at kernel.org>
> ---
>   .../habanalabs/common/command_submission.c    | 26 +++++++++++--------
>   1 file changed, 15 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
> index ccf68f482948..4ec28af3ed78 100644
> --- a/drivers/accel/habanalabs/common/command_submission.c
> +++ b/drivers/accel/habanalabs/common/command_submission.c
> @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref)
>   
>   static void cs_timedout(struct work_struct *work)
>   {
> +	struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
> +	bool skip_reset_on_timeout, device_reset = false;
>   	struct hl_device *hdev;
>   	u64 event_mask = 0x0;
> +	uint timeout_sec;
>   	int rc;
> -	struct hl_cs *cs = container_of(work, struct hl_cs,
> -						 work_tdr.work);
> -	bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
> +
> +	skip_reset_on_timeout = cs->skip_reset_on_timeout;
>   
>   	rc = cs_get_unless_zero(cs);
>   	if (!rc)
> @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work)
>   		event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
>   	}
>   
> +	timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
> +
>   	switch (cs->type) {
>   	case CS_TYPE_SIGNAL:
>   		dev_err(hdev->dev,
> -			"Signal command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Signal command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	case CS_TYPE_WAIT:
>   		dev_err(hdev->dev,
> -			"Wait command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Wait command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	case CS_TYPE_COLLECTIVE_WAIT:
>   		dev_err(hdev->dev,
> -			"Collective Wait command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Collective Wait command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   
>   	default:
>   		dev_err(hdev->dev,
> -			"Command submission %llu has not finished in time!\n",
> -			cs->sequence);
> +			"Command submission %llu has not finished in %u seconds!\n",
> +			cs->sequence, timeout_sec);
>   		break;
>   	}
>   

Reviewed-by: Ofir Bitton<obitton at habana.ai>



More information about the dri-devel mailing list