[PATCH 05/12] accel/habanalabs: print max timeout value on CS stuck
Ofir Bitton
obitton at habana.ai
Wed May 17 18:01:31 UTC 2023
On 16/05/2023 12:30, Oded Gabbay wrote:
> If a workload got stuck, we print an error to the kernel log about it.
> Add to that print the configured max timeout value, as that value is
> not fixed between ASICs and in addition it can be configured using
> a kernel module parameter.
>
> Signed-off-by: Oded Gabbay <ogabbay at kernel.org>
> ---
> .../habanalabs/common/command_submission.c | 26 +++++++++++--------
> 1 file changed, 15 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
> index ccf68f482948..4ec28af3ed78 100644
> --- a/drivers/accel/habanalabs/common/command_submission.c
> +++ b/drivers/accel/habanalabs/common/command_submission.c
> @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref)
>
> static void cs_timedout(struct work_struct *work)
> {
> + struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
> + bool skip_reset_on_timeout, device_reset = false;
> struct hl_device *hdev;
> u64 event_mask = 0x0;
> + uint timeout_sec;
> int rc;
> - struct hl_cs *cs = container_of(work, struct hl_cs,
> - work_tdr.work);
> - bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false;
> +
> + skip_reset_on_timeout = cs->skip_reset_on_timeout;
>
> rc = cs_get_unless_zero(cs);
> if (!rc)
> @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work)
> event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
> }
>
> + timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
> +
> switch (cs->type) {
> case CS_TYPE_SIGNAL:
> dev_err(hdev->dev,
> - "Signal command submission %llu has not finished in time!\n",
> - cs->sequence);
> + "Signal command submission %llu has not finished in %u seconds!\n",
> + cs->sequence, timeout_sec);
> break;
>
> case CS_TYPE_WAIT:
> dev_err(hdev->dev,
> - "Wait command submission %llu has not finished in time!\n",
> - cs->sequence);
> + "Wait command submission %llu has not finished in %u seconds!\n",
> + cs->sequence, timeout_sec);
> break;
>
> case CS_TYPE_COLLECTIVE_WAIT:
> dev_err(hdev->dev,
> - "Collective Wait command submission %llu has not finished in time!\n",
> - cs->sequence);
> + "Collective Wait command submission %llu has not finished in %u seconds!\n",
> + cs->sequence, timeout_sec);
> break;
>
> default:
> dev_err(hdev->dev,
> - "Command submission %llu has not finished in time!\n",
> - cs->sequence);
> + "Command submission %llu has not finished in %u seconds!\n",
> + cs->sequence, timeout_sec);
> break;
> }
>
Reviewed-by: Ofir Bitton<obitton at habana.ai>
More information about the dri-devel
mailing list