[systemd-devel] [PATCHv3] core: send sigabrt on watchdog timeout to get the stacktrace
Lennart Poettering
lennart at poettering.net
Tue Oct 28 10:11:48 PDT 2014
On Tue, 28.10.14 16:35, Umut Tezduyar Lindskog (umut.tezduyar at axis.com) wrote:
Applied! Thanks!
> if sigabrt doesn't do the job, follow regular shutdown
> routine, sigterm > sigkill.
> ---
> TODO | 2 --
> man/systemd.service.xml | 5 +++--
> src/core/busname.c | 2 +-
> src/core/mount.c | 3 ++-
> src/core/scope.c | 2 +-
> src/core/service.c | 37 ++++++++++++++++++++++++++-----------
> src/core/service.h | 1 +
> src/core/socket.c | 3 ++-
> src/core/swap.c | 3 ++-
> src/core/unit.c | 24 ++++++++++++++++++------
> src/core/unit.h | 8 +++++++-
> 11 files changed, 63 insertions(+), 27 deletions(-)
>
> diff --git a/TODO b/TODO
> index acac4e3..3e85eee 100644
> --- a/TODO
> +++ b/TODO
> @@ -54,8 +54,6 @@ Features:
>
> * consider showing the unit names during boot up in the status output, not just the unit descriptions
>
> -* send SIGABRT when a service watchdog is triggered, by default, so that we acquire a backtrace of the hang.
> -
> * dhcp: do we allow configuring dhcp routes on interfaces that are not the one we got the dhcp info from?
>
> * maybe allow timer units with an empty Units= setting, so that they
> diff --git a/man/systemd.service.xml b/man/systemd.service.xml
> index 115d169..e563b19 100644
> --- a/man/systemd.service.xml
> +++ b/man/systemd.service.xml
> @@ -593,8 +593,9 @@
> (i.e. the "keep-alive ping"). If the time
> between two such calls is larger than
> the configured time, then the service
> - is placed in a failed state. By
> - setting <varname>Restart=</varname> to
> + is placed in a failed state and it will
> + be terminated with <varname>SIGABRT</varname>.
> + By setting <varname>Restart=</varname> to
> <option>on-failure</option> or
> <option>always</option>, the service
> will be automatically restarted. The
> diff --git a/src/core/busname.c b/src/core/busname.c
> index 22d2a6d..68cb6ca 100644
> --- a/src/core/busname.c
> +++ b/src/core/busname.c
> @@ -446,7 +446,7 @@ static void busname_enter_signal(BusName *n, BusNameState state, BusNameResult f
>
> r = unit_kill_context(UNIT(n),
> &kill_context,
> - state != BUSNAME_SIGTERM,
> + state != BUSNAME_SIGTERM ? KILL_KILL : KILL_TERMINATE,
> -1,
> n->control_pid,
> false);
> diff --git a/src/core/mount.c b/src/core/mount.c
> index e284357..01243c3 100644
> --- a/src/core/mount.c
> +++ b/src/core/mount.c
> @@ -775,7 +775,8 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
> r = unit_kill_context(
> UNIT(m),
> &m->kill_context,
> - state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM,
> + (state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM) ?
> + KILL_KILL : KILL_TERMINATE,
> -1,
> m->control_pid,
> false);
> diff --git a/src/core/scope.c b/src/core/scope.c
> index e8f9e8d..0f7c1f9 100644
> --- a/src/core/scope.c
> +++ b/src/core/scope.c
> @@ -243,7 +243,7 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
> r = unit_kill_context(
> UNIT(s),
> &s->kill_context,
> - state != SCOPE_STOP_SIGTERM,
> + state != SCOPE_STOP_SIGTERM ? KILL_KILL : KILL_TERMINATE,
> -1, -1, false);
> if (r < 0)
> goto fail;
> diff --git a/src/core/service.c b/src/core/service.c
> index d160c4e..2b16778 100644
> --- a/src/core/service.c
> +++ b/src/core/service.c
> @@ -56,6 +56,7 @@ static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
> [SERVICE_EXITED] = UNIT_ACTIVE,
> [SERVICE_RELOAD] = UNIT_RELOADING,
> [SERVICE_STOP] = UNIT_DEACTIVATING,
> + [SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
> [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
> [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
> [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
> @@ -76,6 +77,7 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] =
> [SERVICE_EXITED] = UNIT_ACTIVE,
> [SERVICE_RELOAD] = UNIT_RELOADING,
> [SERVICE_STOP] = UNIT_DEACTIVATING,
> + [SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
> [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
> [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
> [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
> @@ -663,7 +665,7 @@ static void service_set_state(Service *s, ServiceState state) {
> SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
> SERVICE_RELOAD,
> SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> - SERVICE_STOP_POST,
> + SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
> SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
> SERVICE_AUTO_RESTART))
> s->timer_event_source = sd_event_source_unref(s->timer_event_source);
> @@ -672,7 +674,7 @@ static void service_set_state(Service *s, ServiceState state) {
> SERVICE_START, SERVICE_START_POST,
> SERVICE_RUNNING, SERVICE_RELOAD,
> SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> - SERVICE_STOP_POST,
> + SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
> SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
> service_unwatch_main_pid(s);
> s->main_command = NULL;
> @@ -682,7 +684,7 @@ static void service_set_state(Service *s, ServiceState state) {
> SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
> SERVICE_RELOAD,
> SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> - SERVICE_STOP_POST,
> + SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
> SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
> service_unwatch_control_pid(s);
> s->control_command = NULL;
> @@ -696,7 +698,7 @@ static void service_set_state(Service *s, ServiceState state) {
> SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
> SERVICE_RUNNING, SERVICE_RELOAD,
> SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
> - SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
> + SERVICE_STOP_SIGABRT, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
> !(state == SERVICE_DEAD && UNIT(s)->job)) {
> service_close_socket_fd(s);
> service_connection_unref(s);
> @@ -750,7 +752,7 @@ static int service_coldplug(Unit *u) {
> SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
> SERVICE_RELOAD,
> SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> - SERVICE_STOP_POST,
> + SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
> SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
>
> usec_t k;
> @@ -779,7 +781,7 @@ static int service_coldplug(Unit *u) {
> SERVICE_START, SERVICE_START_POST,
> SERVICE_RUNNING, SERVICE_RELOAD,
> SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> - SERVICE_STOP_POST,
> + SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
> SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
> r = unit_watch_pid(UNIT(s), s->main_pid);
> if (r < 0)
> @@ -791,7 +793,7 @@ static int service_coldplug(Unit *u) {
> SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
> SERVICE_RELOAD,
> SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> - SERVICE_STOP_POST,
> + SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
> SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
> r = unit_watch_pid(UNIT(s), s->control_pid);
> if (r < 0)
> @@ -1181,7 +1183,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
> r = unit_kill_context(
> UNIT(s),
> &s->kill_context,
> - state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM,
> + (state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM && state != SERVICE_STOP_SIGABRT) ?
> + KILL_KILL : (state == SERVICE_STOP_SIGABRT ? KILL_ABORT : KILL_TERMINATE),
> s->main_pid,
> s->control_pid,
> s->main_pid_alien);
> @@ -1197,7 +1200,7 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
> }
>
> service_set_state(s, state);
> - } else if (state == SERVICE_STOP_SIGTERM)
> + } else if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGABRT)
> service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
> else if (state == SERVICE_STOP_SIGKILL)
> service_enter_stop_post(s, SERVICE_SUCCESS);
> @@ -1211,7 +1214,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
> fail:
> log_warning_unit(UNIT(s)->id, "%s failed to kill processes: %s", UNIT(s)->id, strerror(-r));
>
> - if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL)
> + if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL ||
> + state == SERVICE_STOP_SIGABRT)
> service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
> else
> service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
> @@ -1637,6 +1641,7 @@ static int service_start(Unit *u) {
> /* We cannot fulfill this request right now, try again later
> * please! */
> if (s->state == SERVICE_STOP ||
> + s->state == SERVICE_STOP_SIGABRT ||
> s->state == SERVICE_STOP_SIGTERM ||
> s->state == SERVICE_STOP_SIGKILL ||
> s->state == SERVICE_STOP_POST ||
> @@ -1695,6 +1700,7 @@ static int service_stop(Unit *u) {
>
> /* Already on it */
> if (s->state == SERVICE_STOP ||
> + s->state == SERVICE_STOP_SIGABRT ||
> s->state == SERVICE_STOP_SIGTERM ||
> s->state == SERVICE_STOP_SIGKILL ||
> s->state == SERVICE_STOP_POST ||
> @@ -2126,6 +2132,7 @@ static void service_notify_cgroup_empty_event(Unit *u) {
> service_enter_running(s, SERVICE_SUCCESS);
> break;
>
> + case SERVICE_STOP_SIGABRT:
> case SERVICE_STOP_SIGTERM:
> case SERVICE_STOP_SIGKILL:
>
> @@ -2252,6 +2259,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
> service_enter_running(s, f);
> break;
>
> + case SERVICE_STOP_SIGABRT:
> case SERVICE_STOP_SIGTERM:
> case SERVICE_STOP_SIGKILL:
>
> @@ -2392,6 +2400,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
> service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
> break;
>
> + case SERVICE_STOP_SIGABRT:
> case SERVICE_STOP_SIGTERM:
> case SERVICE_STOP_SIGKILL:
> if (main_pid_good(s) <= 0)
> @@ -2461,6 +2470,12 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us
> service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
> break;
>
> + case SERVICE_STOP_SIGABRT:
> + log_warning_unit(UNIT(s)->id,
> + "%s stop-sigabrt timed out. Terminating.", UNIT(s)->id);
> + service_enter_signal(s, SERVICE_STOP_SIGTERM, s->result);
> + break;
> +
> case SERVICE_STOP_SIGTERM:
> if (s->kill_context.send_sigkill) {
> log_warning_unit(UNIT(s)->id, "%s stop-sigterm timed out. Killing.", UNIT(s)->id);
> @@ -2528,7 +2543,7 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
> log_error_unit(UNIT(s)->id, "%s watchdog timeout (limit %s)!", UNIT(s)->id,
> format_timespan(t, sizeof(t), s->watchdog_usec, 1));
>
> - service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_WATCHDOG);
> + service_enter_signal(s, SERVICE_STOP_SIGABRT, SERVICE_FAILURE_WATCHDOG);
>
> return 0;
> }
> diff --git a/src/core/service.h b/src/core/service.h
> index 0db0c4d..54fbe46 100644
> --- a/src/core/service.h
> +++ b/src/core/service.h
> @@ -39,6 +39,7 @@ typedef enum ServiceState {
> SERVICE_EXITED, /* Nothing is running anymore, but RemainAfterExit is true hence this is OK */
> SERVICE_RELOAD,
> SERVICE_STOP, /* No STOP_PRE state, instead just register multiple STOP executables */
> + SERVICE_STOP_SIGABRT, /* Watchdog timeout */
> SERVICE_STOP_SIGTERM,
> SERVICE_STOP_SIGKILL,
> SERVICE_STOP_POST,
> diff --git a/src/core/socket.c b/src/core/socket.c
> index 9004cb4..6ba8338 100644
> --- a/src/core/socket.c
> +++ b/src/core/socket.c
> @@ -1578,7 +1578,8 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
> r = unit_kill_context(
> UNIT(s),
> &s->kill_context,
> - state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM,
> + (state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM) ?
> + KILL_KILL : KILL_TERMINATE,
> -1,
> s->control_pid,
> false);
> diff --git a/src/core/swap.c b/src/core/swap.c
> index b2ca048..001bc2e 100644
> --- a/src/core/swap.c
> +++ b/src/core/swap.c
> @@ -710,7 +710,8 @@ static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
> r = unit_kill_context(
> UNIT(s),
> &s->kill_context,
> - state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM,
> + (state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM) ?
> + KILL_KILL : KILL_TERMINATE,
> -1,
> s->control_pid,
> false);
> diff --git a/src/core/unit.c b/src/core/unit.c
> index 489ea1e..84f210a 100644
> --- a/src/core/unit.c
> +++ b/src/core/unit.c
> @@ -3313,7 +3313,7 @@ int unit_make_transient(Unit *u) {
> int unit_kill_context(
> Unit *u,
> KillContext *c,
> - bool sigkill,
> + KillOperation k,
> pid_t main_pid,
> pid_t control_pid,
> bool main_pid_alien) {
> @@ -3326,7 +3326,19 @@ int unit_kill_context(
> if (c->kill_mode == KILL_NONE)
> return 0;
>
> - sig = sigkill ? SIGKILL : c->kill_signal;
> + switch (k) {
> + case KILL_KILL:
> + sig = SIGKILL;
> + break;
> + case KILL_ABORT:
> + sig = SIGABRT;
> + break;
> + case KILL_TERMINATE:
> + sig = c->kill_signal;
> + break;
> + default:
> + assert_not_reached("KillOperation unknown");
> + }
>
> if (main_pid > 0) {
> r = kill_and_sigcont(main_pid, sig);
> @@ -3340,7 +3352,7 @@ int unit_kill_context(
> if (!main_pid_alien)
> wait_for_exit = true;
>
> - if (c->send_sighup && !sigkill)
> + if (c->send_sighup && k != KILL_KILL)
> kill(main_pid, SIGHUP);
> }
> }
> @@ -3356,12 +3368,12 @@ int unit_kill_context(
> } else {
> wait_for_exit = true;
>
> - if (c->send_sighup && !sigkill)
> + if (c->send_sighup && k != KILL_KILL)
> kill(control_pid, SIGHUP);
> }
> }
>
> - if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && sigkill)) && u->cgroup_path) {
> + if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL)) && u->cgroup_path) {
> _cleanup_set_free_ Set *pid_set = NULL;
>
> /* Exclude the main/control pids from being killed via the cgroup */
> @@ -3385,7 +3397,7 @@ int unit_kill_context(
>
> /* wait_for_exit = true; */
>
> - if (c->send_sighup && !sigkill) {
> + if (c->send_sighup && k != KILL_KILL) {
> set_free(pid_set);
>
> pid_set = unit_pid_set(main_pid, control_pid);
> diff --git a/src/core/unit.h b/src/core/unit.h
> index bbad546..081ab18 100644
> --- a/src/core/unit.h
> +++ b/src/core/unit.h
> @@ -54,6 +54,12 @@ enum UnitActiveState {
> _UNIT_ACTIVE_STATE_INVALID = -1
> };
>
> +typedef enum KillOperation {
> + KILL_TERMINATE,
> + KILL_KILL,
> + KILL_ABORT,
> +} KillOperation;
> +
> static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
> return t == UNIT_ACTIVE || t == UNIT_RELOADING;
> }
> @@ -576,7 +582,7 @@ int unit_write_drop_in_private_format(Unit *u, UnitSetPropertiesMode mode, const
>
> int unit_remove_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name);
>
> -int unit_kill_context(Unit *u, KillContext *c, bool sigkill, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
> +int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
>
> int unit_make_transient(Unit *u);
>
> --
> 2.1.1
>
> _______________________________________________
> systemd-devel mailing list
> systemd-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/systemd-devel
Lennart
--
Lennart Poettering, Red Hat
More information about the systemd-devel
mailing list