[systemd-devel] [PATCHv3] core: send sigabrt on watchdog timeout to get the stacktrace

Lennart Poettering lennart at poettering.net
Tue Oct 28 10:11:48 PDT 2014


On Tue, 28.10.14 16:35, Umut Tezduyar Lindskog (umut.tezduyar at axis.com) wrote:

Applied! Thanks!

> if sigabrt doesn't do the job, follow regular shutdown
> routine, sigterm > sigkill.
> ---
>  TODO                    |  2 --
>  man/systemd.service.xml |  5 +++--
>  src/core/busname.c      |  2 +-
>  src/core/mount.c        |  3 ++-
>  src/core/scope.c        |  2 +-
>  src/core/service.c      | 37 ++++++++++++++++++++++++++-----------
>  src/core/service.h      |  1 +
>  src/core/socket.c       |  3 ++-
>  src/core/swap.c         |  3 ++-
>  src/core/unit.c         | 24 ++++++++++++++++++------
>  src/core/unit.h         |  8 +++++++-
>  11 files changed, 63 insertions(+), 27 deletions(-)
> 
> diff --git a/TODO b/TODO
> index acac4e3..3e85eee 100644
> --- a/TODO
> +++ b/TODO
> @@ -54,8 +54,6 @@ Features:
>  
>  * consider showing the unit names during boot up in the status output, not just the unit descriptions
>  
> -* send SIGABRT when a service watchdog is triggered, by default, so that we acquire a backtrace of the hang.
> -
>  * dhcp: do we allow configuring dhcp routes on interfaces that are not the one we got the dhcp info from?
>  
>  * maybe allow timer units with an empty Units= setting, so that they
> diff --git a/man/systemd.service.xml b/man/systemd.service.xml
> index 115d169..e563b19 100644
> --- a/man/systemd.service.xml
> +++ b/man/systemd.service.xml
> @@ -593,8 +593,9 @@
>                                  (i.e. the "keep-alive ping"). If the time
>                                  between two such calls is larger than
>                                  the configured time, then the service
> -                                is placed in a failed state. By
> -                                setting <varname>Restart=</varname> to
> +                                is placed in a failed state and it will
> +                                be terminated with <varname>SIGABRT</varname>.
> +                                By setting <varname>Restart=</varname> to
>                                  <option>on-failure</option> or
>                                  <option>always</option>, the service
>                                  will be automatically restarted. The
> diff --git a/src/core/busname.c b/src/core/busname.c
> index 22d2a6d..68cb6ca 100644
> --- a/src/core/busname.c
> +++ b/src/core/busname.c
> @@ -446,7 +446,7 @@ static void busname_enter_signal(BusName *n, BusNameState state, BusNameResult f
>  
>          r = unit_kill_context(UNIT(n),
>                                &kill_context,
> -                              state != BUSNAME_SIGTERM,
> +                              state != BUSNAME_SIGTERM ? KILL_KILL : KILL_TERMINATE,
>                                -1,
>                                n->control_pid,
>                                false);
> diff --git a/src/core/mount.c b/src/core/mount.c
> index e284357..01243c3 100644
> --- a/src/core/mount.c
> +++ b/src/core/mount.c
> @@ -775,7 +775,8 @@ static void mount_enter_signal(Mount *m, MountState state, MountResult f) {
>          r = unit_kill_context(
>                          UNIT(m),
>                          &m->kill_context,
> -                        state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM,
> +                        (state != MOUNT_MOUNTING_SIGTERM && state != MOUNT_UNMOUNTING_SIGTERM && state != MOUNT_REMOUNTING_SIGTERM) ?
> +                        KILL_KILL : KILL_TERMINATE,
>                          -1,
>                          m->control_pid,
>                          false);
> diff --git a/src/core/scope.c b/src/core/scope.c
> index e8f9e8d..0f7c1f9 100644
> --- a/src/core/scope.c
> +++ b/src/core/scope.c
> @@ -243,7 +243,7 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
>                  r = unit_kill_context(
>                                  UNIT(s),
>                                  &s->kill_context,
> -                                state != SCOPE_STOP_SIGTERM,
> +                                state != SCOPE_STOP_SIGTERM ? KILL_KILL : KILL_TERMINATE,
>                                  -1, -1, false);
>                  if (r < 0)
>                          goto fail;
> diff --git a/src/core/service.c b/src/core/service.c
> index d160c4e..2b16778 100644
> --- a/src/core/service.c
> +++ b/src/core/service.c
> @@ -56,6 +56,7 @@ static const UnitActiveState state_translation_table[_SERVICE_STATE_MAX] = {
>          [SERVICE_EXITED] = UNIT_ACTIVE,
>          [SERVICE_RELOAD] = UNIT_RELOADING,
>          [SERVICE_STOP] = UNIT_DEACTIVATING,
> +        [SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
>          [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
>          [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
>          [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
> @@ -76,6 +77,7 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] =
>          [SERVICE_EXITED] = UNIT_ACTIVE,
>          [SERVICE_RELOAD] = UNIT_RELOADING,
>          [SERVICE_STOP] = UNIT_DEACTIVATING,
> +        [SERVICE_STOP_SIGABRT] = UNIT_DEACTIVATING,
>          [SERVICE_STOP_SIGTERM] = UNIT_DEACTIVATING,
>          [SERVICE_STOP_SIGKILL] = UNIT_DEACTIVATING,
>          [SERVICE_STOP_POST] = UNIT_DEACTIVATING,
> @@ -663,7 +665,7 @@ static void service_set_state(Service *s, ServiceState state) {
>                      SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
>                      SERVICE_RELOAD,
>                      SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> -                    SERVICE_STOP_POST,
> +                    SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
>                      SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL,
>                      SERVICE_AUTO_RESTART))
>                  s->timer_event_source = sd_event_source_unref(s->timer_event_source);
> @@ -672,7 +674,7 @@ static void service_set_state(Service *s, ServiceState state) {
>                      SERVICE_START, SERVICE_START_POST,
>                      SERVICE_RUNNING, SERVICE_RELOAD,
>                      SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> -                    SERVICE_STOP_POST,
> +                    SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
>                      SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
>                  service_unwatch_main_pid(s);
>                  s->main_command = NULL;
> @@ -682,7 +684,7 @@ static void service_set_state(Service *s, ServiceState state) {
>                      SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
>                      SERVICE_RELOAD,
>                      SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> -                    SERVICE_STOP_POST,
> +                    SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
>                      SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
>                  service_unwatch_control_pid(s);
>                  s->control_command = NULL;
> @@ -696,7 +698,7 @@ static void service_set_state(Service *s, ServiceState state) {
>                      SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
>                      SERVICE_RUNNING, SERVICE_RELOAD,
>                      SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL, SERVICE_STOP_POST,
> -                    SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
> +                    SERVICE_STOP_SIGABRT, SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL) &&
>              !(state == SERVICE_DEAD && UNIT(s)->job)) {
>                  service_close_socket_fd(s);
>                  service_connection_unref(s);
> @@ -750,7 +752,7 @@ static int service_coldplug(Unit *u) {
>                             SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
>                             SERVICE_RELOAD,
>                             SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> -                           SERVICE_STOP_POST,
> +                           SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
>                             SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
>  
>                          usec_t k;
> @@ -779,7 +781,7 @@ static int service_coldplug(Unit *u) {
>                              SERVICE_START, SERVICE_START_POST,
>                              SERVICE_RUNNING, SERVICE_RELOAD,
>                              SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> -                            SERVICE_STOP_POST,
> +                            SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
>                              SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL))) {
>                          r = unit_watch_pid(UNIT(s), s->main_pid);
>                          if (r < 0)
> @@ -791,7 +793,7 @@ static int service_coldplug(Unit *u) {
>                             SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
>                             SERVICE_RELOAD,
>                             SERVICE_STOP, SERVICE_STOP_SIGTERM, SERVICE_STOP_SIGKILL,
> -                           SERVICE_STOP_POST,
> +                           SERVICE_STOP_SIGABRT, SERVICE_STOP_POST,
>                             SERVICE_FINAL_SIGTERM, SERVICE_FINAL_SIGKILL)) {
>                          r = unit_watch_pid(UNIT(s), s->control_pid);
>                          if (r < 0)
> @@ -1181,7 +1183,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
>          r = unit_kill_context(
>                          UNIT(s),
>                          &s->kill_context,
> -                        state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM,
> +                        (state != SERVICE_STOP_SIGTERM && state != SERVICE_FINAL_SIGTERM && state != SERVICE_STOP_SIGABRT) ?
> +                        KILL_KILL : (state == SERVICE_STOP_SIGABRT ? KILL_ABORT : KILL_TERMINATE),
>                          s->main_pid,
>                          s->control_pid,
>                          s->main_pid_alien);
> @@ -1197,7 +1200,7 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
>                  }
>  
>                  service_set_state(s, state);
> -        } else if (state == SERVICE_STOP_SIGTERM)
> +        } else if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGABRT)
>                  service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_SUCCESS);
>          else if (state == SERVICE_STOP_SIGKILL)
>                  service_enter_stop_post(s, SERVICE_SUCCESS);
> @@ -1211,7 +1214,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
>  fail:
>          log_warning_unit(UNIT(s)->id, "%s failed to kill processes: %s", UNIT(s)->id, strerror(-r));
>  
> -        if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL)
> +        if (state == SERVICE_STOP_SIGTERM || state == SERVICE_STOP_SIGKILL ||
> +            state == SERVICE_STOP_SIGABRT)
>                  service_enter_stop_post(s, SERVICE_FAILURE_RESOURCES);
>          else
>                  service_enter_dead(s, SERVICE_FAILURE_RESOURCES, true);
> @@ -1637,6 +1641,7 @@ static int service_start(Unit *u) {
>          /* We cannot fulfill this request right now, try again later
>           * please! */
>          if (s->state == SERVICE_STOP ||
> +            s->state == SERVICE_STOP_SIGABRT ||
>              s->state == SERVICE_STOP_SIGTERM ||
>              s->state == SERVICE_STOP_SIGKILL ||
>              s->state == SERVICE_STOP_POST ||
> @@ -1695,6 +1700,7 @@ static int service_stop(Unit *u) {
>  
>          /* Already on it */
>          if (s->state == SERVICE_STOP ||
> +            s->state == SERVICE_STOP_SIGABRT ||
>              s->state == SERVICE_STOP_SIGTERM ||
>              s->state == SERVICE_STOP_SIGKILL ||
>              s->state == SERVICE_STOP_POST ||
> @@ -2126,6 +2132,7 @@ static void service_notify_cgroup_empty_event(Unit *u) {
>                  service_enter_running(s, SERVICE_SUCCESS);
>                  break;
>  
> +        case SERVICE_STOP_SIGABRT:
>          case SERVICE_STOP_SIGTERM:
>          case SERVICE_STOP_SIGKILL:
>  
> @@ -2252,6 +2259,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
>                                  service_enter_running(s, f);
>                                  break;
>  
> +                        case SERVICE_STOP_SIGABRT:
>                          case SERVICE_STOP_SIGTERM:
>                          case SERVICE_STOP_SIGKILL:
>  
> @@ -2392,6 +2400,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
>                                  service_enter_signal(s, SERVICE_STOP_SIGTERM, f);
>                                  break;
>  
> +                        case SERVICE_STOP_SIGABRT:
>                          case SERVICE_STOP_SIGTERM:
>                          case SERVICE_STOP_SIGKILL:
>                                  if (main_pid_good(s) <= 0)
> @@ -2461,6 +2470,12 @@ static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *us
>                  service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_TIMEOUT);
>                  break;
>  
> +        case SERVICE_STOP_SIGABRT:
> +                log_warning_unit(UNIT(s)->id,
> +                                 "%s stop-sigabrt timed out. Terminating.", UNIT(s)->id);
> +                service_enter_signal(s, SERVICE_STOP_SIGTERM, s->result);
> +                break;
> +
>          case SERVICE_STOP_SIGTERM:
>                  if (s->kill_context.send_sigkill) {
>                          log_warning_unit(UNIT(s)->id, "%s stop-sigterm timed out. Killing.", UNIT(s)->id);
> @@ -2528,7 +2543,7 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
>          log_error_unit(UNIT(s)->id, "%s watchdog timeout (limit %s)!", UNIT(s)->id,
>                         format_timespan(t, sizeof(t), s->watchdog_usec, 1));
>  
> -        service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_WATCHDOG);
> +        service_enter_signal(s, SERVICE_STOP_SIGABRT, SERVICE_FAILURE_WATCHDOG);
>  
>          return 0;
>  }
> diff --git a/src/core/service.h b/src/core/service.h
> index 0db0c4d..54fbe46 100644
> --- a/src/core/service.h
> +++ b/src/core/service.h
> @@ -39,6 +39,7 @@ typedef enum ServiceState {
>          SERVICE_EXITED,            /* Nothing is running anymore, but RemainAfterExit is true hence this is OK */
>          SERVICE_RELOAD,
>          SERVICE_STOP,              /* No STOP_PRE state, instead just register multiple STOP executables */
> +        SERVICE_STOP_SIGABRT,      /* Watchdog timeout */
>          SERVICE_STOP_SIGTERM,
>          SERVICE_STOP_SIGKILL,
>          SERVICE_STOP_POST,
> diff --git a/src/core/socket.c b/src/core/socket.c
> index 9004cb4..6ba8338 100644
> --- a/src/core/socket.c
> +++ b/src/core/socket.c
> @@ -1578,7 +1578,8 @@ static void socket_enter_signal(Socket *s, SocketState state, SocketResult f) {
>          r = unit_kill_context(
>                          UNIT(s),
>                          &s->kill_context,
> -                        state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM,
> +                        (state != SOCKET_STOP_PRE_SIGTERM && state != SOCKET_FINAL_SIGTERM) ?
> +                        KILL_KILL : KILL_TERMINATE,
>                          -1,
>                          s->control_pid,
>                          false);
> diff --git a/src/core/swap.c b/src/core/swap.c
> index b2ca048..001bc2e 100644
> --- a/src/core/swap.c
> +++ b/src/core/swap.c
> @@ -710,7 +710,8 @@ static void swap_enter_signal(Swap *s, SwapState state, SwapResult f) {
>          r = unit_kill_context(
>                          UNIT(s),
>                          &s->kill_context,
> -                        state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM,
> +                        (state != SWAP_ACTIVATING_SIGTERM && state != SWAP_DEACTIVATING_SIGTERM) ?
> +                        KILL_KILL : KILL_TERMINATE,
>                          -1,
>                          s->control_pid,
>                          false);
> diff --git a/src/core/unit.c b/src/core/unit.c
> index 489ea1e..84f210a 100644
> --- a/src/core/unit.c
> +++ b/src/core/unit.c
> @@ -3313,7 +3313,7 @@ int unit_make_transient(Unit *u) {
>  int unit_kill_context(
>                  Unit *u,
>                  KillContext *c,
> -                bool sigkill,
> +                KillOperation k,
>                  pid_t main_pid,
>                  pid_t control_pid,
>                  bool main_pid_alien) {
> @@ -3326,7 +3326,19 @@ int unit_kill_context(
>          if (c->kill_mode == KILL_NONE)
>                  return 0;
>  
> -        sig = sigkill ? SIGKILL : c->kill_signal;
> +        switch (k) {
> +        case KILL_KILL:
> +                sig = SIGKILL;
> +                break;
> +        case KILL_ABORT:
> +                sig = SIGABRT;
> +                break;
> +        case KILL_TERMINATE:
> +                sig = c->kill_signal;
> +                break;
> +        default:
> +                assert_not_reached("KillOperation unknown");
> +        }
>  
>          if (main_pid > 0) {
>                  r = kill_and_sigcont(main_pid, sig);
> @@ -3340,7 +3352,7 @@ int unit_kill_context(
>                          if (!main_pid_alien)
>                                  wait_for_exit = true;
>  
> -                        if (c->send_sighup && !sigkill)
> +                        if (c->send_sighup && k != KILL_KILL)
>                                  kill(main_pid, SIGHUP);
>                  }
>          }
> @@ -3356,12 +3368,12 @@ int unit_kill_context(
>                  } else {
>                          wait_for_exit = true;
>  
> -                        if (c->send_sighup && !sigkill)
> +                        if (c->send_sighup && k != KILL_KILL)
>                                  kill(control_pid, SIGHUP);
>                  }
>          }
>  
> -        if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && sigkill)) && u->cgroup_path) {
> +        if ((c->kill_mode == KILL_CONTROL_GROUP || (c->kill_mode == KILL_MIXED && k == KILL_KILL)) && u->cgroup_path) {
>                  _cleanup_set_free_ Set *pid_set = NULL;
>  
>                  /* Exclude the main/control pids from being killed via the cgroup */
> @@ -3385,7 +3397,7 @@ int unit_kill_context(
>  
>                          /* wait_for_exit = true; */
>  
> -                        if (c->send_sighup && !sigkill) {
> +                        if (c->send_sighup && k != KILL_KILL) {
>                                  set_free(pid_set);
>  
>                                  pid_set = unit_pid_set(main_pid, control_pid);
> diff --git a/src/core/unit.h b/src/core/unit.h
> index bbad546..081ab18 100644
> --- a/src/core/unit.h
> +++ b/src/core/unit.h
> @@ -54,6 +54,12 @@ enum UnitActiveState {
>          _UNIT_ACTIVE_STATE_INVALID = -1
>  };
>  
> +typedef enum KillOperation {
> +        KILL_TERMINATE,
> +        KILL_KILL,
> +        KILL_ABORT,
> +} KillOperation;
> +
>  static inline bool UNIT_IS_ACTIVE_OR_RELOADING(UnitActiveState t) {
>          return t == UNIT_ACTIVE || t == UNIT_RELOADING;
>  }
> @@ -576,7 +582,7 @@ int unit_write_drop_in_private_format(Unit *u, UnitSetPropertiesMode mode, const
>  
>  int unit_remove_drop_in(Unit *u, UnitSetPropertiesMode mode, const char *name);
>  
> -int unit_kill_context(Unit *u, KillContext *c, bool sigkill, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
> +int unit_kill_context(Unit *u, KillContext *c, KillOperation k, pid_t main_pid, pid_t control_pid, bool main_pid_alien);
>  
>  int unit_make_transient(Unit *u);
>  
> -- 
> 2.1.1
> 
> _______________________________________________
> systemd-devel mailing list
> systemd-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/systemd-devel


Lennart

-- 
Lennart Poettering, Red Hat


More information about the systemd-devel mailing list