[systemd-devel] [PATCH v2 1/3] introduce WatchdogSec and hook up the watchdog with the existing failure logic

Michael Olbrich m.olbrich at pengutronix.de
Wed Feb 8 01:10:34 PST 2012


---
changes in v2:
 - the watchdog is trigger for the first time when the service starts.
   Is service_enter_start_post() the right place for this?
 - Call it SERVICE_FAILURE_WATCHDOG and define corresponding string
 - Watchdog timer handling reworked to catch timeouts during
   daemon-reexec/daemon-reload

 man/systemd.service.xml          |   18 ++++++++++++++
 src/dbus-service.c               |    2 +
 src/load-fragment-gperf.gperf.m4 |    1 +
 src/service.c                    |   46 ++++++++++++++++++++++++++++++++++++-
 src/service.h                    |    3 ++
 5 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/man/systemd.service.xml b/man/systemd.service.xml
index 0baddd1..0b5edb8 100644
--- a/man/systemd.service.xml
+++ b/man/systemd.service.xml
@@ -460,6 +460,24 @@
                         </varlistentry>
 
                         <varlistentry>
+                                <term><varname>WatchdogSec=</varname></term>
+                                <listitem><para>Configures the watchdog
+                                timeout for a service. This is activated
+                                when the start-up is completed. The service
+                                must call
+                                <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>
+                                regularly with "WATCHDOG=1". If the time
+                                between two such calls is larger than
+                                the configured time then the service
+                                enters a failure state. By setting
+                                <term><varname>Restart=</varname></term>
+                                to <option>on-failure</option> or
+                                <option>always</option> the service can
+                                be restarted. Defaults to 0s, which
+                                disables this feature.</para></listitem>
+                        </varlistentry>
+
+                        <varlistentry>
                                 <term><varname>Restart=</varname></term>
                                 <listitem><para>Configures whether the
                                 main service process shall be
diff --git a/src/dbus-service.c b/src/dbus-service.c
index 738dc7b..fedfc1d 100644
--- a/src/dbus-service.c
+++ b/src/dbus-service.c
@@ -43,6 +43,7 @@
         "  <property name=\"NotifyAccess\" type=\"s\" access=\"read\"/>\n" \
         "  <property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
         "  <property name=\"TimeoutUSec\" type=\"t\" access=\"read\"/>\n" \
+        "  <property name=\"WatchdogUSec\" type=\"t\" access=\"read\"/>\n" \
         "  <property name=\"WatchdogTimestamp\" type=\"t\" access=\"read\"/>\n" \
         "  <property name=\"WatchdogTimestampMonotonic\" type=\"t\" access=\"read\"/>\n" \
         BUS_EXEC_COMMAND_INTERFACE("ExecStartPre")                      \
@@ -119,6 +120,7 @@ static const BusProperty bus_service_properties[] = {
         { "NotifyAccess",           bus_service_append_notify_access, "s", offsetof(Service, notify_access)                },
         { "RestartUSec",            bus_property_append_usec,         "t", offsetof(Service, restart_usec)                 },
         { "TimeoutUSec",            bus_property_append_usec,         "t", offsetof(Service, timeout_usec)                 },
+        { "WatchdogUSec",           bus_property_append_usec,         "t", offsetof(Service, watchdog_usec)                },
         { "WatchdogTimestamp",      bus_property_append_usec,         "t", offsetof(Service, watchdog_timestamp.realtime)  },
         { "WatchdogTimestampMonotonic",bus_property_append_usec,      "t", offsetof(Service, watchdog_timestamp.monotonic) },
         BUS_EXEC_COMMAND_PROPERTY("ExecStartPre",  offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]),  true ),
diff --git a/src/load-fragment-gperf.gperf.m4 b/src/load-fragment-gperf.gperf.m4
index 14c0606..9191f90 100644
--- a/src/load-fragment-gperf.gperf.m4
+++ b/src/load-fragment-gperf.gperf.m4
@@ -134,6 +134,7 @@ Service.ExecStop,                config_parse_exec,                  SERVICE_EXE
 Service.ExecStopPost,            config_parse_exec,                  SERVICE_EXEC_STOP_POST,        offsetof(Service, exec_command)
 Service.RestartSec,              config_parse_usec,                  0,                             offsetof(Service, restart_usec)
 Service.TimeoutSec,              config_parse_usec,                  0,                             offsetof(Service, timeout_usec)
+Service.WatchdogSec,             config_parse_usec,                  0,                             offsetof(Service, watchdog_usec)
 Service.Type,                    config_parse_service_type,          0,                             offsetof(Service, type)
 Service.Restart,                 config_parse_service_restart,       0,                             offsetof(Service, restart)
 Service.PermissionsStartOnly,    config_parse_bool,                  0,                             offsetof(Service, permissions_start_only)
diff --git a/src/service.c b/src/service.c
index b6bbfab..1631595 100644
--- a/src/service.c
+++ b/src/service.c
@@ -112,6 +112,9 @@ static void service_init(Unit *u) {
 
         s->timeout_usec = DEFAULT_TIMEOUT_USEC;
         s->restart_usec = DEFAULT_RESTART_USEC;
+
+        s->watchdog_watch.type = WATCH_INVALID;
+
         s->timer_watch.type = WATCH_INVALID;
 #ifdef HAVE_SYSV_COMPAT
         s->sysv_start_priority = -1;
@@ -208,14 +211,39 @@ static void service_connection_unref(Service *s) {
 static void service_stop_watchdog(Service *s) {
         assert(s);
 
+        unit_unwatch_timer(UNIT(s), &s->watchdog_watch);
         s->watchdog_timestamp.realtime = 0;
         s->watchdog_timestamp.monotonic = 0;
 }
 
+static void service_enter_dead(Service *s, ServiceResult f, bool allow_restart);
+
+static void service_handle_watchdog(Service *s) {
+        usec_t offset;
+        int r;
+
+        assert(s);
+
+        if (s->watchdog_usec == 0)
+                return;
+
+        offset = now(CLOCK_MONOTONIC) - s->watchdog_timestamp.monotonic;
+        if (offset >= s->watchdog_usec) {
+                log_error("%s watchdog timeout!", UNIT(s)->id);
+                service_enter_dead(s, SERVICE_FAILURE_WATCHDOG, true);
+                return;
+        }
+
+        r = unit_watch_timer(UNIT(s), s->watchdog_usec - offset, &s->watchdog_watch);
+        if (r < 0)
+                log_warning("%s failed to install watchdog timer: %s", UNIT(s)->id, strerror(-r));
+}
+
 static void service_reset_watchdog(Service *s) {
         assert(s);
 
         dual_timestamp_get(&s->watchdog_timestamp);
+        service_handle_watchdog(s);
 }
 
 static void service_done(Unit *u) {
@@ -259,6 +287,8 @@ static void service_done(Unit *u) {
 
         unit_ref_unset(&s->accept_socket);
 
+        service_stop_watchdog(s);
+
         unit_unwatch_timer(u, &s->timer_watch);
 }
 
@@ -1568,9 +1598,12 @@ static int service_coldplug(Unit *u) {
                                 if ((r = unit_watch_pid(UNIT(s), s->control_pid)) < 0)
                                         return r;
 
+                if (s->deserialized_state == SERVICE_START_POST ||
+                    s->deserialized_state == SERVICE_RUNNING)
+                        service_handle_watchdog(s);
+
                 service_set_state(s, s->deserialized_state);
         }
-
         return 0;
 }
 
@@ -2002,6 +2035,9 @@ static void service_enter_start_post(Service *s) {
 
         service_unwatch_control_pid(s);
 
+        if (s->watchdog_usec > 0)
+                service_reset_watchdog(s);
+
         if ((s->control_command = s->exec_command[SERVICE_EXEC_START_POST])) {
                 s->control_command_id = SERVICE_EXEC_START_POST;
 
@@ -2922,6 +2958,11 @@ static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
         assert(s);
         assert(elapsed == 1);
 
+        if (w == &s->watchdog_watch) {
+                service_handle_watchdog(s);
+                return;
+        }
+
         assert(w == &s->timer_watch);
 
         switch (s->state) {
@@ -3611,7 +3652,8 @@ static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
         [SERVICE_FAILURE_TIMEOUT] = "timeout",
         [SERVICE_FAILURE_EXIT_CODE] = "exit-code",
         [SERVICE_FAILURE_SIGNAL] = "signal",
-        [SERVICE_FAILURE_CORE_DUMP] = "core-dump"
+        [SERVICE_FAILURE_CORE_DUMP] = "core-dump",
+        [SERVICE_FAILURE_WATCHDOG] = "watchdog"
 };
 
 DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult);
diff --git a/src/service.h b/src/service.h
index b1e8b90..02726ef 100644
--- a/src/service.h
+++ b/src/service.h
@@ -95,6 +95,7 @@ typedef enum ServiceResult {
         SERVICE_FAILURE_EXIT_CODE,
         SERVICE_FAILURE_SIGNAL,
         SERVICE_FAILURE_CORE_DUMP,
+        SERVICE_FAILURE_WATCHDOG,
         _SERVICE_RESULT_MAX,
         _SERVICE_RESULT_INVALID = -1
 } ServiceResult;
@@ -112,6 +113,8 @@ struct Service {
         usec_t timeout_usec;
 
         dual_timestamp watchdog_timestamp;
+        usec_t watchdog_usec;
+        Watch watchdog_watch;
 
         ExecCommand* exec_command[_SERVICE_EXEC_COMMAND_MAX];
         ExecContext exec_context;
-- 
1.7.7.3



More information about the systemd-devel mailing list