[systemd-devel] [PATCH v2] log: be more verbose if dbus job fails

Michal Sekletar msekleta at redhat.com
Thu Apr 9 08:30:44 PDT 2015


On Thu, Apr 09, 2015 at 03:24:54PM +0000, Zbigniew Jędrzejewski-Szmek wrote:
> On Thu, Apr 09, 2015 at 05:20:43PM +0200, Michal Sekletar wrote:
> > On Thu, Apr 09, 2015 at 02:44:38PM +0000, Zbigniew Jędrzejewski-Szmek wrote:
> > > On Thu, Apr 09, 2015 at 04:35:53PM +0200, Michal Sekletar wrote:
> > > > On Thu, Apr 09, 2015 at 02:10:14PM +0000, Zbigniew Jędrzejewski-Szmek wrote:
> > > > > On Thu, Apr 09, 2015 at 03:20:02PM +0200, Michal Sekletar wrote:
> > > > > > Users might have hard time figuring out why exactly their systemctl request
> > > > > > failed. If dbus job fails try to figure out more details about failure by
> > > > > > examining Result property of the service.
> > > > > > 
> > > > > > https://bugzilla.redhat.com/show_bug.cgi?id=1016680
> > > > > > ---
> > > > > >  src/libsystemd/sd-bus/bus-util.c | 41 +++++++++++++++++++---
> > > > > >  src/shared/log.c                 | 76 ++++++++++++++++++++++++++++++++++++++++
> > > > > >  src/shared/log.h                 |  2 ++
> > > > > >  3 files changed, 114 insertions(+), 5 deletions(-)
> > > > > > 
> > > > > > diff --git a/src/libsystemd/sd-bus/bus-util.c b/src/libsystemd/sd-bus/bus-util.c
> > > > > > index 6498769..bb4c993 100644
> > > > > > --- a/src/libsystemd/sd-bus/bus-util.c
> > > > > > +++ b/src/libsystemd/sd-bus/bus-util.c
> > > > > > @@ -30,6 +30,7 @@
> > > > > >  #include "path-util.h"
> > > > > >  #include "missing.h"
> > > > > >  #include "set.h"
> > > > > > +#include "unit-name.h"
> > > > > >  
> > > > > >  #include "sd-bus.h"
> > > > > >  #include "bus-error.h"
> > > > > > @@ -1716,6 +1717,35 @@ static int bus_process_wait(sd_bus *bus) {
> > > > > >          }
> > > > > >  }
> > > > > >  
> > > > > > +static int bus_job_get_service_result(BusWaitForJobs *d, char **result) {
> > > > > > +        int r = 0;
> > > > > > +        _cleanup_free_ char *dbus_path = NULL, *p = NULL;
> > > > > > +
> > > > > > +        assert(d);
> > > > > > +        assert(d->result);
> > > > > > +        assert(d->name);
> > > > > > +        assert(result);
> > > > > > +
> > > > > > +        dbus_path = unit_dbus_path_from_name(d->name);
> > > > > > +        if (!dbus_path)
> > > > > > +                return -ENOMEM;
> > > > > > +
> > > > > > +        r = sd_bus_get_property_string(d->bus,
> > > > > > +                                       "org.freedesktop.systemd1",
> > > > > > +                                       dbus_path,
> > > > > > +                                       "org.freedesktop.systemd1.Service",
> > > > > > +                                       "Result",
> > > > > > +                                       NULL,
> > > > > > +                                       &p);
> > > > > > +        if (r < 0)
> > > > > > +                return r;
> > > > > > +
> > > > > > +        *result = p;
> > > > > > +        p = NULL;
> > > > > > +
> > > > > > +        return 0;
> > > > > > +}
> > > > > > +
> > > > > >  static int check_wait_response(BusWaitForJobs *d, bool quiet) {
> > > > > >          int r = 0;
> > > > > >  
> > > > > > @@ -1736,13 +1766,14 @@ static int check_wait_response(BusWaitForJobs *d, bool quiet) {
> > > > > >                          log_error("Operation on or unit type of %s not supported on this system.", strna(d->name));
> > > > > >                  else if (!streq(d->result, "done") && !streq(d->result, "skipped")) {
> > > > > >                          if (d->name) {
> > > > > > -                                bool quotes;
> > > > > > +                                int q;
> > > > > > +                                _cleanup_free_ char *result = NULL;
> > > > > >  
> > > > > > -                                quotes = chars_intersect(d->name, SHELL_NEED_QUOTES);
> > > > > > +                                q = bus_job_get_service_result(d, &result);
> > > > > > +                                if (q < 0)
> > > > > > +                                        log_debug_errno(q, "Failed to get Result property of service %s: %m", d->name);
> > > > > >  
> > > > > > -                                log_error("Job for %s failed. See \"systemctl status %s%s%s\" and \"journalctl -xe\" for details.",
> > > > > > -                                          d->name,
> > > > > > -                                          quotes ? "'" : "", d->name, quotes ? "'" : "");
> > > > > > +                                log_job_error_with_service_result(d->name, result);
> > > > > >                          } else
> > > > > >                                  log_error("Job failed. See \"journalctl -xe\" for details.");
> > > > > >                  }
> > > > > > diff --git a/src/shared/log.c b/src/shared/log.c
> > > > > > index 646a1d6..3219756 100644
> > > > > > --- a/src/shared/log.c
> > > > > > +++ b/src/shared/log.c
> > > > > > @@ -1061,3 +1061,79 @@ void log_received_signal(int level, const struct signalfd_siginfo *si) {
> > > > > >  void log_set_upgrade_syslog_to_journal(bool b) {
> > > > > >          upgrade_syslog_to_journal = b;
> > > > > >  }
> > > > > > +
> > > > > > +typedef enum ServiceResult {
> > > > > > +        SERVICE_SUCCESS,
> > > > > > +        SERVICE_FAILURE_RESOURCES,
> > > > > > +        SERVICE_FAILURE_TIMEOUT,
> > > > > > +        SERVICE_FAILURE_EXIT_CODE,
> > > > > > +        SERVICE_FAILURE_SIGNAL,
> > > > > > +        SERVICE_FAILURE_CORE_DUMP,
> > > > > > +        SERVICE_FAILURE_WATCHDOG,
> > > > > > +        SERVICE_FAILURE_START_LIMIT,
> > > > > > +        _SERVICE_RESULT_MAX,
> > > > > > +        _SERVICE_RESULT_INVALID = -1
> > > > > > +} ServiceResult;
> > > > > 
> > > > > dbus property returns the result as a string. So we need to map this string
> > > > > to an explanation. Going through the intermediate ServiceResult int does
> > > > > not really gain anything... I think things would be simpler if you added a
> > > > > table mapping result to explanation:
> > > > > 
> > > > > static const struct {
> > > > >   const char *result, *explanation;
> > > > > } explanations[] = {
> > > > >   {"timeout", "start timeout was exceeded"},
> > > > >   ...
> > > > >   {}
> > > > > };
> > > > > 
> > > > > and then simply looped over this table. This way it won't be necessary to
> > > > > have three different lists.
> > > > 
> > > > I guess that what you are suggesting will remove a bit of boilerplate code, will
> > > > do that.
> > > > 
> > > > > 
> > > > > This could live simply in systemctl.c. Those are only parts of
> > > > > sentences, nothing generally useful.
> > > > 
> > > > I'd say that having general function for logging about failed dbus job might be
> > > > useful.
> > > > 
> > > > > 
> > > > > > +static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
> > > > > > +        [SERVICE_SUCCESS] = "success",
> > > > > > +        [SERVICE_FAILURE_RESOURCES] = "resources",
> > > > > > +        [SERVICE_FAILURE_TIMEOUT] = "timeout",
> > > > > > +        [SERVICE_FAILURE_EXIT_CODE] = "exit-code",
> > > > > > +        [SERVICE_FAILURE_SIGNAL] = "signal",
> > > > > > +        [SERVICE_FAILURE_CORE_DUMP] = "core-dump",
> > > > > > +        [SERVICE_FAILURE_WATCHDOG] = "watchdog",
> > > > > > +        [SERVICE_FAILURE_START_LIMIT] = "start-limit"
> > > > > > +};
> > > > > > +
> > > > > > +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(service_result, ServiceResult);
> > > > > > +
> > > > > > +static const char* const service_result_error_message_table[_SERVICE_RESULT_MAX] = {
> > > > > > +        [SERVICE_FAILURE_RESOURCES] = "configured resource limit was exceeded",
> > > > > > +        [SERVICE_FAILURE_TIMEOUT] = ,
> > > > > > +        [SERVICE_FAILURE_EXIT_CODE] = "ExecStart process exited with error code",
> > > > > > +        [SERVICE_FAILURE_SIGNAL] = "fatal signal was delivered to ExecStart process",
> > > > > > +        [SERVICE_FAILURE_CORE_DUMP] = "fatal signal was delivered to ExecStart process. Core dumped",
> > > > > 
> > > > > "fatal signal was delivered to ExecStart process and it dumped core"?
> > > > > 
> > > > > > +        [SERVICE_FAILURE_WATCHDOG] = "service failed to sent watchdog ping",
> > > > > "...failed to send..."
> > > > > 
> > > > > > +        [SERVICE_FAILURE_START_LIMIT] = "start of the service was attempted too often too quickly",
> > > > > > +};
> > > > > > +
> > > > > > +void log_job_error_with_service_result(const char* service, const char *result) {
> > > > > > +        bool quotes;
> > > > > > +        _cleanup_free_ char *stream_buffer = NULL;
> > > > > > +        _cleanup_fclose_ FILE *stream = NULL;
> > > > > > +        char *service_maybe_quoted = NULL;
> > > > > > +        size_t size;
> > > > > > +        ServiceResult r;
> > > > > > +
> > > > > > +        assert(service);
> > > > > > +        assert(result);
> > > > > > +
> > > > > > +        quotes = chars_intersect(service, SHELL_NEED_QUOTES);
> > > > > > +
> > > > > > +        service_maybe_quoted = alloca(strlen(service) + 3);
> > > > > > +        sprintf(service_maybe_quoted, "%s%s%s", quotes ? "'" : "", service, quotes ? "'" : "");
> > > > > > +
> > > > > > +        stream = open_memstream(&stream_buffer, &size);
> > > > > > +        if (!stream)
> > > > > > +                return;
> > > > > > +
> > > > > > +        r = service_result_from_string(result);
> > > > > > +
> > > > > > +        assert(r > _SERVICE_RESULT_INVALID && r < _SERVICE_RESULT_MAX);
> > > > > > +
> > > > > > +        fprintf(stream, "Job for %s failed because %s. See \"systemctl status %s\" and \"journalctl -xe\" for details.\n",
> > > > > > +                service,
> > > > > > +                service_result_error_message_table[r],
> > > > > > +                service_maybe_quoted);
> > > > > > +
> > > > > > +        /* For some results maybe additional explanation is required */
> > > > > > +        if (streq_ptr(result, "start-limit"))
> > > > > > +                fprintf(stream, "To force a start please invoke \"systemctl reset-failed %s\" followed by \"systemctl start %s\" again.",
> > > > > > +                        service_maybe_quoted,
> > > > > > +                        service_maybe_quoted);
> > > > > > +
> > > > > > +        fflush(stream);
> > > > > > +
> > > > > > +        log_error("%s", stream_buffer);
> > > > > 
> > > > > I don't understand why memstream is needed. Why not just do log_error(),
> > > > > and optionally follow that by log_info() with the additional hint?
> > > > 
> > > > I used memstream because I did not want log twice in case that function is
> > > > called from context where log target is not tty, i.e. not from systemctl.
> > > I don't think this matters. Logging in two steps vs one matters when
> > > doing structured logging, e.g. to the journal. But when writing to a file,
> > > no matter if a tty or something else, doing it with one or two calls is pretty
> > > much the same thing.
> > 
> > That is exactly my point. I wanted to allow people to call the function from
> > whatever context they want. I didn't assume *anything* about log target and
> > try to do the right thing regardless what it is.
> 
> But this is not a library function that can be called from upredictable contexts.
> It is our internal utility, so we can tailor it to our current needs. If it ever
> needs to be made more generic, we can do it when the need arises.

Hmm...But it is in src/shared/log.c which kind of makes it library function (of
our internal library though), doesn't it?

Michal

> 
> Zbyszek


More information about the systemd-devel mailing list