[patch] app/xdm server crash detect XDM_BROKEN_INTERVAL and ctrl-alt-backspace

vdb at picaros.org vdb at picaros.org
Mon Sep 13 03:58:44 PDT 2010


Once upon a time xdm was created.  This program manages X terminals by 
forking 1 or 2 processes per display:

  1. a local server 'X :0' if required, and 
  2. the session manager greeter '-:0'.

The dm.c WaitForChild() loop interpretes the session exit code and 
uses a ++startTries>=startAttempts test to detect an unresponsive 
server, OPENFAILED_DISPLAY, or a SIGTERM'ed session.  Other exit codes 
reset startTries = 0.  

All worked well until the ps2 mouse appeared.  Such input device is 
initialized last and after the X listening socket.  Thus the session 
manager finds the X server, starts the greeter, and waits for its 
return.  Alas, if the mouse now fails initialization then X quits and 
triggers session.c IOErrorHandler() { exit(RESERVER_DISPLAY); }.  
The main loop would then call RestartDisplay(d, TRUE) causing a 
persistent restart-quit cycle which inhibits any user login.  

Somewhere before 2007 this was solved by the crash detect code in dm.c 
at the case RESERVER_DISPLAY.  A manager exit is considered a crash if 
a previous exit occured within the XDM_BROKEN_INTERVAL of 120 second.  
RemoveDisplay() is then called to remove the display from the list.  

This isn't a perfect solution since crash detection may also be 
triggered by the ctrl-alt-backspace server termination.  Although a 
bit an unfriendly way of quitting X this is the only option available 
in case of a garbled display.  

The proposed patch http://bugs.freedesktop.org/show_bug.cgi?id=20546
adds a ++reservTries>=reservAttempts test to allow for a few successive 
crash-type manager exits.  The resource reservAttempts is the number 
of times a fatal IO error is allowed in session.c.  

The patch also substitutes the RestartDisplay() RemoveDisplay() 
sequence by StopDisplay().  The original sequence works since 
RestartDisplay(d, TRUE) kills the serverPid.  However RemoveDisplay() 
directly removes the display from the list which may thus give an 
"Unknown child termination" error.  StopDisplay() on the other hand 
implements the synchronous kill and wait for child exit.  
RemoveDisplay() is only to be called if neither server nor session 
manager is running.  

The patch is included below.  I was wondering if someone here would be 
willing to inspect and then perhaps to support the patch for inclusion 
into app/xdm.  If there are questions just ask and I will further 
describe the underlying logic.  

Signed-off-by: Servaas Vandenberghe

diff --git a/dm.c b/dm.c
index da18800..c0b0031 100644
--- a/dm.c
+++ b/dm.c
@@ -492,6 +492,7 @@ #endif
 		break;
 	    case OBEYSESS_DISPLAY:
 		d->startTries = 0;
+		d->reservTries = 0;
 		Debug ("Display exited with OBEYSESS_DISPLAY\n");
 		if (d->displayType.lifetime != Permanent ||
 		    d->status == zombie)
@@ -528,24 +529,44 @@ #endif
 		Debug ("Display exited with RESERVER_DISPLAY\n");
 		if (d->displayType.origin == FromXDMCP || d->status == zombie)
 		    StopDisplay(d);
-		else
-		    RestartDisplay (d, TRUE);
-		{
-		  Time_t Time;
-		  time(&Time);
-		  Debug("time %i %i\n",Time,d->lastCrash);
-		  if (d->lastCrash &&
-		      ((Time - d->lastCrash) < XDM_BROKEN_INTERVAL)) {
-		    Debug("Server crash frequency too high:"
-			  " removing display %s\n",d->name);
-		    LogError("Server crash rate too high:"
-			     " removing display %s\n",d->name);
+		else {
+		  Time_t now;
+		  int crash;
+
+		  time(&now);
+		  Debug("time %i %i try %i of %i\n", now, d->lastReserv,
+			d->reservTries, d->reservAttempts);
+		  crash = d->lastReserv &&
+		    ((now - d->lastReserv) < XDM_BROKEN_INTERVAL);
+
+		  if (!crash)
+		    d->reservTries = 0;
+
+		  if (crash && ++d->reservTries >= d->reservAttempts) {
+		    char msg[] = "Server crash frequency too high:"
+		      " stopping display";
+		    Debug("%s %s\n", msg, d->name);
+		    LogError("%s %s\n", msg, d->name);
 #if !defined(ARC4_RANDOM) && !defined(DEV_RANDOM)
 		    AddTimerEntropy();
 #endif
-		    RemoveDisplay (d);
+		    /* For a local X server either:
+		     * 1. The server exit was returned by waitpid().  So
+		     *    serverPid==-1 => StopDisplay() calls RemoveDisplay()
+		     *
+		     * 2. The server is a zombie or still running.  So
+		     *    serverPid>1 => StopDisplay()
+		     *                   a. sets status=zombie,
+		     *                   b. kills the server.
+		     *    The next waitpid() returns this zombie server pid
+		     *    and the 'case zombie:' below then calls 
+		     *    RemoveDisplay().
+		     */
+		    StopDisplay(d);
 		  } else
-		    d->lastCrash = Time;
+		    RestartDisplay(d, TRUE);
+
+		  d->lastReserv = now;
 		}
 		break;
 	    case waitCompose (SIGTERM,0,0):
diff --git a/dm.h b/dm.h
index af50328..ccf0bd8 100644
--- a/dm.h
+++ b/dm.h
@@ -177,7 +177,8 @@ struct display {
 	pid_t		serverPid;	/* process id of server (-1 if none) */
 	FileState	state;		/* state during HUP processing */
 	int		startTries;	/* current start try */
-        Time_t          lastCrash;      /* time of last crash */
+	Time_t          lastReserv;     /* time of last reserver crash */
+	int		reservTries;	/* current reserver try */
 # ifdef XDMCP
 	/* XDMCP state */
 	CARD32		sessionID;	/* ID of active session */
@@ -197,6 +198,7 @@ # endif
 	int		openRepeat;	/* open attempts to make */
 	int		openTimeout;	/* abort open attempt timeout */
 	int		startAttempts;	/* number of attempts at starting */
+	int		reservAttempts;	/* allowed fatal errors after start */
 	int		pingInterval;	/* interval between XSync */
 	int		pingTimeout;	/* timeout for XSync */
 	int		terminateServer;/* restart for each session */
diff --git a/dpylist.c b/dpylist.c
index 6da882f..dccd679 100644
--- a/dpylist.c
+++ b/dpylist.c
@@ -241,7 +241,9 @@ NewDisplay (char *name, char *class)
     d->openTimeout = 0;
     d->startAttempts = 0;
     d->startTries = 0;
-    d->lastCrash = 0;
+    d->lastReserv = 0;
+    d->reservAttempts = 0;
+    d->reservTries = 0;
     d->terminateServer = 0;
     d->grabTimeout = 0;
 #ifdef XDMCP
diff --git a/resource.c b/resource.c
index 5c02da7..a06d742 100644
--- a/resource.c
+++ b/resource.c
@@ -222,6 +222,8 @@ struct displayResource serverResources[]
 				"120" },
 { "startAttempts","StartAttempts",DM_INT,	boffset(startAttempts),
 				"4" },
+{ "reservAttempts","ReservAttempts",DM_INT,	boffset(reservAttempts),
+				"2" },
 { "pingInterval","PingInterval",DM_INT,		boffset(pingInterval),
 				"5" },
 { "pingTimeout","PingTimeout",	DM_INT,		boffset(pingTimeout),
diff --git a/xdm.man.cpp b/xdm.man.cpp
index 220580b..d393598 100644
--- a/xdm.man.cpp
+++ b/xdm.man.cpp
@@ -445,6 +445,7 @@ See the section
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openRepeat\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openTimeout\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.startAttempts\fP"
+.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.reservAttempts\fP"
 These numeric resources control the behavior of
 .I xdm
 when attempting to open intransigent servers.  \fBopenDelay\fP is
@@ -463,9 +464,10 @@ This
 process is repeated \fBstartAttempts\fP times, at which point the display is
 declared dead and disabled.  Although
 this behavior may seem arbitrary, it has been empirically developed and
-works quite well on most systems.  The default values are
-5 for \fBopenDelay\fP, 5 for \fBopenRepeat\fP, 30 for \fBopenTimeout\fP and
-4 for \fBstartAttempts\fP.
+works quite well on most systems.  \fBreservAttempts\fP is the number of 
+times a fatal error is allowed after connecting.  The default values are
+\fBopenDelay\fP: 15, \fBopenRepeat\fP: 5, \fBopenTimeout\fP: 120, 
+\fBstartAttempts\fP: 4 and \fBreservAttempts\fP: 2.
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingInterval\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingTimeout\fP"
 To discover when remote displays disappear,


More information about the xorg-devel mailing list