[patch] xdm 1.1.8 server crash detect

vdb128 at picaros.org vdb128 at picaros.org
Mon Apr 27 07:37:28 PDT 2009


The patch below proposes two changes in the code for detecting an 
X server crash.  These changes make my not so reliable 
multi-server-seat setup a bit more workable.  

I posted a summary at 
http://bugs.freedesktop.org/show_bug.cgi?id=20546
but the text is actually longer than the patch itself.  

Since the code is self contained I was wondering if someone here 
would be willing to inspect and then perhaps to support the patch 
for inclusion into the X app/xdm tree.  If there are questions 
just ask and I will describe the underlying logic.  

Changes:
1. RemoveDisplay() is changed into StopDisplay() thus enabling 
   the wait for child exit synchronism.  
2. The resource reservAttempts is added as to allow for a few 
   successive crash-type manager exits.  

diff --git a/dm.c b/dm.c
index 52b49eb..daec863 100644
--- a/dm.c
+++ b/dm.c
@@ -476,6 +476,7 @@ #endif
 		break;
 	    case OBEYSESS_DISPLAY:
 		d->startTries = 0;
+		d->reservTries = 0;
 		Debug ("Display exited with OBEYSESS_DISPLAY\n");
 		if (d->displayType.lifetime != Permanent ||
 		    d->status == zombie)
@@ -512,24 +513,42 @@ #endif
 		Debug ("Display exited with RESERVER_DISPLAY\n");
 		if (d->displayType.origin == FromXDMCP || d->status == zombie)
 		    StopDisplay(d);
-		else
-		    RestartDisplay (d, TRUE);
-		{
-		  Time_t Time;
-		  time(&Time);
-		  Debug("time %i %i\n",Time,d->lastCrash);
-		  if (d->lastCrash && 
-		      ((Time - d->lastCrash) < XDM_BROKEN_INTERVAL)) {
-		    Debug("Server crash frequency too high:"
-			  " removing display %s\n",d->name);
-		    LogError("Server crash rate too high:"
-			     " removing display %s\n",d->name);
+		else {
+		  Time_t now;
+		  int crash;
+
+		  time(&now);
+		  Debug("time %i %i try %i of %i\n", now, d->lastReserv,
+			d->reservTries, d->reservAttempts);
+		  crash = d->lastReserv && 
+		    ((now - d->lastReserv) < XDM_BROKEN_INTERVAL);
+
+		  if (!crash) d->reservTries = 0;
+
+		  if (crash && ++d->reservTries >= d->reservAttempts) {
+		    char msg[]="Server crash frequency too high:"
+		      " stopping display";
+		    Debug("%s %s\n",msg,d->name);
+		    LogError("%s %s\n",msg,d->name);
 #if !defined(ARC4_RANDOM) && !defined(DEV_RANDOM)
 		    AddTimerEntropy();
 #endif
-		    RemoveDisplay (d);
+		    /* For a local X server either:
+		     * 1. The server exit was returned by waitpid().  So
+		     *    serverPid==-1 => StopDisplay() calls RemoveDisplay()
+		     *
+		     * 2. The server is a zombie or still running.  So
+		     *    serverPid>1 => StopDisplay() 
+		     *                   - sets status=zombie, 
+		     *                   - kills the server.  
+		     *    The next waitpid() returns this zombie server pid 
+		     *    and the code below then calls RemoveDisplay().
+		     */
+		    StopDisplay(d);
 		  } else 
-		    d->lastCrash = Time;
+		    RestartDisplay (d, TRUE);
+
+		  d->lastReserv = now;
 		}
 		break;
 	    case waitCompose (SIGTERM,0,0):
diff --git a/dm.h b/dm.h
index 5fa13e4..d472479 100644
--- a/dm.h
+++ b/dm.h
@@ -179,7 +179,8 @@ struct display {
 	pid_t		serverPid;	/* process id of server (-1 if none) */
 	FileState	state;		/* state during HUP processing */
 	int		startTries;	/* current start try */
-        Time_t          lastCrash;      /* time of last crash */
+        Time_t          lastReserv;     /* time of last reserver crash */
+	int		reservTries;	/* current reserver try */
 #ifdef XDMCP
 	/* XDMCP state */
 	CARD32		sessionID;	/* ID of active session */
@@ -199,6 +200,7 @@ #endif
 	int		openRepeat;	/* open attempts to make */
 	int		openTimeout;	/* abort open attempt timeout */
 	int		startAttempts;	/* number of attempts at starting */
+	int		reservAttempts;	/* allowed fatal errors after start */
 	int		pingInterval;	/* interval between XSync */
 	int		pingTimeout;	/* timeout for XSync */
 	int		terminateServer;/* restart for each session */
diff --git a/dpylist.c b/dpylist.c
index 6aca2ee..9c92294 100644
--- a/dpylist.c
+++ b/dpylist.c
@@ -245,7 +245,9 @@ NewDisplay (char *name, char *class)
     d->openTimeout = 0;
     d->startAttempts = 0;
     d->startTries = 0;
-    d->lastCrash = 0;
+    d->lastReserv = 0;
+    d->reservAttempts = 0;
+    d->reservTries = 0;
     d->terminateServer = 0;
     d->grabTimeout = 0;
 #ifdef XDMCP
diff --git a/resource.c b/resource.c
index d38fe07..aef7624 100644
--- a/resource.c
+++ b/resource.c
@@ -284,6 +284,8 @@ struct displayResource serverResources[]
 				"120" },
 { "startAttempts","StartAttempts",DM_INT,	boffset(startAttempts),
 				"4" },
+{ "reservAttempts","ReservAttempts",DM_INT,	boffset(reservAttempts),
+				"2" },
 { "pingInterval","PingInterval",DM_INT,		boffset(pingInterval),
 				"5" },
 { "pingTimeout","PingTimeout",	DM_INT,		boffset(pingTimeout),
diff --git a/xdm.man.cpp b/xdm.man.cpp
index ab511b7..18649b6 100644
--- a/xdm.man.cpp
+++ b/xdm.man.cpp
@@ -448,6 +448,7 @@ See the section
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openRepeat\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openTimeout\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.startAttempts\fP"
+.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.reservAttempts\fP"
 These numeric resources control the behavior of
 .I xdm
 when attempting to open intransigent servers.  \fBopenDelay\fP is
@@ -466,9 +467,10 @@ This
 process is repeated \fBstartAttempts\fP times, at which point the display is
 declared dead and disabled.  Although
 this behavior may seem arbitrary, it has been empirically developed and
-works quite well on most systems.  The default values are
-5 for \fBopenDelay\fP, 5 for \fBopenRepeat\fP, 30 for \fBopenTimeout\fP and
-4 for \fBstartAttempts\fP.
+works quite well on most systems.  \fBreservAttempts\fP is the number of 
+times a fatal error is allowed after connecting.  The default values are
+\fBopenDelay\fP: 15, \fBopenRepeat\fP: 5, \fBopenTimeout\fP: 120, 
+\fBstartAttempts\fP: 4 and \fBreservAttempts\fP: 2.
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingInterval\fP"
 .IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingTimeout\fP"
 To discover when remote displays disappear,


More information about the xorg-devel mailing list