[patch] xdm 1.1.8 server crash detect
vdb128 at picaros.org
vdb128 at picaros.org
Mon Apr 27 07:37:28 PDT 2009
The patch below proposes two changes in the code for detecting an
X server crash. These changes make my not so reliable
multi-server-seat setup a bit more workable.
I posted a summary at
http://bugs.freedesktop.org/show_bug.cgi?id=20546
but the text is actually longer than the patch itself.
Since the code is self contained I was wondering if someone here
would be willing to inspect and then perhaps to support the patch
for inclusion into the X app/xdm tree. If there are questions
just ask and I will describe the underlying logic.
Changes:
1. RemoveDisplay() is changed into StopDisplay() thus enabling
the wait for child exit synchronism.
2. The resource reservAttempts is added as to allow for a few
successive crash-type manager exits.
diff --git a/dm.c b/dm.c
index 52b49eb..daec863 100644
--- a/dm.c
+++ b/dm.c
@@ -476,6 +476,7 @@ #endif
break;
case OBEYSESS_DISPLAY:
d->startTries = 0;
+ d->reservTries = 0;
Debug ("Display exited with OBEYSESS_DISPLAY\n");
if (d->displayType.lifetime != Permanent ||
d->status == zombie)
@@ -512,24 +513,42 @@ #endif
Debug ("Display exited with RESERVER_DISPLAY\n");
if (d->displayType.origin == FromXDMCP || d->status == zombie)
StopDisplay(d);
- else
- RestartDisplay (d, TRUE);
- {
- Time_t Time;
- time(&Time);
- Debug("time %i %i\n",Time,d->lastCrash);
- if (d->lastCrash &&
- ((Time - d->lastCrash) < XDM_BROKEN_INTERVAL)) {
- Debug("Server crash frequency too high:"
- " removing display %s\n",d->name);
- LogError("Server crash rate too high:"
- " removing display %s\n",d->name);
+ else {
+ Time_t now;
+ int crash;
+
+ time(&now);
+ Debug("time %i %i try %i of %i\n", now, d->lastReserv,
+ d->reservTries, d->reservAttempts);
+ crash = d->lastReserv &&
+ ((now - d->lastReserv) < XDM_BROKEN_INTERVAL);
+
+ if (!crash) d->reservTries = 0;
+
+ if (crash && ++d->reservTries >= d->reservAttempts) {
+ char msg[]="Server crash frequency too high:"
+ " stopping display";
+ Debug("%s %s\n",msg,d->name);
+ LogError("%s %s\n",msg,d->name);
#if !defined(ARC4_RANDOM) && !defined(DEV_RANDOM)
AddTimerEntropy();
#endif
- RemoveDisplay (d);
+ /* For a local X server either:
+ * 1. The server exit was returned by waitpid(). So
+ * serverPid==-1 => StopDisplay() calls RemoveDisplay()
+ *
+ * 2. The server is a zombie or still running. So
+ * serverPid>1 => StopDisplay()
+ * - sets status=zombie,
+ * - kills the server.
+ * The next waitpid() returns this zombie server pid
+ * and the code below then calls RemoveDisplay().
+ */
+ StopDisplay(d);
} else
- d->lastCrash = Time;
+ RestartDisplay (d, TRUE);
+
+ d->lastReserv = now;
}
break;
case waitCompose (SIGTERM,0,0):
diff --git a/dm.h b/dm.h
index 5fa13e4..d472479 100644
--- a/dm.h
+++ b/dm.h
@@ -179,7 +179,8 @@ struct display {
pid_t serverPid; /* process id of server (-1 if none) */
FileState state; /* state during HUP processing */
int startTries; /* current start try */
- Time_t lastCrash; /* time of last crash */
+ Time_t lastReserv; /* time of last reserver crash */
+ int reservTries; /* current reserver try */
#ifdef XDMCP
/* XDMCP state */
CARD32 sessionID; /* ID of active session */
@@ -199,6 +200,7 @@ #endif
int openRepeat; /* open attempts to make */
int openTimeout; /* abort open attempt timeout */
int startAttempts; /* number of attempts at starting */
+ int reservAttempts; /* allowed fatal errors after start */
int pingInterval; /* interval between XSync */
int pingTimeout; /* timeout for XSync */
int terminateServer;/* restart for each session */
diff --git a/dpylist.c b/dpylist.c
index 6aca2ee..9c92294 100644
--- a/dpylist.c
+++ b/dpylist.c
@@ -245,7 +245,9 @@ NewDisplay (char *name, char *class)
d->openTimeout = 0;
d->startAttempts = 0;
d->startTries = 0;
- d->lastCrash = 0;
+ d->lastReserv = 0;
+ d->reservAttempts = 0;
+ d->reservTries = 0;
d->terminateServer = 0;
d->grabTimeout = 0;
#ifdef XDMCP
diff --git a/resource.c b/resource.c
index d38fe07..aef7624 100644
--- a/resource.c
+++ b/resource.c
@@ -284,6 +284,8 @@ struct displayResource serverResources[]
"120" },
{ "startAttempts","StartAttempts",DM_INT, boffset(startAttempts),
"4" },
+{ "reservAttempts","ReservAttempts",DM_INT, boffset(reservAttempts),
+ "2" },
{ "pingInterval","PingInterval",DM_INT, boffset(pingInterval),
"5" },
{ "pingTimeout","PingTimeout", DM_INT, boffset(pingTimeout),
diff --git a/xdm.man.cpp b/xdm.man.cpp
index ab511b7..18649b6 100644
--- a/xdm.man.cpp
+++ b/xdm.man.cpp
@@ -448,6 +448,7 @@ See the section
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openRepeat\fP"
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openTimeout\fP"
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.startAttempts\fP"
+.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.reservAttempts\fP"
These numeric resources control the behavior of
.I xdm
when attempting to open intransigent servers. \fBopenDelay\fP is
@@ -466,9 +467,10 @@ This
process is repeated \fBstartAttempts\fP times, at which point the display is
declared dead and disabled. Although
this behavior may seem arbitrary, it has been empirically developed and
-works quite well on most systems. The default values are
-5 for \fBopenDelay\fP, 5 for \fBopenRepeat\fP, 30 for \fBopenTimeout\fP and
-4 for \fBstartAttempts\fP.
+works quite well on most systems. \fBreservAttempts\fP is the number of
+times a fatal error is allowed after connecting. The default values are
+\fBopenDelay\fP: 15, \fBopenRepeat\fP: 5, \fBopenTimeout\fP: 120,
+\fBstartAttempts\fP: 4 and \fBreservAttempts\fP: 2.
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingInterval\fP"
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingTimeout\fP"
To discover when remote displays disappear,
More information about the xorg-devel
mailing list