[patch] app/xdm server crash detect XDM_BROKEN_INTERVAL and ctrl-alt-backspace
vdb at picaros.org
vdb at picaros.org
Mon Sep 13 03:58:44 PDT 2010
Once upon a time xdm was created. This program manages X terminals by
forking 1 or 2 processes per display:
1. a local server 'X :0' if required, and
2. the session manager greeter '-:0'.
The dm.c WaitForChild() loop interpretes the session exit code and
uses a ++startTries>=startAttempts test to detect an unresponsive
server, OPENFAILED_DISPLAY, or a SIGTERM'ed session. Other exit codes
reset startTries = 0.
All worked well until the ps2 mouse appeared. Such input device is
initialized last and after the X listening socket. Thus the session
manager finds the X server, starts the greeter, and waits for its
return. Alas, if the mouse now fails initialization then X quits and
triggers session.c IOErrorHandler() { exit(RESERVER_DISPLAY); }.
The main loop would then call RestartDisplay(d, TRUE) causing a
persistent restart-quit cycle which inhibits any user login.
Somewhere before 2007 this was solved by the crash detect code in dm.c
at the case RESERVER_DISPLAY. A manager exit is considered a crash if
a previous exit occured within the XDM_BROKEN_INTERVAL of 120 second.
RemoveDisplay() is then called to remove the display from the list.
This isn't a perfect solution since crash detection may also be
triggered by the ctrl-alt-backspace server termination. Although a
bit an unfriendly way of quitting X this is the only option available
in case of a garbled display.
The proposed patch http://bugs.freedesktop.org/show_bug.cgi?id=20546
adds a ++reservTries>=reservAttempts test to allow for a few successive
crash-type manager exits. The resource reservAttempts is the number
of times a fatal IO error is allowed in session.c.
The patch also substitutes the RestartDisplay() RemoveDisplay()
sequence by StopDisplay(). The original sequence works since
RestartDisplay(d, TRUE) kills the serverPid. However RemoveDisplay()
directly removes the display from the list which may thus give an
"Unknown child termination" error. StopDisplay() on the other hand
implements the synchronous kill and wait for child exit.
RemoveDisplay() is only to be called if neither server nor session
manager is running.
The patch is included below. I was wondering if someone here would be
willing to inspect and then perhaps to support the patch for inclusion
into app/xdm. If there are questions just ask and I will further
describe the underlying logic.
Signed-off-by: Servaas Vandenberghe
diff --git a/dm.c b/dm.c
index da18800..c0b0031 100644
--- a/dm.c
+++ b/dm.c
@@ -492,6 +492,7 @@ #endif
break;
case OBEYSESS_DISPLAY:
d->startTries = 0;
+ d->reservTries = 0;
Debug ("Display exited with OBEYSESS_DISPLAY\n");
if (d->displayType.lifetime != Permanent ||
d->status == zombie)
@@ -528,24 +529,44 @@ #endif
Debug ("Display exited with RESERVER_DISPLAY\n");
if (d->displayType.origin == FromXDMCP || d->status == zombie)
StopDisplay(d);
- else
- RestartDisplay (d, TRUE);
- {
- Time_t Time;
- time(&Time);
- Debug("time %i %i\n",Time,d->lastCrash);
- if (d->lastCrash &&
- ((Time - d->lastCrash) < XDM_BROKEN_INTERVAL)) {
- Debug("Server crash frequency too high:"
- " removing display %s\n",d->name);
- LogError("Server crash rate too high:"
- " removing display %s\n",d->name);
+ else {
+ Time_t now;
+ int crash;
+
+ time(&now);
+ Debug("time %i %i try %i of %i\n", now, d->lastReserv,
+ d->reservTries, d->reservAttempts);
+ crash = d->lastReserv &&
+ ((now - d->lastReserv) < XDM_BROKEN_INTERVAL);
+
+ if (!crash)
+ d->reservTries = 0;
+
+ if (crash && ++d->reservTries >= d->reservAttempts) {
+ char msg[] = "Server crash frequency too high:"
+ " stopping display";
+ Debug("%s %s\n", msg, d->name);
+ LogError("%s %s\n", msg, d->name);
#if !defined(ARC4_RANDOM) && !defined(DEV_RANDOM)
AddTimerEntropy();
#endif
- RemoveDisplay (d);
+ /* For a local X server either:
+ * 1. The server exit was returned by waitpid(). So
+ * serverPid==-1 => StopDisplay() calls RemoveDisplay()
+ *
+ * 2. The server is a zombie or still running. So
+ * serverPid>1 => StopDisplay()
+ * a. sets status=zombie,
+ * b. kills the server.
+ * The next waitpid() returns this zombie server pid
+ * and the 'case zombie:' below then calls
+ * RemoveDisplay().
+ */
+ StopDisplay(d);
} else
- d->lastCrash = Time;
+ RestartDisplay(d, TRUE);
+
+ d->lastReserv = now;
}
break;
case waitCompose (SIGTERM,0,0):
diff --git a/dm.h b/dm.h
index af50328..ccf0bd8 100644
--- a/dm.h
+++ b/dm.h
@@ -177,7 +177,8 @@ struct display {
pid_t serverPid; /* process id of server (-1 if none) */
FileState state; /* state during HUP processing */
int startTries; /* current start try */
- Time_t lastCrash; /* time of last crash */
+ Time_t lastReserv; /* time of last reserver crash */
+ int reservTries; /* current reserver try */
# ifdef XDMCP
/* XDMCP state */
CARD32 sessionID; /* ID of active session */
@@ -197,6 +198,7 @@ # endif
int openRepeat; /* open attempts to make */
int openTimeout; /* abort open attempt timeout */
int startAttempts; /* number of attempts at starting */
+ int reservAttempts; /* allowed fatal errors after start */
int pingInterval; /* interval between XSync */
int pingTimeout; /* timeout for XSync */
int terminateServer;/* restart for each session */
diff --git a/dpylist.c b/dpylist.c
index 6da882f..dccd679 100644
--- a/dpylist.c
+++ b/dpylist.c
@@ -241,7 +241,9 @@ NewDisplay (char *name, char *class)
d->openTimeout = 0;
d->startAttempts = 0;
d->startTries = 0;
- d->lastCrash = 0;
+ d->lastReserv = 0;
+ d->reservAttempts = 0;
+ d->reservTries = 0;
d->terminateServer = 0;
d->grabTimeout = 0;
#ifdef XDMCP
diff --git a/resource.c b/resource.c
index 5c02da7..a06d742 100644
--- a/resource.c
+++ b/resource.c
@@ -222,6 +222,8 @@ struct displayResource serverResources[]
"120" },
{ "startAttempts","StartAttempts",DM_INT, boffset(startAttempts),
"4" },
+{ "reservAttempts","ReservAttempts",DM_INT, boffset(reservAttempts),
+ "2" },
{ "pingInterval","PingInterval",DM_INT, boffset(pingInterval),
"5" },
{ "pingTimeout","PingTimeout", DM_INT, boffset(pingTimeout),
diff --git a/xdm.man.cpp b/xdm.man.cpp
index 220580b..d393598 100644
--- a/xdm.man.cpp
+++ b/xdm.man.cpp
@@ -445,6 +445,7 @@ See the section
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openRepeat\fP"
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.openTimeout\fP"
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.startAttempts\fP"
+.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.reservAttempts\fP"
These numeric resources control the behavior of
.I xdm
when attempting to open intransigent servers. \fBopenDelay\fP is
@@ -463,9 +464,10 @@ This
process is repeated \fBstartAttempts\fP times, at which point the display is
declared dead and disabled. Although
this behavior may seem arbitrary, it has been empirically developed and
-works quite well on most systems. The default values are
-5 for \fBopenDelay\fP, 5 for \fBopenRepeat\fP, 30 for \fBopenTimeout\fP and
-4 for \fBstartAttempts\fP.
+works quite well on most systems. \fBreservAttempts\fP is the number of
+times a fatal error is allowed after connecting. The default values are
+\fBopenDelay\fP: 15, \fBopenRepeat\fP: 5, \fBopenTimeout\fP: 120,
+\fBstartAttempts\fP: 4 and \fBreservAttempts\fP: 2.
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingInterval\fP"
.IP "\fBDisplayManager.\fP\fIDISPLAY\fP\fB.pingTimeout\fP"
To discover when remote displays disappear,
More information about the xorg-devel
mailing list