[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH] detect and report qemu-dm failure
Keir Fraser writes ("Re: [Xen-devel] [PATCH] detect and report qemu-dm failure"): > This patch doesn't apply to tip. Updated (and given a quick retest). Ian. Signed-off-by: Ian Jackson <ian.jackson@xxxxxxxxxxxxx> diff -r 4ac0898df538 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Thu Jun 12 16:18:31 2008 +0100 +++ b/tools/python/xen/xend/XendDomain.py Thu Jun 12 16:46:56 2008 +0100 @@ -34,7 +34,7 @@ import xen.lowlevel.xc from xen.xend import XendOptions, XendCheckpoint, XendDomainInfo from xen.xend.PrettyPrint import prettyprint -from xen.xend import XendConfig +from xen.xend import XendConfig, image from xen.xend.XendError import XendError, XendInvalidDomain, VmError from xen.xend.XendError import VMBadState from xen.xend.XendLogging import log @@ -178,6 +178,8 @@ class XendDomain: except Exception: log.exception("Failed to create reference to running " "domain id: %d" % dom['domid']) + + image.cleanup_stale_sentinel_fifos() # add all managed domains as dormant domains. for dom in managed: diff -r 4ac0898df538 tools/python/xen/xend/XendLogging.py --- a/tools/python/xen/xend/XendLogging.py Thu Jun 12 16:18:31 2008 +0100 +++ b/tools/python/xen/xend/XendLogging.py Thu Jun 12 16:46:56 2008 +0100 @@ -25,10 +25,10 @@ import types import types import logging import logging.handlers -import fcntl from xen.util import mkdir from xen.xend.server import params +from xen.util import oshelp __all__ = [ 'log', 'init', 'getLogFilename' ] @@ -103,9 +103,7 @@ class XendRotatingFileHandler(logging.ha # entire FileHandler, StreamHandler & RotatingFileHandler classes which # is even worse def setCloseOnExec(self): - flags = fcntl.fcntl(self.stream.fileno(), fcntl.F_GETFD) - flags |= fcntl.FD_CLOEXEC - fcntl.fcntl(self.stream.fileno(), fcntl.F_SETFD, flags) + oshelp.fcntl_setfd_cloexec(self.stream, True) def init(filename, level): diff -r 4ac0898df538 tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Thu Jun 12 16:18:31 2008 +0100 +++ b/tools/python/xen/xend/image.py Thu Jun 12 16:47:49 2008 +0100 @@ -22,6 +22,12 @@ import math import math import time import signal +import thread +import fcntl +import sys +import errno +import glob +import traceback import xen.lowlevel.xc from xen.xend.XendConstants import * @@ -32,11 +38,23 @@ from xen.xend.xenstore.xswatch import xs from xen.xend.xenstore.xswatch import xswatch from xen.xend import arch from xen.xend import XendOptions +from xen.util import oshelp +from xen.util import utils xc = xen.lowlevel.xc.xc() MAX_GUEST_CMDLINE = 1024 +sentinel_path_prefix = '/var/run/xend/dm-' +sentinel_fifos_inuse = { } + +def cleanup_stale_sentinel_fifos(): + for path in glob.glob(sentinel_path_prefix + '*.fifo'): + if path in sentinel_fifos_inuse: continue + try: os.unlink(path) + except OSError, e: + log.warning('could not delete stale fifo %s: %s', + path, utils.exception_string(e)) def create(vm, vmConfig): """Create an image handler for a vm. @@ -324,6 +342,13 @@ class ImageHandler: args = args + self.dmargs return args + def _openSentinel(self, sentinel_path_fifo): + self.sentinel_fifo = file(sentinel_path_fifo, 'r') + self.sentinel_lock = thread.allocate_lock() + oshelp.fcntl_setfd_cloexec(self.sentinel_fifo, True) + sentinel_fifos_inuse[sentinel_path_fifo] = 1 + self.sentinel_path_fifo = sentinel_path_fifo + def createDeviceModel(self, restore = False): if self.device_model is None: return @@ -339,21 +364,29 @@ class ImageHandler: env['XAUTHORITY'] = self.xauthority if self.vncconsole: args = args + ([ "-vncviewer" ]) + unique_id = "%i-%i" % (self.vm.getDomid(), time.time()) + sentinel_path = sentinel_path_prefix + unique_id + sentinel_path_fifo = sentinel_path + '.fifo' + os.mkfifo(sentinel_path_fifo, 0600) + sentinel_write = file(sentinel_path_fifo, 'r+') + self._openSentinel(sentinel_path_fifo) + self.vm.storeDom("image/device-model-fifo", sentinel_path_fifo) xstransact.Mkdir("/local/domain/0/device-model/%i" % self.vm.getDomid()) xstransact.SetPermissions("/local/domain/0/device-model/%i" % self.vm.getDomid(), { 'dom': self.vm.getDomid(), 'read': True, 'write': True }) log.info("spawning device models: %s %s", self.device_model, args) # keep track of pid and spawned options to kill it later - logfile = "/var/log/xen/qemu-dm-%s.log" % str(self.vm.info['name_label']) - if os.path.exists(logfile): - if os.path.exists(logfile + ".1"): - os.unlink(logfile + ".1") - os.rename(logfile, logfile + ".1") + self.logfile = "/var/log/xen/qemu-dm-%s.log" % str(self.vm.info['name_label']) + if os.path.exists(self.logfile): + if os.path.exists(self.logfile + ".1"): + os.unlink(self.logfile + ".1") + os.rename(self.logfile, self.logfile + ".1") null = os.open("/dev/null", os.O_RDONLY) - logfd = os.open(logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC) + logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND) + sys.stderr.flush() pid = os.fork() if pid == 0: #child try: @@ -362,18 +395,26 @@ class ImageHandler: os.dup2(logfd, 2) os.close(null) os.close(logfd) + self.sentinel_fifo.close() try: os.execve(self.device_model, args, env) - except: - os._exit(127) + except Exception, e: + print >>sys.stderr, ( + 'failed to set up fds or execute dm %s: %s' % + (self.device_model, utils.exception_string(e))) + os._exit(126) except: os._exit(127) else: self.pid = pid os.close(null) os.close(logfd) + sentinel_write.close() self.vm.storeDom("image/device-model-pid", self.pid) log.info("device model pid: %d", self.pid) + # we would very much prefer not to have a thread here and instead + # have a callback but sadly we don't have Twisted in xend + self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,()) def signalDeviceModel(self, cmd, ret, par = None): if self.device_model is None: @@ -419,44 +460,116 @@ class ImageHandler: xstransact.Store("/local/domain/0/device-model/%i" % self.vm.getDomid(), ('command', 'continue')) + def _dmfailed(self, message): + log.warning("domain %s: %s", self.vm.getName(), message) + # ideally we would like to forcibly crash the domain with + # something like + # xc.domain_shutdown(self.vm.getDomid(), DOMAIN_CRASH) + # but this can easily lead to very rapid restart loops against + # which we currently have no protection + def recreate(self): if self.device_model is None: return - self.pid = self.vm.gatherDom(('image/device-model-pid', int)) + name = self.vm.getName() + sentinel_path_fifo = self.vm.readDom('image/device-model-fifo') + fifo_fd = -1 + log.debug("rediscovering %s", sentinel_path_fifo) + if sentinel_path_fifo is None: + log.debug("%s device model no sentinel, cannot rediscover", name) + else: + try: + # We open it O_WRONLY because that fails ENXIO if no-one + # has it open for reading (see SuSv3). The dm process got + # a read/write descriptor from our earlier invocation. + fifo_fd = os.open(sentinel_path_fifo, os.O_WRONLY|os.O_NONBLOCK) + except OSError, e: + if e.errno == errno.ENXIO: + self._dmfailed("%s device model no longer running"%name) + elif e.errno == errno.ENOENT: + log.debug("%s device model sentinel %s absent!", + name, sentinel_path_fifo) + else: + raise + if fifo_fd >= 0: + self._openSentinel(sentinel_path_fifo) + os.close(fifo_fd) + self.pid = self.vm.gatherDom(('image/device-model-pid', int)) + log.debug("%s device model rediscovered, pid %s sentinel fifo %s", + name, self.pid, sentinel_path_fifo) + self.sentinel_thread = thread.start_new_thread(self._sentinel_watch,()) + + def _sentinel_watch(self): + log.info("waiting for sentinel_fifo") + try: self.sentinel_fifo.read(1) + except OSError, e: pass + self.sentinel_lock.acquire() + try: + if self.pid: + (p,st) = os.waitpid(self.pid, os.WNOHANG) + if p == self.pid: + message = oshelp.waitstatus_description(st) + else: + # obviously it is malfunctioning, kill it now + try: + os.kill(self.pid, signal.SIGKILL) + message = "malfunctioning (closed sentinel), killed" + except: + message = "malfunctioning or died ?" + message = "pid %d: %s" % (self.pid, message) + else: + message = "no longer running" + except Exception, e: + message = "waitpid failed: %s" % utils.exception_string(e) + message = "device model failure: %s" % message + try: message += "; see %s " % self.logfile + except: pass + self._dmfailed(message) + self.pid = None + self.sentinel_lock.release() def destroyDeviceModel(self): if self.device_model is None: return if self.pid: + self.sentinel_lock.acquire() try: - os.kill(self.pid, signal.SIGHUP) - except OSError, exn: - log.exception(exn) - try: - # Try to reap the child every 100ms for 10s. Then SIGKILL it. - for i in xrange(100): - (p, rv) = os.waitpid(self.pid, os.WNOHANG) - if p == self.pid: - break - time.sleep(0.1) - else: - log.warning("DeviceModel %d took more than 10s " - "to terminate: sending SIGKILL" % self.pid) + try: + os.kill(self.pid, signal.SIGHUP) + except OSError, exn: + log.exception(exn) + try: + # Try to reap the child every 100ms for 10s. Then SIGKILL it. + for i in xrange(100): + (p, rv) = os.waitpid(self.pid, os.WNOHANG) + if p == self.pid: + break + time.sleep(0.1) + else: + log.warning("DeviceModel %d took more than 10s " + "to terminate: sending SIGKILL" % self.pid) + os.kill(self.pid, signal.SIGKILL) + os.waitpid(self.pid, 0) + except OSError, exn: + # This is expected if Xend has been restarted within the + # life of this domain. In this case, we can kill the process, + # but we can't wait for it because it's not our child. + # We just make really sure it's going away (SIGKILL) first. os.kill(self.pid, signal.SIGKILL) - os.waitpid(self.pid, 0) - except OSError, exn: - # This is expected if Xend has been restarted within the - # life of this domain. In this case, we can kill the process, - # but we can't wait for it because it's not our child. - # We just make really sure it's going away (SIGKILL) first. - os.kill(self.pid, signal.SIGKILL) - self.pid = None - state = xstransact.Remove("/local/domain/0/device-model/%i" - % self.vm.getDomid()) + state = xstransact.Remove("/local/domain/0/device-model/%i" + % self.vm.getDomid()) + finally: + self.pid = None + self.sentinel_lock.release() try: os.unlink('/var/run/tap/qemu-read-%d' % self.vm.getDomid()) os.unlink('/var/run/tap/qemu-write-%d' % self.vm.getDomid()) + except: + pass + try: + del sentinel_fifos_inuse[self.sentinel_path_fifo] + os.unlink(self.sentinel_path_fifo) except: pass diff -r 4ac0898df538 tools/python/xen/util/oshelp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/python/xen/util/oshelp.py Thu Jun 12 16:46:56 2008 +0100 @@ -0,0 +1,20 @@ +import fcntl +import os + +def fcntl_setfd_cloexec(file, bool): + f = fcntl.fcntl(file, fcntl.F_GETFD) + if bool: f |= fcntl.FD_CLOEXEC + else: f &= ~fcntl.FD_CLOEXEC + fcntl.fcntl(file, fcntl.F_SETFD) + +def waitstatus_description(st): + if os.WIFEXITED(st): + es = os.WEXITSTATUS(st) + if es: return "exited with nonzero status %i" % es + else: return "exited" + elif os.WIFSIGNALED(st): + s = "died due to signal %i" % os.WTERMSIG(st) + if os.WCOREDUMP(st): s += " (core dumped)" + return s + else: + return "failed with unexpected wait status %i" % st diff -r 4ac0898df538 tools/python/xen/util/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/python/xen/util/utils.py Thu Jun 12 16:46:56 2008 +0100 @@ -0,0 +1,6 @@ +import traceback +import sys + +def exception_string(e): + (ty,v,tb) = sys.exc_info() + return traceback.format_exception_only(ty,v) _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |