[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Added a basic integrity checker, and some basic ability to recover from store
# HG changeset patch # User emellor@xxxxxxxxxxxxxxxxxxxxxx # Node ID afd7d4ba113bf96bca099111fa62335165bd2288 # Parent 2142adf67d6a62fbcb70388bbbcb591d438cfa17 Added a basic integrity checker, and some basic ability to recover from store corruption, rather than just spewing error messages and exiting. Added a xenstore-control executable, which sends commands to xenstored. Currently, the only command is 'check', which triggers an integrity check. (The integrity check is also triggered whenever a corrupted store is detected). Signed-off-by: Ewan Mellor <ewan@xxxxxxxxxxxxx> diff -r 2142adf67d6a -r afd7d4ba113b .hgignore --- a/.hgignore Wed Mar 8 22:56:55 2006 +++ b/.hgignore Wed Mar 8 22:57:02 2006 @@ -162,6 +162,7 @@ ^tools/xenstore/xenstore-read$ ^tools/xenstore/xenstore-rm$ ^tools/xenstore/xenstore-write$ +^tools/xenstore/xenstore-control$ ^tools/xenstore/xenstore-ls$ ^tools/xenstore/xenstored$ ^tools/xenstore/xenstored_test$ diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/Makefile --- a/tools/xenstore/Makefile Wed Mar 8 22:56:55 2006 +++ b/tools/xenstore/Makefile Wed Mar 8 22:57:02 2006 @@ -27,7 +27,10 @@ CLIENTS += xenstore-write CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS)) -all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-ls +all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control xenstore-ls + +test_interleaved_transactions: test_interleaved_transactions.o + $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ testcode: xs_test xenstored_test xs_random @@ -35,13 +38,16 @@ $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@ $(CLIENTS): xenstore-%: xenstore_%.o libxenstore.so - $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@ + $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c $(COMPILE.c) -DCLIENT_$(*F) -o $@ $< +xenstore-control: xenstore_control.o libxenstore.so + $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ + xenstore-ls: xsls.o libxenstore.so - $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@ + $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@ xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o tdb.o $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@ @@ -77,7 +83,8 @@ clean: testsuite-clean rm -f *.o *.opic *.so rm -f xenstored xs_random xs_stress xs_crashme - rm -f xs_test xenstored_test xs_tdb_dump xenstore-ls $(CLIENTS) + rm -f xs_test xenstored_test xs_tdb_dump xenstore-control xenstore-ls + rm -f $(CLIENTS) $(RM) $(PROG_DEP) print-dir: @@ -129,7 +136,7 @@ tarball: clean cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/ -install: libxenstore.so xenstored xenstore-ls $(CLIENTS) +install: all $(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored $(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored $(INSTALL_DIR) -p $(DESTDIR)/usr/bin @@ -137,6 +144,7 @@ $(INSTALL_DIR) -p $(DESTDIR)/usr/include $(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin $(INSTALL_PROG) $(CLIENTS) $(DESTDIR)/usr/bin + $(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin $(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR) $(INSTALL_DATA) libxenstore.so $(DESTDIR)/usr/$(LIBDIR) diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstored_core.c --- a/tools/xenstore/xenstored_core.c Wed Mar 8 22:56:55 2006 +++ b/tools/xenstore/xenstored_core.c Wed Mar 8 22:57:02 2006 @@ -60,6 +60,18 @@ static char *tracefile = NULL; static TDB_CONTEXT *tdb_ctx; +static void corrupt(struct connection *conn, const char *fmt, ...); +static void check_store(); + +#define log(...) \ + do { \ + char *s = talloc_asprintf(NULL, __VA_ARGS__); \ + trace("%s\n", s); \ + syslog(LOG_ERR, "%s", s); \ + talloc_free(s); \ + } while (0) + + #ifdef TESTING static bool failtest = false; @@ -103,33 +115,6 @@ #endif /* TESTING */ #include "xenstored_test.h" - -/* FIXME: Ideally, this should never be called. Some can be eliminated. */ -/* Something is horribly wrong: shutdown immediately. */ -void __attribute__((noreturn)) corrupt(struct connection *conn, - const char *fmt, ...) -{ - va_list arglist; - char *str; - int saved_errno = errno; - - va_start(arglist, fmt); - str = talloc_vasprintf(NULL, fmt, arglist); - va_end(arglist); - - trace("xenstored corruption: connection id %i: err %s: %s", - conn ? (int)conn->id : -1, strerror(saved_errno), str); - eprintf("xenstored corruption: connection id %i: err %s: %s", - conn ? (int)conn->id : -1, strerror(saved_errno), str); -#ifdef TESTING - /* Allow them to attach debugger. */ - sleep(30); -#endif - syslog(LOG_DAEMON, - "xenstored corruption: connection id %i: err %s: %s", - conn ? (int)conn->id : -1, strerror(saved_errno), str); - _exit(2); -} TDB_CONTEXT *tdb_context(struct connection *conn) { @@ -216,7 +201,8 @@ now = time(NULL); tm = localtime(&now); - trace("%s %p %02d:%02d:%02d %s (", prefix, conn, + trace("%s %p %p %04d%02d%02d %02d:%02d:%02d %s (", prefix, conn, + conn->transaction, tm->year + 1900, tm->mon + 1, tm->mday, tm->tm_hour, tm->tm_min, tm->tm_sec, sockmsg_string(data->hdr.msg.type)); @@ -838,8 +824,6 @@ return 0; } -/* Be careful: create heirarchy, put entry in existing parent *last*. - * This helps fsck if we die during this. */ static struct node *create_node(struct connection *conn, const char *name, void *data, unsigned int datalen) @@ -940,8 +924,9 @@ { unsigned int i; - /* Delete self, then delete children. If something goes wrong, - * consistency check will clean up this way. */ + /* Delete self, then delete children. If we crash, then the worst + that can happen is the children will continue to take up space, but + will otherwise be unreachable. */ delete_node_single(conn, node); /* Delete children, too. */ @@ -951,9 +936,14 @@ child = read_node(conn, talloc_asprintf(node, "%s/%s", node->name, node->children + i)); - if (!child) - corrupt(conn, "No child '%s' found", child); - delete_node(conn, child); + if (child) { + delete_node(conn, child); + } + else { + trace("delete_node: No child '%s/%s' found!\n", + node->name, node->children + i); + /* Skip it, we've already deleted the parent. */ + } } } @@ -977,12 +967,15 @@ } } corrupt(conn, "Can't find child '%s' in %s", childname, node->name); + return false; } static int _rm(struct connection *conn, struct node *node, const char *name) { - /* Delete from parent first, then if something explodes fsck cleans. */ + /* Delete from parent first, then if we crash, the worst that can + happen is the child will continue to take up space, but will + otherwise be unreachable. */ struct node *parent = read_node(conn, get_parent(name)); if (!parent) { send_error(conn, EINVAL); @@ -1001,10 +994,11 @@ static void internal_rm(const char *name) { - char *tname = talloc_strdup(talloc_autofree_context(), name); + char *tname = talloc_strdup(NULL, name); struct node *node = read_node(NULL, tname); if (node) _rm(NULL, node, tname); + talloc_free(tname); } @@ -1150,18 +1144,19 @@ case XS_DEBUG: if (streq(in->buffer, "print")) xprintf("debug: %s", in->buffer + get_string(in, 0)); + if (streq(in->buffer, "check")) + check_store(); #ifdef TESTING /* For testing, we allow them to set id. */ if (streq(in->buffer, "setid")) { conn->id = atoi(in->buffer + get_string(in, 0)); - send_ack(conn, XS_DEBUG); } else if (streq(in->buffer, "failtest")) { if (get_string(in, 0) < in->used) srandom(atoi(in->buffer + get_string(in, 0))); - send_ack(conn, XS_DEBUG); failtest = true; } #endif /* TESTING */ + send_ack(conn, XS_DEBUG); break; case XS_WATCH: @@ -1259,7 +1254,7 @@ if (in->hdr.msg.len > PATH_MAX) { #ifndef TESTING - syslog(LOG_DAEMON, "Client tried to feed us %i", + syslog(LOG_ERR, "Client tried to feed us %i", in->hdr.msg.len); #endif goto bad_client; @@ -1430,10 +1425,16 @@ balloon driver will pick up stale entries. In the case of the balloon driver, this can be fatal. */ - char *tlocal = talloc_strdup(talloc_autofree_context(), - "/local"); + char *tlocal = talloc_strdup(NULL, "/local"); + + check_store(); + internal_rm("/local"); create_node(NULL, tlocal, NULL, 0); + + talloc_free(tlocal); + + check_store(); } else { tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT, @@ -1444,10 +1445,92 @@ manual_node("/", "tool"); manual_node("/tool", "xenstored"); manual_node("/tool/xenstored", NULL); - } - - /* FIXME: Fsck */ -} + + check_store(); + } +} + +static char *child_name(const char *s1, const char *s2) +{ + if (strcmp(s1, "/")) { + return talloc_asprintf(NULL, "%s/%s", s1, s2); + } + else { + return talloc_asprintf(NULL, "/%s", s2); + } +} + +static void check_store_(const char *name) +{ + struct node *node = read_node(NULL, name); + + if (node) { + size_t i = 0; + + while (i < node->childlen) { + size_t childlen = strlen(node->children + i); + char * childname = child_name(node->name, + node->children + i); + struct node *childnode = read_node(NULL, childname); + + if (childnode) { + check_store_(childname); + i += childlen + 1; + } + else { + log("check_store: No child '%s' found!\n", + childname); + + memdel(node->children, i, childlen + 1, + node->childlen); + node->childlen -= childlen + 1; + write_node(NULL, node); + } + + talloc_free(childname); + } + } + else { + /* Impossible, because no database should ever be without the + root, and otherwise, we've just checked in our caller + (which made a recursive call to get here). */ + + log("check_store: No child '%s' found: impossible!", name); + } +} + + +static void check_store() +{ + char * root = talloc_strdup(NULL, "/"); + log("Checking store ..."); + check_store_(root); + log("Checking store complete."); + talloc_free(root); +} + + +/* Something is horribly wrong: check the store. */ +static void corrupt(struct connection *conn, const char *fmt, ...) +{ + va_list arglist; + char *str; + int saved_errno = errno; + + va_start(arglist, fmt); + str = talloc_vasprintf(NULL, fmt, arglist); + va_end(arglist); + + log("corruption detected by connection %i: err %s: %s", + conn ? (int)conn->id : -1, strerror(saved_errno), str); + +#ifdef TESTING + /* Allow them to attach debugger. */ + sleep(30); +#endif + check_store(); +} + static void write_pidfile(const char *pidfile) { diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstored_core.h --- a/tools/xenstore/xenstored_core.h Wed Mar 8 22:56:55 2006 +++ b/tools/xenstore/xenstored_core.h Wed Mar 8 22:57:02 2006 @@ -148,10 +148,6 @@ /* Replace the tdb: required for transaction code */ bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb); -/* Fail due to excessive corruption, capitalist pigdogs! */ -void __attribute__((noreturn)) corrupt(struct connection *conn, - const char *fmt, ...); - struct connection *new_connection(connwritefn_t *write, connreadfn_t *read); diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstore_control.c --- /dev/null Wed Mar 8 22:56:55 2006 +++ b/tools/xenstore/xenstore_control.c Wed Mar 8 22:57:02 2006 @@ -0,0 +1,28 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xs.h" + + +int main(int argc, char **argv) +{ + if (argc < 2 || + strcmp(argv[1], "check")) + { + fprintf(stderr, + "Usage:\n" + "\n" + " %s check\n" + "\n", argv[0]); + return 2; + } + + struct xs_handle * xsh = xs_daemon_open(); + + xs_debug_command(xsh, argv[1], NULL, 0); + + xs_daemon_close(xsh); + + return 0; +} _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |