[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] Added a basic integrity checker, and some basic ability to recover from store



# HG changeset patch
# User emellor@xxxxxxxxxxxxxxxxxxxxxx
# Node ID afd7d4ba113bf96bca099111fa62335165bd2288
# Parent  2142adf67d6a62fbcb70388bbbcb591d438cfa17
Added a basic integrity checker, and some basic ability to recover from store
corruption, rather than just spewing error messages and exiting.

Added a xenstore-control executable, which sends commands to xenstored.
Currently, the only command is 'check', which triggers an integrity check.
(The integrity check is also triggered whenever a corrupted store is detected).

Signed-off-by: Ewan Mellor <ewan@xxxxxxxxxxxxx>

diff -r 2142adf67d6a -r afd7d4ba113b .hgignore
--- a/.hgignore Wed Mar  8 22:56:55 2006
+++ b/.hgignore Wed Mar  8 22:57:02 2006
@@ -162,6 +162,7 @@
 ^tools/xenstore/xenstore-read$
 ^tools/xenstore/xenstore-rm$
 ^tools/xenstore/xenstore-write$
+^tools/xenstore/xenstore-control$
 ^tools/xenstore/xenstore-ls$
 ^tools/xenstore/xenstored$
 ^tools/xenstore/xenstored_test$
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/Makefile
--- a/tools/xenstore/Makefile   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/Makefile   Wed Mar  8 22:57:02 2006
@@ -27,7 +27,10 @@
 CLIENTS += xenstore-write
 CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
 
-all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-ls
+all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control 
xenstore-ls
+
+test_interleaved_transactions: test_interleaved_transactions.o
+       $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
 
 testcode: xs_test xenstored_test xs_random
 
@@ -35,13 +38,16 @@
        $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
 
 $(CLIENTS): xenstore-%: xenstore_%.o libxenstore.so
-       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
+       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
 
 $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
        $(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
 
+xenstore-control: xenstore_control.o libxenstore.so
+       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
+
 xenstore-ls: xsls.o libxenstore.so
-       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
+       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
 
 xenstored_test: xenstored_core_test.o xenstored_watch_test.o 
xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o 
fake_libxc.o utils.o tdb.o
        $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
@@ -77,7 +83,8 @@
 clean: testsuite-clean
        rm -f *.o *.opic *.so
        rm -f xenstored xs_random xs_stress xs_crashme
-       rm -f xs_test xenstored_test xs_tdb_dump xenstore-ls $(CLIENTS)
+       rm -f xs_test xenstored_test xs_tdb_dump xenstore-control xenstore-ls
+       rm -f $(CLIENTS)
        $(RM) $(PROG_DEP)
 
 print-dir:
@@ -129,7 +136,7 @@
 tarball: clean
        cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/
 
-install: libxenstore.so xenstored xenstore-ls $(CLIENTS)
+install: all
        $(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored
        $(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored
        $(INSTALL_DIR) -p $(DESTDIR)/usr/bin
@@ -137,6 +144,7 @@
        $(INSTALL_DIR) -p $(DESTDIR)/usr/include
        $(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin
        $(INSTALL_PROG) $(CLIENTS) $(DESTDIR)/usr/bin
+       $(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin
        $(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin
        $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR)
        $(INSTALL_DATA) libxenstore.so $(DESTDIR)/usr/$(LIBDIR)
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstored_core.c
--- a/tools/xenstore/xenstored_core.c   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/xenstored_core.c   Wed Mar  8 22:57:02 2006
@@ -60,6 +60,18 @@
 static char *tracefile = NULL;
 static TDB_CONTEXT *tdb_ctx;
 
+static void corrupt(struct connection *conn, const char *fmt, ...);
+static void check_store();
+
+#define log(...)                                                       \
+       do {                                                            \
+               char *s = talloc_asprintf(NULL, __VA_ARGS__);           \
+               trace("%s\n", s);                                       \
+               syslog(LOG_ERR, "%s",  s);                              \
+               talloc_free(s);                                         \
+       } while (0)
+
+
 #ifdef TESTING
 static bool failtest = false;
 
@@ -103,33 +115,6 @@
 #endif /* TESTING */
 
 #include "xenstored_test.h"
-
-/* FIXME: Ideally, this should never be called.  Some can be eliminated. */
-/* Something is horribly wrong: shutdown immediately. */
-void __attribute__((noreturn)) corrupt(struct connection *conn,
-                                      const char *fmt, ...)
-{
-       va_list arglist;
-       char *str;
-       int saved_errno = errno;
-
-       va_start(arglist, fmt);
-       str = talloc_vasprintf(NULL, fmt, arglist);
-       va_end(arglist);
-
-       trace("xenstored corruption: connection id %i: err %s: %s",
-               conn ? (int)conn->id : -1, strerror(saved_errno), str);
-       eprintf("xenstored corruption: connection id %i: err %s: %s",
-               conn ? (int)conn->id : -1, strerror(saved_errno), str);
-#ifdef TESTING
-       /* Allow them to attach debugger. */
-       sleep(30);
-#endif
-       syslog(LOG_DAEMON,
-              "xenstored corruption: connection id %i: err %s: %s",
-              conn ? (int)conn->id : -1, strerror(saved_errno), str);
-       _exit(2);
-}
 
 TDB_CONTEXT *tdb_context(struct connection *conn)
 {
@@ -216,7 +201,8 @@
        now = time(NULL);
        tm = localtime(&now);
 
-       trace("%s %p %02d:%02d:%02d %s (", prefix, conn,
+       trace("%s %p %p %04d%02d%02d %02d:%02d:%02d %s (", prefix, conn,
+             conn->transaction, tm->year + 1900, tm->mon + 1, tm->mday,
              tm->tm_hour, tm->tm_min, tm->tm_sec,
              sockmsg_string(data->hdr.msg.type));
        
@@ -838,8 +824,6 @@
        return 0;
 }
 
-/* Be careful: create heirarchy, put entry in existing parent *last*.
- * This helps fsck if we die during this. */
 static struct node *create_node(struct connection *conn, 
                                const char *name,
                                void *data, unsigned int datalen)
@@ -940,8 +924,9 @@
 {
        unsigned int i;
 
-       /* Delete self, then delete children.  If something goes wrong,
-        * consistency check will clean up this way. */
+       /* Delete self, then delete children.  If we crash, then the worst
+          that can happen is the children will continue to take up space, but
+          will otherwise be unreachable. */
        delete_node_single(conn, node);
 
        /* Delete children, too. */
@@ -951,9 +936,14 @@
                child = read_node(conn, 
                                  talloc_asprintf(node, "%s/%s", node->name,
                                                  node->children + i));
-               if (!child)
-                       corrupt(conn, "No child '%s' found", child);
-               delete_node(conn, child);
+               if (child) {
+                       delete_node(conn, child);
+               }
+               else {
+                       trace("delete_node: No child '%s/%s' found!\n",
+                             node->name, node->children + i);
+                       /* Skip it, we've already deleted the parent. */
+               }
        }
 }
 
@@ -977,12 +967,15 @@
                }
        }
        corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
+       return false;
 }
 
 
 static int _rm(struct connection *conn, struct node *node, const char *name)
 {
-       /* Delete from parent first, then if something explodes fsck cleans. */
+       /* Delete from parent first, then if we crash, the worst that can
+          happen is the child will continue to take up space, but will
+          otherwise be unreachable. */
        struct node *parent = read_node(conn, get_parent(name));
        if (!parent) {
                send_error(conn, EINVAL);
@@ -1001,10 +994,11 @@
 
 static void internal_rm(const char *name)
 {
-       char *tname = talloc_strdup(talloc_autofree_context(), name);
+       char *tname = talloc_strdup(NULL, name);
        struct node *node = read_node(NULL, tname);
        if (node)
                _rm(NULL, node, tname);
+       talloc_free(tname);
 }
 
 
@@ -1150,18 +1144,19 @@
        case XS_DEBUG:
                if (streq(in->buffer, "print"))
                        xprintf("debug: %s", in->buffer + get_string(in, 0));
+               if (streq(in->buffer, "check"))
+                       check_store();
 #ifdef TESTING
                /* For testing, we allow them to set id. */
                if (streq(in->buffer, "setid")) {
                        conn->id = atoi(in->buffer + get_string(in, 0));
-                       send_ack(conn, XS_DEBUG);
                } else if (streq(in->buffer, "failtest")) {
                        if (get_string(in, 0) < in->used)
                                srandom(atoi(in->buffer + get_string(in, 0)));
-                       send_ack(conn, XS_DEBUG);
                        failtest = true;
                }
 #endif /* TESTING */
+               send_ack(conn, XS_DEBUG);
                break;
 
        case XS_WATCH:
@@ -1259,7 +1254,7 @@
 
                if (in->hdr.msg.len > PATH_MAX) {
 #ifndef TESTING
-                       syslog(LOG_DAEMON, "Client tried to feed us %i",
+                       syslog(LOG_ERR, "Client tried to feed us %i",
                               in->hdr.msg.len);
 #endif
                        goto bad_client;
@@ -1430,10 +1425,16 @@
                   balloon driver will pick up stale entries.  In the case of
                   the balloon driver, this can be fatal.
                */
-               char *tlocal = talloc_strdup(talloc_autofree_context(),
-                                            "/local");
+               char *tlocal = talloc_strdup(NULL, "/local");
+
+               check_store();
+
                internal_rm("/local");
                create_node(NULL, tlocal, NULL, 0);
+
+               talloc_free(tlocal);
+
+               check_store();
        }
        else {
                tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
@@ -1444,10 +1445,92 @@
                manual_node("/", "tool");
                manual_node("/tool", "xenstored");
                manual_node("/tool/xenstored", NULL);
-       }
-
-       /* FIXME: Fsck */
-}
+
+               check_store();
+       }
+}
+
+static char *child_name(const char *s1, const char *s2)
+{
+       if (strcmp(s1, "/")) {
+               return talloc_asprintf(NULL, "%s/%s", s1, s2);
+       }
+       else {
+               return talloc_asprintf(NULL, "/%s", s2);
+       }
+}
+
+static void check_store_(const char *name)
+{
+       struct node *node = read_node(NULL, name);
+
+       if (node) {
+               size_t i = 0;
+
+               while (i < node->childlen) {
+                       size_t childlen = strlen(node->children + i);
+                       char * childname = child_name(node->name,
+                                                     node->children + i);
+                       struct node *childnode = read_node(NULL, childname);
+                       
+                       if (childnode) {
+                               check_store_(childname);
+                               i += childlen + 1;
+                       }
+                       else {
+                               log("check_store: No child '%s' found!\n",
+                                   childname);
+
+                               memdel(node->children, i, childlen + 1,
+                                      node->childlen);
+                               node->childlen -= childlen + 1;
+                               write_node(NULL, node);
+                       }
+
+                       talloc_free(childname);
+               }
+       }
+       else {
+               /* Impossible, because no database should ever be without the
+                  root, and otherwise, we've just checked in our caller
+                  (which made a recursive call to get here). */
+                  
+               log("check_store: No child '%s' found: impossible!", name);
+       }
+}
+
+
+static void check_store()
+{
+       char * root = talloc_strdup(NULL, "/");
+       log("Checking store ...");
+       check_store_(root);
+       log("Checking store complete.");
+       talloc_free(root);
+}
+
+
+/* Something is horribly wrong: check the store. */
+static void corrupt(struct connection *conn, const char *fmt, ...)
+{
+       va_list arglist;
+       char *str;
+       int saved_errno = errno;
+
+       va_start(arglist, fmt);
+       str = talloc_vasprintf(NULL, fmt, arglist);
+       va_end(arglist);
+
+       log("corruption detected by connection %i: err %s: %s",
+           conn ? (int)conn->id : -1, strerror(saved_errno), str);
+
+#ifdef TESTING
+       /* Allow them to attach debugger. */
+       sleep(30);
+#endif
+       check_store();
+}
+
 
 static void write_pidfile(const char *pidfile)
 {
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstored_core.h
--- a/tools/xenstore/xenstored_core.h   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/xenstored_core.h   Wed Mar  8 22:57:02 2006
@@ -148,10 +148,6 @@
 /* Replace the tdb: required for transaction code */
 bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
 
-/* Fail due to excessive corruption, capitalist pigdogs! */
-void __attribute__((noreturn)) corrupt(struct connection *conn,
-                                      const char *fmt, ...);
-
 struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
 
 
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstore_control.c
--- /dev/null   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/xenstore_control.c Wed Mar  8 22:57:02 2006
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xs.h"
+
+
+int main(int argc, char **argv)
+{
+  if (argc < 2 ||
+      strcmp(argv[1], "check"))
+  {
+    fprintf(stderr,
+            "Usage:\n"
+            "\n"
+            "       %s check\n"
+            "\n", argv[0]);
+    return 2;
+  }
+
+  struct xs_handle * xsh = xs_daemon_open();
+
+  xs_debug_command(xsh, argv[1], NULL, 0);
+
+  xs_daemon_close(xsh);
+
+  return 0;
+}

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.