[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] Manual merge.



# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID f8acd354e1295226fbda14aaf8bd164e07b93742
# Parent  80d5dd14711eccf379e475000f3b156df286d279

# Parent  09067ce923038c4ba6dcb9630fb848cce0d1c5fa
Manual merge.

diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/Makefile
--- a/tools/blktap/Makefile     Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/Makefile     Sun Jul  3 22:36:48 2005
@@ -2,43 +2,24 @@
 MINOR    = 0
 SONAME   = libblktap.so.$(MAJOR)
 
-CC       = gcc
-
 XEN_ROOT = ../..
 include $(XEN_ROOT)/tools/Rules.mk
 
-BLKTAP_INSTALL_DIR     = /usr/sbin
+SUBDIRS :=
+SUBDIRS += parallax
 
-INSTALL         = install
-INSTALL_PROG    = $(INSTALL) -m0755
-INSTALL_DIR     = $(INSTALL) -d -m0755
+BLKTAP_INSTALL_DIR = /usr/sbin
 
-INCLUDES += 
+INSTALL            = install
+INSTALL_PROG       = $(INSTALL) -m0755
+INSTALL_DIR        = $(INSTALL) -d -m0755
+
+INCLUDES += -I. -I $(XEN_LIBXC)
 
 LIBS     := -lpthread -lz
 
 SRCS     :=
 SRCS     += blktaplib.c
-
-PLX_SRCS := 
-PLX_SRCS += vdi.c 
-PLX_SRCS += radix.c 
-PLX_SRCS += snaplog.c
-PLX_SRCS += blockstore.c 
-PLX_SRCS += block-async.c
-PLX_SRCS += requests-async.c
-VDI_SRCS := $(PLX_SRCS)
-PLX_SRCS += parallax.c
-
-VDI_TOOLS :=
-VDI_TOOLS += vdi_create
-VDI_TOOLS += vdi_list
-VDI_TOOLS += vdi_snap
-VDI_TOOLS += vdi_snap_list
-VDI_TOOLS += vdi_snap_delete
-VDI_TOOLS += vdi_fill
-VDI_TOOLS += vdi_tree
-VDI_TOOLS += vdi_validate
 
 CFLAGS   += -Wall
 CFLAGS   += -Werror
@@ -46,20 +27,21 @@
 #CFLAGS   += -O3
 CFLAGS   += -g3
 CFLAGS   += -fno-strict-aliasing
-CFLAGS   += -I $(XEN_LIBXC)
-CFLAGS   += $(INCLUDES) -I.
 CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
 # Get gcc to generate the dependencies for us.
 CFLAGS   += -Wp,-MD,.$(@F).d
+CFLAGS   += $(INCLUDES) 
 DEPS     = .*.d
 
 OBJS     = $(patsubst %.c,%.o,$(SRCS))
-IBINS    = blkdump parallax $(VDI_TOOLS)
+IBINS    = blkdump
 
 LIB      = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
 
-all: mk-symlinks blkdump $(VDI_TOOLS) parallax blockstored
-       $(MAKE) $(LIB)
+all: mk-symlinks libblktap.so blkdump
+       @set -e; for subdir in $(SUBDIRS); do \
+               $(MAKE) -C $$subdir $@;       \
+       done
 
 LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
 mk-symlinks:
@@ -77,10 +59,16 @@
        $(INSTALL_DIR) -p $(DESTDIR)/usr/include
        $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR)
        $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
-       $(INSTALL_PROG) $(IBINS) $(DESTDIR)/$(BLKTAP_INSTALL_DIR)
+       $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR)
+       @set -e; for subdir in $(SUBDIRS); do \
+               $(MAKE) -C $$subdir $@;       \
+       done
 
 clean:
-       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump 
$(VDI_TOOLS) parallax vdi_unittest
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump
+       @set -e; for subdir in $(SUBDIRS); do \
+               $(MAKE) -C $$subdir $@;       \
+       done
 
 rpm: all
        rm -rf staging
@@ -91,52 +79,17 @@
        mv staging/i386/*.rpm .
        rm -rf staging
 
-libblktap.so:
+libblktap.so: $(OBJS)
+       $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o      \
+             libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS)
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR)
        ln -sf libblktap.so.$(MAJOR) $@
-libblktap.so.$(MAJOR):
-       ln -sf libblktap.so.$(MAJOR).$(MINOR) $@
-libblktap.so.$(MAJOR).$(MINOR): $(OBJS)
-       $(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ $(LIBS)
 
-blkdump: $(LIB)
+blkdump: libblktap.so
        $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c
 
-parallax: $(LIB) $(PLX_SRCS)
-       $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L. -lblktap $(LIBS) 
$(PLX_SRCS) 
+.PHONY: TAGS clean install mk-symlinks rpm
 
-vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(LIBS) $(VDI_SRCS)
-
-vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(LIBS) $(VDI_SRCS)
-
-vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(LIBS) 
$(VDI_SRCS)
-
-vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(LIBS) $(VDI_SRCS)
-
-vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(LIBS) $(VDI_SRCS)
-
-vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(LIBS) $(VDI_SRCS)
-
-vdi_unittest: $(LIB) vdi_unittest.c $(VDI_SRCS)
-       $(CC) $(CFLAGS) -g3 -o vdi_unittest vdi_unittest.c $(LIBS) $(VDI_SRCS)
-
-blockstored: blockstored.c
-       $(CC) $(CFLAGS) -g3 -o blockstored $(LIBS) blockstored.c
-bstest: bstest.c blockstore.c
-       $(CC) $(CFLAGS) -g3 -o bstest bstest.c $(LIBS) blockstore.c
-
-.PHONY: TAGS clean install mk-symlinks rpm
 TAGS:
        etags -t $(SRCS) *.h
 
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_tree.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_tree.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,132 @@
+/**************************************************************************
+ * 
+ * vdi_tree.c
+ *
+ * Output current vdi tree to dot and postscript.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE  "vdi.ps"
+
+typedef struct sh_st {
+    snap_id_t     id;
+    struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+    sh_t **s = &node_hash[SNAP_HASH(id)];
+    
+    while (*s != NULL) {
+        if (SNAPID_EQUAL(&((*s)->id), id))
+            return 1;
+        *s = (*s)->next;
+    }
+    
+    *s = (sh_t *)malloc(sizeof(sh_t));
+    (*s)->id = *id;
+    (*s)->next = NULL;
+    
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    FILE *f;
+    char dot_file[255] = GRAPH_DOT_FILE;
+    char  ps_file[255] = GRAPH_PS_FILE;
+    int nr_vdis = 0, nr_forks = 0;
+    vdi_registry_t *reg;
+    vdi_t *vdi;
+    int i;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    reg = get_vdi_registry();
+    
+    if ( reg == NULL ) {
+        printf("couldn't get VDI registry.\n");
+        exit(-1);
+    }
+    
+    if ( argc > 1 ) {
+        strncpy(ps_file, argv[1], 255);
+        ps_file[255] = '\0';
+    }
+    
+    /* now dump it out to a dot file. */
+    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+    
+    f = fopen(dot_file, "w");
+    
+    /* write graph preamble */
+    fprintf(f, "digraph G {\n");
+    fprintf(f, "   rankdir=LR\n");
+    
+    for (i=0; i<reg->nr_vdis; i++) {
+        char oldnode[255];
+        snap_block_t *blk;
+        snap_id_t id;
+        int nr_snaps, done=0;
+        
+        vdi = vdi_get(i);
+        id = vdi->snap;
+        /* add a node for the id */
+printf("vdi: %d\n", i);
+        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
+                id.block, id.index, vdi->name,
+                id.block, id.index);
+        sprintf(oldnode, "n%Ld%d", id.block, id.index);
+        
+        while (id.block != 0) {
+            blk = snap_get_block(id.block);
+            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+            id = blk->hdr.fork_block;
+            
+            done = sh_check_and_add(&id);
+            
+            /* add a node for the fork_id */
+            if (!done) {
+                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
+                    id.block, id.index,
+                    id.block, id.index);
+            }
+            
+            /* add an edge between them */
+            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+                    id.block, id.index, oldnode, nr_snaps);
+            sprintf(oldnode, "n%Ld%d", id.block, id.index);
+            freeblock(blk);
+            
+            if (done) break;
+        }
+    }
+    
+    /* write graph postamble */
+    fprintf(f, "}\n");
+    fclose(f);
+    
+    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+    {
+        char cmd[255];
+        sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
+        system(cmd);
+    }
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/snaplog.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/snaplog.c   Sun Jul  3 22:36:48 2005
@@ -0,0 +1,238 @@
+/**************************************************************************
+ * 
+ * snaplog.c
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+ 
+ /* VDI histories are made from chains of snapshot logs.  These logs record 
+  * the (radix) root and timestamp of individual snapshots.
+  *
+  * creation of a new VDI involves 'forking' a snapshot log, by creating a 
+  * new, empty log (in a new VDI) and parenting it off of a record in an 
+  * existing snapshot log.
+  *
+  * snapshot log blocks have at most one writer.
+  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "snaplog.h"
+
+
+
+snap_block_t *snap_get_block(u64 block)
+{
+    snap_block_t *blk = (snap_block_t *)readblock(block);
+    
+    if ( blk == NULL)
+        return NULL;
+    if ( blk->hdr.magic != SNAP_MAGIC ) {
+        freeblock(blk);
+        return NULL;
+    }
+    
+    return blk;
+}
+    
+int snap_get_id(snap_id_t *id, snap_rec_t *target)
+{
+    snap_block_t *blk;
+    
+    if ( id == NULL )
+        return -1;
+    
+    blk = snap_get_block(id->block);
+    
+    if ( blk == NULL ) 
+        return -1;
+    
+    if ( id->index > blk->hdr.nr_entries ) {
+        freeblock(blk);
+        return -1;
+    }
+    
+    *target = blk->snaps[id->index];
+    freeblock(blk);
+    return 0;
+}
+
+int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
+                                  snap_id_t *new_id)
+{
+    snap_rec_t parent_rec, fork_rec;
+    snap_block_t *blk, *pblk;
+    /*
+    if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
+        return -1;    
+    
+    if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
+        return -1;   
+*/
+    blk = (snap_block_t *)newblock();
+    blk->hdr.magic  = SNAP_MAGIC;
+    blk->hdr.nr_entries  = 0;
+    blk->hdr.log_entries = 0;
+    blk->hdr.immutable   = 0;
+    
+    if (   (parent_id  != NULL) 
+        && (parent_id->block != fork_id->block) 
+        && (parent_id->block != 0)) {
+        
+        pblk = snap_get_block(parent_id->block);
+        blk->hdr.log_entries = pblk->hdr.log_entries;
+        freeblock(pblk);
+    }
+    
+    if (parent_id != NULL) {
+        blk->hdr.parent_block = *parent_id;
+        blk->hdr.fork_block   = *fork_id;
+    } else {
+        blk->hdr.parent_block = null_snap_id;
+        blk->hdr.fork_block   = null_snap_id;
+    }
+    
+    new_id->index = 0;
+    new_id->block = allocblock(blk);
+    freeblock(blk);
+    if (new_id->block == 0)
+        return -1;
+    
+    return 0;
+}
+
+int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
+{
+    return __snap_block_create(parent_id, parent_id, new_id);
+}
+
+int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
+{
+    snap_id_t id = *old_id;
+    snap_block_t *blk = snap_get_block(id.block);
+    
+    if ( rec->deleted == 1 ) {
+        printf("Attempt to append a deleted snapshot!\n");
+        return -1;
+    }
+    
+    if ( blk->hdr.immutable != 0 ) {
+        printf("Attempt to snap an immutable snap block!\n");
+        return -1;
+    }
+    
+    new_id->block = id.block;
+    
+    if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
+        int ret;
+        
+        id.index--; /* make id point to the last full record */
+        
+        ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
+        if ( ret != 0 ) {
+            freeblock(blk);
+            return -1;
+        }
+        
+        blk->hdr.immutable = 1;
+        writeblock(id.block, blk);
+        freeblock(blk);
+        blk = snap_get_block(new_id->block);
+        id = *new_id;
+    }
+    
+    blk->snaps[blk->hdr.nr_entries] = *rec;
+    blk->hdr.nr_entries++;
+    blk->hdr.log_entries++;
+    new_id->index = blk->hdr.nr_entries;
+    //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
+    writeblock(id.block, blk);
+    freeblock(blk);
+    return 0;
+}
+
+int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
+{
+    snap_block_t *p_blk, *c_blk, *blk;
+    snap_rec_t   *p_rec, *c_rec;
+    int ret = -1;
+    
+    p_blk = snap_get_block(p_id->block);
+    
+    if (p_blk == NULL) return(-1);
+    
+    if (c_id->block == p_id->block)
+    {
+        c_blk = p_blk;
+    } else {
+         c_blk = snap_get_block(c_id->block);
+    }
+    
+    if (p_blk == NULL) {
+        freeblock(p_blk);
+        return(-1);
+    }
+     
+    /* parent and child must not be deleted. */
+    p_rec = &p_blk->snaps[p_id->index];
+    c_rec = &c_blk->snaps[c_id->index];
+    /*
+    if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
+        printf("One of those snaps is already deleted.\n");
+        goto done;
+    }
+    */
+    /* first non-deleted thing in the log before child must be parent. */
+    
+    /* XXX todo: text the range here for delete (and eventually fork) bits) */
+    /* for now, snaps must be consecutive, on the same log page: */
+    
+    if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
+    {
+        printf("Deleting non-consecutive snaps is not done yet.\n");
+        goto done;
+    }
+    
+    /* mark parent as deleted XXX: may need to lock parent block here.*/
+    p_rec->deleted = 1;
+    writeblock(p_id->block, p_blk);
+    
+    /* delete the parent */
+    printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
+    ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
+    
+    /* return the number of blocks reclaimed. */
+    
+done:
+    if (c_blk != p_blk) freeblock(c_blk);
+    freeblock(p_blk);
+    
+    return(ret);
+}
+
+void snap_print_history(snap_id_t *snap_id)
+{
+    snap_id_t id = *snap_id;
+    unsigned int idx = id.index;
+    snap_block_t *new_blk, *blk = snap_get_block(id.block);
+    
+    while ( blk ) {
+        printf("[Snap block %Ld]:\n", id.block);
+        do {
+            printf("   %03u: root: %Ld ts: %ld.%ld\n", idx, 
+                    blk->snaps[idx].radix_root,
+                    blk->snaps[idx].timestamp.tv_sec,
+                    blk->snaps[idx].timestamp.tv_usec);
+        } while (idx-- != 0);
+        
+        id = blk->hdr.parent_block;
+        if (id.block != 0) {
+            new_blk = snap_get_block(id.block);
+        }
+        freeblock(blk);
+        blk = new_blk;
+    }
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/snaplog.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/snaplog.h   Sun Jul  3 22:36:48 2005
@@ -0,0 +1,61 @@
+/**************************************************************************
+ * 
+ * snaplog.h
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+ 
+#include "radix.h"
+#include "blockstore.h"    /* for BLOCK_SIZE */
+ 
+#ifndef __SNAPLOG_H__
+#define __SNAPLOG_H__
+
+typedef struct snap_id {
+    u64            block;
+    unsigned int   index;
+} snap_id_t;
+
+typedef struct snap_rec {
+    u64            radix_root;
+    struct timeval timestamp;
+    /* flags: */
+    unsigned       deleted:1;
+} snap_rec_t;
+
+
+int  snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
+int  snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
+int  snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
+void snap_print_history(snap_id_t *snap_id);
+int  snap_get_id(snap_id_t *id, snap_rec_t *target);
+
+
+/* exported for vdi debugging */
+#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
+
+static const snap_id_t null_snap_id = { 0, 0 }; 
+
+typedef struct snap_block_hdr {
+    u64            magic;
+    snap_id_t      parent_block; /* parent block within this chain */
+    snap_id_t      fork_block;   /* where this log was forked */
+    unsigned       log_entries;  /* total entries since forking */
+    unsigned short nr_entries;   /* entries in snaps[] */
+    unsigned short immutable;    /* has this snap page become immutable? */
+} snap_block_hdr_t;
+
+
+#define SNAPS_PER_BLOCK \
+    ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
+
+typedef struct snap_block {
+    snap_block_hdr_t hdr;
+    snap_rec_t       snaps[SNAPS_PER_BLOCK];
+} snap_block_t;
+    
+
+snap_block_t *snap_get_block(u64 block);
+
+#endif /* __SNAPLOG_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/README
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/README      Sun Jul  3 22:36:48 2005
@@ -0,0 +1,177 @@
+Parallax Quick Overview
+March 3, 2005
+
+This is intended to provide a quick set of instructions to let you
+guys play with the current parallax source.  In it's current form, the
+code will let you run an arbitrary number of VMs off of a single disk
+image, doing copy-on-write as they make updates.  Each domain is
+assigned a virtual disk image (VDI), which may be based on a snapshot
+of an existing image.  All of the VDI and snapshot management should
+currently work.
+
+The current implementation uses a single file as a blockstore for
+_everything_ this will soon be replaced by the fancier backend code
+and the local cache.  As it stands, Parallax will create
+"blockstore.dat" in the directory that you run it from, and use
+largefile support to make this grow to unfathomable girth.  So, you
+probably want to run the daemon off of a local disk, with a lot of
+free space.
+
+Here's how to get going:
+
+0. Setup:
+---------
+
+Pick a local directory on a disk with lots of room.  You should be
+running from a privileged domain (e.g. dom0) with the blocktap
+configured in and block backend NOT.
+
+For convenience (for the moment) copy all of the vdi tools (vdi_*) and
+the parallax daemon from tools/blktap into this directory.
+
+1. Populate the blockstore:
+---------------------------
+
+First you need to put at least one image into the blockstore.  You
+will need a disk image, either as a file or local partition.  My
+general approach has been to
+
+(a) make a really big sparse file with 
+
+        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
+
+(b) put a filesystem into it
+
+        mkfs.ext3 ./image
+
+(c) mount it using loopback
+
+        mkdir ./mnt
+        mount -o loop ./image
+
+(d) cd into it and untar one of the image files from srg-roots.
+
+        cd mnt
+        tar ...
+
+NOTE: Beware if your system is FC3.  mkfs is not compatible with old
+versions of fedora, and so you don't have much choice but to install
+further fc3 images if you have used the fc3 version of mkfs.
+
+(e) unmount the image
+
+        cd ..
+        umount mnt
+
+(f) now, create a new VDI to hold the image 
+
+        ./vdi_create "My new FC3 VDI"
+
+(g) get the id of the new VDI.
+
+        ./vdi_list
+
+        |      0                     My new FC3 VDI
+
+(0 is the VDI id... create a few more if you want.)
+
+(h) hoover your image into the new VDI.
+
+        ./vdi_fill 0 ./image
+
+This will pull the entire image into the blockstore and set up a
+mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
+should also work, but vdi_fill has NO notion of sparseness yet, so you
+are going to pump a block into the store for each block you read.
+
+vdi_fill will count up until it is done, and you should be ready to
+go.  If you want to be anal, you can use vdi_validate to test the VDI
+against the original image.
+
+2. Create some extra VDIs
+-------------------------
+
+VDIs are actually a list of snapshots, and each snapshot is a full
+image of mappings.  So, to preserve an immutable copy of a current
+VDI, do this:
+
+(a) Snapshot your new VDI.
+
+        ./vdi_snap 0
+
+Snapshotting writes the current radix root to the VDI's snapshot log,
+and assigns it a new writable root.
+
+(b) look at the VDI's snapshot log.
+
+        ./vdi_snap_list 0
+
+        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
+
+The first two columns constitute a snapshot id and represent the
+(block, offset) of the snapshot record.  The Date tells you when the
+snapshot was made, and 31 is the radix root node of the snapshot.
+
+(c) Create a new VDI, based on that snapshot, and look at the list.
+
+        ./vdi_create "FC3 - Copy 1" 16 0
+        ./vdi_list
+
+        |      0                     My new FC3 VDI
+        |      1                       FC3 - Copy 1
+
+NOTE: If you have Graphviz installed on your system, you can use
+vdi_tree to generate a postscript of your current set of VDIs and
+snapshots.
+
+
+Create as many VDIs as you need for the VMs that you want to run.
+
+3. Boot some VMs:
+-----------------
+
+Parallax currently uses a hack in xend to pass the VDI id, you need to
+modify the disk line of the VM config that is going to mount it.
+
+(a) set up your vm config, by using the following disk line:
+
+        disk = ['parallax:1,sda1,w,0' ]
+
+This example uses VDI 1 (from vdi_list above), presents it as sda1
+(writable), and uses dom 0 as the backend.  If you were running the
+daemon (and tap driver) in some domain other than 0, you would change
+this last parameter.
+
+NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so 
that it knows what to do with "parallax:".
+
+(b) Run parallax in the backend domain.
+
+        ./parallax
+
+(c) create your new domain.
+
+        xm create ...
+
+---
+
+That's pretty much all there is to it at the moment.  Hope this is
+clear enough to get you going.  Now, a few serious caveats that will
+be sorted out in the almost immediate future:
+
+WARNINGS:
+---------
+
+1. There is NO locking in the VDI tools at the moment, so I'd avoid
+running them in parallel, or more importantly, running them while the
+daemon is running.
+
+2. I doubt that xend will be very happy about restarting if you have
+parallax-using domains.  So if it dies while there are active parallax
+doms, you may need to reboot.
+
+3. I've turned off write-in-place.  So at the moment, EVERY block
+write is a log append on the blockstore.  I've been having some probs
+with the radix tree's marking of writable blocks after snapshots and
+will sort this out very soon.
+
+
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/bstest.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/bstest.c    Sun Jul  3 22:36:48 2005
@@ -0,0 +1,191 @@
+/**************************************************************************
+ * 
+ * bstest.c
+ *
+ * Block store daemon test program.
+ *
+ * usage: bstest <host>|X {r|w|a} ID 
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <errno.h>
+#include "blockstore.h"
+
+int direct(char *host, u32 op, u64 id, int len) {
+    struct sockaddr_in sn, peer;
+    int sock;
+    bsmsg_t msgbuf;
+    int rc, slen;
+    struct hostent *addr;
+
+    addr = gethostbyname(host);
+    if (!addr) {
+        perror("bad hostname");
+        exit(1);
+    }
+    peer.sin_family = addr->h_addrtype;
+    peer.sin_port = htons(BLOCKSTORED_PORT);
+    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
+    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
+            (unsigned int)(unsigned char)addr->h_addr[0],
+            (unsigned int)(unsigned char)addr->h_addr[1],
+            (unsigned int)(unsigned char)addr->h_addr[2],
+            (unsigned int)(unsigned char)addr->h_addr[3]);
+
+    sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+        perror("Bad socket");
+        exit(1);
+    }
+    memset(&sn, 0, sizeof(sn));
+    sn.sin_family = AF_INET;
+    sn.sin_port = htons(BLOCKSTORED_PORT);
+    sn.sin_addr.s_addr = htonl(INADDR_ANY);
+    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
+        perror("bind");
+        close(sock);
+        exit(1);
+    }
+
+    memset((void *)&msgbuf, 0, sizeof(msgbuf));
+    msgbuf.operation = op;
+    msgbuf.id = id;
+
+    rc = sendto(sock, (void *)&msgbuf, len, 0,
+                (struct sockaddr *)&peer, sizeof(peer));
+    if (rc < 0) {
+        perror("sendto");
+        exit(1);
+    }
+
+    slen = sizeof(peer);
+    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
+                   (struct sockaddr *)&peer, &slen);
+    if (len < 0) {
+        perror("recvfrom");
+        exit(1);
+    }
+
+    printf("Reply %u bytes:\n", len);
+    if (len >= MSGBUFSIZE_OP)
+        printf("  operation: %u\n", msgbuf.operation);
+    if (len >= MSGBUFSIZE_FLAGS)
+        printf("  flags: 0x%x\n", msgbuf.flags);
+    if (len >= MSGBUFSIZE_ID)
+        printf("  id: %llu\n", msgbuf.id);
+    if (len >= (MSGBUFSIZE_ID + 4))
+        printf("  data: %02x %02x %02x %02x...\n",
+               (unsigned int)msgbuf.block[0],
+               (unsigned int)msgbuf.block[1],
+               (unsigned int)msgbuf.block[2],
+               (unsigned int)msgbuf.block[3]);
+    
+    if (sock > 0)
+        close(sock);
+   
+    return 0;
+}
+
+int main (int argc, char **argv) {
+
+    u32 op = 0;
+    u64 id = 0;
+    int len = 0, rc;
+    void *block;
+
+    if (argc < 3) {
+        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
+        return 1;
+    }
+
+    switch (argv[2][0]) {
+    case 'r':
+    case 'R':
+        op = BSOP_READBLOCK;
+        len = MSGBUFSIZE_ID;
+        break;
+    case 'w':
+    case 'W':
+        op = BSOP_WRITEBLOCK;
+        len = MSGBUFSIZE_BLOCK;
+        break;
+    case 'a':
+    case 'A':
+        op = BSOP_ALLOCBLOCK;
+        len = MSGBUFSIZE_BLOCK;
+        break;
+    default:
+        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
+        return 1;
+    }
+
+    if (argc >= 4)
+        id = atoll(argv[3]);
+
+    if (strcmp(argv[1], "X") == 0) {
+        rc = __init_blockstore();
+        if (rc < 0) {
+            fprintf(stderr, "blockstore init failed.\n");
+            return 1;
+        }
+        switch(op) {
+        case BSOP_READBLOCK:
+            block = readblock(id);
+            if (block) {
+                printf("data: %02x %02x %02x %02x...\n",
+                       (unsigned int)((unsigned char*)block)[0],
+                       (unsigned int)((unsigned char*)block)[1],
+                       (unsigned int)((unsigned char*)block)[2],
+                       (unsigned int)((unsigned char*)block)[3]);
+            }
+            break;
+        case BSOP_WRITEBLOCK:
+            block = malloc(BLOCK_SIZE);
+            if (!block) {
+                perror("bstest malloc");
+                return 1;
+            }
+            memset(block, 0, BLOCK_SIZE);
+            rc = writeblock(id, block);
+            if (rc != 0) {
+                printf("error\n");
+            }
+            else {
+                printf("OK\n");
+            }
+            break;
+        case BSOP_ALLOCBLOCK:
+            block = malloc(BLOCK_SIZE);
+            if (!block) {
+                perror("bstest malloc");
+                return 1;
+            }
+            memset(block, 0, BLOCK_SIZE);
+            id = allocblock_hint(block, id);
+            if (id == 0) {
+                printf("error\n");
+            }
+            else {
+                printf("ID: %llu\n", id);
+            }
+            break;
+        }
+    }
+    else {
+        direct(argv[1], op, id, len);
+    }
+
+
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap_delete.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_snap_delete.c   Sun Jul  3 22:36:48 2005
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * vdi_snap_delete.c
+ *
+ * Delete a snapshot.
+ *
+ * This is not finished:  right now it takes a snap n and calls 
+ * snap_collapse(n,n+1).
+ *
+ * TODO: support for non-consecutive, non-same-block snaps
+ *       Avoid forking probs.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "snaplog.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    snap_id_t    id, c_id;
+    int ret;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc != 3 ) {
+        printf("usage: %s <snap block> <snap idx>\n", argv[0]);
+        exit(-1);
+    }
+    
+    id.block   = (u64)          atoll(argv[1]);
+    id.index   = (unsigned int) atol (argv[2]);
+    
+    c_id = id;
+    c_id.index++;
+    
+    ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
+    
+    printf("Freed %d blocks.\n", ret);
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/block-async.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/block-async.c       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,393 @@
+/* block-async.c
+ * 
+ * Asynchronous block wrappers for parallax.
+ */
+ 
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "block-async.h"
+#include "blockstore.h"
+#include "vdi.h"
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* We have a queue of outstanding I/O requests implemented as a 
+ * circular producer-consumer ring with free-running buffers.
+ * to allow reordering, this ring indirects to indexes in an 
+ * ring of io_structs.
+ * 
+ * the block_* calls may either add an entry to this ring and return, 
+ * or satisfy the request immediately and call the callback directly.
+ * None of the io calls in parallax should be nested enough to worry 
+ * about stack problems with this approach.
+ */
+
+struct read_args {
+    u64 addr;
+};
+
+struct write_args {
+    u64   addr;
+    char *block;
+};
+
+struct alloc_args {
+    char *block;
+};
+ 
+struct pending_io_req {
+    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
+    union {
+        struct read_args  r;
+        struct write_args w;
+        struct alloc_args a;
+    } u;
+    io_cb_t cb;
+    void *param;
+};
+
+void radix_lock_init(struct radix_lock *r)
+{
+    int i;
+    
+    pthread_mutex_init(&r->lock, NULL);
+    for (i=0; i < 1024; i++) {
+        r->lines[i] = 0;
+        r->waiters[i] = NULL;
+        r->state[i] = ANY;
+    }
+}
+
+/* maximum outstanding I/O requests issued asynchronously */
+/* must be a power of 2.*/
+#define MAX_PENDING_IO 1024
+
+/* how many threads to concurrently issue I/O to the disk. */
+#define IO_POOL_SIZE   10
+
+static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
+static int pending_io_list[MAX_PENDING_IO];
+static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
+#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
+#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
+#define PENDING_IO_ENT(_x) \
+       (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
+#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
+#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
+static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
+
+static void init_pending_io(void)
+{
+    int i;
+       
+    for (i=0; i<MAX_PENDING_IO; i++)
+        pending_io_list[i] = i;
+               
+} 
+
+void block_read(u64 addr, io_cb_t cb, void *param)
+{
+    struct pending_io_req *req;
+    
+    pthread_mutex_lock(&pending_io_lock);
+    assert(CAN_PRODUCE_PENDING_IO);
+    
+    req = PENDING_IO_ENT(io_prod++);
+    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
+    req->op = IO_READ;
+    req->u.r.addr = addr;
+    req->cb = cb;
+    req->param = param;
+    
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);    
+}
+
+
+void block_write(u64 addr, char *block, io_cb_t cb, void *param)
+{
+    struct pending_io_req *req;
+    
+    pthread_mutex_lock(&pending_io_lock);
+    assert(CAN_PRODUCE_PENDING_IO);
+    
+    req = PENDING_IO_ENT(io_prod++);
+    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
+    req->op = IO_WRITE;
+    req->u.w.addr  = addr;
+    req->u.w.block = block;
+    req->cb = cb;
+    req->param = param;
+    
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);    
+}
+
+
+void block_alloc(char *block, io_cb_t cb, void *param)
+{
+    struct pending_io_req *req;
+       
+    pthread_mutex_lock(&pending_io_lock);
+    assert(CAN_PRODUCE_PENDING_IO);
+    
+    req = PENDING_IO_ENT(io_prod++);
+    req->op = IO_ALLOC;
+    req->u.a.block = block;
+    req->cb = cb;
+    req->param = param;
+    
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);    
+}
+
+void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+    pthread_mutex_lock(&r->lock);
+    
+    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
+        r->lines[row]++;
+        r->state[row] = READ;
+        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
+        pthread_mutex_unlock(&r->lock);
+        ret.type = IO_INT_T;
+        ret.u.i = 0;
+        cb(ret, param);
+    } else {
+        struct radix_wait **rwc;
+        struct radix_wait *rw = 
+            (struct radix_wait *) malloc (sizeof(struct radix_wait));
+        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
+        rw->type  = RLOCK;
+        rw->param = param;
+        rw->cb    = cb;
+        rw->next  = NULL;
+        /* append to waiters list. */
+        rwc = &r->waiters[row];
+        while (*rwc != NULL) rwc = &(*rwc)->next;
+        *rwc = rw;
+        pthread_mutex_unlock(&r->lock);
+        return;
+    }
+}
+
+
+void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+    pthread_mutex_lock(&r->lock);
+    
+    /* the second check here is redundant -- just here for debugging now. */
+    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
+        r->state[row] = STOP;
+        r->lines[row] = -1;
+        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
+        pthread_mutex_unlock(&r->lock);
+        ret.type = IO_INT_T;
+        ret.u.i = 0;
+        cb(ret, param);
+    } else {
+        struct radix_wait **rwc;
+        struct radix_wait *rw = 
+            (struct radix_wait *) malloc (sizeof(struct radix_wait));
+        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
+        rw->type  = WLOCK;
+        rw->param = param;
+        rw->cb    = cb;
+        rw->next  = NULL;
+        /* append to waiters list. */
+        rwc = &r->waiters[row];
+        while (*rwc != NULL) rwc = &(*rwc)->next;
+        *rwc = rw;
+        pthread_mutex_unlock(&r->lock);
+        return;
+    }
+       
+}
+
+/* called with radix_lock locked and lock count of zero. */
+static void wake_waiters(struct radix_lock *r, int row)
+{
+    struct pending_io_req *req;
+    struct radix_wait *rw;
+    
+    if (r->lines[row] != 0) return;
+    if (r->waiters[row] == NULL) return; 
+    
+    if (r->waiters[row]->type == WLOCK) {
+
+        rw = r->waiters[row];
+        pthread_mutex_lock(&pending_io_lock);
+        assert(CAN_PRODUCE_PENDING_IO);
+        
+        req = PENDING_IO_ENT(io_prod++);
+        req->op    = IO_WWAKE;
+        req->cb    = rw->cb;
+        req->param = rw->param;
+        r->lines[row] = -1; /* write lock the row. */
+        r->state[row] = STOP;
+        r->waiters[row] = rw->next;
+        free(rw);
+        pthread_mutex_unlock(&pending_io_lock);
+    
+    } else /* RLOCK */ {
+
+        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
+            rw = r->waiters[row];
+            pthread_mutex_lock(&pending_io_lock);
+            assert(CAN_PRODUCE_PENDING_IO);
+            
+            req = PENDING_IO_ENT(io_prod++);
+            req->op    = IO_RWAKE;
+            req->cb    = rw->cb;
+            req->param = rw->param;
+            r->lines[row]++; /* read lock the row. */
+            r->state[row] = READ; 
+            r->waiters[row] = rw->next;
+            free(rw);
+            pthread_mutex_unlock(&pending_io_lock);
+        }
+
+        if (r->waiters[row] != NULL) /* There is a write queued still */
+            r->state[row] = STOP;
+    }  
+    
+    pthread_mutex_lock(&pending_io_lock);
+    pthread_cond_signal(&pending_io_cond);
+    pthread_mutex_unlock(&pending_io_lock);
+}
+
+void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+       
+    pthread_mutex_lock(&r->lock);
+    assert(r->lines[row] > 0); /* try to catch misuse. */
+    r->lines[row]--;
+    if (r->lines[row] == 0) {
+        r->state[row] = ANY;
+        wake_waiters(r, row);
+    }
+    pthread_mutex_unlock(&r->lock);
+    cb(ret, param);
+}
+
+void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
+{
+    struct io_ret ret;
+    
+    pthread_mutex_lock(&r->lock);
+    assert(r->lines[row] == -1); /* try to catch misuse. */
+    r->lines[row] = 0;
+    r->state[row] = ANY;
+    wake_waiters(r, row);
+    pthread_mutex_unlock(&r->lock);
+    cb(ret, param);
+}
+
+/* consumer calls */
+static void do_next_io_req(struct pending_io_req *req)
+{
+    struct io_ret          ret;
+    void  *param;
+    
+    switch (req->op) {
+    case IO_READ:
+        ret.type = IO_BLOCK_T;
+        ret.u.b  = readblock(req->u.r.addr);
+        break;
+    case IO_WRITE:
+        ret.type = IO_INT_T;
+        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
+        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
+        break;
+    case IO_ALLOC:
+        ret.type = IO_ADDR_T;
+        ret.u.a  = allocblock(req->u.a.block);
+        break;
+    case IO_RWAKE:
+        DPRINTF("WAKE DEFERRED RLOCK!\n");
+        ret.type = IO_INT_T;
+        ret.u.i  = 0;
+        break;
+    case IO_WWAKE:
+        DPRINTF("WAKE DEFERRED WLOCK!\n");
+        ret.type = IO_INT_T;
+        ret.u.i  = 0;
+        break;
+    default:
+        DPRINTF("Unknown IO operation on pending list!\n");
+        return;
+    }
+    
+    param = req->param;
+    pthread_mutex_lock(&pending_io_lock);
+    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
+    pthread_mutex_unlock(&pending_io_lock);
+       
+    assert(req->cb != NULL);
+    req->cb(ret, param);
+    
+}
+
+void *io_thread(void *param) 
+{
+    int tid;
+    struct pending_io_req *req;
+    
+    /* Set this thread's tid. */
+    tid = *(int *)param;
+    free(param);
+    
+start:
+    pthread_mutex_lock(&pending_io_lock);
+    while (io_prod == io_cons) {
+        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
+    }
+    
+    if (io_prod == io_cons) {
+        /* unnecessary wakeup. */
+        pthread_mutex_unlock(&pending_io_lock);
+        goto start;
+    }
+    
+    req = PENDING_IO_ENT(io_cons++);
+    pthread_mutex_unlock(&pending_io_lock);
+       
+    do_next_io_req(req);
+    
+    goto start;
+       
+}
+
+static pthread_t io_pool[IO_POOL_SIZE];
+void start_io_threads(void)
+
+{      
+    int i, tid=0;
+    
+    for (i=0; i < IO_POOL_SIZE; i++) {
+        int ret, *t;
+        t = (int *)malloc(sizeof(int));
+        *t = tid++;
+        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
+        if (ret != 0) printf("Error starting thread %d\n", i);
+    }
+       
+}
+
+void init_block_async(void)
+{
+    init_pending_io();
+    start_io_threads();
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap_list.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_snap_list.c     Sun Jul  3 22:36:48 2005
@@ -0,0 +1,82 @@
+/**************************************************************************
+ * 
+ * vdi_snap_list.c
+ *
+ * Print a list of snapshots for the specified vdi.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t        *vdi;
+    u64           id;
+    int           i, max_snaps = -1;
+    snap_block_t *blk;
+    snap_id_t     sid;
+    char         *t;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc == 1 ) {
+        printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
+        exit(-1);
+    }
+    
+    id = (u64) atoll(argv[1]);
+    
+    if ( argc > 2 ) {
+        max_snaps = atoi(argv[2]);
+    }
+    
+    vdi = vdi_get(id);
+    
+    if ( vdi == NULL ) {
+        printf("couldn't find the requested VDI.\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+    
+    sid = vdi->snap;
+    sid.index--;
+    
+    //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", 
+    //    "radix root", "d");
+    printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", 
+            "radix root", "d");
+     
+    while (sid.block != 0) {
+        blk = snap_get_block(sid.block);
+        for (i = sid.index; i >= 0; i--) {
+            if ( max_snaps == 0  ) {
+                freeblock(blk);
+                goto done;
+            }
+            t = ctime(&blk->snaps[i].timestamp.tv_sec);
+            t[strlen(t)-1] = '\0';
+            //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
+            printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
+                    sid.block, i, 
+                    //blk->snaps[i].timestamp.tv_sec,
+                    t,
+                    blk->snaps[i].timestamp.tv_usec,
+                    blk->snaps[i].radix_root,
+                    blk->snaps[i].deleted ? "*" : " ");
+            if ( max_snaps != -1 ) 
+                max_snaps--;
+        }
+        sid = blk->hdr.parent_block;
+        freeblock(blk);
+    }
+done:            
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_list.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_list.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * vdi_list.c
+ *
+ * Print a list of VDIs on the block store.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_registry_t *reg;
+    vdi_t *vdi;
+    int i;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    reg = get_vdi_registry();
+    
+    if ( reg == NULL ) {
+        printf("couldn't get VDI registry.\n");
+        exit(-1);
+    }
+    
+    for (i=0; i < reg->nr_vdis; i++) {
+        vdi = vdi_get(i);
+        
+        if ( vdi != NULL ) {
+            
+            printf("%10Ld %60s\n", vdi->id, vdi->name);
+            freeblock(vdi);
+            
+        }
+    }
+    
+    freeblock(reg);
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstore.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/blockstore.c        Sun Jul  3 22:36:48 2005
@@ -0,0 +1,1350 @@
+/**************************************************************************
+ * 
+ * blockstore.c
+ *
+ * Simple block store interface
+ *
+ */
+ 
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <stdarg.h>
+#include "blockstore.h"
+#include <pthread.h>
+
+//#define BLOCKSTORE_REMOTE
+//#define BSDEBUG
+
+#define RETRY_TIMEOUT 1000000 /* microseconds */
+
+/*****************************************************************************
+ * Debugging
+ */
+#ifdef BSDEBUG
+void DB(char *format, ...)
+{
+    va_list args;
+    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
+    va_start(args, format);
+    vfprintf(stderr, format, args);
+    va_end(args);
+}
+#else
+#define DB(format, ...) (void)0
+#endif
+
+#ifdef BLOCKSTORE_REMOTE
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <netdb.h>
+
+/*****************************************************************************
+ * Network state                                                             *
+ *****************************************************************************/
+
+/* The individual disk servers we talks to. These will be referenced by
+ * an integer index into bsservers[].
+ */
+bsserver_t bsservers[MAX_SERVERS];
+
+/* The cluster map. This is indexed by an integer cluster number.
+ */
+bscluster_t bsclusters[MAX_CLUSTERS];
+
+/* Local socket.
+ */
+struct sockaddr_in sin_local;
+int bssock = 0;
+
+/*****************************************************************************
+ * Notification                                                              *
+ *****************************************************************************/
+
+typedef struct pool_thread_t_struct {
+    pthread_mutex_t ptmutex;
+    pthread_cond_t ptcv;
+    int newdata;
+} pool_thread_t;
+
+pool_thread_t pool_thread[READ_POOL_SIZE+1];
+
+#define RECV_NOTIFY(tid) { \
+    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+    pool_thread[tid].newdata = 1; \
+    DB("CV Waking %u", tid); \
+    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
+    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+#define RECV_AWAIT(tid) { \
+    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
+    if (pool_thread[tid].newdata) { \
+        pool_thread[tid].newdata = 0; \
+        DB("CV Woken %u", tid); \
+    } \
+    else { \
+        DB("CV Waiting %u", tid); \
+        pthread_cond_wait(&(pool_thread[tid].ptcv), \
+                          &(pool_thread[tid].ptmutex)); \
+    } \
+    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
+
+/*****************************************************************************
+ * Message queue management                                                  *
+ *****************************************************************************/
+
+/* Protects the queue manipulation critcal regions.
+ */
+pthread_mutex_t ptmutex_queue;
+#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
+#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
+
+pthread_mutex_t ptmutex_recv;
+#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
+#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
+
+/* A message queue entry. We allocate one of these for every request we send.
+ * Asynchronous reply reception also used one of these.
+ */
+typedef struct bsq_t_struct {
+    struct bsq_t_struct *prev;
+    struct bsq_t_struct *next;
+    int status;
+    int server;
+    int length;
+    struct msghdr msghdr;
+    struct iovec iov[2];
+    int tid;
+    struct timeval tv_sent;
+    bshdr_t message;
+    void *block;
+} bsq_t;
+
+#define BSQ_STATUS_MATCHED 1
+
+pthread_mutex_t ptmutex_luid;
+#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
+#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
+
+static u64 luid_cnt = 0x1000ULL;
+u64 new_luid(void) {
+    u64 luid;
+    ENTER_LUID_CR;
+    luid = luid_cnt++;
+    LEAVE_LUID_CR;
+    return luid;
+}
+
+/* Queue of outstanding requests.
+ */
+bsq_t *bs_head = NULL;
+bsq_t *bs_tail = NULL;
+int bs_qlen = 0;
+
+/*
+ */
+void queuedebug(char *msg) {
+    bsq_t *q;
+    ENTER_QUEUE_CR;
+    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
+    for (q = bs_head; q; q = q->next) {
+        fprintf(stderr, "  luid=%016llx server=%u\n",
+                q->message.luid, q->server);
+    }
+    LEAVE_QUEUE_CR;
+}
+
+int enqueue(bsq_t *qe) {
+    ENTER_QUEUE_CR;
+    qe->next = NULL;
+    qe->prev = bs_tail;
+    if (!bs_head)
+        bs_head = qe;
+    else
+        bs_tail->next = qe;
+    bs_tail = qe;
+    bs_qlen++;
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("enqueue");
+#endif
+    return 0;
+}
+
+int dequeue(bsq_t *qe) {
+    bsq_t *q;
+    ENTER_QUEUE_CR;
+    for (q = bs_head; q; q = q->next) {
+        if (q == qe) {
+            if (q->prev)
+                q->prev->next = q->next;
+            else 
+                bs_head = q->next;
+            if (q->next)
+                q->next->prev = q->prev;
+            else
+                bs_tail = q->prev;
+            bs_qlen--;
+            goto found;
+        }
+    }
+
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("dequeue not found");
+#endif
+    return 0;
+
+    found:
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("dequeue not found");
+#endif
+    return 1;
+}
+
+bsq_t *queuesearch(bsq_t *qe) {
+    bsq_t *q;
+    ENTER_QUEUE_CR;
+    for (q = bs_head; q; q = q->next) {
+        if ((qe->server == q->server) &&
+            (qe->message.operation == q->message.operation) &&
+            (qe->message.luid == q->message.luid)) {
+
+            if ((q->message.operation == BSOP_READBLOCK) &&
+                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
+                q->block = qe->block;
+                qe->block = NULL;
+            }
+            q->length = qe->length;
+            q->message.flags = qe->message.flags;
+            q->message.id = qe->message.id;
+            q->status |= BSQ_STATUS_MATCHED;
+
+            if (q->prev)
+                q->prev->next = q->next;
+            else 
+                bs_head = q->next;
+            if (q->next)
+                q->next->prev = q->prev;
+            else
+                bs_tail = q->prev;
+            q->next = NULL;
+            q->prev = NULL;
+            bs_qlen--;
+            goto found;
+        }
+    }
+
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("queuesearch not found");
+#endif
+    return NULL;
+
+    found:
+    LEAVE_QUEUE_CR;
+#ifdef BSDEBUG
+    queuedebug("queuesearch found");
+#endif
+    return q;
+}
+
+/*****************************************************************************
+ * Network communication                                                     *
+ *****************************************************************************/
+
+int send_message(bsq_t *qe) {
+    int rc;
+
+    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
+    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
+    qe->msghdr.msg_iov = qe->iov;
+    if (qe->block)
+        qe->msghdr.msg_iovlen = 2;
+    else
+        qe->msghdr.msg_iovlen = 1;
+    qe->msghdr.msg_control = NULL;
+    qe->msghdr.msg_controllen = 0;
+    qe->msghdr.msg_flags = 0;
+
+    qe->iov[0].iov_base = (void *)&(qe->message);
+    qe->iov[0].iov_len = MSGBUFSIZE_ID;
+
+    if (qe->block) {
+        qe->iov[1].iov_base = qe->block;
+        qe->iov[1].iov_len = BLOCK_SIZE;
+    }
+
+    qe->message.luid = new_luid();
+
+    qe->status = 0;
+    qe->tid = (int)pthread_getspecific(tid_key);
+    if (enqueue(qe) < 0) {
+        fprintf(stderr, "Error enqueuing request.\n");
+        return -1;
+    }
+
+    gettimeofday(&(qe->tv_sent), NULL);
+    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
+    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
+    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
+    //           (struct sockaddr *)&(bsservers[qe->server].sin),
+    //           sizeof(struct sockaddr_in));
+    if (rc < 0)
+        return rc;
+
+    return rc;
+}
+
+int recv_message(bsq_t *qe) {
+    struct sockaddr_in from;
+    //int flen = sizeof(from);
+    int rc;
+
+    qe->msghdr.msg_name = &from;
+    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
+    qe->msghdr.msg_iov = qe->iov;
+    if (qe->block)
+        qe->msghdr.msg_iovlen = 2;
+    else
+        qe->msghdr.msg_iovlen = 1;
+    qe->msghdr.msg_control = NULL;
+    qe->msghdr.msg_controllen = 0;
+    qe->msghdr.msg_flags = 0;
+
+    qe->iov[0].iov_base = (void *)&(qe->message);
+    qe->iov[0].iov_len = MSGBUFSIZE_ID;
+    if (qe->block) {
+        qe->iov[1].iov_base = qe->block;
+        qe->iov[1].iov_len = BLOCK_SIZE;
+    }
+
+    rc = recvmsg(bssock, &(qe->msghdr), 0);
+
+    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
+    //               (struct sockaddr *)&from, &flen);
+    return rc;
+}
+
+int get_server_number(struct sockaddr_in *sin) {
+    int i;
+
+#ifdef BSDEBUG2
+    fprintf(stderr,
+            "get_server_number(%u.%u.%u.%u/%u)\n",
+            (unsigned int)sin->sin_addr.s_addr & 0xff,
+            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
+            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
+            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
+            (unsigned int)sin->sin_port);
+#endif
+
+    for (i = 0; i < MAX_SERVERS; i++) {
+        if (bsservers[i].hostname) {
+#ifdef BSDEBUG2
+            fprintf(stderr,
+                    "get_server_number check %u.%u.%u.%u/%u\n",
+                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
+                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
+                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
16)&0xff,
+                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
24)&0xff,
+                    (unsigned int)bsservers[i].sin.sin_port);
+#endif
+            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
+                (sin->sin_port == bsservers[i].sin.sin_port) &&
+                (memcmp((void *)&(sin->sin_addr),
+                        (void *)&(bsservers[i].sin.sin_addr),
+                        sizeof(struct in_addr)) == 0)) {
+                return i;
+            }
+        }        
+    }
+
+    return -1;
+}
+
+void *rx_buffer = NULL;
+bsq_t rx_qe;
+bsq_t *recv_any(void) {
+    struct sockaddr_in from;
+    int rc;
+    
+    DB("ENTER recv_any\n");
+
+    rx_qe.msghdr.msg_name = &from;
+    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
+    rx_qe.msghdr.msg_iov = rx_qe.iov;
+    if (!rx_buffer) {
+        rx_buffer = malloc(BLOCK_SIZE);
+        if (!rx_buffer) {
+            perror("recv_any malloc");
+            return NULL;
+        }
+    }
+    rx_qe.block = rx_buffer;
+    rx_buffer = NULL;
+    rx_qe.msghdr.msg_iovlen = 2;
+    rx_qe.msghdr.msg_control = NULL;
+    rx_qe.msghdr.msg_controllen = 0;
+    rx_qe.msghdr.msg_flags = 0;
+    
+    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
+    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
+    rx_qe.iov[1].iov_base = rx_qe.block;
+    rx_qe.iov[1].iov_len = BLOCK_SIZE;
+
+    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
+    if (rc < 0) {
+        perror("recv_any");
+        return NULL;
+    }
+
+    rx_qe.length = rc;    
+    rx_qe.server = get_server_number(&from);
+
+    DB("recv_any from %d luid=%016llx len=%u\n",
+       rx_qe.server, rx_qe.message.luid, rx_qe.length);
+
+    return &rx_qe;
+}
+
+void recv_recycle_buffer(bsq_t *q) {
+    if (q->block) {
+        rx_buffer = q->block;
+        q->block = NULL;
+    }
+}
+
+// cycle through reading any incoming, searching for a match in the
+// queue, until we have all we need.
+int wait_recv(bsq_t **reqs, int numreqs) {
+    bsq_t *q, *m;
+    unsigned int x, i;
+    int tid = (int)pthread_getspecific(tid_key);
+
+    DB("ENTER wait_recv %u\n", numreqs);
+
+    checkmatch:
+    x = 0xffffffff;
+    for (i = 0; i < numreqs; i++) {
+        x &= reqs[i]->status;
+    }
+    if ((x & BSQ_STATUS_MATCHED)) {
+        DB("LEAVE wait_recv\n");
+        return numreqs;
+    }
+
+    RECV_AWAIT(tid);
+
+    /*
+    rxagain:
+    ENTER_RECV_CR;
+    q = recv_any();
+    LEAVE_RECV_CR;
+    if (!q)
+        return -1;
+
+    m = queuesearch(q);
+    recv_recycle_buffer(q);
+    if (!m) {
+        fprintf(stderr, "Unmatched RX\n");
+        goto rxagain;
+    }
+    */
+
+    goto checkmatch;
+
+}
+
+/* retry
+ */
+static int retry_count = 0;
+int retry(bsq_t *qe)
+{
+    int rc;
+    gettimeofday(&(qe->tv_sent), NULL);
+    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
+    retry_count++;
+    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
+    if (rc < 0)
+        return rc;
+    return 0;
+}
+
+/* queue runner
+ */
+void *queue_runner(void *arg)
+{
+    for (;;) {
+        struct timeval now;
+        long long nowus, sus;
+        bsq_t *q;
+        int r;
+
+        sleep(1);
+
+        gettimeofday(&now, NULL);
+        nowus = now.tv_usec + now.tv_sec * 1000000;
+        ENTER_QUEUE_CR;
+        r = retry_count;
+        for (q = bs_head; q; q = q->next) {
+            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
+            if ((nowus - sus) > RETRY_TIMEOUT) {
+                if (retry(q) < 0) {
+                    fprintf(stderr, "Error on sendmsg retry.\n");
+                }
+            }
+        }
+        if (r != retry_count) {
+            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
+        }
+        LEAVE_QUEUE_CR;
+    }
+}
+
+/* receive loop
+ */
+void *receive_loop(void *arg)
+{
+    bsq_t *q, *m;
+
+    for(;;) {
+        q = recv_any();
+        if (!q) {
+            fprintf(stderr, "recv_any error\n");
+        }
+        else {
+            m = queuesearch(q);
+            recv_recycle_buffer(q);
+            if (!m) {
+                fprintf(stderr, "Unmatched RX\n");
+            }
+            else {
+                DB("RX MATCH");
+                RECV_NOTIFY(m->tid);
+            }
+        }
+    }
+}
+pthread_t pthread_recv;
+
+/*****************************************************************************
+ * Reading                                                                   *
+ *****************************************************************************/
+
+void *readblock_indiv(int server, u64 id) {
+    void *block;
+    bsq_t *qe;
+    int len, rc;
+
+    qe = (bsq_t *)malloc(sizeof(bsq_t));
+    if (!qe) {
+        perror("readblock qe malloc");
+        return NULL;
+    }
+    qe->block = NULL;
+    
+    /*
+    qe->block = malloc(BLOCK_SIZE);
+    if (!qe->block) {
+        perror("readblock qe malloc");
+        free((void *)qe);
+        return NULL;
+    }
+    */
+
+    qe->server = server;
+
+    qe->message.operation = BSOP_READBLOCK;
+    qe->message.flags = 0;
+    qe->message.id = id;
+    qe->length = MSGBUFSIZE_ID;
+
+    if (send_message(qe) < 0) {
+        perror("readblock sendto");
+        goto err;
+    }
+    
+    /*len = recv_message(qe);
+    if (len < 0) {
+        perror("readblock recv");
+        goto err;
+    }*/
+
+    rc = wait_recv(&qe, 1);
+    if (rc < 0) {
+        perror("readblock recv");
+        goto err;
+    }
+
+    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "readblock server error\n");
+        goto err;
+    }
+    if (qe->length < MSGBUFSIZE_BLOCK) {
+        fprintf(stderr, "readblock recv short (%u)\n", len);
+        goto err;
+    }
+    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
+        perror("readblock malloc");
+        goto err;
+    }
+    memcpy(block, qe->message.block, BLOCK_SIZE);
+    */    
+    block = qe->block;
+
+    free((void *)qe);
+    return block;
+
+    err:
+    if (qe->block)
+        free(qe->block);
+    free((void *)qe);
+    return NULL;
+}
+
+/**
+ * readblock: read a block from disk
+ *   @id: block id to read
+ *
+ *   @return: pointer to block, NULL on error
+ */
+void *readblock(u64 id) {
+    int map = (int)BSID_MAP(id);
+    u64 xid;
+    static int i = CLUSTER_MAX_REPLICAS - 1;
+    void *block = NULL;
+
+    /* special case for the "superblock" just use the first block on the
+     * first replica. (extend to blocks < 6 for vdi bug)
+     */
+    if (id < 6) {
+        block = readblock_indiv(bsclusters[map].servers[0], id);
+        goto out;
+    }
+
+    i++;
+    if (i >= CLUSTER_MAX_REPLICAS)
+        i = 0;
+    switch (i) {
+    case 0:
+        xid = BSID_REPLICA0(id);
+        break;
+    case 1:
+        xid = BSID_REPLICA1(id);
+        break;
+    case 2:
+        xid = BSID_REPLICA2(id);
+        break;
+    }
+    
+    block = readblock_indiv(bsclusters[map].servers[i], xid);
+
+    out:
+#ifdef BSDEBUG
+    if (block)
+        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+                id,
+                (unsigned int)((unsigned char *)block)[0],
+                (unsigned int)((unsigned char *)block)[1],
+                (unsigned int)((unsigned char *)block)[2],
+                (unsigned int)((unsigned char *)block)[3],
+                (unsigned int)((unsigned char *)block)[4],
+                (unsigned int)((unsigned char *)block)[5],
+                (unsigned int)((unsigned char *)block)[6],
+                (unsigned int)((unsigned char *)block)[7]);
+    else
+        fprintf(stderr, "READ:  %016llx NULL\n", id);
+#endif
+    return block;
+}
+
+/*****************************************************************************
+ * Writing                                                                   *
+ *****************************************************************************/
+
+bsq_t *writeblock_indiv(int server, u64 id, void *block) {
+
+    bsq_t *qe;
+    int len;
+
+    qe = (bsq_t *)malloc(sizeof(bsq_t));
+    if (!qe) {
+        perror("writeblock qe malloc");
+        goto err;
+    }
+    qe->server = server;
+
+    qe->message.operation = BSOP_WRITEBLOCK;
+    qe->message.flags = 0;
+    qe->message.id = id;
+    //memcpy(qe->message.block, block, BLOCK_SIZE);
+    qe->block = block;
+    qe->length = MSGBUFSIZE_BLOCK;
+
+    if (send_message(qe) < 0) {
+        perror("writeblock sendto");
+        goto err;
+    }
+
+    return qe;
+
+    err:
+    free((void *)qe);
+    return NULL;
+}
+    
+
+/**
+ * writeblock: write an existing block to disk
+ *   @id: block id
+ *   @block: pointer to block
+ *
+ *   @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+    
+    int map = (int)BSID_MAP(id);
+    int rep0 = bsclusters[map].servers[0];
+    int rep1 = bsclusters[map].servers[1];
+    int rep2 = bsclusters[map].servers[2];
+    bsq_t *reqs[3];
+    int rc;
+
+    reqs[0] = reqs[1] = reqs[2] = NULL;
+
+#ifdef BSDEBUG
+    fprintf(stderr,
+            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+            id,
+            (unsigned int)((unsigned char *)block)[0],
+            (unsigned int)((unsigned char *)block)[1],
+            (unsigned int)((unsigned char *)block)[2],
+            (unsigned int)((unsigned char *)block)[3],
+            (unsigned int)((unsigned char *)block)[4],
+            (unsigned int)((unsigned char *)block)[5],
+            (unsigned int)((unsigned char *)block)[6],
+            (unsigned int)((unsigned char *)block)[7]);
+#endif
+
+    /* special case for the "superblock" just use the first block on the
+     * first replica. (extend to blocks < 6 for vdi bug)
+     */
+    if (id < 6) {
+        reqs[0] = writeblock_indiv(rep0, id, block);
+        if (!reqs[0])
+            return -1;
+        rc = wait_recv(reqs, 1);
+        return rc;
+    }
+
+    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
+    if (!reqs[0])
+        goto err;
+    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
+    if (!reqs[1])
+        goto err;
+    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
+    if (!reqs[2])
+        goto err;
+
+    rc = wait_recv(reqs, 3);
+    if (rc < 0) {
+        perror("writeblock recv");
+        goto err;
+    }
+    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "writeblock server0 error\n");
+        goto err;
+    }
+    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "writeblock server1 error\n");
+        goto err;
+    }
+    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "writeblock server2 error\n");
+        goto err;
+    }
+
+
+    free((void *)reqs[0]);
+    free((void *)reqs[1]);
+    free((void *)reqs[2]);
+    return 0;
+
+    err:
+    if (reqs[0]) {
+        dequeue(reqs[0]);
+        free((void *)reqs[0]);
+    }
+    if (reqs[1]) {
+        dequeue(reqs[1]);
+        free((void *)reqs[1]);
+    }
+    if (reqs[2]) {
+        dequeue(reqs[2]);
+        free((void *)reqs[2]);
+    }
+    return -1;
+}
+
+/*****************************************************************************
+ * Allocation                                                                *
+ *****************************************************************************/
+
+/**
+ * allocblock: write a new block to disk
+ *   @block: pointer to block
+ *
+ *   @return: new id of block on disk
+ */
+u64 allocblock(void *block) {
+    return allocblock_hint(block, 0);
+}
+
+bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
+    bsq_t *qe;
+    int len;
+
+    qe = (bsq_t *)malloc(sizeof(bsq_t));
+    if (!qe) {
+        perror("allocblock_hint qe malloc");
+        goto err;
+    }
+    qe->server = server;
+
+    qe->message.operation = BSOP_ALLOCBLOCK;
+    qe->message.flags = 0;
+    qe->message.id = hint;
+    //memcpy(qe->message.block, block, BLOCK_SIZE);
+    qe->block = block;
+    qe->length = MSGBUFSIZE_BLOCK;
+
+    if (send_message(qe) < 0) {
+        perror("allocblock_hint sendto");
+        goto err;
+    }
+    
+    return qe;
+
+    err:
+    free((void *)qe);
+    return NULL;
+}
+
+/**
+ * allocblock_hint: write a new block to disk
+ *   @block: pointer to block
+ *   @hint: allocation hint
+ *
+ *   @return: new id of block on disk
+ */
+u64 allocblock_hint(void *block, u64 hint) {
+    int map = (int)hint;
+    int rep0 = bsclusters[map].servers[0];
+    int rep1 = bsclusters[map].servers[1];
+    int rep2 = bsclusters[map].servers[2];
+    bsq_t *reqs[3];
+    int rc;
+    u64 id0, id1, id2;
+
+    reqs[0] = reqs[1] = reqs[2] = NULL;
+
+    DB("ENTER allocblock\n");
+
+    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
+    if (!reqs[0])
+        goto err;
+    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
+    if (!reqs[1])
+        goto err;
+    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
+    if (!reqs[2])
+        goto err;
+
+    rc = wait_recv(reqs, 3);
+    if (rc < 0) {
+        perror("allocblock recv");
+        goto err;
+    }
+    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "allocblock server0 error\n");
+        goto err;
+    }
+    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "allocblock server1 error\n");
+        goto err;
+    }
+    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
+        fprintf(stderr, "allocblock server2 error\n");
+        goto err;
+    }
+
+    id0 = reqs[0]->message.id;
+    id1 = reqs[1]->message.id;
+    id2 = reqs[2]->message.id;
+
+#ifdef BSDEBUG
+    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
+            BSID(map, id0, id1, id2),
+            (unsigned int)((unsigned char *)block)[0],
+            (unsigned int)((unsigned char *)block)[1],
+            (unsigned int)((unsigned char *)block)[2],
+            (unsigned int)((unsigned char *)block)[3],
+            (unsigned int)((unsigned char *)block)[4],
+            (unsigned int)((unsigned char *)block)[5],
+            (unsigned int)((unsigned char *)block)[6],
+            (unsigned int)((unsigned char *)block)[7]);
+#endif
+    
+    free((void *)reqs[0]);
+    free((void *)reqs[1]);
+    free((void *)reqs[2]);
+    return BSID(map, id0, id1, id2);
+
+    err:
+    if (reqs[0]) {
+        dequeue(reqs[0]);
+        free((void *)reqs[0]);
+    }
+    if (reqs[1]) {
+        dequeue(reqs[1]);
+        free((void *)reqs[1]);
+    }
+    if (reqs[2]) {
+        dequeue(reqs[2]);
+        free((void *)reqs[2]);
+    }
+    return 0;
+}
+
+#else /* /BLOCKSTORE_REMOTE */
+
+/*****************************************************************************
+ * Local storage version                                                     *
+ *****************************************************************************/
+ 
+/**
+ * readblock: read a block from disk
+ *   @id: block id to read
+ *
+ *   @return: pointer to block, NULL on error
+ */
+
+void *readblock(u64 id) {
+    void *block;
+    int block_fp;
+   
+//printf("readblock(%llu)\n", id); 
+    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return NULL;
+    }
+    
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        printf ("%Ld ", id);
+        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
+        perror("readblock lseek");
+        goto err;
+    }
+    if ((block = malloc(BLOCK_SIZE)) == NULL) {
+        perror("readblock malloc");
+        goto err;
+    }
+    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("readblock read");
+        free(block);
+        goto err;
+    }
+    close(block_fp);
+    return block;
+    
+err:
+    close(block_fp);
+    return NULL;
+}
+
+/**
+ * writeblock: write an existing block to disk
+ *   @id: block id
+ *   @block: pointer to block
+ *
+ *   @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+    
+    int block_fp;
+    
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return -1;
+    }
+
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        perror("writeblock lseek");
+        goto err;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) < 0) {
+        perror("writeblock write");
+        goto err;
+    }
+    close(block_fp);
+    return 0;
+
+err:
+    close(block_fp);
+    return -1;
+}
+
+/**
+ * allocblock: write a new block to disk
+ *   @block: pointer to block
+ *
+ *   @return: new id of block on disk
+ */
+
+u64 allocblock(void *block) {
+    u64 lb;
+    off64_t pos;
+    int block_fp;
+    
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return 0;
+    }
+
+    pos = lseek64(block_fp, 0, SEEK_END);
+    if (pos == (off64_t)-1) {
+        perror("allocblock lseek");
+        goto err;
+    }
+    if (pos % BLOCK_SIZE != 0) {
+        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
+        goto err;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("allocblock write");
+        goto err;
+    }
+    lb = pos / BLOCK_SIZE + 1;
+//printf("alloc(%Ld)\n", lb);
+    close(block_fp);
+    return lb;
+    
+err:
+    close(block_fp);
+    return 0;
+    
+}
+
+/**
+ * allocblock_hint: write a new block to disk
+ *   @block: pointer to block
+ *   @hint: allocation hint
+ *
+ *   @return: new id of block on disk
+ */
+u64 allocblock_hint(void *block, u64 hint) {
+    return allocblock(block);
+}
+
+#endif /* BLOCKSTORE_REMOTE */
+
+/*****************************************************************************
+ * Memory management                                                         *
+ *****************************************************************************/
+
+/**
+ * newblock: get a new in-memory block set to zeros
+ *
+ *   @return: pointer to new block, NULL on error
+ */
+void *newblock() {
+    void *block = malloc(BLOCK_SIZE);
+    if (block == NULL) {
+        perror("newblock");
+        return NULL;
+    }
+    memset(block, 0, BLOCK_SIZE);
+    return block;
+}
+
+
+/**
+ * freeblock: unallocate an in-memory block
+ *   @id: block id (zero if this is only in-memory)
+ *   @block: block to be freed
+ */
+void freeblock(void *block) {
+    if (block != NULL)
+        free(block);
+}
+
+static freeblock_t *new_freeblock(void)
+{
+    freeblock_t *fb;
+    
+    fb = newblock();
+    
+    if (fb == NULL) return NULL;
+    
+    fb->magic = FREEBLOCK_MAGIC;
+    fb->next  = 0ULL;
+    fb->count = 0ULL;
+    memset(fb->list, 0, sizeof fb->list);
+    
+    return fb;
+}
+
+void releaseblock(u64 id)
+{
+    blockstore_super_t *bs_super;
+    freeblock_t *fl_current;
+    
+    /* get superblock */
+    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+    
+    /* get freeblock_current */
+    if (bs_super->freelist_current == 0ULL) 
+    {
+        fl_current = new_freeblock();
+        bs_super->freelist_current = allocblock(fl_current);
+        writeblock(BLOCKSTORE_SUPER, bs_super);
+    } else {
+        fl_current = readblock(bs_super->freelist_current);
+    }
+    
+    /* if full, chain to superblock and allocate new current */
+    
+    if (fl_current->count == FREEBLOCK_SIZE) {
+        fl_current->next = bs_super->freelist_full;
+        writeblock(bs_super->freelist_current, fl_current);
+        bs_super->freelist_full = bs_super->freelist_current;
+        freeblock(fl_current);
+        fl_current = new_freeblock();
+        bs_super->freelist_current = allocblock(fl_current);
+        writeblock(BLOCKSTORE_SUPER, bs_super);
+    }
+    
+    /* append id to current */
+    fl_current->list[fl_current->count++] = id;
+    writeblock(bs_super->freelist_current, fl_current);
+    
+    freeblock(fl_current);
+    freeblock(bs_super);
+    
+    
+}
+
+/* freelist debug functions: */
+void freelist_count(int print_each)
+{
+    blockstore_super_t *bs_super;
+    freeblock_t *fb;
+    u64 total = 0, next;
+    
+    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+    
+    if (bs_super->freelist_current == 0ULL) {
+        printf("freelist is empty!\n");
+        return;
+    }
+    
+    fb = readblock(bs_super->freelist_current);
+    printf("%Ld entires on current.\n", fb->count);
+    total += fb->count;
+    if (print_each == 1)
+    {
+        int i;
+        for (i=0; i< fb->count; i++)
+            printf("  %Ld\n", fb->list[i]);
+    }
+    
+    freeblock(fb);
+    
+    if (bs_super->freelist_full == 0ULL) {
+        printf("freelist_full is empty!\n");
+        return;
+    }
+    
+    next = bs_super->freelist_full;
+    for (;;) {
+        fb = readblock(next);
+        total += fb->count;
+        if (print_each == 1)
+        {
+            int i;
+            for (i=0; i< fb->count; i++)
+                printf("  %Ld\n", fb->list[i]);
+        }
+        next = fb->next;
+        freeblock(fb);
+        if (next == 0ULL) break;
+    }
+    printf("Total of %Ld ids on freelist.\n", total);
+}
+
+/*****************************************************************************
+ * Initialisation                                                            *
+ *****************************************************************************/
+
+int __init_blockstore(void)
+{
+    int i;
+    blockstore_super_t *bs_super;
+    u64 ret;
+    int block_fp;
+    
+#ifdef BLOCKSTORE_REMOTE
+    struct hostent *addr;
+
+    pthread_mutex_init(&ptmutex_queue, NULL);
+    pthread_mutex_init(&ptmutex_luid, NULL);
+    pthread_mutex_init(&ptmutex_recv, NULL);
+    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
+    for (i = 0; i <= READ_POOL_SIZE; i++) {
+        pool_thread[i].newdata = 0;
+        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
+        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
+    }
+
+    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
+    bsservers[1].hostname = "planb.cl.cam.ac.uk";
+    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
+    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
+    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
+    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
+    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
+    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
+    bsservers[8].hostname = NULL;
+    bsservers[9].hostname = NULL;
+    bsservers[10].hostname = NULL;
+    bsservers[11].hostname = NULL;
+    bsservers[12].hostname = NULL;
+    bsservers[13].hostname = NULL;
+    bsservers[14].hostname = NULL;
+    bsservers[15].hostname = NULL;
+
+    for (i = 0; i < MAX_SERVERS; i++) {
+        if (!bsservers[i].hostname)
+            continue;
+        addr = gethostbyname(bsservers[i].hostname);
+        if (!addr) {
+            perror("bad hostname");
+            return -1;
+        }
+        bsservers[i].sin.sin_family = addr->h_addrtype;
+        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
+        bsservers[i].sin.sin_addr.s_addr = 
+            ((struct in_addr *)(addr->h_addr))->s_addr;
+    }
+
+    /* Cluster map
+     */
+    bsclusters[0].servers[0] = 0;
+    bsclusters[0].servers[1] = 1;
+    bsclusters[0].servers[2] = 2;
+    bsclusters[1].servers[0] = 1;
+    bsclusters[1].servers[1] = 2;
+    bsclusters[1].servers[2] = 3;
+    bsclusters[2].servers[0] = 2;
+    bsclusters[2].servers[1] = 3;
+    bsclusters[2].servers[2] = 4;
+    bsclusters[3].servers[0] = 3;
+    bsclusters[3].servers[1] = 4;
+    bsclusters[3].servers[2] = 5;
+    bsclusters[4].servers[0] = 4;
+    bsclusters[4].servers[1] = 5;
+    bsclusters[4].servers[2] = 6;
+    bsclusters[5].servers[0] = 5;
+    bsclusters[5].servers[1] = 6;
+    bsclusters[5].servers[2] = 7;
+    bsclusters[6].servers[0] = 6;
+    bsclusters[6].servers[1] = 7;
+    bsclusters[6].servers[2] = 0;
+    bsclusters[7].servers[0] = 7;
+    bsclusters[7].servers[1] = 0;
+    bsclusters[7].servers[2] = 1;
+
+    /* Local socket set up
+     */
+    bssock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (bssock < 0) {
+        perror("Bad socket");
+        return -1;
+    }
+    memset(&sin_local, 0, sizeof(sin_local));
+    sin_local.sin_family = AF_INET;
+    sin_local.sin_port = htons(BLOCKSTORED_PORT);
+    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
+    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
+        perror("bind");
+        close(bssock);
+        return -1;
+    }
+
+    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
+    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
+
+#else /* /BLOCKSTORE_REMOTE */
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return -1;
+        exit(-1);
+    }
+    
+    if (lseek(block_fp, 0, SEEK_END) == 0) {
+        bs_super = newblock();
+        bs_super->magic            = BLOCKSTORE_MAGIC;
+        bs_super->freelist_full    = 0LL;
+        bs_super->freelist_current = 0LL;
+        
+        ret = allocblock(bs_super);
+        
+        freeblock(bs_super);
+    } else {
+        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
+        if (bs_super->magic != BLOCKSTORE_MAGIC)
+        {
+            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
+            exit(-1);
+        }
+        freeblock(bs_super);
+    }
+        
+    close(block_fp);
+        
+#endif /*  BLOCKSTORE_REMOTE */   
+    return 0;
+}
+
+void __exit_blockstore(void)
+{
+    int i;
+#ifdef BLOCKSTORE_REMOTE
+    pthread_mutex_destroy(&ptmutex_recv);
+    pthread_mutex_destroy(&ptmutex_luid);
+    pthread_mutex_destroy(&ptmutex_queue);
+    /*pthread_mutex_destroy(&ptmutex_notify);
+      pthread_cond_destroy(&ptcv_notify);*/
+    for (i = 0; i <= READ_POOL_SIZE; i++) {
+        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
+        pthread_cond_destroy(&(pool_thread[i].ptcv));
+    }
+#endif
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstore.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/blockstore.h        Sun Jul  3 22:36:48 2005
@@ -0,0 +1,134 @@
+/**************************************************************************
+ * 
+ * blockstore.h
+ *
+ * Simple block store interface
+ *
+ */
+ 
+#ifndef __BLOCKSTORE_H__
+#define __BLOCKSTORE_H__
+
+#include <netinet/in.h>
+#include <xc.h>
+
+#define BLOCK_SIZE  4096
+#define BLOCK_SHIFT   12
+#define BLOCK_MASK  0xfffffffffffff000LL
+
+/* XXX SMH: where is the below supposed to be defined???? */
+#ifndef SECTOR_SHIFT 
+#define SECTOR_SHIFT   9 
+#endif
+
+#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
+#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
+
+typedef struct {
+    u64 magic;
+    u64 next;
+    u64 count;
+    u64 list[FREEBLOCK_SIZE];
+} freeblock_t; 
+
+#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
+#define BLOCKSTORE_SUPER 1ULL
+
+typedef struct {
+    u64 magic;
+    u64 freelist_full;
+    u64 freelist_current;
+} blockstore_super_t;
+
+extern void *newblock();
+extern void *readblock(u64 id);
+extern u64 allocblock(void *block);
+extern u64 allocblock_hint(void *block, u64 hint);
+extern int writeblock(u64 id, void *block);
+
+/* Add this blockid to a freelist, to be recycled by the allocator. */
+extern void releaseblock(u64 id);
+
+/* this is a memory free() operation for block-sized allocations */
+extern void freeblock(void *block);
+extern int __init_blockstore(void);
+
+/* debug for freelist. */
+void freelist_count(int print_each);
+#define ALLOCFAIL (((u64)(-1)))
+
+/* Distribution
+ */
+#define BLOCKSTORED_PORT 9346
+
+struct bshdr_t_struct {
+    u32            operation;
+    u32            flags;
+    u64            id;
+    u64            luid;
+} __attribute__ ((packed));
+typedef struct bshdr_t_struct bshdr_t;
+
+struct bsmsg_t_struct {
+    bshdr_t        hdr;
+    unsigned char  block[BLOCK_SIZE];
+} __attribute__ ((packed));
+
+typedef struct bsmsg_t_struct bsmsg_t;
+
+#define MSGBUFSIZE_OP    sizeof(u32)
+#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
+#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + 
sizeof(u64))
+#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
+
+#define BSOP_READBLOCK  0x01
+#define BSOP_WRITEBLOCK 0x02
+#define BSOP_ALLOCBLOCK 0x03
+#define BSOP_FREEBLOCK  0x04
+
+#define BSOP_FLAG_ERROR 0x01
+
+#define BS_ALLOC_SKIP 10
+#define BS_ALLOC_HACK
+
+/* Remote hosts and cluster map - XXX need to generalise
+ */
+
+/*
+
+  Interim ID format is
+
+  63 60 59                40 39                20 19                 0
+  +----+--------------------+--------------------+--------------------+
+  |map | replica 2          | replica 1          | replica 0          |
+  +----+--------------------+--------------------+--------------------+
+
+  The map is an index into a table detailing which machines form the
+  cluster.
+
+ */
+
+#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
+#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
+#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
+#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
+
+#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
+                                         (((u64)(_rep2))<<40) | \
+                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
+
+typedef struct bsserver_t_struct {
+    char              *hostname;
+    struct sockaddr_in sin;
+} bsserver_t;
+
+#define MAX_SERVERS 16
+
+#define CLUSTER_MAX_REPLICAS 3
+typedef struct bscluster_t_struct {
+    int servers[CLUSTER_MAX_REPLICAS];
+} bscluster_t;
+
+#define MAX_CLUSTERS 16
+
+#endif /* __BLOCKSTORE_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/parallax.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/parallax.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,611 @@
+/**************************************************************************
+ * 
+ * parallax.c
+ *
+ * The Parallax Storage Server
+ *
+ */
+ 
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "blktaplib.h"
+#include "blockstore.h"
+#include "vdi.h"
+#include "block-async.h"
+#include "requests-async.h"
+
+#define PARALLAX_DEV     61440
+#define SECTS_PER_NODE   8
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* ------[ session records ]----------------------------------------------- */
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+#define VDI_HASHSZ 16
+#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
+
+typedef struct blkif {
+    domid_t       domid;
+    unsigned int  handle;
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    vdi_t        *vdi_hash[VDI_HASHSZ];
+    struct blkif *hash_next;
+} blkif_t;
+
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    if ( handle != 0 )
+        printf("blktap/parallax don't currently support non-0 dev handles!\n");
+    
+    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif != NULL) && 
+            ((blkif->domid != domid) || (blkif->handle != handle)) )
+        blkif = blkif->hash_next;
+    return blkif;
+}
+
+vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
+{
+    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
+    
+    while ((vdi != NULL) && (vdi->vdevice != device))
+        vdi = vdi->next;
+    
+    return vdi;
+}
+
+/* ------[ control message handling ]-------------------------------------- */
+
+void blkif_create(blkif_be_create_t *create)
+{
+    domid_t       domid  = create->domid;
+    unsigned int  handle = create->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    DPRINTF("parallax (blkif_create): create is %p\n", create); 
+    
+    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
+    {
+        DPRINTF("Could not create blkif: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid  = domid;
+    blkif->handle = handle;
+    blkif->status = DISCONNECTED;
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif != NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
+                domid, handle);
+            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+            free(blkif);
+            return;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+
+    blkif->hash_next = *pblkif;
+    *pblkif = blkif;
+
+    DPRINTF("Successfully created blkif\n");
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
+    
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif = *pblkif) != NULL )
+    {
+        if ( (blkif->domid == domid) && (blkif->handle == handle) )
+        {
+            if ( blkif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pblkif = &blkif->hash_next;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pblkif = blkif->hash_next;
+    free(blkif);
+    destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void vbd_create(blkif_be_vbd_create_t *create)
+{
+    blkif_t            *blkif;
+    vdi_t              *vdi, **vdip;
+    blkif_vdev_t        vdevice = create->vdevice;
+
+    DPRINTF("parallax (vbd_create): create=%p\n", create); 
+    
+    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
+    if ( blkif == NULL )
+    {
+        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
+                create->domid, create->blkif_handle); 
+        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    /* VDI identifier is in grow->extent.sector_start */
+    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
+            (unsigned long)create->dev_handle);
+
+    vdi = vdi_get(create->dev_handle);
+    if (vdi == NULL)
+    {
+        printf("parallax (vbd_create): VDI %lx not found.\n",
+               (unsigned long)create->dev_handle);
+        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        return;
+    }
+    
+    vdi->next = NULL;
+    vdi->vdevice = vdevice;
+    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
+    while (*vdip != NULL)
+        vdip = &(*vdip)->next;
+    *vdip = vdi;
+    
+    DPRINTF("blkif_create succeeded\n"); 
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
+{
+    blkif_t            *blkif;
+    vdi_t              *vdi, **vdip;
+    blkif_vdev_t        vdevice = destroy->vdevice;
+    
+    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
+    if ( blkif == NULL )
+    {
+        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
+                destroy->domid, destroy->blkif_handle); 
+        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
+    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
+        vdip = &(*vdip)->next;
+
+    if (*vdip != NULL) 
+    {
+        vdi = *vdip;
+        *vdip = vdi->next;
+        vdi_put(vdi);
+    }
+        
+}
+
+int parallax_control(control_msg_t *msg)
+{
+    domid_t  domid;
+    int      ret;
+
+    DPRINTF("parallax_control: msg is %p\n", msg); 
+    
+    if (msg->type != CMSG_BLKIF_BE) 
+    {
+        printf("Unexpected control message (%d)\n", msg->type);
+        return 0;
+    }
+    
+    switch(msg->subtype)
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        blkif_create((blkif_be_create_t *)msg->msg);
+        break;   
+        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        blkif_destroy((blkif_be_destroy_t *)msg->msg);
+        break;  
+        
+    case CMSG_BLKIF_BE_VBD_CREATE:
+        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+            goto parse_error;
+        vbd_create((blkif_be_vbd_create_t *)msg->msg);
+        break;
+        
+    case CMSG_BLKIF_BE_VBD_DESTROY:
+        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+            goto parse_error;
+        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
+        break;
+
+    case CMSG_BLKIF_BE_CONNECT:
+    case CMSG_BLKIF_BE_DISCONNECT:
+        /* we don't manage the device channel, the tap does. */
+        break;
+
+    default:
+        goto parse_error;
+    }
+    return 0;
+parse_error:
+    printf("Bad control message!\n");
+    return 0;
+    
+}    
+
+int parallax_probe(blkif_request_t *req, blkif_t *blkif)
+{
+    blkif_response_t *rsp;
+    vdisk_t *img_info;
+    vdi_t *vdi;
+    int i, nr_vdis = 0; 
+
+    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
+
+    /* We expect one buffer only. */
+    if ( req->nr_segments != 1 )
+      goto err;
+
+    /* Make sure the buffer is page-sized. */
+    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
+      goto err;
+
+    /* fill the list of devices */
+    for (i=0; i<VDI_HASHSZ; i++) {
+        vdi = blkif->vdi_hash[i];
+        while (vdi) {
+            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+            img_info[nr_vdis].device   = vdi->vdevice;
+            img_info[nr_vdis].info     = 0;
+            /* The -1 here accounts for the LSB in the radix tree */
+            img_info[nr_vdis].capacity = 
+                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
+            nr_vdis++;
+            vdi = vdi->next;
+        }
+    }
+
+    
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_PROBE;
+    rsp->status = nr_vdis; /* number of disks */
+
+    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
+    return  BLKTAP_RESPOND;
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_PROBE;
+    rsp->status = BLKIF_RSP_ERROR;
+    
+    DPRINTF("parallax_probe: send error response\n"); 
+    return BLKTAP_RESPOND;  
+}
+
+typedef struct {
+    blkif_request_t *req;
+    int              count;
+    int              error;
+    pthread_mutex_t  mutex;
+} pending_t;
+
+#define MAX_REQUESTS 64
+pending_t pending_list[MAX_REQUESTS];
+
+struct cb_param {
+    pending_t *pent;
+    int       segment;
+    u64       sector; 
+    u64       vblock; /* for debug printing -- can be removed. */
+};
+
+static void read_cb(struct io_ret r, void *in_param)
+{
+    struct cb_param *param = (struct cb_param *)in_param;
+    pending_t *p = param->pent;
+    int segment = param->segment;
+    blkif_request_t *req = p->req;
+    unsigned long size, offset, start;
+    char *dpage, *spage;
+       
+    spage  = IO_BLOCK(r);
+    if (spage == NULL) { p->error++; goto finish; }
+    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
+    
+    /* Calculate read size and offset within the read block. */
+
+    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
+    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
+             blkif_first_sect(req->frame_and_sects[segment]) + 1
+        ) << SECTOR_SHIFT;
+    start = blkif_first_sect(req->frame_and_sects[segment]) 
+        << SECTOR_SHIFT;
+
+    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
+            "vblock %llx, "
+            "size %lx\n", 
+            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
+            blkif_last_sect (p->req->frame_and_sects[segment]),
+            param->vblock, size); 
+
+    memcpy(dpage + start, spage + offset, size);
+    freeblock(spage);
+    
+    /* Done the read.  Now update the pending record. */
+ finish:
+    pthread_mutex_lock(&p->mutex);
+    p->count--;
+    
+    if (p->count == 0) {
+       blkif_response_t *rsp;
+       
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_READ;
+       if (p->error == 0) {
+            rsp->status = BLKIF_RSP_OKAY;
+       } else {
+            rsp->status = BLKIF_RSP_ERROR;
+       }
+        blktap_inject_response(rsp);       
+    }
+    
+    pthread_mutex_unlock(&p->mutex);
+       
+    free(param); /* TODO: replace with cached alloc/dealloc */
+}      
+
+int parallax_read(blkif_request_t *req, blkif_t *blkif)
+{
+    blkif_response_t *rsp;
+    u64 vblock, gblock;
+    vdi_t *vdi;
+    u64 sector;
+    int i;
+    char *dpage, *spage;
+    pending_t *pent;
+
+    vdi = blkif_get_vdi(blkif, req->device);
+    
+    if ( vdi == NULL )
+        goto err;
+        
+    pent = &pending_list[ID_TO_IDX(req->id)];
+    pent->count = req->nr_segments;
+    pent->req = req;
+    pthread_mutex_init(&pent->mutex, NULL);
+    
+    for (i = 0; i < req->nr_segments; i++) {
+        pthread_t tid;
+        int ret;
+        struct cb_param *p;
+        
+        /* Round the requested segment to a block address. */
+        sector  = req->sector_number + (8*i);
+        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+        
+        /* TODO: Replace this call to malloc with a cached allocation */
+        p = (struct cb_param *)malloc(sizeof(struct cb_param));
+        p->pent = pent;
+        p->sector = sector; 
+        p->segment = i;     
+        p->vblock = vblock; /* dbg */
+        
+        /* Get that block from the store. */
+        vdi_read(vdi, vblock, read_cb, (void *)p);    
+    }
+    
+    return BLKTAP_STOLEN;
+
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_READ;
+    rsp->status = BLKIF_RSP_ERROR;
+    
+    return BLKTAP_RESPOND;  
+}
+
+static void write_cb(struct io_ret r, void *in_param)
+{
+    struct cb_param *param = (struct cb_param *)in_param;
+    pending_t *p = param->pent;
+    blkif_request_t *req = p->req;
+    
+    /* catch errors from the block code. */
+    if (IO_INT(r) < 0) p->error++;
+    
+    pthread_mutex_lock(&p->mutex);
+    p->count--;
+    
+    if (p->count == 0) {
+       blkif_response_t *rsp;
+       
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_WRITE;
+       if (p->error == 0) {
+            rsp->status = BLKIF_RSP_OKAY;
+       } else {
+            rsp->status = BLKIF_RSP_ERROR;
+       }
+        blktap_inject_response(rsp);       
+    }
+    
+    pthread_mutex_unlock(&p->mutex);
+       
+    free(param); /* TODO: replace with cached alloc/dealloc */
+}
+
+int parallax_write(blkif_request_t *req, blkif_t *blkif)
+{
+    blkif_response_t *rsp;
+    u64 sector;
+    int i, writable = 0;
+    u64 vblock, gblock;
+    char *spage;
+    unsigned long size, offset, start;
+    vdi_t *vdi;
+    pending_t *pent;
+
+    vdi = blkif_get_vdi(blkif, req->device);
+    
+    if ( vdi == NULL )
+        goto err;
+        
+    pent = &pending_list[ID_TO_IDX(req->id)];
+    pent->count = req->nr_segments;
+    pent->req = req;
+    pthread_mutex_init(&pent->mutex, NULL);
+    
+    for (i = 0; i < req->nr_segments; i++) {
+        struct cb_param *p;
+        
+        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+        
+        /* Round the requested segment to a block address. */
+        
+        sector  = req->sector_number + (8*i);
+        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+        
+        /* Calculate read size and offset within the read block. */
+        
+        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
+        size = ( blkif_last_sect (req->frame_and_sects[i]) -
+                 blkif_first_sect(req->frame_and_sects[i]) + 1
+            ) << SECTOR_SHIFT;
+        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+
+        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
+                "vblock %llx, gblock %llx, "
+                "size %lx\n", 
+                sector, blkif_first_sect(req->frame_and_sects[i]),
+                blkif_last_sect (req->frame_and_sects[i]),
+                vblock, gblock, size); 
+      
+        /* XXX: For now we just freak out if they try to write a   */
+        /* non block-sized, block-aligned page.                    */
+        
+        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
+            printf("]\n] STRANGE WRITE!\n]\n");
+            goto err;
+        }
+        
+        /* TODO: Replace this call to malloc with a cached allocation */
+        p = (struct cb_param *)malloc(sizeof(struct cb_param));
+        p->pent = pent;
+        p->sector = sector; 
+        p->segment = i;     
+        p->vblock = vblock; /* dbg */
+        
+        /* Issue the write to the store. */
+        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
+    }
+
+    return BLKTAP_STOLEN;
+
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_WRITE;
+    rsp->status = BLKIF_RSP_ERROR;
+    
+    return BLKTAP_RESPOND;  
+}
+
+int parallax_request(blkif_request_t *req)
+{
+    blkif_response_t *rsp;
+    domid_t  dom   = ID_TO_DOM(req->id);
+    blkif_t *blkif = blkif_find_by_handle(dom, 0);
+    
+    if (blkif == NULL)
+        goto err;
+    
+    if ( req->operation == BLKIF_OP_PROBE ) {
+        
+        return parallax_probe(req, blkif);
+        
+    } else if ( req->operation == BLKIF_OP_READ ) {
+        
+        return parallax_read(req, blkif);
+        
+    } else if ( req->operation == BLKIF_OP_WRITE ) {
+        
+        return parallax_write(req, blkif);
+        
+    } else {
+        printf("Unknown request message type!\n");
+        /* Unknown operation */
+        goto err;
+    }
+    
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->operation = req->operation;
+    rsp->id = req->id;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+void __init_parallax(void) 
+{
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+}
+
+
+
+int main(int argc, char *argv[])
+{
+    DPRINTF("parallax: starting.\n"); 
+    __init_blockstore();
+    DPRINTF("parallax: initialized blockstore...\n"); 
+    init_block_async();
+    DPRINTF("parallax: initialized async blocks...\n"); 
+    __init_vdi();
+    DPRINTF("parallax: initialized vdi registry etc...\n"); 
+    __init_parallax();
+    DPRINTF("parallax: initialized local stuff..\n"); 
+
+    blktap_register_ctrl_hook("parallax_control", parallax_control);
+    blktap_register_request_hook("parallax_request", parallax_request);
+    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
+    blktap_listen();
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi.c       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,367 @@
+/**************************************************************************
+ * 
+ * vdi.c
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include "blockstore.h"
+#include "block-async.h"
+#include "requests-async.h"
+#include "radix.h"
+#include "vdi.h"
+                    
+#define VDI_REG_BLOCK   2LL
+#define VDI_RADIX_ROOT  writable(3)
+                                                            
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* I haven't decided about this registry stuff, so this is just a really
+ * quick lash-up so that there is some way to track VDIs.
+ *
+ * (Most vdi access should be with a direct handle to the block, so this
+ *  registry is just for start-of-day lookup and other control operations.)
+ */
+
+vdi_registry_t *create_vdi_registry(void)
+{
+    vdi_registry_t *reg = (vdi_registry_t *)newblock();
+    
+    if (reg == NULL)
+        return NULL;
+    
+    /* zero-fill the vdi radix root while we have an empty block. */
+    writeblock(VDI_RADIX_ROOT, (void *)reg);
+    
+    
+    DPRINTF("[vdi.c] Creating VDI registry!\n");
+    reg->magic      = VDI_REG_MAGIC;
+    reg->nr_vdis    = 0;
+    
+    writeblock(VDI_REG_BLOCK, (void *)reg);
+    
+    return reg;
+}
+    
+vdi_registry_t *get_vdi_registry(void)
+{
+    vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
+    
+    if ( vdi_reg == NULL )
+        vdi_reg = create_vdi_registry();
+    
+    if ( vdi_reg->magic != VDI_REG_MAGIC ) {
+        freeblock(vdi_reg);
+        return NULL;
+    }
+    
+    return vdi_reg;
+}
+
+
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
+{
+    int ret;
+    vdi_t *vdi;
+    vdi_registry_t *vdi_reg;
+    snap_rec_t snap_rec;
+    
+    /* create a vdi struct */
+    vdi = newblock();
+    if (vdi == NULL) 
+        return NULL;
+    
+    if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
+        vdi->radix_root = snapshot(snap_rec.radix_root);
+    } else {
+        vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
+        vdi->radix_root = writable(vdi->radix_root); /* grr. */
+    }
+    
+    /* create a snapshot log, and add it to the vdi struct */
+    
+    ret = snap_block_create(parent_snap, &vdi->snap);
+    if ( ret != 0 ) {
+        DPRINTF("Error getting snap block in vdi_create.\n");
+        freeblock(vdi);
+        return NULL;
+    }
+            
+    /* append the vdi to the registry, fill block and id.             */
+    /* implicit allocation means we have to write the vdi twice here. */
+    vdi_reg    = get_vdi_registry();
+    if ( vdi_reg == NULL ) {
+        freeblock(vdi);
+        return NULL;
+    }
+    
+    vdi->block = allocblock((void *)vdi);
+    vdi->id    = vdi_reg->nr_vdis++;
+    strncpy(vdi->name, name, VDI_NAME_SZ);
+    vdi->name[VDI_NAME_SZ] = '\0';
+    vdi->radix_lock = NULL; /* for tidiness */
+    writeblock(vdi->block, (void *)vdi);
+    
+    update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
+    writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
+    freeblock(vdi_reg);
+    
+    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
+    if (vdi->radix_lock == NULL) 
+    {
+       perror("couldn't malloc radix_lock for new vdi!");
+       freeblock(vdi);
+       return NULL;
+    }
+    radix_lock_init(vdi->radix_lock);
+    
+    return vdi;
+}
+
+/* vdi_get and vdi_put currently act more like alloc/free -- they don't 
+ * do refcount-based allocation.  
+ */
+vdi_t *vdi_get(u64 vdi_id)
+{
+    u64 vdi_blk;
+    vdi_t *vdi;
+    
+    vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
+    
+    if ( vdi_blk == 0 )
+        return NULL;
+    
+    vdi = (vdi_t *)readblock(vdi_blk);
+    
+    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
+    if (vdi->radix_lock == NULL) 
+    {
+       perror("couldn't malloc radix_lock for new vdi!");
+       freeblock(vdi);
+       return NULL;
+    }
+    radix_lock_init(vdi->radix_lock);
+    
+    return vdi;
+}
+
+void vdi_put(vdi_t *vdi)
+{
+    free(vdi->radix_lock);
+    freeblock(vdi);
+}
+
+void vdi_snapshot(vdi_t *vdi)
+{
+    snap_rec_t rec;
+    int ret;
+    
+    rec.radix_root = vdi->radix_root;
+    gettimeofday(&rec.timestamp, NULL);
+    rec.deleted = 0;
+    
+    vdi->radix_root = snapshot(vdi->radix_root);
+    ret = snap_append(&vdi->snap, &rec, &vdi->snap);
+    if ( ret != 0 ) {
+        printf("snap_append returned failure\n");
+        return;
+    }
+    writeblock(vdi->block, vdi);
+}
+    
+int __init_vdi()
+{
+    /* sneak this in here for the moment. */
+    __rcache_init();
+    
+    /* force the registry to be created if it doesn't exist. */
+    vdi_registry_t *vdi_reg = get_vdi_registry();
+    if (vdi_reg == NULL) {
+        printf("[vdi.c] Couldn't get/create a VDI registry!\n");
+        return -1;
+    }
+    freeblock(vdi_reg);
+    
+    
+    return 0;
+}
+    
+#ifdef VDI_STANDALONE
+
+#define TEST_VDIS      50
+#define NR_ITERS    50000
+#define FORK_POINTS   200
+#define INIT_VDIS       3
+#define INIT_SNAPS     40
+
+/* These must be of decreasing size: */
+#define NEW_FORK       (RAND_MAX-(RAND_MAX/1000))
+#define NEW_ROOT_VDI   (RAND_MAX-((RAND_MAX/1000)*2))
+#define NEW_FORK_VDI   (RAND_MAX-((RAND_MAX/1000)*3))
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE  "vdi.ps"
+
+
+typedef struct sh_st {
+    snap_id_t     id;
+    struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+    sh_t **s = &node_hash[SNAP_HASH(id)];
+    
+    while (*s != NULL) {
+        if (SNAPID_EQUAL(&((*s)->id), id))
+            return 1;
+        *s = (*s)->next;
+    }
+    
+    *s = (sh_t *)malloc(sizeof(sh_t));
+    (*s)->id = *id;
+    (*s)->next = NULL;
+    
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    vdi_t *vdi_list[TEST_VDIS];
+    snap_id_t id, fork_points[FORK_POINTS];
+    int nr_vdis = 0, nr_forks = 0;
+    int i, j, r;
+    FILE *f;
+    char name[VDI_NAME_SZ];
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
+    
+    for (i=0; i<INIT_VDIS; i++) {
+        r=rand();
+        
+        sprintf(name, "VDI Number %d", nr_vdis);
+        vdi_list[i] = vdi_create(NULL, name);
+        for (j=0; j<(r%INIT_SNAPS); j++)
+            vdi_snapshot(vdi_list[i]);
+        fork_points[i] = vdi_list[i]->snap;
+        nr_vdis++;
+        nr_forks++;
+    }
+    
+    printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
+            
+    for (i=0; i<NR_ITERS; i++) {
+        r = rand();
+        
+        if ( r > NEW_FORK ) {
+            if ( nr_forks > FORK_POINTS )
+                continue;
+            id = vdi_list[r%nr_vdis]->snap;
+            if ( ( id.block == 0 ) || ( id.index == 0 ) )
+                continue;
+            id.index--;
+            fork_points[nr_forks++] = id;
+            
+        } else if ( r > NEW_ROOT_VDI ) {
+            
+            if ( nr_vdis == TEST_VDIS )
+                continue;
+            
+            sprintf(name, "VDI Number %d.", nr_vdis);
+            vdi_list[nr_vdis++] = vdi_create(NULL, name);
+            
+        } else if ( r > NEW_FORK_VDI ) {
+            
+            if ( nr_vdis == TEST_VDIS )
+                continue;
+            
+            sprintf(name, "VDI Number %d.", nr_vdis);
+            vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
+            
+        } else /* SNAPSHOT */ {
+            
+            vdi_snapshot(vdi_list[r%nr_vdis]);
+            
+        }
+    }
+    
+    /* now dump it out to a dot file. */
+    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+    
+    f = fopen(GRAPH_DOT_FILE, "w");
+    
+    /* write graph preamble */
+    fprintf(f, "digraph G {\n");
+    fprintf(f, "   rankdir=LR\n");
+    
+    for (i=0; i<nr_vdis; i++) {
+        char oldnode[255];
+        snap_block_t *blk;
+        snap_id_t id = vdi_list[i]->snap;
+        int nr_snaps, done=0;
+        
+        /* add a node for the id */
+printf("vdi: %d\n", i);
+        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
+                id.block, id.index, vdi_list[i]->name,
+                id.block, id.index);
+        sprintf(oldnode, "n%Ld%d", id.block, id.index);
+        
+        while (id.block != 0) {
+            blk = snap_get_block(id.block);
+            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+            id = blk->hdr.fork_block;
+            
+            done = sh_check_and_add(&id);
+            
+            /* add a node for the fork_id */
+            if (!done) {
+                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
+                    id.block, id.index,
+                    id.block, id.index);
+            }
+            
+            /* add an edge between them */
+            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+                    id.block, id.index, oldnode, nr_snaps);
+            sprintf(oldnode, "n%Ld%d", id.block, id.index);
+            freeblock(blk);
+            
+            if (done) break;
+        }
+    }
+    
+    /* write graph postamble */
+    fprintf(f, "}\n");
+    fclose(f);
+    
+    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+    {
+        char cmd[255];
+        sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
+        system(cmd);
+    }
+    return 0;
+}
+
+#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi.h       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,55 @@
+#ifndef _VDI_H_
+#define _VDI_H_
+/**************************************************************************
+ * 
+ * vdi.h
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+
+#ifndef __VDI_H__
+#define __VDI_H__
+
+#include "blktaplib.h"
+#include "snaplog.h"
+
+#define VDI_HEIGHT     27 /* Note that these are now hard-coded */
+#define VDI_REG_HEIGHT 27 /* in the async lookup code           */
+
+#define VDI_NAME_SZ 256
+
+
+typedef struct vdi {
+    u64         id;               /* unique vdi id -- used by the registry   */
+    u64         block;            /* block where this vdi lives (also unique)*/
+    u64         radix_root;       /* radix root node for block mappings      */
+    snap_id_t   snap;             /* next snapshot slot for this VDI         */
+    struct vdi *next;             /* used to hash-chain in blkif.            */
+    blkif_vdev_t vdevice;         /* currently mounted as...                 */
+    struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs   */
+    char        name[VDI_NAME_SZ];/* human readable vdi name                 */
+} vdi_t;
+
+#define VDI_REG_MAGIC   0xff00ff0bb0ff00ffLL
+
+typedef struct vdi_registry {
+    u64     magic;
+    u64     nr_vdis;
+} vdi_registry_t;
+
+
+int __init_vdi(void);
+
+vdi_t *vdi_get(u64 vdi_id);
+void vdi_put(vdi_t *vdi);
+vdi_registry_t *get_vdi_registry(void);
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
+u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
+void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
+void vdi_snapshot(vdi_t *vdi);
+
+
+#endif /* __VDI_H__ */
+
+#endif //_VDI_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/requests-async.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/requests-async.c    Sun Jul  3 22:36:48 2005
@@ -0,0 +1,762 @@
+/* requests-async.c
+ *
+ * asynchronous request dispatcher for radix access in parallax.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <pthread.h>
+#include <err.h>
+#include <zlib.h> /* for crc32() */
+#include "requests-async.h"
+#include "vdi.h"
+#include "radix.h"
+
+#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
+#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
+#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
+
+
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+struct block_info {
+    u32        crc;
+    u32        unused;
+};
+
+struct io_req {
+    enum { IO_OP_READ, IO_OP_WRITE } op;
+    u64        root;
+    u64        vaddr;
+    int        state;
+    io_cb_t    cb;
+    void      *param;
+    struct radix_lock *lock;
+
+    /* internal stuff: */
+    struct io_ret     retval;/* holds the return while we unlock. */
+    char             *block; /* the block to write */
+    radix_tree_node   radix[3];
+    u64               radix_addr[3];
+    struct block_info bi;
+};
+
+void clear_w_bits(radix_tree_node node) 
+{
+    int i;
+    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
+        node[i] = node[i] & ONEMASK;
+    return;
+}
+
+void clear_L3_w_bits(radix_tree_node node) 
+{
+    int i;
+    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
+        node[i] = node[i] & ONEMASK;
+    return;
+}
+
+enum states {
+    /* both */
+    READ_L1,
+    READ_L2,
+    READ_L3,
+
+    /* read */
+    READ_LOCKED,
+    READ_DATA,
+    READ_UNLOCKED,
+    RETURN_ZERO,
+
+    /* write */
+    WRITE_LOCKED,
+    WRITE_DATA,
+    WRITE_L3,
+    WRITE_UNLOCKED,
+    
+    /* L3 Zero Path */
+    ALLOC_DATA_L3z,
+    WRITE_L3_L3z,
+    
+    /* L3 Fault Path */
+    ALLOC_DATA_L3f,
+    WRITE_L3_L3f,
+    
+    /* L2 Zero Path */
+    ALLOC_DATA_L2z,
+    WRITE_L2_L2z,
+    ALLOC_L3_L2z,
+    WRITE_L2_L3z,
+    
+    /* L2 Fault Path */
+    READ_L3_L2f,
+    ALLOC_DATA_L2f,
+    WRITE_L2_L2f,
+    ALLOC_L3_L2f,
+    WRITE_L2_L3f,
+
+    /* L1 Zero Path */
+    ALLOC_DATA_L1z,
+    ALLOC_L3_L1z,
+    ALLOC_L2_L1z,
+    WRITE_L1_L1z,
+
+    /* L1 Fault Path */
+    READ_L2_L1f,
+    READ_L3_L1f,
+    ALLOC_DATA_L1f,
+    ALLOC_L3_L1f,
+    ALLOC_L2_L1f,
+    WRITE_L1_L1f,
+    
+};
+
+enum radix_offsets {
+    L1 = 0, 
+    L2 = 1,
+    L3 = 2
+};
+
+
+static void read_cb(struct io_ret ret, void *param);
+static void write_cb(struct io_ret ret, void *param);
+
+int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
+{
+    struct io_req *req;
+
+    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
+    /* Every second line in the bottom-level radix tree is used to      */
+    /* store crc32 values etc. We shift the vadder here to achied this. */
+    vaddr <<= 1;
+
+    req = (struct io_req *)malloc(sizeof (struct io_req));
+    if (req == NULL) return ERR_NOMEM;
+
+    req->radix[0] = req->radix[1] = req->radix[2] = NULL;      
+    req->op    = IO_OP_READ;
+    req->root  = vdi->radix_root;
+    req->lock  = vdi->radix_lock; 
+    req->vaddr = vaddr;
+    req->cb    = cb;
+    req->param = param;
+    req->state = READ_LOCKED;
+
+    block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
+       
+    return 0;
+}
+
+
+int   vdi_write(vdi_t *vdi, u64 vaddr, char *block, 
+                io_cb_t cb, void *param)
+{
+    struct io_req *req;
+
+    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
+    /* Every second line in the bottom-level radix tree is used to      */
+    /* store crc32 values etc. We shift the vadder here to achied this. */
+    vaddr <<= 1;
+
+    req = (struct io_req *)malloc(sizeof (struct io_req));
+    if (req == NULL) return ERR_NOMEM; 
+
+    req->radix[0] = req->radix[1] = req->radix[2] = NULL;
+    req->op     = IO_OP_WRITE;
+    req->root   = vdi->radix_root;
+    req->lock   = vdi->radix_lock; 
+    req->vaddr  = vaddr;
+    req->block  = block;
+    /* Todo: add a pseodoheader to the block to include some location   */
+    /* information in the CRC as well.                                  */
+    req->bi.crc = (u32) crc32(0L, Z_NULL, 0); 
+    req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); 
+    req->bi.unused = 0xdeadbeef;
+
+    req->cb     = cb;
+    req->param  = param;
+    req->radix_addr[L1] = getid(req->root); /* for consistency */
+    req->state  = WRITE_LOCKED;
+
+    block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
+
+
+    return 0;
+}
+
+static void read_cb(struct io_ret ret, void *param)
+{
+    struct io_req *req = (struct io_req *)param;
+    radix_tree_node node;
+    u64 idx;
+    char *block;
+    void *req_param;
+
+    DPRINTF("read_cb\n");
+    /* get record */
+    switch(req->state) {
+       
+    case READ_LOCKED: 
+    
+        DPRINTF("READ_LOCKED\n");
+       req->state = READ_L1;
+       block_read(getid(req->root), read_cb, req); 
+       break;
+       
+    case READ_L1: /* block is the radix root */
+
+        DPRINTF("READ_L1\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+        node = (radix_tree_node) block;
+        idx  = getid( node[L1_IDX(req->vaddr)] );
+        free(block);
+        if ( idx == ZERO ) {
+            req->state = RETURN_ZERO;
+            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        } else {
+            req->state = READ_L2;
+            block_read(idx, read_cb, req);
+        }
+        break;
+
+    case READ_L2:
+
+        DPRINTF("READ_L2\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+        node = (radix_tree_node) block;
+        idx  = getid( node[L2_IDX(req->vaddr)] );
+        free(block);
+        if ( idx == ZERO ) {
+            req->state = RETURN_ZERO;
+            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        } else {
+            req->state = READ_L3;
+            block_read(idx, read_cb, req);
+        }
+        break;
+
+    case READ_L3:
+    {
+        struct block_info *bi;
+
+        DPRINTF("READ_L3\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+        node = (radix_tree_node) block;
+        idx  = getid( node[L3_IDX(req->vaddr)] );
+        bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
+        req->bi = *bi;
+        free(block);
+        if ( idx == ZERO )  {
+            req->state = RETURN_ZERO;
+            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        } else {
+            req->state = READ_DATA;
+            block_read(idx, read_cb, req);
+        }
+        break;
+    }
+    case READ_DATA:
+    {
+        u32 crc;
+
+        DPRINTF("READ_DATA\n");
+        block = IO_BLOCK(ret);
+        if (block == NULL) goto fail;
+
+        /* crc check */
+        crc = (u32) crc32(0L, Z_NULL, 0); 
+        crc = (u32) crc32(crc, block, BLOCK_SIZE); 
+        if (crc != req->bi.crc) {
+            /* TODO: add a retry loop here.                          */
+            /* Do this after the cache is added -- make sure to      */
+            /* invalidate the bad page before reissuing the read.    */
+
+            warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
+#ifdef PRINT_BADCRC_PAGES
+            {
+                int j;
+                for (j=0; j<BLOCK_SIZE; j++) {
+                    if isprint(block[j]) {
+                        printf("%c", block[j]);
+                    } else {
+                        printf(".");
+                    }
+                    if ((j % 64) == 0) printf("\n");
+                }
+            }
+#endif /* PRINT_BADCRC_PAGES */
+
+            /* fast and loose for the moment. */
+            /* goto fail;                     */
+        }
+
+        req->retval = ret;
+        req->state = READ_UNLOCKED;
+        block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
+        break;
+    }
+    case READ_UNLOCKED:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("READ_UNLOCKED\n");
+        req_param = req->param;
+        r         = req->retval;
+        cb        = req->cb;
+        free(req);
+        cb(r, req_param);
+        break;
+    }
+    
+    case RETURN_ZERO:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("RETURN_ZERO\n");
+        req_param = req->param;
+        cb        = req->cb;
+        free(req);
+        r.type = IO_BLOCK_T;
+        r.u.b = newblock();
+        cb(r, req_param);
+        break;
+    }
+        
+    default:
+       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
+       goto fail;
+    }
+ 
+    return;
+
+ fail:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("asyn_read had a read error.\n");
+        req_param = req->param;
+        r         = ret;
+        cb        = req->cb;
+        free(req);
+        cb(r, req_param);
+    }
+
+
+}
+
+static void write_cb(struct io_ret r, void *param)
+{
+    struct io_req *req = (struct io_req *)param;
+    radix_tree_node node;
+    u64 a, addr;
+    void *req_param;
+    struct block_info *bi;
+
+    switch(req->state) {
+       
+    case WRITE_LOCKED:
+        
+        DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
+       req->state = READ_L1;
+       block_read(getid(req->root), write_cb, req); 
+       break;
+       
+    case READ_L1: /* block is the radix root */
+
+        DPRINTF("READ_L1\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        if (node == NULL) goto fail;
+        a    = node[L1_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix_addr[L2] = addr;
+        req->radix[L1] = node;
+
+        if ( addr == ZERO ) {
+            /* L1 empty subtree: */
+            req->state = ALLOC_DATA_L1z;
+            block_alloc( req->block, write_cb, req );
+        } else if ( !iswritable(a) ) {
+            /* L1 fault: */
+            req->state = READ_L2_L1f;
+            block_read( addr, write_cb, req );
+        } else {
+            req->state = READ_L2;
+            block_read( addr, write_cb, req );
+        }
+        break;
+    
+    case READ_L2:
+
+        DPRINTF("READ_L2\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix_addr[L3] = addr;
+        req->radix[L2] = node;
+
+        if ( addr == ZERO ) {
+            /* L2 empty subtree: */
+            req->state = ALLOC_DATA_L2z;
+            block_alloc( req->block, write_cb, req );
+        } else if ( !iswritable(a) ) {
+            /* L2 fault: */
+            req->state = READ_L3_L2f;
+            block_read( addr, write_cb, req );
+        } else {
+            req->state = READ_L3;
+            block_read( addr, write_cb, req );
+        }
+        break;
+    
+    case READ_L3:
+
+        DPRINTF("READ_L3\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        if (node == NULL) goto fail;
+        a    = node[L3_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix[L3] = node;
+
+        if ( addr == ZERO ) {
+            /* L3 fault: */
+            req->state = ALLOC_DATA_L3z;
+            block_alloc( req->block, write_cb, req );
+        } else if ( !iswritable(a) ) {
+            /* L3 fault: */
+            req->state = ALLOC_DATA_L3f;
+            block_alloc( req->block, write_cb, req );
+        } else {
+            req->state = WRITE_DATA;
+            block_write( addr, req->block, write_cb, req );
+        }
+        break;
+    
+    case WRITE_DATA:
+
+        DPRINTF("WRITE_DATA\n");
+        /* The L3 radix points to the correct block, we just need to  */
+        /* update the crc.                                            */
+        if (IO_INT(r) < 0) goto fail;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 101;
+        *bi = req->bi;
+        req->state = WRITE_L3;
+        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+        break;
+    
+    /* L3 Zero Path: */
+
+    case ALLOC_DATA_L3z:
+
+        DPRINTF("ALLOC_DATA_L3z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 102;
+        *bi = req->bi;
+        req->state = WRITE_L3_L3z;
+        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+        break;
+    
+    /* L3 Fault Path: */
+
+    case ALLOC_DATA_L3f:
+    
+        DPRINTF("ALLOC_DATA_L3f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 103;
+        *bi = req->bi;
+        req->state = WRITE_L3_L3f;
+        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
+        break;
+
+    /* L2 Zero Path: */
+        
+    case ALLOC_DATA_L2z:
+
+        DPRINTF("ALLOC_DATA_L2z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3] = newblock();
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 104;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L2z;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+
+    case ALLOC_L3_L2z:
+
+        DPRINTF("ALLOC_L3_L2z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = WRITE_L2_L2z;
+        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
+        break;
+        
+    /* L2 Fault Path: */
+        
+    case READ_L3_L2f:
+    
+       DPRINTF("READ_L3_L2f\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        clear_L3_w_bits(node);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix[L3] = node;
+        req->state = ALLOC_DATA_L2f;
+        block_alloc( req->block, write_cb, req );
+        break;
+                
+    case ALLOC_DATA_L2f:
+
+        DPRINTF("ALLOC_DATA_L2f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 105;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L2f;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+
+    case ALLOC_L3_L2f:
+
+        DPRINTF("ALLOC_L3_L2f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = WRITE_L2_L2f;
+        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
+        break;
+        
+    /* L1 Zero Path: */
+    
+    case ALLOC_DATA_L1z:
+
+        DPRINTF("ALLOC_DATA_L1z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3] = newblock();
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 106;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L1z;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+        
+    case ALLOC_L3_L1z:
+
+        DPRINTF("ALLOC_L3_L1z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2] = newblock();
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = ALLOC_L2_L1z;
+        block_alloc( (char*)req->radix[L2], write_cb, req );
+        break;
+
+    case ALLOC_L2_L1z:
+
+        DPRINTF("ALLOC_L2_L1z\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L1][L1_IDX(req->vaddr)] = a;
+        req->state = WRITE_L1_L1z;
+        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
+        break;
+
+    /* L1 Fault Path: */
+        
+    case READ_L2_L1f:
+    
+       DPRINTF("READ_L2_L1f\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        clear_w_bits(node);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix_addr[L3] = addr;
+        req->radix[L2] = node;
+        
+        if (addr == ZERO) {
+            /* nothing below L2, create an empty L3 and alloc data. */
+            /* (So skip READ_L3_L1f.) */
+            req->radix[L3] = newblock();
+            req->state = ALLOC_DATA_L1f;
+            block_alloc( req->block, write_cb, req );
+        } else {
+            req->state = READ_L3_L1f;
+            block_read( addr, write_cb, req );
+        }
+        break;
+        
+    case READ_L3_L1f:
+    
+       DPRINTF("READ_L3_L1f\n");
+        node = (radix_tree_node) IO_BLOCK(r);
+        clear_L3_w_bits(node);
+        if (node == NULL) goto fail;
+        a    = node[L2_IDX(req->vaddr)];
+        addr = getid(a);
+
+        req->radix[L3] = node;
+        req->state = ALLOC_DATA_L1f;
+        block_alloc( req->block, write_cb, req );
+        break;
+                
+    case ALLOC_DATA_L1f:
+
+        DPRINTF("ALLOC_DATA_L1f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L3][L3_IDX(req->vaddr)] = a;
+        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
+        req->bi.unused = 107;
+        *bi = req->bi;
+        req->state = ALLOC_L3_L1f;
+        block_alloc( (char*)req->radix[L3], write_cb, req );
+        break;
+
+    case ALLOC_L3_L1f:
+
+        DPRINTF("ALLOC_L3_L1f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L2][L2_IDX(req->vaddr)] = a;
+        req->state = ALLOC_L2_L1f;
+        block_alloc( (char*)req->radix[L2], write_cb, req );
+        break;
+
+    case ALLOC_L2_L1f:
+
+        DPRINTF("ALLOC_L2_L1f\n");
+        addr = IO_ADDR(r);
+        a = writable(addr);
+        req->radix[L1][L1_IDX(req->vaddr)] = a;
+        req->state = WRITE_L1_L1f;
+        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
+        break;
+
+    case WRITE_L3:
+    case WRITE_L3_L3z:
+    case WRITE_L3_L3f:
+    case WRITE_L2_L2z:
+    case WRITE_L2_L2f:
+    case WRITE_L1_L1z:
+    case WRITE_L1_L1f:
+    {
+       int i;
+        DPRINTF("DONE\n");
+        /* free any saved node vals. */
+        for (i=0; i<3; i++)
+            if (req->radix[i] != 0) free(req->radix[i]);
+        req->retval = r;
+        req->state = WRITE_UNLOCKED;
+        block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
+        break;
+    }
+    case WRITE_UNLOCKED:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        DPRINTF("WRITE_UNLOCKED!\n");
+        req_param = req->param;
+        r         = req->retval;
+        cb        = req->cb;
+        free(req);
+        cb(r, req_param);
+        break;
+    }
+        
+    default:
+       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
+       goto fail;
+    }
+    
+    return;
+    
+ fail:
+    {
+        struct io_ret r;
+        io_cb_t cb;
+        int i;
+
+        DPRINTF("asyn_write had a read error mid-way.\n");
+        req_param = req->param;
+        cb        = req->cb;
+        r.type = IO_INT_T;
+        r.u.i  = -1;
+        /* free any saved node vals. */
+        for (i=0; i<3; i++)
+            if (req->radix[i] != 0) free(req->radix[i]);
+        free(req);
+        cb(r, req_param);
+    }
+}
+
+char *vdi_read_s(vdi_t *vdi, u64 vaddr)
+{
+    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+    char *block = NULL;
+    int ret;
+
+    void reads_cb(struct io_ret r, void *param) 
+    {
+        block = IO_BLOCK(r);
+        pthread_mutex_unlock((pthread_mutex_t *)param);
+    }
+
+    pthread_mutex_lock(&m);
+    ret = vdi_read(vdi, vaddr, reads_cb, &m);
+
+    if (ret == 0) pthread_mutex_lock(&m);
+    
+    return block;
+}
+
+
+int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
+{
+    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+    int ret, result;
+
+    void writes_cb(struct io_ret r, void *param) 
+    {
+        result = IO_INT(r);
+        pthread_mutex_unlock((pthread_mutex_t *)param);
+    }
+
+    pthread_mutex_lock(&m);
+    ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
+
+    if (ret == 0) pthread_mutex_lock(&m);
+    
+    return result;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/requests-async.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/requests-async.h    Sun Jul  3 22:36:48 2005
@@ -0,0 +1,29 @@
+#ifndef _REQUESTSASYNC_H_
+#define _REQUESTSASYNC_H_
+
+#include "block-async.h"
+#include "blockstore.h" /* for newblock etc. */
+
+/*
+#define BLOCK_SIZE 4096
+#define ZERO 0ULL
+#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
+#define iswritable(x) (((x) & 1LLU) != 0)
+#define writable(x) (((x) << 1) | 1LLU)
+#define readonly(x) ((u64)((x) << 1))
+*/
+
+#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
+#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
+
+int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
+int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
+             
+/* synchronous versions: */
+char *vdi_read_s (vdi_t *vdi, u64 vaddr);
+int   vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
+
+#define ERR_BAD_VADDR  -1
+#define ERR_NOMEM      -2
+
+#endif //_REQUESTSASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_unittest.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_unittest.c      Sun Jul  3 22:36:48 2005
@@ -0,0 +1,184 @@
+/**************************************************************************
+ * 
+ * vdi_unittest.c
+ *
+ * Run a small test workload to ensure that data access through a vdi
+ * is (at least superficially) correct.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "requests-async.h"
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define TEST_PAGES  32
+static char *zero_page;
+static char pages[TEST_PAGES][BLOCK_SIZE];
+static int next_page = 0;
+
+void fill_test_pages(void)
+{
+    int i, j;
+    long *page;
+
+    for (i=0; i< TEST_PAGES; i++) {
+        page = (unsigned long *)pages[i];
+        for (j=0; j<(BLOCK_SIZE/4); j++) {
+            page[j] = random();
+        }
+    }
+
+    zero_page = newblock();
+}
+
+inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
+{
+    u64 ret = L1;
+
+    ret = (ret << 9) | L2;
+    ret = (ret << 9) | L3;
+
+    return ret;
+}
+
+void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
+{
+    u64 vaddr;
+    char *page = pages[next_page++];
+    char *rpage = NULL;
+
+    printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
+
+    vaddr = make_vaddr(L1, L2, L3);
+    vdi_write_s(vdi, vaddr, page);
+    rpage = vdi_read_s(vdi, vaddr);
+
+    if (rpage == NULL) 
+    {
+        printf( "read %Lu returned NULL\n", vaddr); 
+        return; 
+    }
+
+    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
+    {
+        printf( "read %Lu returned a different page\n", vaddr);
+        return;
+    }
+
+    freeblock(rpage);
+}
+
+void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
+{
+    u64 vaddr;
+    char *rpage = NULL;
+
+    printf("TEST  (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
+
+    vaddr = make_vaddr(L1, L2, L3);
+    rpage = vdi_read_s(vdi, vaddr);
+
+    if (rpage == NULL) 
+    {
+        printf( "read %Lu returned NULL\n", vaddr); 
+        return; 
+    }
+
+    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
+    {
+        printf( "read %Lu returned a different page\n", vaddr);
+        return;
+    }
+
+    freeblock(rpage);
+}
+
+void coverage_test(vdi_t *vdi)
+{
+    u64 vaddr;
+    int i, j, k;
+
+    /* Do a series of writes and reads to test all paths through the 
+     * async radix code.  The radix request code will dump CRC warnings
+     * if there are data problems here as well.
+     */
+
+    /* L1 Zero */
+    touch_block(vdi, 0, 0, 0);
+
+    /* L2 Zero */
+    i = next_page;
+    touch_block(vdi, 0, 1, 0);
+
+    /* L3 Zero */
+    j = next_page;
+    touch_block(vdi, 0, 0, 1);
+    k = next_page;
+    touch_block(vdi, 0, 1, 1);
+
+    /* Direct write */
+    touch_block(vdi, 0, 0, 0);
+
+    vdi_snapshot(vdi);
+
+    /* L1 fault */
+    touch_block(vdi, 0, 0, 0);
+    /* test the read-only branches that should have been copied over. */
+    test_block(vdi, 0, 1, 0, pages[i]);
+    test_block(vdi, 0, 0, 1, pages[j]);
+
+    /* L2 fault */
+    touch_block(vdi, 0, 1, 0);
+    test_block(vdi, 0, 1, 1, pages[k]);
+
+    /* L3 fault */
+    touch_block(vdi, 0, 0, 1);
+    
+    /* read - L1 zero */
+    test_block(vdi, 1, 0, 0, zero_page);
+    
+    /* read - L2 zero */
+    test_block(vdi, 0, 2, 0, zero_page);
+
+    /* read - L3 zero */
+    test_block(vdi, 0, 0, 2, zero_page);
+}
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    u64          id;
+    int          fd;
+    struct stat  st;
+    u64          tot_size;
+    char         spage[BLOCK_SIZE];
+    char        *dpage;
+    u64          vblock = 0, count=0;
+    
+    __init_blockstore();
+    init_block_async();
+    __init_vdi();
+        
+    vdi = vdi_create( NULL, "UNIT TEST VDI");
+    
+    if ( vdi == NULL ) {
+        printf("Failed to create VDI!\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+
+    fill_test_pages();
+    coverage_test(vdi);
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/block-async.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/block-async.h       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,69 @@
+/* block-async.h
+ * 
+ * Asynchronous block wrappers for parallax.
+ */
+ 
+#ifndef _BLOCKASYNC_H_
+#define _BLOCKASYNC_H_
+
+#include <assert.h>
+#include <xc.h>
+#include "vdi.h"
+
+struct io_ret
+{
+    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
+    union {
+        u64   a;
+        char *b;
+        int   i;
+    } u;
+};
+
+typedef void (*io_cb_t)(struct io_ret r, void *param);
+
+/* per-vdi lock structures to make sure requests run in a safe order. */
+struct radix_wait {
+    enum {RLOCK, WLOCK} type;
+    io_cb_t  cb;
+    void    *param;
+    struct radix_wait *next;
+};
+
+struct radix_lock {
+    pthread_mutex_t lock;
+    int                    lines[1024];
+    struct radix_wait     *waiters[1024];
+    enum {ANY, READ, STOP} state[1024];
+};
+void radix_lock_init(struct radix_lock *r);
+
+void block_read(u64 addr, io_cb_t cb, void *param);
+void block_write(u64 addr, char *block, io_cb_t cb, void *param);
+void block_alloc(char *block, io_cb_t cb, void *param);
+void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
+void init_block_async(void);
+
+static inline u64 IO_ADDR(struct io_ret r)
+{
+    assert(r.type == IO_ADDR_T);
+    return r.u.a;
+}
+
+static inline char *IO_BLOCK(struct io_ret r)
+{
+    assert(r.type == IO_BLOCK_T);
+    return r.u.b;
+}
+
+static inline int IO_INT(struct io_ret r)
+{
+    assert(r.type == IO_INT_T);
+    return r.u.i;
+}
+
+
+#endif //_BLOCKASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_snap.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * vdi_snap.c
+ *
+ * Snapshot a vdi.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t  *vdi;
+    u64     id;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc == 1 ) {
+        printf("usage: %s <VDI id>\n", argv[0]);
+        exit(-1);
+    }
+    
+    id = (u64) atoll(argv[1]);
+    
+    vdi = vdi_get(id);
+    
+    if ( vdi == NULL ) {
+        printf("couldn't find the requested VDI.\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+    
+    vdi_snapshot(vdi);
+    
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_create.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_create.c        Sun Jul  3 22:36:48 2005
@@ -0,0 +1,52 @@
+/**************************************************************************
+ * 
+ * vdi_create.c
+ *
+ * Create a new vdi.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    char         name[VDI_NAME_SZ] = "";
+    snap_id_t    id;
+    int          from_snap = 0;
+    
+    __init_blockstore();
+    __init_vdi();
+    
+    if ( argc == 1 ) {
+        printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
+        exit(-1);
+    }
+    
+    strncpy( name, argv[1], VDI_NAME_SZ);
+    name[VDI_NAME_SZ] = '\0';    
+    
+    if ( argc > 3 ) {
+        id.block   = (u64)          atoll(argv[2]);
+        id.index   = (unsigned int) atol (argv[3]);
+        from_snap  = 1;
+    }
+    
+    vdi = vdi_create( from_snap ? &id : NULL, name);
+    
+    if ( vdi == NULL ) {
+        printf("Failed to create VDI!\n");
+        freeblock(vdi);
+        exit(-1);
+    }
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_validate.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_validate.c      Sun Jul  3 22:36:48 2005
@@ -0,0 +1,97 @@
+/**************************************************************************
+ * 
+ * vdi_validate.c
+ *
+ * Intended to sanity-check vm_fill and the underlying vdi code.
+ *
+ * Block-by-block compare of a vdi with a file/device on the disk.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+#include "requests-async.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    u64          id;
+    int          fd;
+    struct stat  st;
+    u64          tot_size;
+    char         spage[BLOCK_SIZE], *dpage;
+    char        *vpage;
+    u64          vblock = 0, count=0;
+    
+    __init_blockstore();
+    init_block_async();
+    __init_vdi();
+    
+    if ( argc < 3 ) {
+        printf("usage: %s <VDI id> <filename>\n", argv[0]);
+        exit(-1);
+    }
+        
+    id = (u64) atoll(argv[1]);
+    
+    vdi = vdi_get( id );
+    
+    if ( vdi == NULL ) {
+        printf("Failed to retreive VDI %Ld!\n", id);
+        exit(-1);
+    }
+    
+    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+    
+    if (fd < 0) {
+        printf("Couldn't open %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    if ( fstat(fd, &st) != 0 ) {
+        printf("Couldn't stat %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    tot_size = (u64) st.st_size;
+    printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
+    
+    printf("           ");
+    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+
+        dpage = vdi_read_s(vdi, vblock);
+
+        if (dpage == NULL) {
+            printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
+            exit(0);
+        }
+
+        if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
+            printf("\n\nblocks don't match! (%Ld)\n", vblock);
+            exit(0);
+        }
+        
+        freeblock(dpage);
+        
+        vblock++;
+        if ((vblock % 1024) == 0) {
+            printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+            fflush(stdout);
+        }
+    }
+    printf("\n");
+    
+    printf("VDI %Ld looks good!\n", id);
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_fill.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/vdi_fill.c  Sun Jul  3 22:36:48 2005
@@ -0,0 +1,81 @@
+/**************************************************************************
+ * 
+ * vdi_fill.c
+ *
+ * Hoover a file or device into a vdi.
+ * You must first create the vdi with vdi_create.
+ *
+ */
+ 
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "requests-async.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+    vdi_t       *vdi;
+    u64          id;
+    int          fd;
+    struct stat  st;
+    u64          tot_size;
+    char         spage[BLOCK_SIZE];
+    char        *dpage;
+    u64          vblock = 0, count=0;
+    
+    __init_blockstore();
+    init_block_async();
+    __init_vdi();
+    
+    if ( argc < 3 ) {
+        printf("usage: %s <VDI id> <filename>\n", argv[0]);
+        exit(-1);
+    }
+        
+    id = (u64) atoll(argv[1]);
+    
+    vdi = vdi_get( id );
+    
+    if ( vdi == NULL ) {
+        printf("Failed to retreive VDI %Ld!\n", id);
+        exit(-1);
+    }
+    
+    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+    
+    if (fd < 0) {
+        printf("Couldn't open %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    if ( fstat(fd, &st) != 0 ) {
+        printf("Couldn't stat %s!\n", argv[2]);
+        exit(-1);
+    }
+    
+    tot_size = (u64) st.st_size;
+    printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
+    
+    printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);    
+    printf("           ");
+    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+        vdi_write_s(vdi, vblock, spage);
+        
+        vblock++;
+        if ((vblock % 512) == 0)
+        printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+        fflush(stdout);
+    }
+    printf("\n");
+    
+    freeblock(vdi);
+    
+    return (0);
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/radix.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/radix.c     Sun Jul  3 22:36:48 2005
@@ -0,0 +1,631 @@
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <pthread.h>
+#include "blockstore.h"
+#include "radix.h"
+
+#define RADIX_TREE_MAP_SHIFT 9
+#define RADIX_TREE_MAP_MASK 0x1ff
+#define RADIX_TREE_MAP_ENTRIES 512
+
+/*
+#define DEBUG
+*/
+
+/* Experimental radix cache. */
+
+static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
+static  int rcache_count = 0;
+#define RCACHE_MAX 1024
+
+typedef struct rcache_st {
+    radix_tree_node  *node;
+    u64               id;
+    struct rcache_st *hash_next;
+    struct rcache_st *cache_next;
+    struct rcache_st *cache_prev;
+} rcache_t;
+
+static rcache_t *rcache_head = NULL;
+static rcache_t *rcache_tail = NULL;
+
+#define RCHASH_SIZE 512ULL
+rcache_t *rcache[RCHASH_SIZE];
+#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
+
+void __rcache_init(void)
+{
+    int i;
+
+    for (i=0; i<RCHASH_SIZE; i++)
+        rcache[i] = NULL;
+}
+    
+
+void rcache_write(u64 id, radix_tree_node *node)
+{
+    rcache_t *r, *tmp, **curs;
+    
+    pthread_mutex_lock(&rcache_mutex);
+    
+    /* Is it already in the cache? */
+    r = rcache[RCACHE_HASH(id)];
+    
+    for (;;) {
+        if (r == NULL) 
+            break;
+        if (r->id == id) 
+        {
+            memcpy(r->node, node, BLOCK_SIZE);
+            
+            /* bring to front. */
+            if (r != rcache_head) {
+                
+                if (r == rcache_tail) {
+                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+                    rcache_tail->cache_next = NULL;
+                }
+
+                tmp = r->cache_next;
+                if (r->cache_next != NULL) r->cache_next->cache_prev 
+                                                     = r->cache_prev;
+                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
+
+                r->cache_prev = NULL;
+                r->cache_next = rcache_head;
+                if (rcache_head != NULL) rcache_head->cache_prev = r;
+                rcache_head = r;
+            }
+
+//printf("Update (%Ld)\n", r->id);
+            goto done;
+        }
+        r = r->hash_next;
+    }
+    
+    if ( rcache_count == RCACHE_MAX ) 
+    {
+        /* Remove an entry */
+        
+        r = rcache_tail;
+        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+        rcache_tail->cache_next = NULL;
+        freeblock(r->node);
+        
+        curs = &rcache[RCACHE_HASH(r->id)];
+        while ((*curs) != r)
+            curs = &(*curs)->hash_next;
+        *curs = r->hash_next;
+//printf("Evict (%Ld)\n", r->id);
+        
+    } else {
+        
+        r = (rcache_t *)malloc(sizeof(rcache_t));
+        rcache_count++;
+    }
+    
+    r->node = newblock();
+    memcpy(r->node, node, BLOCK_SIZE);
+    r->id = id;
+    
+    r->hash_next = rcache[RCACHE_HASH(id)];
+    rcache[RCACHE_HASH(id)] = r;
+    
+    r->cache_prev = NULL;
+    r->cache_next = rcache_head;
+    if (rcache_head != NULL) rcache_head->cache_prev = r;
+    rcache_head = r;
+    if (rcache_tail == NULL) rcache_tail = r;
+    
+//printf("Added (%Ld, %p)\n", id, r->node);
+done:
+    pthread_mutex_unlock(&rcache_mutex);
+}
+
+radix_tree_node *rcache_read(u64 id)
+{
+    rcache_t *r, *tmp;
+    radix_tree_node *node = NULL;
+    
+    pthread_mutex_lock(&rcache_mutex);
+
+    r = rcache[RCACHE_HASH(id)];
+    
+    for (;;) {
+        if (r == NULL) {
+//printf("Miss (%Ld)\n", id);
+            goto done;
+        }
+        if (r->id == id) break;
+        r = r->hash_next;
+    }
+   
+    /* bring to front. */
+    if (r != rcache_head) 
+    {
+        if (r == rcache_tail) {
+            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
+            rcache_tail->cache_next = NULL;
+        }
+        tmp = r->cache_next;
+        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
+        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
+
+        r->cache_prev = NULL;
+        r->cache_next = rcache_head;
+        if (rcache_head != NULL) rcache_head->cache_prev = r;
+        rcache_head = r;
+    }
+    
+    node = newblock();
+    memcpy(node, r->node, BLOCK_SIZE);
+    
+//printf("Hit (%Ld, %p)\n", id, r->node);
+done:
+    pthread_mutex_unlock(&rcache_mutex);
+    
+    return(node);
+}
+
+
+void *rc_readblock(u64 id)
+{
+    void *ret;
+    
+    ret = (void *)rcache_read(id);
+    
+    if (ret != NULL) return ret;
+    
+    ret = readblock(id);
+    
+    if (ret != NULL)
+        rcache_write(id, ret);
+    
+    return(ret);
+}
+
+u64 rc_allocblock(void *block)
+{
+    u64 ret;
+    
+    ret = allocblock(block);
+    
+    if (ret != ZERO)
+        rcache_write(ret, block);
+    
+    return(ret);
+}
+
+int rc_writeblock(u64 id, void *block)
+{
+    int ret;
+    
+    ret = writeblock(id, block);
+    rcache_write(id, block);
+    
+    return(ret);
+}
+
+
+/*
+ * block device interface and other helper functions
+ * with these functions, block id is just a 63-bit number, with
+ * no special consideration for the LSB
+ */
+radix_tree_node cloneblock(radix_tree_node block);
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+
+/**
+ * cloneblock: clone an existing block in memory
+ *   @block: the old block
+ *
+ *   @return: new block, with LSB cleared for every entry
+ */
+radix_tree_node cloneblock(radix_tree_node block) {
+    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
+    int i;
+    if (node == NULL) {
+        perror("cloneblock malloc");
+        return NULL;
+    }
+    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+        node[i] = block[i] & ONEMASK;
+    return node;
+}
+
+/**
+ * lookup: find a value given a key
+ *   @height: height in bits of the radix tree
+ *   @root: root node id, with set LSB indicating writable node
+ *   @key: key to lookup
+ *
+ *   @return: value on success, zero on error
+ */
+
+u64 lookup(int height, u64 root, u64 key) {
+    radix_tree_node node;
+    u64 mask = ONE;
+    
+    assert(key >> height == 0);
+
+    /* the root block may be smaller to ensure all leaves are full */
+    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+    /* now carve off equal sized chunks at each step */
+    for (;;) {
+        u64 oldroot;
+
+#ifdef DEBUG
+        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
+                (iswritable(root) ? "" : " (readonly)"));
+#endif
+        
+        if (getid(root) == ZERO)
+            return ZERO;
+
+        oldroot = root;
+        node = (radix_tree_node) rc_readblock(getid(root));
+        if (node == NULL)
+            return ZERO;
+
+        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
+        mask &= root;
+        freeblock(node);
+
+        if (height == 0)
+            return ( root & ONEMASK ) | mask;
+
+        height -= RADIX_TREE_MAP_SHIFT;
+    }
+
+    return ZERO;
+}
+
+/*
+ * update: set a radix tree entry, doing copy-on-write as necessary
+ *   @height: height in bits of the radix tree
+ *   @root: root node id, with set LSB indicating writable node
+ *   @key: key to set
+ *   @val: value to set, s.t. radix(key)=val
+ *
+ *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
+ */
+
+u64 update(int height, u64 root, u64 key, u64 val) {
+    int offset;
+    u64 child;
+    radix_tree_node node;
+    
+    /* base case--return val */
+    if (height == 0)
+        return val;
+
+    /* the root block may be smaller to ensure all leaves are full */
+    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+    offset = (key >> height) & RADIX_TREE_MAP_MASK;
+
+#ifdef DEBUG
+    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+            offset, (iswritable(root)?"":" (clone)"));
+#endif
+
+    /* load a block, or create a new one */
+    if (root == ZERO) {
+        node = (radix_tree_node) newblock();
+    } else {
+        node = (radix_tree_node) rc_readblock(getid(root));
+
+        if (!iswritable(root)) {
+            /* need to clone this node */
+            radix_tree_node oldnode = node;
+            node = cloneblock(node);
+            freeblock(oldnode);
+            root = ZERO;
+        }
+    }
+
+    if (node == NULL) {
+#ifdef DEBUG
+        printf("update: node is null!\n");
+#endif
+        return ZERO;
+    }
+
+    child = update(height, node[offset], key, val);
+
+    if (child == ZERO) {
+        freeblock(node);
+        return ZERO;
+    } else if (child == node[offset]) {
+        /* no change, so we already owned the child */
+        assert(iswritable(root));
+
+        freeblock(node);
+        return root;
+    }
+
+    node[offset] = child;
+
+    /* new/cloned blocks need to be saved */
+    if (root == ZERO) {
+        /* mark this as an owned block */
+        root = rc_allocblock(node);
+        if (root)
+            root = writable(root);
+    } else if (rc_writeblock(getid(root), node) < 0) {
+        freeblock(node);
+        return ZERO;
+    }
+
+    freeblock(node);
+    return root;
+}
+
+/**
+ * snapshot: create a snapshot
+ *   @root: old root node
+ *
+ *   @return: new root node, 0 on error
+ */
+u64 snapshot(u64 root) {
+    radix_tree_node node, newnode;
+
+    if ((node = rc_readblock(getid(root))) == NULL)
+        return ZERO;
+
+    newnode = cloneblock(node);
+    freeblock(node);
+    if (newnode == NULL)
+        return ZERO;
+    
+    root = rc_allocblock(newnode);
+    freeblock(newnode);
+
+    if (root == ZERO)
+        return ZERO;
+    else
+        return writable(root);
+}
+
+/**
+ * collapse: collapse a parent onto a child.
+ * 
+ * NOTE: This assumes that parent and child really are, and further that
+ * there are no other children forked from this parent. (children of the
+ * child are okay...)
+ */
+
+int collapse(int height, u64 proot, u64 croot)
+{
+    int i, numlinks, ret, total = 0;
+    radix_tree_node pnode, cnode;
+    
+    if (height == 0) {
+        height = -1; /* terminate recursion */
+    } else {        
+        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+    }
+    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
+
+    /* Terminal cases: */
+
+    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
+        return -1;
+    
+    /* get roots */
+    if ((pnode = readblock(getid(proot))) == NULL)
+        return -1;
+    
+    if ((cnode = readblock(getid(croot))) == NULL)
+    {
+        freeblock(pnode);
+        return -1;
+    }
+    
+    /* For each writable link in proot */
+    for (i=0; i<numlinks; i++)
+    {
+        if ( pnode[i] == cnode[i] ) continue;
+        
+        /* collapse (next level) */
+        /* if height != 0 and writable... */
+        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
+        {
+            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
+            ret = collapse(height, pnode[i], cnode[i]);
+            if (ret == -1) 
+            {
+                total = -1;
+            } else {
+                total += ret;
+            }
+        }
+    
+        
+    }
+    
+    /* if plink is writable, AND clink is writable -> free plink block */
+    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
+    {
+        releaseblock(getid(proot));
+        if (ret >=0) total++;
+        //printf("   Delete %Ld\n", getid(proot));
+    }
+//printf("done : %Ld\n", getid(proot));
+    return total;
+
+}
+
+
+void print_root(u64 root, int height, FILE *dot_f)
+{
+    FILE *f;
+    int i;
+    radix_tree_node node;
+    char *style[2] = { "", "style=bold,color=blue," };
+    
+    if (dot_f == NULL) {
+        f = fopen("radix.dot", "w");
+        if (f == NULL) {
+            perror("print_root: open");
+            return;
+        }
+
+        /* write graph preamble */
+        fprintf(f, "digraph G {\n");
+
+        /* add a node for this root. */
+        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
+                getid(root), style[iswritable(root)], getid(root));
+    }
+    
+    printf("print_root(%Ld)\n", getid(root));
+    
+    /* base case */
+    if (height == 0) {
+        /* add a node and edge for each child root */
+        node = (radix_tree_node) readblock(getid(root));
+        if (node == NULL)
+            return;
+        
+        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
+            if (node[i] != ZERO) {
+                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
+                        getid(node[i]), style[iswritable(node[i])], 
+                        getid(node[i]));
+                fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
+                        getid(node[i]), i);
+            }
+        }
+        freeblock(node);
+        return;
+    }
+
+    /* the root block may be smaller to ensure all leaves are full */
+    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+    if (getid(root) == ZERO)
+        return;
+
+    node = (radix_tree_node) readblock(getid(root));
+    if (node == NULL)
+        return;
+
+    /* add a node and edge for each child root */
+    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+        if (node[i] != ZERO) {
+            fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
+                    getid(node[i]), style[iswritable(node[i])], 
+                    getid(node[i]));
+
+            print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
+            fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
+                    getid(node[i]), i);
+        }
+
+    freeblock(node);
+    
+    /* write graph postamble */
+    if (dot_f == NULL) {
+        fprintf(f, "}\n");
+        fclose(f);
+    }
+}
+
+#ifdef RADIX_STANDALONE
+
+int main(int argc, char **argv) {
+    u64 key = ZERO, val = ZERO;
+    u64 root = writable(2ULL);
+    u64 p = ZERO, c = ZERO;
+    int v;
+    char buff[4096];
+
+    __init_blockstore();
+    
+    memset(buff, 0, 4096);
+    /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
+
+    if (fp < 3) {
+        perror("open");
+        return -1;
+    }
+    if (lseek(fp, 0, SEEK_END) == 0) {
+        write(fp, buff, 4096);
+    }*/
+        
+    allocblock(buff);
+            
+    printf("Recognized commands:\n"
+           "Note: the LSB of a node number indicates if it is writable\n"
+           "  root <node>               set root to <node>\n"
+           "  snapshot                  take a snapshot of the root\n"
+           "  set <key> <val>           set key=val\n"
+           "  get <key>                 query key\n"
+           "  c <proot> <croot>         collapse\n"
+           "  pr                        print tree to dot\n"
+           "  pf <1=verbose>            print freelist\n"
+           "  quit\n"
+           "\nroot = %Ld\n", root);
+    for (;;) {
+        //print_root(root, 34, NULL);
+        //system("dot radix.dot -Tps -o radix.ps");
+
+        printf("> ");
+        fflush(stdout);
+        fgets(buff, 1024, stdin);
+        if (feof(stdin))
+            break;
+        if (sscanf(buff, " root %Ld", &root) == 1) {
+            printf("root set to %Ld\n", root);
+        } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
+            root = update(34, root, key, val);
+            printf("root = %Ld\n", root);
+        } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
+            v = collapse(34, p, c);
+            printf("reclaimed %d blocks.\n", v);
+        } else if (sscanf(buff, " get %Ld", &key) == 1) {
+            val = lookup(34, root, key);
+            printf("value = %Ld\n", val);
+        } else if (!strcmp(buff, "quit\n")) {
+            break;
+        } else if (!strcmp(buff, "snapshot\n")) {
+            root = snapshot(root);
+            printf("new root = %Ld\n", root);
+        } else if (sscanf(buff, " pr %Ld", &root) == 1) {
+            print_root(root, 34, NULL);
+        } else if (sscanf(buff, " pf %d", &v) == 1) {
+            freelist_count(v);
+        } else if (!strcmp(buff, "pf\n")) {
+            freelist_count(0);
+        } else {
+            printf("command not recognized\n");
+        }
+    }
+    return 0;
+}
+
+#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/radix.h
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/radix.h     Sun Jul  3 22:36:48 2005
@@ -0,0 +1,45 @@
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#ifndef __RADIX_H__
+#define __RADIX_H__
+
+/* I don't really like exposing these, but... */
+#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
+#define putid(x) ((x)<<1)
+#define writable(x) (((x)<<1)|1LL)
+#define iswritable(x) ((x)&1LL)
+#define ZERO 0LL
+#define ONE 1LL
+#define ONEMASK 0xffffffffffffffeLL
+
+#define RADIX_TREE_MAP_SHIFT 9
+#define RADIX_TREE_MAP_MASK 0x1ff
+#define RADIX_TREE_MAP_ENTRIES 512
+
+typedef u64 *radix_tree_node;
+
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+int collapse(int height, u64 proot, u64 croot);
+int isprivate(int height, u64 root, u64 key);
+
+
+void __rcache_init(void);
+
+#endif /* __RADIX_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstored.c
--- /dev/null   Sun Jul  3 22:32:52 2005
+++ b/tools/blktap/parallax/blockstored.c       Sun Jul  3 22:36:48 2005
@@ -0,0 +1,276 @@
+/**************************************************************************
+ * 
+ * blockstored.c
+ *
+ * Block store daemon.
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+#include <errno.h>
+#include "blockstore.h"
+
+//#define BSDEBUG
+
+int readblock_into(u64 id, void *block);
+
+int open_socket(u16 port) {
+    
+    struct sockaddr_in sn;
+    int sock;
+
+    sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock < 0) {
+        perror("Bad socket");
+        return -1;
+    }
+    memset(&sn, 0, sizeof(sn));
+    sn.sin_family = AF_INET;
+    sn.sin_port = htons(port);
+    sn.sin_addr.s_addr = htonl(INADDR_ANY);
+    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
+        perror("bind");
+        close(sock);
+        return -1;
+    }
+
+    return sock;
+}
+
+static int block_fp = -1;
+static int bssock = -1;
+
+int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
+
+    int rc;
+    
+#ifdef BSDEBUG
+    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
+            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t 
*)buffer)->hdr.id);
+#endif
+    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, 
sizeof(*peer));
+    if (rc < 0) {
+        perror("send_reply");
+        return 1;
+    }
+
+
+    return 0;
+}
+
+static bsmsg_t msgbuf;
+
+void service_loop(void) {
+
+    for (;;) {
+        int rc, len;
+        struct sockaddr_in from;
+        size_t slen = sizeof(from);
+        u64 bid;
+
+        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
+                       (struct sockaddr *)&from, &slen);
+
+        if (len < 0) {
+            perror("recvfrom");
+            continue;
+        }
+
+        if (len < MSGBUFSIZE_OP) {
+            fprintf(stderr, "Short packet.\n");
+            continue;
+        }
+
+#ifdef BSDEBUG
+        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
+                len, msgbuf.hdr.operation, msgbuf.hdr.id);
+#endif
+
+        switch (msgbuf.hdr.operation) {
+        case BSOP_READBLOCK:
+            if (len < MSGBUFSIZE_ID) {
+                fprintf(stderr, "Short packet (readblock %u).\n", len);
+                continue;
+            }
+            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
+            if (rc < 0) {
+                fprintf(stderr, "readblock error\n");
+                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+                continue;
+            }
+            msgbuf.hdr.flags = 0;
+            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
+            break;
+        case BSOP_WRITEBLOCK:
+            if (len < MSGBUFSIZE_BLOCK) {
+                fprintf(stderr, "Short packet (writeblock %u).\n", len);
+                continue;
+            }
+            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
+            if (rc < 0) {
+                fprintf(stderr, "writeblock error\n");
+                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+                continue;
+            }
+            msgbuf.hdr.flags = 0;
+            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+            break;
+        case BSOP_ALLOCBLOCK:
+            if (len < MSGBUFSIZE_BLOCK) {
+                fprintf(stderr, "Short packet (allocblock %u).\n", len);
+                continue;
+            }
+            bid = allocblock(msgbuf.block);
+            if (bid == ALLOCFAIL) {
+                fprintf(stderr, "allocblock error\n");
+                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
+                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+                continue;
+            }
+            msgbuf.hdr.id = bid;
+            msgbuf.hdr.flags = 0;
+            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
+            break;
+        }
+
+    }
+}
+ 
+/**
+ * readblock: read a block from disk
+ *   @id: block id to read
+ *   @block: pointer to buffer to receive block
+ *
+ *   @return: 0 if OK, other on error
+ */
+
+int readblock_into(u64 id, void *block) {
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
+        perror("readblock lseek");
+        return -1;
+    }
+    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("readblock read");
+        return -1;
+    }
+    return 0;
+}
+
+/**
+ * writeblock: write an existing block to disk
+ *   @id: block id
+ *   @block: pointer to block
+ *
+ *   @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+        perror("writeblock lseek");
+        return -1;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) < 0) {
+        perror("writeblock write");
+        return -1;
+    }
+    return 0;
+}
+
+/**
+ * allocblock: write a new block to disk
+ *   @block: pointer to block
+ *
+ *   @return: new id of block on disk
+ */
+static u64 lastblock = 0;
+
+u64 allocblock(void *block) {
+    u64 lb;
+    off64_t pos;
+
+    retry:
+    pos = lseek64(block_fp, 0, SEEK_END);
+    if (pos == (off64_t)-1) {
+        perror("allocblock lseek");
+        return ALLOCFAIL;
+    }
+    if (pos % BLOCK_SIZE != 0) {
+        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
+        return ALLOCFAIL;
+    }
+    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+        perror("allocblock write");
+        return ALLOCFAIL;
+    }
+    lb = pos / BLOCK_SIZE + 1;
+
+#ifdef BS_ALLOC_HACK
+    if (lb < BS_ALLOC_SKIP)
+        goto retry;
+#endif
+    
+    if (lb <= lastblock)
+        printf("[*** %Ld alredy allocated! ***]\n", lb);
+    
+    lastblock = lb;
+    return lb;
+}
+
+/**
+ * newblock: get a new in-memory block set to zeros
+ *
+ *   @return: pointer to new block, NULL on error
+ */
+void *newblock() {
+    void *block = malloc(BLOCK_SIZE);
+    if (block == NULL) {
+        perror("newblock");
+        return NULL;
+    }
+    memset(block, 0, BLOCK_SIZE);
+    return block;
+}
+
+
+/**
+ * freeblock: unallocate an in-memory block
+ *   @id: block id (zero if this is only in-memory)
+ *   @block: block to be freed
+ */
+void freeblock(void *block) {
+    if (block != NULL)
+        free(block);
+}
+
+
+int main(int argc, char **argv)
+{
+    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+    if (block_fp < 0) {
+        perror("open");
+        return -1;
+    }
+
+    bssock = open_socket(BLOCKSTORED_PORT);
+    if (bssock < 0) {
+        return -1;
+    }
+
+    service_loop();
+    
+    close(bssock);
+
+    return 0;
+}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/README-PARALLAX
--- a/tools/blktap/README-PARALLAX      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,177 +0,0 @@
-Parallax Quick Overview
-March 3, 2005
-
-This is intended to provide a quick set of instructions to let you
-guys play with the current parallax source.  In it's current form, the
-code will let you run an arbitrary number of VMs off of a single disk
-image, doing copy-on-write as they make updates.  Each domain is
-assigned a virtual disk image (VDI), which may be based on a snapshot
-of an existing image.  All of the VDI and snapshot management should
-currently work.
-
-The current implementation uses a single file as a blockstore for
-_everything_ this will soon be replaced by the fancier backend code
-and the local cache.  As it stands, Parallax will create
-"blockstore.dat" in the directory that you run it from, and use
-largefile support to make this grow to unfathomable girth.  So, you
-probably want to run the daemon off of a local disk, with a lot of
-free space.
-
-Here's how to get going:
-
-0. Setup:
----------
-
-Pick a local directory on a disk with lots of room.  You should be
-running from a privileged domain (e.g. dom0) with the blocktap
-configured in and block backend NOT.
-
-For convenience (for the moment) copy all of the vdi tools (vdi_*) and
-the parallax daemon from tools/blktap into this directory.
-
-1. Populate the blockstore:
----------------------------
-
-First you need to put at least one image into the blockstore.  You
-will need a disk image, either as a file or local partition.  My
-general approach has been to
-
-(a) make a really big sparse file with 
-
-        dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
-
-(b) put a filesystem into it
-
-        mkfs.ext3 ./image
-
-(c) mount it using loopback
-
-        mkdir ./mnt
-        mount -o loop ./image
-
-(d) cd into it and untar one of the image files from srg-roots.
-
-        cd mnt
-        tar ...
-
-NOTE: Beware if your system is FC3.  mkfs is not compatible with old
-versions of fedora, and so you don't have much choice but to install
-further fc3 images if you have used the fc3 version of mkfs.
-
-(e) unmount the image
-
-        cd ..
-        umount mnt
-
-(f) now, create a new VDI to hold the image 
-
-        ./vdi_create "My new FC3 VDI"
-
-(g) get the id of the new VDI.
-
-        ./vdi_list
-
-        |      0                     My new FC3 VDI
-
-(0 is the VDI id... create a few more if you want.)
-
-(h) hoover your image into the new VDI.
-
-        ./vdi_fill 0 ./image
-
-This will pull the entire image into the blockstore and set up a
-mapping tree for it for VDI 0.  Passing a device (i.e. /dev/sda3)
-should also work, but vdi_fill has NO notion of sparseness yet, so you
-are going to pump a block into the store for each block you read.
-
-vdi_fill will count up until it is done, and you should be ready to
-go.  If you want to be anal, you can use vdi_validate to test the VDI
-against the original image.
-
-2. Create some extra VDIs
--------------------------
-
-VDIs are actually a list of snapshots, and each snapshot is a full
-image of mappings.  So, to preserve an immutable copy of a current
-VDI, do this:
-
-(a) Snapshot your new VDI.
-
-        ./vdi_snap 0
-
-Snapshotting writes the current radix root to the VDI's snapshot log,
-and assigns it a new writable root.
-
-(b) look at the VDI's snapshot log.
-
-        ./vdi_snap_list 0
-
-        | 16   0      Thu Mar  3 19:27:48 2005 565111           31
-
-The first two columns constitute a snapshot id and represent the
-(block, offset) of the snapshot record.  The Date tells you when the
-snapshot was made, and 31 is the radix root node of the snapshot.
-
-(c) Create a new VDI, based on that snapshot, and look at the list.
-
-        ./vdi_create "FC3 - Copy 1" 16 0
-        ./vdi_list
-
-        |      0                     My new FC3 VDI
-        |      1                       FC3 - Copy 1
-
-NOTE: If you have Graphviz installed on your system, you can use
-vdi_tree to generate a postscript of your current set of VDIs and
-snapshots.
-
-
-Create as many VDIs as you need for the VMs that you want to run.
-
-3. Boot some VMs:
------------------
-
-Parallax currently uses a hack in xend to pass the VDI id, you need to
-modify the disk line of the VM config that is going to mount it.
-
-(a) set up your vm config, by using the following disk line:
-
-        disk = ['parallax:1,sda1,w,0' ]
-
-This example uses VDI 1 (from vdi_list above), presents it as sda1
-(writable), and uses dom 0 as the backend.  If you were running the
-daemon (and tap driver) in some domain other than 0, you would change
-this last parameter.
-
-NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so 
that it knows what to do with "parallax:".
-
-(b) Run parallax in the backend domain.
-
-        ./parallax
-
-(c) create your new domain.
-
-        xm create ...
-
----
-
-That's pretty much all there is to it at the moment.  Hope this is
-clear enough to get you going.  Now, a few serious caveats that will
-be sorted out in the almost immediate future:
-
-WARNINGS:
----------
-
-1. There is NO locking in the VDI tools at the moment, so I'd avoid
-running them in parallel, or more importantly, running them while the
-daemon is running.
-
-2. I doubt that xend will be very happy about restarting if you have
-parallax-using domains.  So if it dies while there are active parallax
-doms, you may need to reboot.
-
-3. I've turned off write-in-place.  So at the moment, EVERY block
-write is a log append on the blockstore.  I've been having some probs
-with the radix tree's marking of writable blocks after snapshots and
-will sort this out very soon.
-
-
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/block-async.c
--- a/tools/blktap/block-async.c        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,393 +0,0 @@
-/* block-async.c
- * 
- * Asynchronous block wrappers for parallax.
- */
- 
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include "block-async.h"
-#include "blockstore.h"
-#include "vdi.h"
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* We have a queue of outstanding I/O requests implemented as a 
- * circular producer-consumer ring with free-running buffers.
- * to allow reordering, this ring indirects to indexes in an 
- * ring of io_structs.
- * 
- * the block_* calls may either add an entry to this ring and return, 
- * or satisfy the request immediately and call the callback directly.
- * None of the io calls in parallax should be nested enough to worry 
- * about stack problems with this approach.
- */
-
-struct read_args {
-    u64 addr;
-};
-
-struct write_args {
-    u64   addr;
-    char *block;
-};
-
-struct alloc_args {
-    char *block;
-};
- 
-struct pending_io_req {
-    enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op;
-    union {
-        struct read_args  r;
-        struct write_args w;
-        struct alloc_args a;
-    } u;
-    io_cb_t cb;
-    void *param;
-};
-
-void radix_lock_init(struct radix_lock *r)
-{
-    int i;
-    
-    pthread_mutex_init(&r->lock, NULL);
-    for (i=0; i < 1024; i++) {
-        r->lines[i] = 0;
-        r->waiters[i] = NULL;
-        r->state[i] = ANY;
-    }
-}
-
-/* maximum outstanding I/O requests issued asynchronously */
-/* must be a power of 2.*/
-#define MAX_PENDING_IO 1024
-
-/* how many threads to concurrently issue I/O to the disk. */
-#define IO_POOL_SIZE   10
-
-static struct pending_io_req pending_io_reqs[MAX_PENDING_IO];
-static int pending_io_list[MAX_PENDING_IO];
-static unsigned long io_prod = 0, io_cons = 0, io_free = 0;
-#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1))
-#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs)
-#define PENDING_IO_ENT(_x) \
-       (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]])
-#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod)
-#define CAN_CONSUME_PENDING_IO (io_cons != io_prod)
-static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t  pending_io_cond = PTHREAD_COND_INITIALIZER;
-
-static void init_pending_io(void)
-{
-    int i;
-       
-    for (i=0; i<MAX_PENDING_IO; i++)
-        pending_io_list[i] = i;
-               
-} 
-
-void block_read(u64 addr, io_cb_t cb, void *param)
-{
-    struct pending_io_req *req;
-    
-    pthread_mutex_lock(&pending_io_lock);
-    assert(CAN_PRODUCE_PENDING_IO);
-    
-    req = PENDING_IO_ENT(io_prod++);
-    DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req);
-    req->op = IO_READ;
-    req->u.r.addr = addr;
-    req->cb = cb;
-    req->param = param;
-    
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);    
-}
-
-
-void block_write(u64 addr, char *block, io_cb_t cb, void *param)
-{
-    struct pending_io_req *req;
-    
-    pthread_mutex_lock(&pending_io_lock);
-    assert(CAN_PRODUCE_PENDING_IO);
-    
-    req = PENDING_IO_ENT(io_prod++);
-    DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req);
-    req->op = IO_WRITE;
-    req->u.w.addr  = addr;
-    req->u.w.block = block;
-    req->cb = cb;
-    req->param = param;
-    
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);    
-}
-
-
-void block_alloc(char *block, io_cb_t cb, void *param)
-{
-    struct pending_io_req *req;
-       
-    pthread_mutex_lock(&pending_io_lock);
-    assert(CAN_PRODUCE_PENDING_IO);
-    
-    req = PENDING_IO_ENT(io_prod++);
-    req->op = IO_ALLOC;
-    req->u.a.block = block;
-    req->cb = cb;
-    req->param = param;
-    
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);    
-}
-
-void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-    pthread_mutex_lock(&r->lock);
-    
-    if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) {
-        r->lines[row]++;
-        r->state[row] = READ;
-        DPRINTF("RLOCK  : %3d (row: %d)\n", r->lines[row], row);
-        pthread_mutex_unlock(&r->lock);
-        ret.type = IO_INT_T;
-        ret.u.i = 0;
-        cb(ret, param);
-    } else {
-        struct radix_wait **rwc;
-        struct radix_wait *rw = 
-            (struct radix_wait *) malloc (sizeof(struct radix_wait));
-        DPRINTF("RLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
-        rw->type  = RLOCK;
-        rw->param = param;
-        rw->cb    = cb;
-        rw->next  = NULL;
-        /* append to waiters list. */
-        rwc = &r->waiters[row];
-        while (*rwc != NULL) rwc = &(*rwc)->next;
-        *rwc = rw;
-        pthread_mutex_unlock(&r->lock);
-        return;
-    }
-}
-
-
-void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-    pthread_mutex_lock(&r->lock);
-    
-    /* the second check here is redundant -- just here for debugging now. */
-    if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) {
-        r->state[row] = STOP;
-        r->lines[row] = -1;
-        DPRINTF("WLOCK  : %3d (row: %d)\n", r->lines[row], row);
-        pthread_mutex_unlock(&r->lock);
-        ret.type = IO_INT_T;
-        ret.u.i = 0;
-        cb(ret, param);
-    } else {
-        struct radix_wait **rwc;
-        struct radix_wait *rw = 
-            (struct radix_wait *) malloc (sizeof(struct radix_wait));
-        DPRINTF("WLOCK  : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row);
-        rw->type  = WLOCK;
-        rw->param = param;
-        rw->cb    = cb;
-        rw->next  = NULL;
-        /* append to waiters list. */
-        rwc = &r->waiters[row];
-        while (*rwc != NULL) rwc = &(*rwc)->next;
-        *rwc = rw;
-        pthread_mutex_unlock(&r->lock);
-        return;
-    }
-       
-}
-
-/* called with radix_lock locked and lock count of zero. */
-static void wake_waiters(struct radix_lock *r, int row)
-{
-    struct pending_io_req *req;
-    struct radix_wait *rw;
-    
-    if (r->lines[row] != 0) return;
-    if (r->waiters[row] == NULL) return; 
-    
-    if (r->waiters[row]->type == WLOCK) {
-
-        rw = r->waiters[row];
-        pthread_mutex_lock(&pending_io_lock);
-        assert(CAN_PRODUCE_PENDING_IO);
-        
-        req = PENDING_IO_ENT(io_prod++);
-        req->op    = IO_WWAKE;
-        req->cb    = rw->cb;
-        req->param = rw->param;
-        r->lines[row] = -1; /* write lock the row. */
-        r->state[row] = STOP;
-        r->waiters[row] = rw->next;
-        free(rw);
-        pthread_mutex_unlock(&pending_io_lock);
-    
-    } else /* RLOCK */ {
-
-        while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) {
-            rw = r->waiters[row];
-            pthread_mutex_lock(&pending_io_lock);
-            assert(CAN_PRODUCE_PENDING_IO);
-            
-            req = PENDING_IO_ENT(io_prod++);
-            req->op    = IO_RWAKE;
-            req->cb    = rw->cb;
-            req->param = rw->param;
-            r->lines[row]++; /* read lock the row. */
-            r->state[row] = READ; 
-            r->waiters[row] = rw->next;
-            free(rw);
-            pthread_mutex_unlock(&pending_io_lock);
-        }
-
-        if (r->waiters[row] != NULL) /* There is a write queued still */
-            r->state[row] = STOP;
-    }  
-    
-    pthread_mutex_lock(&pending_io_lock);
-    pthread_cond_signal(&pending_io_cond);
-    pthread_mutex_unlock(&pending_io_lock);
-}
-
-void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-       
-    pthread_mutex_lock(&r->lock);
-    assert(r->lines[row] > 0); /* try to catch misuse. */
-    r->lines[row]--;
-    if (r->lines[row] == 0) {
-        r->state[row] = ANY;
-        wake_waiters(r, row);
-    }
-    pthread_mutex_unlock(&r->lock);
-    cb(ret, param);
-}
-
-void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param)
-{
-    struct io_ret ret;
-    
-    pthread_mutex_lock(&r->lock);
-    assert(r->lines[row] == -1); /* try to catch misuse. */
-    r->lines[row] = 0;
-    r->state[row] = ANY;
-    wake_waiters(r, row);
-    pthread_mutex_unlock(&r->lock);
-    cb(ret, param);
-}
-
-/* consumer calls */
-static void do_next_io_req(struct pending_io_req *req)
-{
-    struct io_ret          ret;
-    void  *param;
-    
-    switch (req->op) {
-    case IO_READ:
-        ret.type = IO_BLOCK_T;
-        ret.u.b  = readblock(req->u.r.addr);
-        break;
-    case IO_WRITE:
-        ret.type = IO_INT_T;
-        ret.u.i  = writeblock(req->u.w.addr, req->u.w.block);
-        DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr);
-        break;
-    case IO_ALLOC:
-        ret.type = IO_ADDR_T;
-        ret.u.a  = allocblock(req->u.a.block);
-        break;
-    case IO_RWAKE:
-        DPRINTF("WAKE DEFERRED RLOCK!\n");
-        ret.type = IO_INT_T;
-        ret.u.i  = 0;
-        break;
-    case IO_WWAKE:
-        DPRINTF("WAKE DEFERRED WLOCK!\n");
-        ret.type = IO_INT_T;
-        ret.u.i  = 0;
-        break;
-    default:
-        DPRINTF("Unknown IO operation on pending list!\n");
-        return;
-    }
-    
-    param = req->param;
-    pthread_mutex_lock(&pending_io_lock);
-    pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req);
-    pthread_mutex_unlock(&pending_io_lock);
-       
-    assert(req->cb != NULL);
-    req->cb(ret, param);
-    
-}
-
-void *io_thread(void *param) 
-{
-    int tid;
-    struct pending_io_req *req;
-    
-    /* Set this thread's tid. */
-    tid = *(int *)param;
-    free(param);
-    
-start:
-    pthread_mutex_lock(&pending_io_lock);
-    while (io_prod == io_cons) {
-        pthread_cond_wait(&pending_io_cond, &pending_io_lock);
-    }
-    
-    if (io_prod == io_cons) {
-        /* unnecessary wakeup. */
-        pthread_mutex_unlock(&pending_io_lock);
-        goto start;
-    }
-    
-    req = PENDING_IO_ENT(io_cons++);
-    pthread_mutex_unlock(&pending_io_lock);
-       
-    do_next_io_req(req);
-    
-    goto start;
-       
-}
-
-static pthread_t io_pool[IO_POOL_SIZE];
-void start_io_threads(void)
-
-{      
-    int i, tid=0;
-    
-    for (i=0; i < IO_POOL_SIZE; i++) {
-        int ret, *t;
-        t = (int *)malloc(sizeof(int));
-        *t = tid++;
-        ret = pthread_create(&io_pool[i], NULL, io_thread, t);
-        if (ret != 0) printf("Error starting thread %d\n", i);
-    }
-       
-}
-
-void init_block_async(void)
-{
-    init_pending_io();
-    start_io_threads();
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/block-async.h
--- a/tools/blktap/block-async.h        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,69 +0,0 @@
-/* block-async.h
- * 
- * Asynchronous block wrappers for parallax.
- */
- 
-#ifndef _BLOCKASYNC_H_
-#define _BLOCKASYNC_H_
-
-#include <assert.h>
-#include <xc.h>
-#include "vdi.h"
-
-struct io_ret
-{
-    enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type;
-    union {
-        u64   a;
-        char *b;
-        int   i;
-    } u;
-};
-
-typedef void (*io_cb_t)(struct io_ret r, void *param);
-
-/* per-vdi lock structures to make sure requests run in a safe order. */
-struct radix_wait {
-    enum {RLOCK, WLOCK} type;
-    io_cb_t  cb;
-    void    *param;
-    struct radix_wait *next;
-};
-
-struct radix_lock {
-    pthread_mutex_t lock;
-    int                    lines[1024];
-    struct radix_wait     *waiters[1024];
-    enum {ANY, READ, STOP} state[1024];
-};
-void radix_lock_init(struct radix_lock *r);
-
-void block_read(u64 addr, io_cb_t cb, void *param);
-void block_write(u64 addr, char *block, io_cb_t cb, void *param);
-void block_alloc(char *block, io_cb_t cb, void *param);
-void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param);
-void init_block_async(void);
-
-static inline u64 IO_ADDR(struct io_ret r)
-{
-    assert(r.type == IO_ADDR_T);
-    return r.u.a;
-}
-
-static inline char *IO_BLOCK(struct io_ret r)
-{
-    assert(r.type == IO_BLOCK_T);
-    return r.u.b;
-}
-
-static inline int IO_INT(struct io_ret r)
-{
-    assert(r.type == IO_INT_T);
-    return r.u.i;
-}
-
-
-#endif //_BLOCKASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstore.c
--- a/tools/blktap/blockstore.c Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,1350 +0,0 @@
-/**************************************************************************
- * 
- * blockstore.c
- *
- * Simple block store interface
- *
- */
- 
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <stdarg.h>
-#include "blockstore.h"
-#include <pthread.h>
-
-//#define BLOCKSTORE_REMOTE
-//#define BSDEBUG
-
-#define RETRY_TIMEOUT 1000000 /* microseconds */
-
-/*****************************************************************************
- * Debugging
- */
-#ifdef BSDEBUG
-void DB(char *format, ...)
-{
-    va_list args;
-    fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key));
-    va_start(args, format);
-    vfprintf(stderr, format, args);
-    va_end(args);
-}
-#else
-#define DB(format, ...) (void)0
-#endif
-
-#ifdef BLOCKSTORE_REMOTE
-
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <netdb.h>
-
-/*****************************************************************************
- * Network state                                                             *
- *****************************************************************************/
-
-/* The individual disk servers we talks to. These will be referenced by
- * an integer index into bsservers[].
- */
-bsserver_t bsservers[MAX_SERVERS];
-
-/* The cluster map. This is indexed by an integer cluster number.
- */
-bscluster_t bsclusters[MAX_CLUSTERS];
-
-/* Local socket.
- */
-struct sockaddr_in sin_local;
-int bssock = 0;
-
-/*****************************************************************************
- * Notification                                                              *
- *****************************************************************************/
-
-typedef struct pool_thread_t_struct {
-    pthread_mutex_t ptmutex;
-    pthread_cond_t ptcv;
-    int newdata;
-} pool_thread_t;
-
-pool_thread_t pool_thread[READ_POOL_SIZE+1];
-
-#define RECV_NOTIFY(tid) { \
-    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
-    pool_thread[tid].newdata = 1; \
-    DB("CV Waking %u", tid); \
-    pthread_cond_signal(&(pool_thread[tid].ptcv)); \
-    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
-#define RECV_AWAIT(tid) { \
-    pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \
-    if (pool_thread[tid].newdata) { \
-        pool_thread[tid].newdata = 0; \
-        DB("CV Woken %u", tid); \
-    } \
-    else { \
-        DB("CV Waiting %u", tid); \
-        pthread_cond_wait(&(pool_thread[tid].ptcv), \
-                          &(pool_thread[tid].ptmutex)); \
-    } \
-    pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); }
-
-/*****************************************************************************
- * Message queue management                                                  *
- *****************************************************************************/
-
-/* Protects the queue manipulation critcal regions.
- */
-pthread_mutex_t ptmutex_queue;
-#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue)
-#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue)
-
-pthread_mutex_t ptmutex_recv;
-#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv)
-#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv)
-
-/* A message queue entry. We allocate one of these for every request we send.
- * Asynchronous reply reception also used one of these.
- */
-typedef struct bsq_t_struct {
-    struct bsq_t_struct *prev;
-    struct bsq_t_struct *next;
-    int status;
-    int server;
-    int length;
-    struct msghdr msghdr;
-    struct iovec iov[2];
-    int tid;
-    struct timeval tv_sent;
-    bshdr_t message;
-    void *block;
-} bsq_t;
-
-#define BSQ_STATUS_MATCHED 1
-
-pthread_mutex_t ptmutex_luid;
-#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid)
-#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid)
-
-static u64 luid_cnt = 0x1000ULL;
-u64 new_luid(void) {
-    u64 luid;
-    ENTER_LUID_CR;
-    luid = luid_cnt++;
-    LEAVE_LUID_CR;
-    return luid;
-}
-
-/* Queue of outstanding requests.
- */
-bsq_t *bs_head = NULL;
-bsq_t *bs_tail = NULL;
-int bs_qlen = 0;
-
-/*
- */
-void queuedebug(char *msg) {
-    bsq_t *q;
-    ENTER_QUEUE_CR;
-    fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen);
-    for (q = bs_head; q; q = q->next) {
-        fprintf(stderr, "  luid=%016llx server=%u\n",
-                q->message.luid, q->server);
-    }
-    LEAVE_QUEUE_CR;
-}
-
-int enqueue(bsq_t *qe) {
-    ENTER_QUEUE_CR;
-    qe->next = NULL;
-    qe->prev = bs_tail;
-    if (!bs_head)
-        bs_head = qe;
-    else
-        bs_tail->next = qe;
-    bs_tail = qe;
-    bs_qlen++;
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("enqueue");
-#endif
-    return 0;
-}
-
-int dequeue(bsq_t *qe) {
-    bsq_t *q;
-    ENTER_QUEUE_CR;
-    for (q = bs_head; q; q = q->next) {
-        if (q == qe) {
-            if (q->prev)
-                q->prev->next = q->next;
-            else 
-                bs_head = q->next;
-            if (q->next)
-                q->next->prev = q->prev;
-            else
-                bs_tail = q->prev;
-            bs_qlen--;
-            goto found;
-        }
-    }
-
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("dequeue not found");
-#endif
-    return 0;
-
-    found:
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("dequeue not found");
-#endif
-    return 1;
-}
-
-bsq_t *queuesearch(bsq_t *qe) {
-    bsq_t *q;
-    ENTER_QUEUE_CR;
-    for (q = bs_head; q; q = q->next) {
-        if ((qe->server == q->server) &&
-            (qe->message.operation == q->message.operation) &&
-            (qe->message.luid == q->message.luid)) {
-
-            if ((q->message.operation == BSOP_READBLOCK) &&
-                ((q->message.flags & BSOP_FLAG_ERROR) == 0)) {
-                q->block = qe->block;
-                qe->block = NULL;
-            }
-            q->length = qe->length;
-            q->message.flags = qe->message.flags;
-            q->message.id = qe->message.id;
-            q->status |= BSQ_STATUS_MATCHED;
-
-            if (q->prev)
-                q->prev->next = q->next;
-            else 
-                bs_head = q->next;
-            if (q->next)
-                q->next->prev = q->prev;
-            else
-                bs_tail = q->prev;
-            q->next = NULL;
-            q->prev = NULL;
-            bs_qlen--;
-            goto found;
-        }
-    }
-
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("queuesearch not found");
-#endif
-    return NULL;
-
-    found:
-    LEAVE_QUEUE_CR;
-#ifdef BSDEBUG
-    queuedebug("queuesearch found");
-#endif
-    return q;
-}
-
-/*****************************************************************************
- * Network communication                                                     *
- *****************************************************************************/
-
-int send_message(bsq_t *qe) {
-    int rc;
-
-    qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin);
-    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
-    qe->msghdr.msg_iov = qe->iov;
-    if (qe->block)
-        qe->msghdr.msg_iovlen = 2;
-    else
-        qe->msghdr.msg_iovlen = 1;
-    qe->msghdr.msg_control = NULL;
-    qe->msghdr.msg_controllen = 0;
-    qe->msghdr.msg_flags = 0;
-
-    qe->iov[0].iov_base = (void *)&(qe->message);
-    qe->iov[0].iov_len = MSGBUFSIZE_ID;
-
-    if (qe->block) {
-        qe->iov[1].iov_base = qe->block;
-        qe->iov[1].iov_len = BLOCK_SIZE;
-    }
-
-    qe->message.luid = new_luid();
-
-    qe->status = 0;
-    qe->tid = (int)pthread_getspecific(tid_key);
-    if (enqueue(qe) < 0) {
-        fprintf(stderr, "Error enqueuing request.\n");
-        return -1;
-    }
-
-    gettimeofday(&(qe->tv_sent), NULL);
-    DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid);
-    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
-    //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0,
-    //           (struct sockaddr *)&(bsservers[qe->server].sin),
-    //           sizeof(struct sockaddr_in));
-    if (rc < 0)
-        return rc;
-
-    return rc;
-}
-
-int recv_message(bsq_t *qe) {
-    struct sockaddr_in from;
-    //int flen = sizeof(from);
-    int rc;
-
-    qe->msghdr.msg_name = &from;
-    qe->msghdr.msg_namelen = sizeof(struct sockaddr_in);
-    qe->msghdr.msg_iov = qe->iov;
-    if (qe->block)
-        qe->msghdr.msg_iovlen = 2;
-    else
-        qe->msghdr.msg_iovlen = 1;
-    qe->msghdr.msg_control = NULL;
-    qe->msghdr.msg_controllen = 0;
-    qe->msghdr.msg_flags = 0;
-
-    qe->iov[0].iov_base = (void *)&(qe->message);
-    qe->iov[0].iov_len = MSGBUFSIZE_ID;
-    if (qe->block) {
-        qe->iov[1].iov_base = qe->block;
-        qe->iov[1].iov_len = BLOCK_SIZE;
-    }
-
-    rc = recvmsg(bssock, &(qe->msghdr), 0);
-
-    //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0,
-    //               (struct sockaddr *)&from, &flen);
-    return rc;
-}
-
-int get_server_number(struct sockaddr_in *sin) {
-    int i;
-
-#ifdef BSDEBUG2
-    fprintf(stderr,
-            "get_server_number(%u.%u.%u.%u/%u)\n",
-            (unsigned int)sin->sin_addr.s_addr & 0xff,
-            ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff,
-            ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff,
-            ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff,
-            (unsigned int)sin->sin_port);
-#endif
-
-    for (i = 0; i < MAX_SERVERS; i++) {
-        if (bsservers[i].hostname) {
-#ifdef BSDEBUG2
-            fprintf(stderr,
-                    "get_server_number check %u.%u.%u.%u/%u\n",
-                    (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff,
-                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff,
-                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
16)&0xff,
-                    ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 
24)&0xff,
-                    (unsigned int)bsservers[i].sin.sin_port);
-#endif
-            if ((sin->sin_family == bsservers[i].sin.sin_family) &&
-                (sin->sin_port == bsservers[i].sin.sin_port) &&
-                (memcmp((void *)&(sin->sin_addr),
-                        (void *)&(bsservers[i].sin.sin_addr),
-                        sizeof(struct in_addr)) == 0)) {
-                return i;
-            }
-        }        
-    }
-
-    return -1;
-}
-
-void *rx_buffer = NULL;
-bsq_t rx_qe;
-bsq_t *recv_any(void) {
-    struct sockaddr_in from;
-    int rc;
-    
-    DB("ENTER recv_any\n");
-
-    rx_qe.msghdr.msg_name = &from;
-    rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in);
-    rx_qe.msghdr.msg_iov = rx_qe.iov;
-    if (!rx_buffer) {
-        rx_buffer = malloc(BLOCK_SIZE);
-        if (!rx_buffer) {
-            perror("recv_any malloc");
-            return NULL;
-        }
-    }
-    rx_qe.block = rx_buffer;
-    rx_buffer = NULL;
-    rx_qe.msghdr.msg_iovlen = 2;
-    rx_qe.msghdr.msg_control = NULL;
-    rx_qe.msghdr.msg_controllen = 0;
-    rx_qe.msghdr.msg_flags = 0;
-    
-    rx_qe.iov[0].iov_base = (void *)&(rx_qe.message);
-    rx_qe.iov[0].iov_len = MSGBUFSIZE_ID;
-    rx_qe.iov[1].iov_base = rx_qe.block;
-    rx_qe.iov[1].iov_len = BLOCK_SIZE;
-
-    rc = recvmsg(bssock, &(rx_qe.msghdr), 0);
-    if (rc < 0) {
-        perror("recv_any");
-        return NULL;
-    }
-
-    rx_qe.length = rc;    
-    rx_qe.server = get_server_number(&from);
-
-    DB("recv_any from %d luid=%016llx len=%u\n",
-       rx_qe.server, rx_qe.message.luid, rx_qe.length);
-
-    return &rx_qe;
-}
-
-void recv_recycle_buffer(bsq_t *q) {
-    if (q->block) {
-        rx_buffer = q->block;
-        q->block = NULL;
-    }
-}
-
-// cycle through reading any incoming, searching for a match in the
-// queue, until we have all we need.
-int wait_recv(bsq_t **reqs, int numreqs) {
-    bsq_t *q, *m;
-    unsigned int x, i;
-    int tid = (int)pthread_getspecific(tid_key);
-
-    DB("ENTER wait_recv %u\n", numreqs);
-
-    checkmatch:
-    x = 0xffffffff;
-    for (i = 0; i < numreqs; i++) {
-        x &= reqs[i]->status;
-    }
-    if ((x & BSQ_STATUS_MATCHED)) {
-        DB("LEAVE wait_recv\n");
-        return numreqs;
-    }
-
-    RECV_AWAIT(tid);
-
-    /*
-    rxagain:
-    ENTER_RECV_CR;
-    q = recv_any();
-    LEAVE_RECV_CR;
-    if (!q)
-        return -1;
-
-    m = queuesearch(q);
-    recv_recycle_buffer(q);
-    if (!m) {
-        fprintf(stderr, "Unmatched RX\n");
-        goto rxagain;
-    }
-    */
-
-    goto checkmatch;
-
-}
-
-/* retry
- */
-static int retry_count = 0;
-int retry(bsq_t *qe)
-{
-    int rc;
-    gettimeofday(&(qe->tv_sent), NULL);
-    DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid);
-    retry_count++;
-    rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT);
-    if (rc < 0)
-        return rc;
-    return 0;
-}
-
-/* queue runner
- */
-void *queue_runner(void *arg)
-{
-    for (;;) {
-        struct timeval now;
-        long long nowus, sus;
-        bsq_t *q;
-        int r;
-
-        sleep(1);
-
-        gettimeofday(&now, NULL);
-        nowus = now.tv_usec + now.tv_sec * 1000000;
-        ENTER_QUEUE_CR;
-        r = retry_count;
-        for (q = bs_head; q; q = q->next) {
-            sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000;
-            if ((nowus - sus) > RETRY_TIMEOUT) {
-                if (retry(q) < 0) {
-                    fprintf(stderr, "Error on sendmsg retry.\n");
-                }
-            }
-        }
-        if (r != retry_count) {
-            fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count);
-        }
-        LEAVE_QUEUE_CR;
-    }
-}
-
-/* receive loop
- */
-void *receive_loop(void *arg)
-{
-    bsq_t *q, *m;
-
-    for(;;) {
-        q = recv_any();
-        if (!q) {
-            fprintf(stderr, "recv_any error\n");
-        }
-        else {
-            m = queuesearch(q);
-            recv_recycle_buffer(q);
-            if (!m) {
-                fprintf(stderr, "Unmatched RX\n");
-            }
-            else {
-                DB("RX MATCH");
-                RECV_NOTIFY(m->tid);
-            }
-        }
-    }
-}
-pthread_t pthread_recv;
-
-/*****************************************************************************
- * Reading                                                                   *
- *****************************************************************************/
-
-void *readblock_indiv(int server, u64 id) {
-    void *block;
-    bsq_t *qe;
-    int len, rc;
-
-    qe = (bsq_t *)malloc(sizeof(bsq_t));
-    if (!qe) {
-        perror("readblock qe malloc");
-        return NULL;
-    }
-    qe->block = NULL;
-    
-    /*
-    qe->block = malloc(BLOCK_SIZE);
-    if (!qe->block) {
-        perror("readblock qe malloc");
-        free((void *)qe);
-        return NULL;
-    }
-    */
-
-    qe->server = server;
-
-    qe->message.operation = BSOP_READBLOCK;
-    qe->message.flags = 0;
-    qe->message.id = id;
-    qe->length = MSGBUFSIZE_ID;
-
-    if (send_message(qe) < 0) {
-        perror("readblock sendto");
-        goto err;
-    }
-    
-    /*len = recv_message(qe);
-    if (len < 0) {
-        perror("readblock recv");
-        goto err;
-    }*/
-
-    rc = wait_recv(&qe, 1);
-    if (rc < 0) {
-        perror("readblock recv");
-        goto err;
-    }
-
-    if ((qe->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "readblock server error\n");
-        goto err;
-    }
-    if (qe->length < MSGBUFSIZE_BLOCK) {
-        fprintf(stderr, "readblock recv short (%u)\n", len);
-        goto err;
-    }
-    /* if ((block = malloc(BLOCK_SIZE)) == NULL) {
-        perror("readblock malloc");
-        goto err;
-    }
-    memcpy(block, qe->message.block, BLOCK_SIZE);
-    */    
-    block = qe->block;
-
-    free((void *)qe);
-    return block;
-
-    err:
-    if (qe->block)
-        free(qe->block);
-    free((void *)qe);
-    return NULL;
-}
-
-/**
- * readblock: read a block from disk
- *   @id: block id to read
- *
- *   @return: pointer to block, NULL on error
- */
-void *readblock(u64 id) {
-    int map = (int)BSID_MAP(id);
-    u64 xid;
-    static int i = CLUSTER_MAX_REPLICAS - 1;
-    void *block = NULL;
-
-    /* special case for the "superblock" just use the first block on the
-     * first replica. (extend to blocks < 6 for vdi bug)
-     */
-    if (id < 6) {
-        block = readblock_indiv(bsclusters[map].servers[0], id);
-        goto out;
-    }
-
-    i++;
-    if (i >= CLUSTER_MAX_REPLICAS)
-        i = 0;
-    switch (i) {
-    case 0:
-        xid = BSID_REPLICA0(id);
-        break;
-    case 1:
-        xid = BSID_REPLICA1(id);
-        break;
-    case 2:
-        xid = BSID_REPLICA2(id);
-        break;
-    }
-    
-    block = readblock_indiv(bsclusters[map].servers[i], xid);
-
-    out:
-#ifdef BSDEBUG
-    if (block)
-        fprintf(stderr, "READ:  %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
-                id,
-                (unsigned int)((unsigned char *)block)[0],
-                (unsigned int)((unsigned char *)block)[1],
-                (unsigned int)((unsigned char *)block)[2],
-                (unsigned int)((unsigned char *)block)[3],
-                (unsigned int)((unsigned char *)block)[4],
-                (unsigned int)((unsigned char *)block)[5],
-                (unsigned int)((unsigned char *)block)[6],
-                (unsigned int)((unsigned char *)block)[7]);
-    else
-        fprintf(stderr, "READ:  %016llx NULL\n", id);
-#endif
-    return block;
-}
-
-/*****************************************************************************
- * Writing                                                                   *
- *****************************************************************************/
-
-bsq_t *writeblock_indiv(int server, u64 id, void *block) {
-
-    bsq_t *qe;
-    int len;
-
-    qe = (bsq_t *)malloc(sizeof(bsq_t));
-    if (!qe) {
-        perror("writeblock qe malloc");
-        goto err;
-    }
-    qe->server = server;
-
-    qe->message.operation = BSOP_WRITEBLOCK;
-    qe->message.flags = 0;
-    qe->message.id = id;
-    //memcpy(qe->message.block, block, BLOCK_SIZE);
-    qe->block = block;
-    qe->length = MSGBUFSIZE_BLOCK;
-
-    if (send_message(qe) < 0) {
-        perror("writeblock sendto");
-        goto err;
-    }
-
-    return qe;
-
-    err:
-    free((void *)qe);
-    return NULL;
-}
-    
-
-/**
- * writeblock: write an existing block to disk
- *   @id: block id
- *   @block: pointer to block
- *
- *   @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-    
-    int map = (int)BSID_MAP(id);
-    int rep0 = bsclusters[map].servers[0];
-    int rep1 = bsclusters[map].servers[1];
-    int rep2 = bsclusters[map].servers[2];
-    bsq_t *reqs[3];
-    int rc;
-
-    reqs[0] = reqs[1] = reqs[2] = NULL;
-
-#ifdef BSDEBUG
-    fprintf(stderr,
-            "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
-            id,
-            (unsigned int)((unsigned char *)block)[0],
-            (unsigned int)((unsigned char *)block)[1],
-            (unsigned int)((unsigned char *)block)[2],
-            (unsigned int)((unsigned char *)block)[3],
-            (unsigned int)((unsigned char *)block)[4],
-            (unsigned int)((unsigned char *)block)[5],
-            (unsigned int)((unsigned char *)block)[6],
-            (unsigned int)((unsigned char *)block)[7]);
-#endif
-
-    /* special case for the "superblock" just use the first block on the
-     * first replica. (extend to blocks < 6 for vdi bug)
-     */
-    if (id < 6) {
-        reqs[0] = writeblock_indiv(rep0, id, block);
-        if (!reqs[0])
-            return -1;
-        rc = wait_recv(reqs, 1);
-        return rc;
-    }
-
-    reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block);
-    if (!reqs[0])
-        goto err;
-    reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block);
-    if (!reqs[1])
-        goto err;
-    reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block);
-    if (!reqs[2])
-        goto err;
-
-    rc = wait_recv(reqs, 3);
-    if (rc < 0) {
-        perror("writeblock recv");
-        goto err;
-    }
-    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "writeblock server0 error\n");
-        goto err;
-    }
-    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "writeblock server1 error\n");
-        goto err;
-    }
-    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "writeblock server2 error\n");
-        goto err;
-    }
-
-
-    free((void *)reqs[0]);
-    free((void *)reqs[1]);
-    free((void *)reqs[2]);
-    return 0;
-
-    err:
-    if (reqs[0]) {
-        dequeue(reqs[0]);
-        free((void *)reqs[0]);
-    }
-    if (reqs[1]) {
-        dequeue(reqs[1]);
-        free((void *)reqs[1]);
-    }
-    if (reqs[2]) {
-        dequeue(reqs[2]);
-        free((void *)reqs[2]);
-    }
-    return -1;
-}
-
-/*****************************************************************************
- * Allocation                                                                *
- *****************************************************************************/
-
-/**
- * allocblock: write a new block to disk
- *   @block: pointer to block
- *
- *   @return: new id of block on disk
- */
-u64 allocblock(void *block) {
-    return allocblock_hint(block, 0);
-}
-
-bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) {
-    bsq_t *qe;
-    int len;
-
-    qe = (bsq_t *)malloc(sizeof(bsq_t));
-    if (!qe) {
-        perror("allocblock_hint qe malloc");
-        goto err;
-    }
-    qe->server = server;
-
-    qe->message.operation = BSOP_ALLOCBLOCK;
-    qe->message.flags = 0;
-    qe->message.id = hint;
-    //memcpy(qe->message.block, block, BLOCK_SIZE);
-    qe->block = block;
-    qe->length = MSGBUFSIZE_BLOCK;
-
-    if (send_message(qe) < 0) {
-        perror("allocblock_hint sendto");
-        goto err;
-    }
-    
-    return qe;
-
-    err:
-    free((void *)qe);
-    return NULL;
-}
-
-/**
- * allocblock_hint: write a new block to disk
- *   @block: pointer to block
- *   @hint: allocation hint
- *
- *   @return: new id of block on disk
- */
-u64 allocblock_hint(void *block, u64 hint) {
-    int map = (int)hint;
-    int rep0 = bsclusters[map].servers[0];
-    int rep1 = bsclusters[map].servers[1];
-    int rep2 = bsclusters[map].servers[2];
-    bsq_t *reqs[3];
-    int rc;
-    u64 id0, id1, id2;
-
-    reqs[0] = reqs[1] = reqs[2] = NULL;
-
-    DB("ENTER allocblock\n");
-
-    reqs[0] = allocblock_hint_indiv(rep0, block, hint);
-    if (!reqs[0])
-        goto err;
-    reqs[1] = allocblock_hint_indiv(rep1, block, hint);
-    if (!reqs[1])
-        goto err;
-    reqs[2] = allocblock_hint_indiv(rep2, block, hint);
-    if (!reqs[2])
-        goto err;
-
-    rc = wait_recv(reqs, 3);
-    if (rc < 0) {
-        perror("allocblock recv");
-        goto err;
-    }
-    if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "allocblock server0 error\n");
-        goto err;
-    }
-    if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "allocblock server1 error\n");
-        goto err;
-    }
-    if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) {
-        fprintf(stderr, "allocblock server2 error\n");
-        goto err;
-    }
-
-    id0 = reqs[0]->message.id;
-    id1 = reqs[1]->message.id;
-    id2 = reqs[2]->message.id;
-
-#ifdef BSDEBUG
-    fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n",
-            BSID(map, id0, id1, id2),
-            (unsigned int)((unsigned char *)block)[0],
-            (unsigned int)((unsigned char *)block)[1],
-            (unsigned int)((unsigned char *)block)[2],
-            (unsigned int)((unsigned char *)block)[3],
-            (unsigned int)((unsigned char *)block)[4],
-            (unsigned int)((unsigned char *)block)[5],
-            (unsigned int)((unsigned char *)block)[6],
-            (unsigned int)((unsigned char *)block)[7]);
-#endif
-    
-    free((void *)reqs[0]);
-    free((void *)reqs[1]);
-    free((void *)reqs[2]);
-    return BSID(map, id0, id1, id2);
-
-    err:
-    if (reqs[0]) {
-        dequeue(reqs[0]);
-        free((void *)reqs[0]);
-    }
-    if (reqs[1]) {
-        dequeue(reqs[1]);
-        free((void *)reqs[1]);
-    }
-    if (reqs[2]) {
-        dequeue(reqs[2]);
-        free((void *)reqs[2]);
-    }
-    return 0;
-}
-
-#else /* /BLOCKSTORE_REMOTE */
-
-/*****************************************************************************
- * Local storage version                                                     *
- *****************************************************************************/
- 
-/**
- * readblock: read a block from disk
- *   @id: block id to read
- *
- *   @return: pointer to block, NULL on error
- */
-
-void *readblock(u64 id) {
-    void *block;
-    int block_fp;
-   
-//printf("readblock(%llu)\n", id); 
-    block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return NULL;
-    }
-    
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        printf ("%Ld ", id);
-        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
-        perror("readblock lseek");
-        goto err;
-    }
-    if ((block = malloc(BLOCK_SIZE)) == NULL) {
-        perror("readblock malloc");
-        goto err;
-    }
-    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("readblock read");
-        free(block);
-        goto err;
-    }
-    close(block_fp);
-    return block;
-    
-err:
-    close(block_fp);
-    return NULL;
-}
-
-/**
- * writeblock: write an existing block to disk
- *   @id: block id
- *   @block: pointer to block
- *
- *   @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-    
-    int block_fp;
-    
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return -1;
-    }
-
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        perror("writeblock lseek");
-        goto err;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) < 0) {
-        perror("writeblock write");
-        goto err;
-    }
-    close(block_fp);
-    return 0;
-
-err:
-    close(block_fp);
-    return -1;
-}
-
-/**
- * allocblock: write a new block to disk
- *   @block: pointer to block
- *
- *   @return: new id of block on disk
- */
-
-u64 allocblock(void *block) {
-    u64 lb;
-    off64_t pos;
-    int block_fp;
-    
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return 0;
-    }
-
-    pos = lseek64(block_fp, 0, SEEK_END);
-    if (pos == (off64_t)-1) {
-        perror("allocblock lseek");
-        goto err;
-    }
-    if (pos % BLOCK_SIZE != 0) {
-        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
-        goto err;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("allocblock write");
-        goto err;
-    }
-    lb = pos / BLOCK_SIZE + 1;
-//printf("alloc(%Ld)\n", lb);
-    close(block_fp);
-    return lb;
-    
-err:
-    close(block_fp);
-    return 0;
-    
-}
-
-/**
- * allocblock_hint: write a new block to disk
- *   @block: pointer to block
- *   @hint: allocation hint
- *
- *   @return: new id of block on disk
- */
-u64 allocblock_hint(void *block, u64 hint) {
-    return allocblock(block);
-}
-
-#endif /* BLOCKSTORE_REMOTE */
-
-/*****************************************************************************
- * Memory management                                                         *
- *****************************************************************************/
-
-/**
- * newblock: get a new in-memory block set to zeros
- *
- *   @return: pointer to new block, NULL on error
- */
-void *newblock() {
-    void *block = malloc(BLOCK_SIZE);
-    if (block == NULL) {
-        perror("newblock");
-        return NULL;
-    }
-    memset(block, 0, BLOCK_SIZE);
-    return block;
-}
-
-
-/**
- * freeblock: unallocate an in-memory block
- *   @id: block id (zero if this is only in-memory)
- *   @block: block to be freed
- */
-void freeblock(void *block) {
-    if (block != NULL)
-        free(block);
-}
-
-static freeblock_t *new_freeblock(void)
-{
-    freeblock_t *fb;
-    
-    fb = newblock();
-    
-    if (fb == NULL) return NULL;
-    
-    fb->magic = FREEBLOCK_MAGIC;
-    fb->next  = 0ULL;
-    fb->count = 0ULL;
-    memset(fb->list, 0, sizeof fb->list);
-    
-    return fb;
-}
-
-void releaseblock(u64 id)
-{
-    blockstore_super_t *bs_super;
-    freeblock_t *fl_current;
-    
-    /* get superblock */
-    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-    
-    /* get freeblock_current */
-    if (bs_super->freelist_current == 0ULL) 
-    {
-        fl_current = new_freeblock();
-        bs_super->freelist_current = allocblock(fl_current);
-        writeblock(BLOCKSTORE_SUPER, bs_super);
-    } else {
-        fl_current = readblock(bs_super->freelist_current);
-    }
-    
-    /* if full, chain to superblock and allocate new current */
-    
-    if (fl_current->count == FREEBLOCK_SIZE) {
-        fl_current->next = bs_super->freelist_full;
-        writeblock(bs_super->freelist_current, fl_current);
-        bs_super->freelist_full = bs_super->freelist_current;
-        freeblock(fl_current);
-        fl_current = new_freeblock();
-        bs_super->freelist_current = allocblock(fl_current);
-        writeblock(BLOCKSTORE_SUPER, bs_super);
-    }
-    
-    /* append id to current */
-    fl_current->list[fl_current->count++] = id;
-    writeblock(bs_super->freelist_current, fl_current);
-    
-    freeblock(fl_current);
-    freeblock(bs_super);
-    
-    
-}
-
-/* freelist debug functions: */
-void freelist_count(int print_each)
-{
-    blockstore_super_t *bs_super;
-    freeblock_t *fb;
-    u64 total = 0, next;
-    
-    bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-    
-    if (bs_super->freelist_current == 0ULL) {
-        printf("freelist is empty!\n");
-        return;
-    }
-    
-    fb = readblock(bs_super->freelist_current);
-    printf("%Ld entires on current.\n", fb->count);
-    total += fb->count;
-    if (print_each == 1)
-    {
-        int i;
-        for (i=0; i< fb->count; i++)
-            printf("  %Ld\n", fb->list[i]);
-    }
-    
-    freeblock(fb);
-    
-    if (bs_super->freelist_full == 0ULL) {
-        printf("freelist_full is empty!\n");
-        return;
-    }
-    
-    next = bs_super->freelist_full;
-    for (;;) {
-        fb = readblock(next);
-        total += fb->count;
-        if (print_each == 1)
-        {
-            int i;
-            for (i=0; i< fb->count; i++)
-                printf("  %Ld\n", fb->list[i]);
-        }
-        next = fb->next;
-        freeblock(fb);
-        if (next == 0ULL) break;
-    }
-    printf("Total of %Ld ids on freelist.\n", total);
-}
-
-/*****************************************************************************
- * Initialisation                                                            *
- *****************************************************************************/
-
-int __init_blockstore(void)
-{
-    int i;
-    blockstore_super_t *bs_super;
-    u64 ret;
-    int block_fp;
-    
-#ifdef BLOCKSTORE_REMOTE
-    struct hostent *addr;
-
-    pthread_mutex_init(&ptmutex_queue, NULL);
-    pthread_mutex_init(&ptmutex_luid, NULL);
-    pthread_mutex_init(&ptmutex_recv, NULL);
-    /*pthread_mutex_init(&ptmutex_notify, NULL);*/
-    for (i = 0; i <= READ_POOL_SIZE; i++) {
-        pool_thread[i].newdata = 0;
-        pthread_mutex_init(&(pool_thread[i].ptmutex), NULL);
-        pthread_cond_init(&(pool_thread[i].ptcv), NULL);
-    }
-
-    bsservers[0].hostname = "firebug.cl.cam.ac.uk";
-    bsservers[1].hostname = "planb.cl.cam.ac.uk";
-    bsservers[2].hostname = "simcity.cl.cam.ac.uk";
-    bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/;
-    bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/;
-    bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/;
-    bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/;
-    bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/;
-    bsservers[8].hostname = NULL;
-    bsservers[9].hostname = NULL;
-    bsservers[10].hostname = NULL;
-    bsservers[11].hostname = NULL;
-    bsservers[12].hostname = NULL;
-    bsservers[13].hostname = NULL;
-    bsservers[14].hostname = NULL;
-    bsservers[15].hostname = NULL;
-
-    for (i = 0; i < MAX_SERVERS; i++) {
-        if (!bsservers[i].hostname)
-            continue;
-        addr = gethostbyname(bsservers[i].hostname);
-        if (!addr) {
-            perror("bad hostname");
-            return -1;
-        }
-        bsservers[i].sin.sin_family = addr->h_addrtype;
-        bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT);
-        bsservers[i].sin.sin_addr.s_addr = 
-            ((struct in_addr *)(addr->h_addr))->s_addr;
-    }
-
-    /* Cluster map
-     */
-    bsclusters[0].servers[0] = 0;
-    bsclusters[0].servers[1] = 1;
-    bsclusters[0].servers[2] = 2;
-    bsclusters[1].servers[0] = 1;
-    bsclusters[1].servers[1] = 2;
-    bsclusters[1].servers[2] = 3;
-    bsclusters[2].servers[0] = 2;
-    bsclusters[2].servers[1] = 3;
-    bsclusters[2].servers[2] = 4;
-    bsclusters[3].servers[0] = 3;
-    bsclusters[3].servers[1] = 4;
-    bsclusters[3].servers[2] = 5;
-    bsclusters[4].servers[0] = 4;
-    bsclusters[4].servers[1] = 5;
-    bsclusters[4].servers[2] = 6;
-    bsclusters[5].servers[0] = 5;
-    bsclusters[5].servers[1] = 6;
-    bsclusters[5].servers[2] = 7;
-    bsclusters[6].servers[0] = 6;
-    bsclusters[6].servers[1] = 7;
-    bsclusters[6].servers[2] = 0;
-    bsclusters[7].servers[0] = 7;
-    bsclusters[7].servers[1] = 0;
-    bsclusters[7].servers[2] = 1;
-
-    /* Local socket set up
-     */
-    bssock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (bssock < 0) {
-        perror("Bad socket");
-        return -1;
-    }
-    memset(&sin_local, 0, sizeof(sin_local));
-    sin_local.sin_family = AF_INET;
-    sin_local.sin_port = htons(BLOCKSTORED_PORT);
-    sin_local.sin_addr.s_addr = htonl(INADDR_ANY);
-    if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) {
-        perror("bind");
-        close(bssock);
-        return -1;
-    }
-
-    pthread_create(&pthread_recv, NULL, receive_loop, NULL);
-    pthread_create(&pthread_recv, NULL, queue_runner, NULL);
-
-#else /* /BLOCKSTORE_REMOTE */
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return -1;
-        exit(-1);
-    }
-    
-    if (lseek(block_fp, 0, SEEK_END) == 0) {
-        bs_super = newblock();
-        bs_super->magic            = BLOCKSTORE_MAGIC;
-        bs_super->freelist_full    = 0LL;
-        bs_super->freelist_current = 0LL;
-        
-        ret = allocblock(bs_super);
-        
-        freeblock(bs_super);
-    } else {
-        bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER);
-        if (bs_super->magic != BLOCKSTORE_MAGIC)
-        {
-            printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n");
-            exit(-1);
-        }
-        freeblock(bs_super);
-    }
-        
-    close(block_fp);
-        
-#endif /*  BLOCKSTORE_REMOTE */   
-    return 0;
-}
-
-void __exit_blockstore(void)
-{
-    int i;
-#ifdef BLOCKSTORE_REMOTE
-    pthread_mutex_destroy(&ptmutex_recv);
-    pthread_mutex_destroy(&ptmutex_luid);
-    pthread_mutex_destroy(&ptmutex_queue);
-    /*pthread_mutex_destroy(&ptmutex_notify);
-      pthread_cond_destroy(&ptcv_notify);*/
-    for (i = 0; i <= READ_POOL_SIZE; i++) {
-        pthread_mutex_destroy(&(pool_thread[i].ptmutex));
-        pthread_cond_destroy(&(pool_thread[i].ptcv));
-    }
-#endif
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstore.h
--- a/tools/blktap/blockstore.h Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,134 +0,0 @@
-/**************************************************************************
- * 
- * blockstore.h
- *
- * Simple block store interface
- *
- */
- 
-#ifndef __BLOCKSTORE_H__
-#define __BLOCKSTORE_H__
-
-#include <netinet/in.h>
-#include <xc.h>
-
-#define BLOCK_SIZE  4096
-#define BLOCK_SHIFT   12
-#define BLOCK_MASK  0xfffffffffffff000LL
-
-/* XXX SMH: where is the below supposed to be defined???? */
-#ifndef SECTOR_SHIFT 
-#define SECTOR_SHIFT   9 
-#endif
-
-#define FREEBLOCK_SIZE  (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64))
-#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL
-
-typedef struct {
-    u64 magic;
-    u64 next;
-    u64 count;
-    u64 list[FREEBLOCK_SIZE];
-} freeblock_t; 
-
-#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL
-#define BLOCKSTORE_SUPER 1ULL
-
-typedef struct {
-    u64 magic;
-    u64 freelist_full;
-    u64 freelist_current;
-} blockstore_super_t;
-
-extern void *newblock();
-extern void *readblock(u64 id);
-extern u64 allocblock(void *block);
-extern u64 allocblock_hint(void *block, u64 hint);
-extern int writeblock(u64 id, void *block);
-
-/* Add this blockid to a freelist, to be recycled by the allocator. */
-extern void releaseblock(u64 id);
-
-/* this is a memory free() operation for block-sized allocations */
-extern void freeblock(void *block);
-extern int __init_blockstore(void);
-
-/* debug for freelist. */
-void freelist_count(int print_each);
-#define ALLOCFAIL (((u64)(-1)))
-
-/* Distribution
- */
-#define BLOCKSTORED_PORT 9346
-
-struct bshdr_t_struct {
-    u32            operation;
-    u32            flags;
-    u64            id;
-    u64            luid;
-} __attribute__ ((packed));
-typedef struct bshdr_t_struct bshdr_t;
-
-struct bsmsg_t_struct {
-    bshdr_t        hdr;
-    unsigned char  block[BLOCK_SIZE];
-} __attribute__ ((packed));
-
-typedef struct bsmsg_t_struct bsmsg_t;
-
-#define MSGBUFSIZE_OP    sizeof(u32)
-#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32))
-#define MSGBUFSIZE_ID    (sizeof(u32) + sizeof(u32) + sizeof(u64) + 
sizeof(u64))
-#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t)
-
-#define BSOP_READBLOCK  0x01
-#define BSOP_WRITEBLOCK 0x02
-#define BSOP_ALLOCBLOCK 0x03
-#define BSOP_FREEBLOCK  0x04
-
-#define BSOP_FLAG_ERROR 0x01
-
-#define BS_ALLOC_SKIP 10
-#define BS_ALLOC_HACK
-
-/* Remote hosts and cluster map - XXX need to generalise
- */
-
-/*
-
-  Interim ID format is
-
-  63 60 59                40 39                20 19                 0
-  +----+--------------------+--------------------+--------------------+
-  |map | replica 2          | replica 1          | replica 0          |
-  +----+--------------------+--------------------+--------------------+
-
-  The map is an index into a table detailing which machines form the
-  cluster.
-
- */
-
-#define BSID_REPLICA0(_id) ((_id)&0xfffffULL)
-#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL)
-#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL)
-#define BSID_MAP(_id)      (((_id)>>60)&0xfULL)
-
-#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \
-                                         (((u64)(_rep2))<<40) | \
-                                         (((u64)(_rep1))<<20) | ((u64)(_rep0)))
-
-typedef struct bsserver_t_struct {
-    char              *hostname;
-    struct sockaddr_in sin;
-} bsserver_t;
-
-#define MAX_SERVERS 16
-
-#define CLUSTER_MAX_REPLICAS 3
-typedef struct bscluster_t_struct {
-    int servers[CLUSTER_MAX_REPLICAS];
-} bscluster_t;
-
-#define MAX_CLUSTERS 16
-
-#endif /* __BLOCKSTORE_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstored.c
--- a/tools/blktap/blockstored.c        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,276 +0,0 @@
-/**************************************************************************
- * 
- * blockstored.c
- *
- * Block store daemon.
- *
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <errno.h>
-#include "blockstore.h"
-
-//#define BSDEBUG
-
-int readblock_into(u64 id, void *block);
-
-int open_socket(u16 port) {
-    
-    struct sockaddr_in sn;
-    int sock;
-
-    sock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (sock < 0) {
-        perror("Bad socket");
-        return -1;
-    }
-    memset(&sn, 0, sizeof(sn));
-    sn.sin_family = AF_INET;
-    sn.sin_port = htons(port);
-    sn.sin_addr.s_addr = htonl(INADDR_ANY);
-    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
-        perror("bind");
-        close(sock);
-        return -1;
-    }
-
-    return sock;
-}
-
-static int block_fp = -1;
-static int bssock = -1;
-
-int send_reply(struct sockaddr_in *peer, void *buffer, int len) {
-
-    int rc;
-    
-#ifdef BSDEBUG
-    fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n",
-            len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t 
*)buffer)->hdr.id);
-#endif
-    rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, 
sizeof(*peer));
-    if (rc < 0) {
-        perror("send_reply");
-        return 1;
-    }
-
-
-    return 0;
-}
-
-static bsmsg_t msgbuf;
-
-void service_loop(void) {
-
-    for (;;) {
-        int rc, len;
-        struct sockaddr_in from;
-        size_t slen = sizeof(from);
-        u64 bid;
-
-        len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0,
-                       (struct sockaddr *)&from, &slen);
-
-        if (len < 0) {
-            perror("recvfrom");
-            continue;
-        }
-
-        if (len < MSGBUFSIZE_OP) {
-            fprintf(stderr, "Short packet.\n");
-            continue;
-        }
-
-#ifdef BSDEBUG
-        fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n",
-                len, msgbuf.hdr.operation, msgbuf.hdr.id);
-#endif
-
-        switch (msgbuf.hdr.operation) {
-        case BSOP_READBLOCK:
-            if (len < MSGBUFSIZE_ID) {
-                fprintf(stderr, "Short packet (readblock %u).\n", len);
-                continue;
-            }
-            rc = readblock_into(msgbuf.hdr.id, msgbuf.block);
-            if (rc < 0) {
-                fprintf(stderr, "readblock error\n");
-                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
-                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-                continue;
-            }
-            msgbuf.hdr.flags = 0;
-            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK);
-            break;
-        case BSOP_WRITEBLOCK:
-            if (len < MSGBUFSIZE_BLOCK) {
-                fprintf(stderr, "Short packet (writeblock %u).\n", len);
-                continue;
-            }
-            rc = writeblock(msgbuf.hdr.id, msgbuf.block);
-            if (rc < 0) {
-                fprintf(stderr, "writeblock error\n");
-                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
-                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-                continue;
-            }
-            msgbuf.hdr.flags = 0;
-            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-            break;
-        case BSOP_ALLOCBLOCK:
-            if (len < MSGBUFSIZE_BLOCK) {
-                fprintf(stderr, "Short packet (allocblock %u).\n", len);
-                continue;
-            }
-            bid = allocblock(msgbuf.block);
-            if (bid == ALLOCFAIL) {
-                fprintf(stderr, "allocblock error\n");
-                msgbuf.hdr.flags = BSOP_FLAG_ERROR;
-                send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-                continue;
-            }
-            msgbuf.hdr.id = bid;
-            msgbuf.hdr.flags = 0;
-            send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID);
-            break;
-        }
-
-    }
-}
- 
-/**
- * readblock: read a block from disk
- *   @id: block id to read
- *   @block: pointer to buffer to receive block
- *
- *   @return: 0 if OK, other on error
- */
-
-int readblock_into(u64 id, void *block) {
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
-        perror("readblock lseek");
-        return -1;
-    }
-    if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("readblock read");
-        return -1;
-    }
-    return 0;
-}
-
-/**
- * writeblock: write an existing block to disk
- *   @id: block id
- *   @block: pointer to block
- *
- *   @return: zero on success, -1 on failure
- */
-int writeblock(u64 id, void *block) {
-    if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
-        perror("writeblock lseek");
-        return -1;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) < 0) {
-        perror("writeblock write");
-        return -1;
-    }
-    return 0;
-}
-
-/**
- * allocblock: write a new block to disk
- *   @block: pointer to block
- *
- *   @return: new id of block on disk
- */
-static u64 lastblock = 0;
-
-u64 allocblock(void *block) {
-    u64 lb;
-    off64_t pos;
-
-    retry:
-    pos = lseek64(block_fp, 0, SEEK_END);
-    if (pos == (off64_t)-1) {
-        perror("allocblock lseek");
-        return ALLOCFAIL;
-    }
-    if (pos % BLOCK_SIZE != 0) {
-        fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
-        return ALLOCFAIL;
-    }
-    if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
-        perror("allocblock write");
-        return ALLOCFAIL;
-    }
-    lb = pos / BLOCK_SIZE + 1;
-
-#ifdef BS_ALLOC_HACK
-    if (lb < BS_ALLOC_SKIP)
-        goto retry;
-#endif
-    
-    if (lb <= lastblock)
-        printf("[*** %Ld alredy allocated! ***]\n", lb);
-    
-    lastblock = lb;
-    return lb;
-}
-
-/**
- * newblock: get a new in-memory block set to zeros
- *
- *   @return: pointer to new block, NULL on error
- */
-void *newblock() {
-    void *block = malloc(BLOCK_SIZE);
-    if (block == NULL) {
-        perror("newblock");
-        return NULL;
-    }
-    memset(block, 0, BLOCK_SIZE);
-    return block;
-}
-
-
-/**
- * freeblock: unallocate an in-memory block
- *   @id: block id (zero if this is only in-memory)
- *   @block: block to be freed
- */
-void freeblock(void *block) {
-    if (block != NULL)
-        free(block);
-}
-
-
-int main(int argc, char **argv)
-{
-    block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
-
-    if (block_fp < 0) {
-        perror("open");
-        return -1;
-    }
-
-    bssock = open_socket(BLOCKSTORED_PORT);
-    if (bssock < 0) {
-        return -1;
-    }
-
-    service_loop();
-    
-    close(bssock);
-
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/bstest.c
--- a/tools/blktap/bstest.c     Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,191 +0,0 @@
-/**************************************************************************
- * 
- * bstest.c
- *
- * Block store daemon test program.
- *
- * usage: bstest <host>|X {r|w|a} ID 
- *
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <netdb.h>
-#include <errno.h>
-#include "blockstore.h"
-
-int direct(char *host, u32 op, u64 id, int len) {
-    struct sockaddr_in sn, peer;
-    int sock;
-    bsmsg_t msgbuf;
-    int rc, slen;
-    struct hostent *addr;
-
-    addr = gethostbyname(host);
-    if (!addr) {
-        perror("bad hostname");
-        exit(1);
-    }
-    peer.sin_family = addr->h_addrtype;
-    peer.sin_port = htons(BLOCKSTORED_PORT);
-    peer.sin_addr.s_addr =  ((struct in_addr *)(addr->h_addr))->s_addr;
-    fprintf(stderr, "Sending to: %u.%u.%u.%u\n",
-            (unsigned int)(unsigned char)addr->h_addr[0],
-            (unsigned int)(unsigned char)addr->h_addr[1],
-            (unsigned int)(unsigned char)addr->h_addr[2],
-            (unsigned int)(unsigned char)addr->h_addr[3]);
-
-    sock = socket(AF_INET, SOCK_DGRAM, 0);
-    if (sock < 0) {
-        perror("Bad socket");
-        exit(1);
-    }
-    memset(&sn, 0, sizeof(sn));
-    sn.sin_family = AF_INET;
-    sn.sin_port = htons(BLOCKSTORED_PORT);
-    sn.sin_addr.s_addr = htonl(INADDR_ANY);
-    if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) {
-        perror("bind");
-        close(sock);
-        exit(1);
-    }
-
-    memset((void *)&msgbuf, 0, sizeof(msgbuf));
-    msgbuf.operation = op;
-    msgbuf.id = id;
-
-    rc = sendto(sock, (void *)&msgbuf, len, 0,
-                (struct sockaddr *)&peer, sizeof(peer));
-    if (rc < 0) {
-        perror("sendto");
-        exit(1);
-    }
-
-    slen = sizeof(peer);
-    len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0,
-                   (struct sockaddr *)&peer, &slen);
-    if (len < 0) {
-        perror("recvfrom");
-        exit(1);
-    }
-
-    printf("Reply %u bytes:\n", len);
-    if (len >= MSGBUFSIZE_OP)
-        printf("  operation: %u\n", msgbuf.operation);
-    if (len >= MSGBUFSIZE_FLAGS)
-        printf("  flags: 0x%x\n", msgbuf.flags);
-    if (len >= MSGBUFSIZE_ID)
-        printf("  id: %llu\n", msgbuf.id);
-    if (len >= (MSGBUFSIZE_ID + 4))
-        printf("  data: %02x %02x %02x %02x...\n",
-               (unsigned int)msgbuf.block[0],
-               (unsigned int)msgbuf.block[1],
-               (unsigned int)msgbuf.block[2],
-               (unsigned int)msgbuf.block[3]);
-    
-    if (sock > 0)
-        close(sock);
-   
-    return 0;
-}
-
-int main (int argc, char **argv) {
-
-    u32 op = 0;
-    u64 id = 0;
-    int len = 0, rc;
-    void *block;
-
-    if (argc < 3) {
-        fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n");
-        return 1;
-    }
-
-    switch (argv[2][0]) {
-    case 'r':
-    case 'R':
-        op = BSOP_READBLOCK;
-        len = MSGBUFSIZE_ID;
-        break;
-    case 'w':
-    case 'W':
-        op = BSOP_WRITEBLOCK;
-        len = MSGBUFSIZE_BLOCK;
-        break;
-    case 'a':
-    case 'A':
-        op = BSOP_ALLOCBLOCK;
-        len = MSGBUFSIZE_BLOCK;
-        break;
-    default:
-        fprintf(stderr, "Unknown action '%s'.\n", argv[2]);
-        return 1;
-    }
-
-    if (argc >= 4)
-        id = atoll(argv[3]);
-
-    if (strcmp(argv[1], "X") == 0) {
-        rc = __init_blockstore();
-        if (rc < 0) {
-            fprintf(stderr, "blockstore init failed.\n");
-            return 1;
-        }
-        switch(op) {
-        case BSOP_READBLOCK:
-            block = readblock(id);
-            if (block) {
-                printf("data: %02x %02x %02x %02x...\n",
-                       (unsigned int)((unsigned char*)block)[0],
-                       (unsigned int)((unsigned char*)block)[1],
-                       (unsigned int)((unsigned char*)block)[2],
-                       (unsigned int)((unsigned char*)block)[3]);
-            }
-            break;
-        case BSOP_WRITEBLOCK:
-            block = malloc(BLOCK_SIZE);
-            if (!block) {
-                perror("bstest malloc");
-                return 1;
-            }
-            memset(block, 0, BLOCK_SIZE);
-            rc = writeblock(id, block);
-            if (rc != 0) {
-                printf("error\n");
-            }
-            else {
-                printf("OK\n");
-            }
-            break;
-        case BSOP_ALLOCBLOCK:
-            block = malloc(BLOCK_SIZE);
-            if (!block) {
-                perror("bstest malloc");
-                return 1;
-            }
-            memset(block, 0, BLOCK_SIZE);
-            id = allocblock_hint(block, id);
-            if (id == 0) {
-                printf("error\n");
-            }
-            else {
-                printf("ID: %llu\n", id);
-            }
-            break;
-        }
-    }
-    else {
-        direct(argv[1], op, id, len);
-    }
-
-
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax.c
--- a/tools/blktap/parallax.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,611 +0,0 @@
-/**************************************************************************
- * 
- * parallax.c
- *
- * The Parallax Storage Server
- *
- */
- 
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include "blktaplib.h"
-#include "blockstore.h"
-#include "vdi.h"
-#include "block-async.h"
-#include "requests-async.h"
-
-#define PARALLAX_DEV     61440
-#define SECTS_PER_NODE   8
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* ------[ session records ]----------------------------------------------- */
-
-#define BLKIF_HASHSZ 1024
-#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
-
-#define VDI_HASHSZ 16
-#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
-
-typedef struct blkif {
-    domid_t       domid;
-    unsigned int  handle;
-    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
-    vdi_t        *vdi_hash[VDI_HASHSZ];
-    struct blkif *hash_next;
-} blkif_t;
-
-static blkif_t      *blkif_hash[BLKIF_HASHSZ];
-
-blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
-{
-    if ( handle != 0 )
-        printf("blktap/parallax don't currently support non-0 dev handles!\n");
-    
-    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( (blkif != NULL) && 
-            ((blkif->domid != domid) || (blkif->handle != handle)) )
-        blkif = blkif->hash_next;
-    return blkif;
-}
-
-vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
-{
-    vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
-    
-    while ((vdi != NULL) && (vdi->vdevice != device))
-        vdi = vdi->next;
-    
-    return vdi;
-}
-
-/* ------[ control message handling ]-------------------------------------- */
-
-void blkif_create(blkif_be_create_t *create)
-{
-    domid_t       domid  = create->domid;
-    unsigned int  handle = create->blkif_handle;
-    blkif_t     **pblkif, *blkif;
-
-    DPRINTF("parallax (blkif_create): create is %p\n", create); 
-    
-    if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
-    {
-        DPRINTF("Could not create blkif: out of memory\n");
-        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
-        return;
-    }
-
-    memset(blkif, 0, sizeof(*blkif));
-    blkif->domid  = domid;
-    blkif->handle = handle;
-    blkif->status = DISCONNECTED;
-
-    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( *pblkif != NULL )
-    {
-        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
-        {
-            DPRINTF("Could not create blkif: already exists (%d,%d)\n",
-                domid, handle);
-            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
-            free(blkif);
-            return;
-        }
-        pblkif = &(*pblkif)->hash_next;
-    }
-
-    blkif->hash_next = *pblkif;
-    *pblkif = blkif;
-
-    DPRINTF("Successfully created blkif\n");
-    create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void blkif_destroy(blkif_be_destroy_t *destroy)
-{
-    domid_t       domid  = destroy->domid;
-    unsigned int  handle = destroy->blkif_handle;
-    blkif_t     **pblkif, *blkif;
-
-    DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); 
-    
-    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( (blkif = *pblkif) != NULL )
-    {
-        if ( (blkif->domid == domid) && (blkif->handle == handle) )
-        {
-            if ( blkif->status != DISCONNECTED )
-                goto still_connected;
-            goto destroy;
-        }
-        pblkif = &blkif->hash_next;
-    }
-
-    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-    return;
-
- still_connected:
-    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
-    return;
-
- destroy:
-    *pblkif = blkif->hash_next;
-    free(blkif);
-    destroy->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void vbd_create(blkif_be_vbd_create_t *create)
-{
-    blkif_t            *blkif;
-    vdi_t              *vdi, **vdip;
-    blkif_vdev_t        vdevice = create->vdevice;
-
-    DPRINTF("parallax (vbd_create): create=%p\n", create); 
-    
-    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
-    if ( blkif == NULL )
-    {
-        DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", 
-                create->domid, create->blkif_handle); 
-        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-        return;
-    }
-
-    /* VDI identifier is in grow->extent.sector_start */
-    DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", 
-            (unsigned long)create->dev_handle);
-
-    vdi = vdi_get(create->dev_handle);
-    if (vdi == NULL)
-    {
-        printf("parallax (vbd_create): VDI %lx not found.\n",
-               (unsigned long)create->dev_handle);
-        create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
-        return;
-    }
-    
-    vdi->next = NULL;
-    vdi->vdevice = vdevice;
-    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
-    while (*vdip != NULL)
-        vdip = &(*vdip)->next;
-    *vdip = vdi;
-    
-    DPRINTF("blkif_create succeeded\n"); 
-    create->status = BLKIF_BE_STATUS_OKAY;
-}
-
-void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
-{
-    blkif_t            *blkif;
-    vdi_t              *vdi, **vdip;
-    blkif_vdev_t        vdevice = destroy->vdevice;
-    
-    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
-    if ( blkif == NULL )
-    {
-        DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
-                destroy->domid, destroy->blkif_handle); 
-        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
-        return;
-    }
-
-    vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
-    while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice))
-        vdip = &(*vdip)->next;
-
-    if (*vdip != NULL) 
-    {
-        vdi = *vdip;
-        *vdip = vdi->next;
-        vdi_put(vdi);
-    }
-        
-}
-
-int parallax_control(control_msg_t *msg)
-{
-    domid_t  domid;
-    int      ret;
-
-    DPRINTF("parallax_control: msg is %p\n", msg); 
-    
-    if (msg->type != CMSG_BLKIF_BE) 
-    {
-        printf("Unexpected control message (%d)\n", msg->type);
-        return 0;
-    }
-    
-    switch(msg->subtype)
-    {
-    case CMSG_BLKIF_BE_CREATE:
-        if ( msg->length != sizeof(blkif_be_create_t) )
-            goto parse_error;
-        blkif_create((blkif_be_create_t *)msg->msg);
-        break;   
-        
-    case CMSG_BLKIF_BE_DESTROY:
-        if ( msg->length != sizeof(blkif_be_destroy_t) )
-            goto parse_error;
-        blkif_destroy((blkif_be_destroy_t *)msg->msg);
-        break;  
-        
-    case CMSG_BLKIF_BE_VBD_CREATE:
-        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
-            goto parse_error;
-        vbd_create((blkif_be_vbd_create_t *)msg->msg);
-        break;
-        
-    case CMSG_BLKIF_BE_VBD_DESTROY:
-        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
-            goto parse_error;
-        vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg);
-        break;
-
-    case CMSG_BLKIF_BE_CONNECT:
-    case CMSG_BLKIF_BE_DISCONNECT:
-        /* we don't manage the device channel, the tap does. */
-        break;
-
-    default:
-        goto parse_error;
-    }
-    return 0;
-parse_error:
-    printf("Bad control message!\n");
-    return 0;
-    
-}    
-
-int parallax_probe(blkif_request_t *req, blkif_t *blkif)
-{
-    blkif_response_t *rsp;
-    vdisk_t *img_info;
-    vdi_t *vdi;
-    int i, nr_vdis = 0; 
-
-    DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); 
-
-    /* We expect one buffer only. */
-    if ( req->nr_segments != 1 )
-      goto err;
-
-    /* Make sure the buffer is page-sized. */
-    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
-       (blkif_last_sect (req->frame_and_sects[0]) != 7) )
-      goto err;
-
-    /* fill the list of devices */
-    for (i=0; i<VDI_HASHSZ; i++) {
-        vdi = blkif->vdi_hash[i];
-        while (vdi) {
-            img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
-            img_info[nr_vdis].device   = vdi->vdevice;
-            img_info[nr_vdis].info     = 0;
-            /* The -1 here accounts for the LSB in the radix tree */
-            img_info[nr_vdis].capacity = 
-                    ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE);
-            nr_vdis++;
-            vdi = vdi->next;
-        }
-    }
-
-    
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_PROBE;
-    rsp->status = nr_vdis; /* number of disks */
-
-    DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
-    return  BLKTAP_RESPOND;
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_PROBE;
-    rsp->status = BLKIF_RSP_ERROR;
-    
-    DPRINTF("parallax_probe: send error response\n"); 
-    return BLKTAP_RESPOND;  
-}
-
-typedef struct {
-    blkif_request_t *req;
-    int              count;
-    int              error;
-    pthread_mutex_t  mutex;
-} pending_t;
-
-#define MAX_REQUESTS 64
-pending_t pending_list[MAX_REQUESTS];
-
-struct cb_param {
-    pending_t *pent;
-    int       segment;
-    u64       sector; 
-    u64       vblock; /* for debug printing -- can be removed. */
-};
-
-static void read_cb(struct io_ret r, void *in_param)
-{
-    struct cb_param *param = (struct cb_param *)in_param;
-    pending_t *p = param->pent;
-    int segment = param->segment;
-    blkif_request_t *req = p->req;
-    unsigned long size, offset, start;
-    char *dpage, *spage;
-       
-    spage  = IO_BLOCK(r);
-    if (spage == NULL) { p->error++; goto finish; }
-    dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment);
-    
-    /* Calculate read size and offset within the read block. */
-
-    offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE;
-    size = ( blkif_last_sect (req->frame_and_sects[segment]) -
-             blkif_first_sect(req->frame_and_sects[segment]) + 1
-        ) << SECTOR_SHIFT;
-    start = blkif_first_sect(req->frame_and_sects[segment]) 
-        << SECTOR_SHIFT;
-
-    DPRINTF("ParallaxRead: sect: %lld (%ld,%ld),  "
-            "vblock %llx, "
-            "size %lx\n", 
-            param->sector, blkif_first_sect(p->req->frame_and_sects[segment]),
-            blkif_last_sect (p->req->frame_and_sects[segment]),
-            param->vblock, size); 
-
-    memcpy(dpage + start, spage + offset, size);
-    freeblock(spage);
-    
-    /* Done the read.  Now update the pending record. */
- finish:
-    pthread_mutex_lock(&p->mutex);
-    p->count--;
-    
-    if (p->count == 0) {
-       blkif_response_t *rsp;
-       
-        rsp = (blkif_response_t *)req;
-        rsp->id = req->id;
-        rsp->operation = BLKIF_OP_READ;
-       if (p->error == 0) {
-            rsp->status = BLKIF_RSP_OKAY;
-       } else {
-            rsp->status = BLKIF_RSP_ERROR;
-       }
-        blktap_inject_response(rsp);       
-    }
-    
-    pthread_mutex_unlock(&p->mutex);
-       
-    free(param); /* TODO: replace with cached alloc/dealloc */
-}      
-
-int parallax_read(blkif_request_t *req, blkif_t *blkif)
-{
-    blkif_response_t *rsp;
-    u64 vblock, gblock;
-    vdi_t *vdi;
-    u64 sector;
-    int i;
-    char *dpage, *spage;
-    pending_t *pent;
-
-    vdi = blkif_get_vdi(blkif, req->device);
-    
-    if ( vdi == NULL )
-        goto err;
-        
-    pent = &pending_list[ID_TO_IDX(req->id)];
-    pent->count = req->nr_segments;
-    pent->req = req;
-    pthread_mutex_init(&pent->mutex, NULL);
-    
-    for (i = 0; i < req->nr_segments; i++) {
-        pthread_t tid;
-        int ret;
-        struct cb_param *p;
-        
-        /* Round the requested segment to a block address. */
-        sector  = req->sector_number + (8*i);
-        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
-        
-        /* TODO: Replace this call to malloc with a cached allocation */
-        p = (struct cb_param *)malloc(sizeof(struct cb_param));
-        p->pent = pent;
-        p->sector = sector; 
-        p->segment = i;     
-        p->vblock = vblock; /* dbg */
-        
-        /* Get that block from the store. */
-        vdi_read(vdi, vblock, read_cb, (void *)p);    
-    }
-    
-    return BLKTAP_STOLEN;
-
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_READ;
-    rsp->status = BLKIF_RSP_ERROR;
-    
-    return BLKTAP_RESPOND;  
-}
-
-static void write_cb(struct io_ret r, void *in_param)
-{
-    struct cb_param *param = (struct cb_param *)in_param;
-    pending_t *p = param->pent;
-    blkif_request_t *req = p->req;
-    
-    /* catch errors from the block code. */
-    if (IO_INT(r) < 0) p->error++;
-    
-    pthread_mutex_lock(&p->mutex);
-    p->count--;
-    
-    if (p->count == 0) {
-       blkif_response_t *rsp;
-       
-        rsp = (blkif_response_t *)req;
-        rsp->id = req->id;
-        rsp->operation = BLKIF_OP_WRITE;
-       if (p->error == 0) {
-            rsp->status = BLKIF_RSP_OKAY;
-       } else {
-            rsp->status = BLKIF_RSP_ERROR;
-       }
-        blktap_inject_response(rsp);       
-    }
-    
-    pthread_mutex_unlock(&p->mutex);
-       
-    free(param); /* TODO: replace with cached alloc/dealloc */
-}
-
-int parallax_write(blkif_request_t *req, blkif_t *blkif)
-{
-    blkif_response_t *rsp;
-    u64 sector;
-    int i, writable = 0;
-    u64 vblock, gblock;
-    char *spage;
-    unsigned long size, offset, start;
-    vdi_t *vdi;
-    pending_t *pent;
-
-    vdi = blkif_get_vdi(blkif, req->device);
-    
-    if ( vdi == NULL )
-        goto err;
-        
-    pent = &pending_list[ID_TO_IDX(req->id)];
-    pent->count = req->nr_segments;
-    pent->req = req;
-    pthread_mutex_init(&pent->mutex, NULL);
-    
-    for (i = 0; i < req->nr_segments; i++) {
-        struct cb_param *p;
-        
-        spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
-        
-        /* Round the requested segment to a block address. */
-        
-        sector  = req->sector_number + (8*i);
-        vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
-        
-        /* Calculate read size and offset within the read block. */
-        
-        offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
-        size = ( blkif_last_sect (req->frame_and_sects[i]) -
-                 blkif_first_sect(req->frame_and_sects[i]) + 1
-            ) << SECTOR_SHIFT;
-        start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
-
-        DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld),  "
-                "vblock %llx, gblock %llx, "
-                "size %lx\n", 
-                sector, blkif_first_sect(req->frame_and_sects[i]),
-                blkif_last_sect (req->frame_and_sects[i]),
-                vblock, gblock, size); 
-      
-        /* XXX: For now we just freak out if they try to write a   */
-        /* non block-sized, block-aligned page.                    */
-        
-        if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
-            printf("]\n] STRANGE WRITE!\n]\n");
-            goto err;
-        }
-        
-        /* TODO: Replace this call to malloc with a cached allocation */
-        p = (struct cb_param *)malloc(sizeof(struct cb_param));
-        p->pent = pent;
-        p->sector = sector; 
-        p->segment = i;     
-        p->vblock = vblock; /* dbg */
-        
-        /* Issue the write to the store. */
-        vdi_write(vdi, vblock, spage, write_cb, (void *)p);
-    }
-
-    return BLKTAP_STOLEN;
-
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->id = req->id;
-    rsp->operation = BLKIF_OP_WRITE;
-    rsp->status = BLKIF_RSP_ERROR;
-    
-    return BLKTAP_RESPOND;  
-}
-
-int parallax_request(blkif_request_t *req)
-{
-    blkif_response_t *rsp;
-    domid_t  dom   = ID_TO_DOM(req->id);
-    blkif_t *blkif = blkif_find_by_handle(dom, 0);
-    
-    if (blkif == NULL)
-        goto err;
-    
-    if ( req->operation == BLKIF_OP_PROBE ) {
-        
-        return parallax_probe(req, blkif);
-        
-    } else if ( req->operation == BLKIF_OP_READ ) {
-        
-        return parallax_read(req, blkif);
-        
-    } else if ( req->operation == BLKIF_OP_WRITE ) {
-        
-        return parallax_write(req, blkif);
-        
-    } else {
-        printf("Unknown request message type!\n");
-        /* Unknown operation */
-        goto err;
-    }
-    
-err:
-    rsp = (blkif_response_t *)req;
-    rsp->operation = req->operation;
-    rsp->id = req->id;
-    rsp->status = BLKIF_RSP_ERROR;
-    return BLKTAP_RESPOND;  
-}
-
-void __init_parallax(void) 
-{
-    memset(blkif_hash, 0, sizeof(blkif_hash));
-}
-
-
-
-int main(int argc, char *argv[])
-{
-    DPRINTF("parallax: starting.\n"); 
-    __init_blockstore();
-    DPRINTF("parallax: initialized blockstore...\n"); 
-    init_block_async();
-    DPRINTF("parallax: initialized async blocks...\n"); 
-    __init_vdi();
-    DPRINTF("parallax: initialized vdi registry etc...\n"); 
-    __init_parallax();
-    DPRINTF("parallax: initialized local stuff..\n"); 
-
-    blktap_register_ctrl_hook("parallax_control", parallax_control);
-    blktap_register_request_hook("parallax_request", parallax_request);
-    DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); 
-    blktap_listen();
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/radix.c
--- a/tools/blktap/radix.c      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,631 +0,0 @@
-/*
- * Radix tree for mapping (up to) 63-bit virtual block IDs to
- * 63-bit global block IDs
- *
- * Pointers within the tree set aside the least significant bit to indicate
- * whther or not the target block is writable from this node.
- *
- * The block with ID 0 is assumed to be an empty block of all zeros
- */
-
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <pthread.h>
-#include "blockstore.h"
-#include "radix.h"
-
-#define RADIX_TREE_MAP_SHIFT 9
-#define RADIX_TREE_MAP_MASK 0x1ff
-#define RADIX_TREE_MAP_ENTRIES 512
-
-/*
-#define DEBUG
-*/
-
-/* Experimental radix cache. */
-
-static  pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER;
-static  int rcache_count = 0;
-#define RCACHE_MAX 1024
-
-typedef struct rcache_st {
-    radix_tree_node  *node;
-    u64               id;
-    struct rcache_st *hash_next;
-    struct rcache_st *cache_next;
-    struct rcache_st *cache_prev;
-} rcache_t;
-
-static rcache_t *rcache_head = NULL;
-static rcache_t *rcache_tail = NULL;
-
-#define RCHASH_SIZE 512ULL
-rcache_t *rcache[RCHASH_SIZE];
-#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1))
-
-void __rcache_init(void)
-{
-    int i;
-
-    for (i=0; i<RCHASH_SIZE; i++)
-        rcache[i] = NULL;
-}
-    
-
-void rcache_write(u64 id, radix_tree_node *node)
-{
-    rcache_t *r, *tmp, **curs;
-    
-    pthread_mutex_lock(&rcache_mutex);
-    
-    /* Is it already in the cache? */
-    r = rcache[RCACHE_HASH(id)];
-    
-    for (;;) {
-        if (r == NULL) 
-            break;
-        if (r->id == id) 
-        {
-            memcpy(r->node, node, BLOCK_SIZE);
-            
-            /* bring to front. */
-            if (r != rcache_head) {
-                
-                if (r == rcache_tail) {
-                    if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
-                    rcache_tail->cache_next = NULL;
-                }
-
-                tmp = r->cache_next;
-                if (r->cache_next != NULL) r->cache_next->cache_prev 
-                                                     = r->cache_prev;
-                if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
-
-                r->cache_prev = NULL;
-                r->cache_next = rcache_head;
-                if (rcache_head != NULL) rcache_head->cache_prev = r;
-                rcache_head = r;
-            }
-
-//printf("Update (%Ld)\n", r->id);
-            goto done;
-        }
-        r = r->hash_next;
-    }
-    
-    if ( rcache_count == RCACHE_MAX ) 
-    {
-        /* Remove an entry */
-        
-        r = rcache_tail;
-        if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
-        rcache_tail->cache_next = NULL;
-        freeblock(r->node);
-        
-        curs = &rcache[RCACHE_HASH(r->id)];
-        while ((*curs) != r)
-            curs = &(*curs)->hash_next;
-        *curs = r->hash_next;
-//printf("Evict (%Ld)\n", r->id);
-        
-    } else {
-        
-        r = (rcache_t *)malloc(sizeof(rcache_t));
-        rcache_count++;
-    }
-    
-    r->node = newblock();
-    memcpy(r->node, node, BLOCK_SIZE);
-    r->id = id;
-    
-    r->hash_next = rcache[RCACHE_HASH(id)];
-    rcache[RCACHE_HASH(id)] = r;
-    
-    r->cache_prev = NULL;
-    r->cache_next = rcache_head;
-    if (rcache_head != NULL) rcache_head->cache_prev = r;
-    rcache_head = r;
-    if (rcache_tail == NULL) rcache_tail = r;
-    
-//printf("Added (%Ld, %p)\n", id, r->node);
-done:
-    pthread_mutex_unlock(&rcache_mutex);
-}
-
-radix_tree_node *rcache_read(u64 id)
-{
-    rcache_t *r, *tmp;
-    radix_tree_node *node = NULL;
-    
-    pthread_mutex_lock(&rcache_mutex);
-
-    r = rcache[RCACHE_HASH(id)];
-    
-    for (;;) {
-        if (r == NULL) {
-//printf("Miss (%Ld)\n", id);
-            goto done;
-        }
-        if (r->id == id) break;
-        r = r->hash_next;
-    }
-   
-    /* bring to front. */
-    if (r != rcache_head) 
-    {
-        if (r == rcache_tail) {
-            if (r->cache_prev != NULL) rcache_tail = r->cache_prev;
-            rcache_tail->cache_next = NULL;
-        }
-        tmp = r->cache_next;
-        if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev;
-        if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp;
-
-        r->cache_prev = NULL;
-        r->cache_next = rcache_head;
-        if (rcache_head != NULL) rcache_head->cache_prev = r;
-        rcache_head = r;
-    }
-    
-    node = newblock();
-    memcpy(node, r->node, BLOCK_SIZE);
-    
-//printf("Hit (%Ld, %p)\n", id, r->node);
-done:
-    pthread_mutex_unlock(&rcache_mutex);
-    
-    return(node);
-}
-
-
-void *rc_readblock(u64 id)
-{
-    void *ret;
-    
-    ret = (void *)rcache_read(id);
-    
-    if (ret != NULL) return ret;
-    
-    ret = readblock(id);
-    
-    if (ret != NULL)
-        rcache_write(id, ret);
-    
-    return(ret);
-}
-
-u64 rc_allocblock(void *block)
-{
-    u64 ret;
-    
-    ret = allocblock(block);
-    
-    if (ret != ZERO)
-        rcache_write(ret, block);
-    
-    return(ret);
-}
-
-int rc_writeblock(u64 id, void *block)
-{
-    int ret;
-    
-    ret = writeblock(id, block);
-    rcache_write(id, block);
-    
-    return(ret);
-}
-
-
-/*
- * block device interface and other helper functions
- * with these functions, block id is just a 63-bit number, with
- * no special consideration for the LSB
- */
-radix_tree_node cloneblock(radix_tree_node block);
-
-/*
- * main api
- * with these functions, the LSB of root always indicates
- * whether or not the block is writable, including the return
- * values of update and snapshot
- */
-u64 lookup(int height, u64 root, u64 key);
-u64 update(int height, u64 root, u64 key, u64 val);
-u64 snapshot(u64 root);
-
-/**
- * cloneblock: clone an existing block in memory
- *   @block: the old block
- *
- *   @return: new block, with LSB cleared for every entry
- */
-radix_tree_node cloneblock(radix_tree_node block) {
-    radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
-    int i;
-    if (node == NULL) {
-        perror("cloneblock malloc");
-        return NULL;
-    }
-    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
-        node[i] = block[i] & ONEMASK;
-    return node;
-}
-
-/**
- * lookup: find a value given a key
- *   @height: height in bits of the radix tree
- *   @root: root node id, with set LSB indicating writable node
- *   @key: key to lookup
- *
- *   @return: value on success, zero on error
- */
-
-u64 lookup(int height, u64 root, u64 key) {
-    radix_tree_node node;
-    u64 mask = ONE;
-    
-    assert(key >> height == 0);
-
-    /* the root block may be smaller to ensure all leaves are full */
-    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-
-    /* now carve off equal sized chunks at each step */
-    for (;;) {
-        u64 oldroot;
-
-#ifdef DEBUG
-        printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
-                (int) ((key >> height) & RADIX_TREE_MAP_MASK),
-                (iswritable(root) ? "" : " (readonly)"));
-#endif
-        
-        if (getid(root) == ZERO)
-            return ZERO;
-
-        oldroot = root;
-        node = (radix_tree_node) rc_readblock(getid(root));
-        if (node == NULL)
-            return ZERO;
-
-        root = node[(key >> height) & RADIX_TREE_MAP_MASK];
-        mask &= root;
-        freeblock(node);
-
-        if (height == 0)
-            return ( root & ONEMASK ) | mask;
-
-        height -= RADIX_TREE_MAP_SHIFT;
-    }
-
-    return ZERO;
-}
-
-/*
- * update: set a radix tree entry, doing copy-on-write as necessary
- *   @height: height in bits of the radix tree
- *   @root: root node id, with set LSB indicating writable node
- *   @key: key to set
- *   @val: value to set, s.t. radix(key)=val
- *
- *   @returns: (possibly new) root id on success (with LSB=1), 0 on failure
- */
-
-u64 update(int height, u64 root, u64 key, u64 val) {
-    int offset;
-    u64 child;
-    radix_tree_node node;
-    
-    /* base case--return val */
-    if (height == 0)
-        return val;
-
-    /* the root block may be smaller to ensure all leaves are full */
-    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-    offset = (key >> height) & RADIX_TREE_MAP_MASK;
-
-#ifdef DEBUG
-    printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
-            offset, (iswritable(root)?"":" (clone)"));
-#endif
-
-    /* load a block, or create a new one */
-    if (root == ZERO) {
-        node = (radix_tree_node) newblock();
-    } else {
-        node = (radix_tree_node) rc_readblock(getid(root));
-
-        if (!iswritable(root)) {
-            /* need to clone this node */
-            radix_tree_node oldnode = node;
-            node = cloneblock(node);
-            freeblock(oldnode);
-            root = ZERO;
-        }
-    }
-
-    if (node == NULL) {
-#ifdef DEBUG
-        printf("update: node is null!\n");
-#endif
-        return ZERO;
-    }
-
-    child = update(height, node[offset], key, val);
-
-    if (child == ZERO) {
-        freeblock(node);
-        return ZERO;
-    } else if (child == node[offset]) {
-        /* no change, so we already owned the child */
-        assert(iswritable(root));
-
-        freeblock(node);
-        return root;
-    }
-
-    node[offset] = child;
-
-    /* new/cloned blocks need to be saved */
-    if (root == ZERO) {
-        /* mark this as an owned block */
-        root = rc_allocblock(node);
-        if (root)
-            root = writable(root);
-    } else if (rc_writeblock(getid(root), node) < 0) {
-        freeblock(node);
-        return ZERO;
-    }
-
-    freeblock(node);
-    return root;
-}
-
-/**
- * snapshot: create a snapshot
- *   @root: old root node
- *
- *   @return: new root node, 0 on error
- */
-u64 snapshot(u64 root) {
-    radix_tree_node node, newnode;
-
-    if ((node = rc_readblock(getid(root))) == NULL)
-        return ZERO;
-
-    newnode = cloneblock(node);
-    freeblock(node);
-    if (newnode == NULL)
-        return ZERO;
-    
-    root = rc_allocblock(newnode);
-    freeblock(newnode);
-
-    if (root == ZERO)
-        return ZERO;
-    else
-        return writable(root);
-}
-
-/**
- * collapse: collapse a parent onto a child.
- * 
- * NOTE: This assumes that parent and child really are, and further that
- * there are no other children forked from this parent. (children of the
- * child are okay...)
- */
-
-int collapse(int height, u64 proot, u64 croot)
-{
-    int i, numlinks, ret, total = 0;
-    radix_tree_node pnode, cnode;
-    
-    if (height == 0) {
-        height = -1; /* terminate recursion */
-    } else {        
-        height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-    }
-    numlinks = (1UL << RADIX_TREE_MAP_SHIFT);
-
-    /* Terminal cases: */
-
-    if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) )
-        return -1;
-    
-    /* get roots */
-    if ((pnode = readblock(getid(proot))) == NULL)
-        return -1;
-    
-    if ((cnode = readblock(getid(croot))) == NULL)
-    {
-        freeblock(pnode);
-        return -1;
-    }
-    
-    /* For each writable link in proot */
-    for (i=0; i<numlinks; i++)
-    {
-        if ( pnode[i] == cnode[i] ) continue;
-        
-        /* collapse (next level) */
-        /* if height != 0 and writable... */
-        if (( height >= 0 ) && ( iswritable(pnode[i]) ) )
-        {
-            //printf("   %Ld is writable (i=%d).\n", getid(pnode[i]), i);
-            ret = collapse(height, pnode[i], cnode[i]);
-            if (ret == -1) 
-            {
-                total = -1;
-            } else {
-                total += ret;
-            }
-        }
-    
-        
-    }
-    
-    /* if plink is writable, AND clink is writable -> free plink block */
-    if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) 
-    {
-        releaseblock(getid(proot));
-        if (ret >=0) total++;
-        //printf("   Delete %Ld\n", getid(proot));
-    }
-//printf("done : %Ld\n", getid(proot));
-    return total;
-
-}
-
-
-void print_root(u64 root, int height, FILE *dot_f)
-{
-    FILE *f;
-    int i;
-    radix_tree_node node;
-    char *style[2] = { "", "style=bold,color=blue," };
-    
-    if (dot_f == NULL) {
-        f = fopen("radix.dot", "w");
-        if (f == NULL) {
-            perror("print_root: open");
-            return;
-        }
-
-        /* write graph preamble */
-        fprintf(f, "digraph G {\n");
-
-        /* add a node for this root. */
-        fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
-                getid(root), style[iswritable(root)], getid(root));
-    }
-    
-    printf("print_root(%Ld)\n", getid(root));
-    
-    /* base case */
-    if (height == 0) {
-        /* add a node and edge for each child root */
-        node = (radix_tree_node) readblock(getid(root));
-        if (node == NULL)
-            return;
-        
-        for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
-            if (node[i] != ZERO) {
-                fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
-                        getid(node[i]), style[iswritable(node[i])], 
-                        getid(node[i]));
-                fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
-                        getid(node[i]), i);
-            }
-        }
-        freeblock(node);
-        return;
-    }
-
-    /* the root block may be smaller to ensure all leaves are full */
-    height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
-
-    if (getid(root) == ZERO)
-        return;
-
-    node = (radix_tree_node) readblock(getid(root));
-    if (node == NULL)
-        return;
-
-    /* add a node and edge for each child root */
-    for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
-        if (node[i] != ZERO) {
-            fprintf(f, "   n%Ld [%sshape=box,label=\"%Ld\"];\n", 
-                    getid(node[i]), style[iswritable(node[i])], 
-                    getid(node[i]));
-
-            print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f);
-            fprintf(f, "   n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), 
-                    getid(node[i]), i);
-        }
-
-    freeblock(node);
-    
-    /* write graph postamble */
-    if (dot_f == NULL) {
-        fprintf(f, "}\n");
-        fclose(f);
-    }
-}
-
-#ifdef RADIX_STANDALONE
-
-int main(int argc, char **argv) {
-    u64 key = ZERO, val = ZERO;
-    u64 root = writable(2ULL);
-    u64 p = ZERO, c = ZERO;
-    int v;
-    char buff[4096];
-
-    __init_blockstore();
-    
-    memset(buff, 0, 4096);
-    /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
-
-    if (fp < 3) {
-        perror("open");
-        return -1;
-    }
-    if (lseek(fp, 0, SEEK_END) == 0) {
-        write(fp, buff, 4096);
-    }*/
-        
-    allocblock(buff);
-            
-    printf("Recognized commands:\n"
-           "Note: the LSB of a node number indicates if it is writable\n"
-           "  root <node>               set root to <node>\n"
-           "  snapshot                  take a snapshot of the root\n"
-           "  set <key> <val>           set key=val\n"
-           "  get <key>                 query key\n"
-           "  c <proot> <croot>         collapse\n"
-           "  pr                        print tree to dot\n"
-           "  pf <1=verbose>            print freelist\n"
-           "  quit\n"
-           "\nroot = %Ld\n", root);
-    for (;;) {
-        //print_root(root, 34, NULL);
-        //system("dot radix.dot -Tps -o radix.ps");
-
-        printf("> ");
-        fflush(stdout);
-        fgets(buff, 1024, stdin);
-        if (feof(stdin))
-            break;
-        if (sscanf(buff, " root %Ld", &root) == 1) {
-            printf("root set to %Ld\n", root);
-        } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
-            root = update(34, root, key, val);
-            printf("root = %Ld\n", root);
-        } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) {
-            v = collapse(34, p, c);
-            printf("reclaimed %d blocks.\n", v);
-        } else if (sscanf(buff, " get %Ld", &key) == 1) {
-            val = lookup(34, root, key);
-            printf("value = %Ld\n", val);
-        } else if (!strcmp(buff, "quit\n")) {
-            break;
-        } else if (!strcmp(buff, "snapshot\n")) {
-            root = snapshot(root);
-            printf("new root = %Ld\n", root);
-        } else if (sscanf(buff, " pr %Ld", &root) == 1) {
-            print_root(root, 34, NULL);
-        } else if (sscanf(buff, " pf %d", &v) == 1) {
-            freelist_count(v);
-        } else if (!strcmp(buff, "pf\n")) {
-            freelist_count(0);
-        } else {
-            printf("command not recognized\n");
-        }
-    }
-    return 0;
-}
-
-#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/radix.h
--- a/tools/blktap/radix.h      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,45 +0,0 @@
-/*
- * Radix tree for mapping (up to) 63-bit virtual block IDs to
- * 63-bit global block IDs
- *
- * Pointers within the tree set aside the least significant bit to indicate
- * whther or not the target block is writable from this node.
- *
- * The block with ID 0 is assumed to be an empty block of all zeros
- */
-
-#ifndef __RADIX_H__
-#define __RADIX_H__
-
-/* I don't really like exposing these, but... */
-#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
-#define putid(x) ((x)<<1)
-#define writable(x) (((x)<<1)|1LL)
-#define iswritable(x) ((x)&1LL)
-#define ZERO 0LL
-#define ONE 1LL
-#define ONEMASK 0xffffffffffffffeLL
-
-#define RADIX_TREE_MAP_SHIFT 9
-#define RADIX_TREE_MAP_MASK 0x1ff
-#define RADIX_TREE_MAP_ENTRIES 512
-
-typedef u64 *radix_tree_node;
-
-
-/*
- * main api
- * with these functions, the LSB of root always indicates
- * whether or not the block is writable, including the return
- * values of update and snapshot
- */
-u64 lookup(int height, u64 root, u64 key);
-u64 update(int height, u64 root, u64 key, u64 val);
-u64 snapshot(u64 root);
-int collapse(int height, u64 proot, u64 croot);
-int isprivate(int height, u64 root, u64 key);
-
-
-void __rcache_init(void);
-
-#endif /* __RADIX_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/requests-async.c
--- a/tools/blktap/requests-async.c     Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,762 +0,0 @@
-/* requests-async.c
- *
- * asynchronous request dispatcher for radix access in parallax.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <assert.h>
-#include <pthread.h>
-#include <err.h>
-#include <zlib.h> /* for crc32() */
-#include "requests-async.h"
-#include "vdi.h"
-#include "radix.h"
-
-#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18)
-#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9)
-#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL))
-
-
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-struct block_info {
-    u32        crc;
-    u32        unused;
-};
-
-struct io_req {
-    enum { IO_OP_READ, IO_OP_WRITE } op;
-    u64        root;
-    u64        vaddr;
-    int        state;
-    io_cb_t    cb;
-    void      *param;
-    struct radix_lock *lock;
-
-    /* internal stuff: */
-    struct io_ret     retval;/* holds the return while we unlock. */
-    char             *block; /* the block to write */
-    radix_tree_node   radix[3];
-    u64               radix_addr[3];
-    struct block_info bi;
-};
-
-void clear_w_bits(radix_tree_node node) 
-{
-    int i;
-    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++)
-        node[i] = node[i] & ONEMASK;
-    return;
-}
-
-void clear_L3_w_bits(radix_tree_node node) 
-{
-    int i;
-    for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2)
-        node[i] = node[i] & ONEMASK;
-    return;
-}
-
-enum states {
-    /* both */
-    READ_L1,
-    READ_L2,
-    READ_L3,
-
-    /* read */
-    READ_LOCKED,
-    READ_DATA,
-    READ_UNLOCKED,
-    RETURN_ZERO,
-
-    /* write */
-    WRITE_LOCKED,
-    WRITE_DATA,
-    WRITE_L3,
-    WRITE_UNLOCKED,
-    
-    /* L3 Zero Path */
-    ALLOC_DATA_L3z,
-    WRITE_L3_L3z,
-    
-    /* L3 Fault Path */
-    ALLOC_DATA_L3f,
-    WRITE_L3_L3f,
-    
-    /* L2 Zero Path */
-    ALLOC_DATA_L2z,
-    WRITE_L2_L2z,
-    ALLOC_L3_L2z,
-    WRITE_L2_L3z,
-    
-    /* L2 Fault Path */
-    READ_L3_L2f,
-    ALLOC_DATA_L2f,
-    WRITE_L2_L2f,
-    ALLOC_L3_L2f,
-    WRITE_L2_L3f,
-
-    /* L1 Zero Path */
-    ALLOC_DATA_L1z,
-    ALLOC_L3_L1z,
-    ALLOC_L2_L1z,
-    WRITE_L1_L1z,
-
-    /* L1 Fault Path */
-    READ_L2_L1f,
-    READ_L3_L1f,
-    ALLOC_DATA_L1f,
-    ALLOC_L3_L1f,
-    ALLOC_L2_L1f,
-    WRITE_L1_L1f,
-    
-};
-
-enum radix_offsets {
-    L1 = 0, 
-    L2 = 1,
-    L3 = 2
-};
-
-
-static void read_cb(struct io_ret ret, void *param);
-static void write_cb(struct io_ret ret, void *param);
-
-int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param)
-{
-    struct io_req *req;
-
-    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
-    /* Every second line in the bottom-level radix tree is used to      */
-    /* store crc32 values etc. We shift the vadder here to achied this. */
-    vaddr <<= 1;
-
-    req = (struct io_req *)malloc(sizeof (struct io_req));
-    if (req == NULL) return ERR_NOMEM;
-
-    req->radix[0] = req->radix[1] = req->radix[2] = NULL;      
-    req->op    = IO_OP_READ;
-    req->root  = vdi->radix_root;
-    req->lock  = vdi->radix_lock; 
-    req->vaddr = vaddr;
-    req->cb    = cb;
-    req->param = param;
-    req->state = READ_LOCKED;
-
-    block_rlock(req->lock, L1_IDX(vaddr), read_cb, req);
-       
-    return 0;
-}
-
-
-int   vdi_write(vdi_t *vdi, u64 vaddr, char *block, 
-                io_cb_t cb, void *param)
-{
-    struct io_req *req;
-
-    if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR;
-    /* Every second line in the bottom-level radix tree is used to      */
-    /* store crc32 values etc. We shift the vadder here to achied this. */
-    vaddr <<= 1;
-
-    req = (struct io_req *)malloc(sizeof (struct io_req));
-    if (req == NULL) return ERR_NOMEM; 
-
-    req->radix[0] = req->radix[1] = req->radix[2] = NULL;
-    req->op     = IO_OP_WRITE;
-    req->root   = vdi->radix_root;
-    req->lock   = vdi->radix_lock; 
-    req->vaddr  = vaddr;
-    req->block  = block;
-    /* Todo: add a pseodoheader to the block to include some location   */
-    /* information in the CRC as well.                                  */
-    req->bi.crc = (u32) crc32(0L, Z_NULL, 0); 
-    req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); 
-    req->bi.unused = 0xdeadbeef;
-
-    req->cb     = cb;
-    req->param  = param;
-    req->radix_addr[L1] = getid(req->root); /* for consistency */
-    req->state  = WRITE_LOCKED;
-
-    block_wlock(req->lock, L1_IDX(vaddr), write_cb, req);
-
-
-    return 0;
-}
-
-static void read_cb(struct io_ret ret, void *param)
-{
-    struct io_req *req = (struct io_req *)param;
-    radix_tree_node node;
-    u64 idx;
-    char *block;
-    void *req_param;
-
-    DPRINTF("read_cb\n");
-    /* get record */
-    switch(req->state) {
-       
-    case READ_LOCKED: 
-    
-        DPRINTF("READ_LOCKED\n");
-       req->state = READ_L1;
-       block_read(getid(req->root), read_cb, req); 
-       break;
-       
-    case READ_L1: /* block is the radix root */
-
-        DPRINTF("READ_L1\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-        node = (radix_tree_node) block;
-        idx  = getid( node[L1_IDX(req->vaddr)] );
-        free(block);
-        if ( idx == ZERO ) {
-            req->state = RETURN_ZERO;
-            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        } else {
-            req->state = READ_L2;
-            block_read(idx, read_cb, req);
-        }
-        break;
-
-    case READ_L2:
-
-        DPRINTF("READ_L2\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-        node = (radix_tree_node) block;
-        idx  = getid( node[L2_IDX(req->vaddr)] );
-        free(block);
-        if ( idx == ZERO ) {
-            req->state = RETURN_ZERO;
-            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        } else {
-            req->state = READ_L3;
-            block_read(idx, read_cb, req);
-        }
-        break;
-
-    case READ_L3:
-    {
-        struct block_info *bi;
-
-        DPRINTF("READ_L3\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-        node = (radix_tree_node) block;
-        idx  = getid( node[L3_IDX(req->vaddr)] );
-        bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1];
-        req->bi = *bi;
-        free(block);
-        if ( idx == ZERO )  {
-            req->state = RETURN_ZERO;
-            block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        } else {
-            req->state = READ_DATA;
-            block_read(idx, read_cb, req);
-        }
-        break;
-    }
-    case READ_DATA:
-    {
-        u32 crc;
-
-        DPRINTF("READ_DATA\n");
-        block = IO_BLOCK(ret);
-        if (block == NULL) goto fail;
-
-        /* crc check */
-        crc = (u32) crc32(0L, Z_NULL, 0); 
-        crc = (u32) crc32(crc, block, BLOCK_SIZE); 
-        if (crc != req->bi.crc) {
-            /* TODO: add a retry loop here.                          */
-            /* Do this after the cache is added -- make sure to      */
-            /* invalidate the bad page before reissuing the read.    */
-
-            warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused);
-#ifdef PRINT_BADCRC_PAGES
-            {
-                int j;
-                for (j=0; j<BLOCK_SIZE; j++) {
-                    if isprint(block[j]) {
-                        printf("%c", block[j]);
-                    } else {
-                        printf(".");
-                    }
-                    if ((j % 64) == 0) printf("\n");
-                }
-            }
-#endif /* PRINT_BADCRC_PAGES */
-
-            /* fast and loose for the moment. */
-            /* goto fail;                     */
-        }
-
-        req->retval = ret;
-        req->state = READ_UNLOCKED;
-        block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req);
-        break;
-    }
-    case READ_UNLOCKED:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("READ_UNLOCKED\n");
-        req_param = req->param;
-        r         = req->retval;
-        cb        = req->cb;
-        free(req);
-        cb(r, req_param);
-        break;
-    }
-    
-    case RETURN_ZERO:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("RETURN_ZERO\n");
-        req_param = req->param;
-        cb        = req->cb;
-        free(req);
-        r.type = IO_BLOCK_T;
-        r.u.b = newblock();
-        cb(r, req_param);
-        break;
-    }
-        
-    default:
-       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
-       goto fail;
-    }
- 
-    return;
-
- fail:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("asyn_read had a read error.\n");
-        req_param = req->param;
-        r         = ret;
-        cb        = req->cb;
-        free(req);
-        cb(r, req_param);
-    }
-
-
-}
-
-static void write_cb(struct io_ret r, void *param)
-{
-    struct io_req *req = (struct io_req *)param;
-    radix_tree_node node;
-    u64 a, addr;
-    void *req_param;
-    struct block_info *bi;
-
-    switch(req->state) {
-       
-    case WRITE_LOCKED:
-        
-        DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr));
-       req->state = READ_L1;
-       block_read(getid(req->root), write_cb, req); 
-       break;
-       
-    case READ_L1: /* block is the radix root */
-
-        DPRINTF("READ_L1\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        if (node == NULL) goto fail;
-        a    = node[L1_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix_addr[L2] = addr;
-        req->radix[L1] = node;
-
-        if ( addr == ZERO ) {
-            /* L1 empty subtree: */
-            req->state = ALLOC_DATA_L1z;
-            block_alloc( req->block, write_cb, req );
-        } else if ( !iswritable(a) ) {
-            /* L1 fault: */
-            req->state = READ_L2_L1f;
-            block_read( addr, write_cb, req );
-        } else {
-            req->state = READ_L2;
-            block_read( addr, write_cb, req );
-        }
-        break;
-    
-    case READ_L2:
-
-        DPRINTF("READ_L2\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix_addr[L3] = addr;
-        req->radix[L2] = node;
-
-        if ( addr == ZERO ) {
-            /* L2 empty subtree: */
-            req->state = ALLOC_DATA_L2z;
-            block_alloc( req->block, write_cb, req );
-        } else if ( !iswritable(a) ) {
-            /* L2 fault: */
-            req->state = READ_L3_L2f;
-            block_read( addr, write_cb, req );
-        } else {
-            req->state = READ_L3;
-            block_read( addr, write_cb, req );
-        }
-        break;
-    
-    case READ_L3:
-
-        DPRINTF("READ_L3\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        if (node == NULL) goto fail;
-        a    = node[L3_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix[L3] = node;
-
-        if ( addr == ZERO ) {
-            /* L3 fault: */
-            req->state = ALLOC_DATA_L3z;
-            block_alloc( req->block, write_cb, req );
-        } else if ( !iswritable(a) ) {
-            /* L3 fault: */
-            req->state = ALLOC_DATA_L3f;
-            block_alloc( req->block, write_cb, req );
-        } else {
-            req->state = WRITE_DATA;
-            block_write( addr, req->block, write_cb, req );
-        }
-        break;
-    
-    case WRITE_DATA:
-
-        DPRINTF("WRITE_DATA\n");
-        /* The L3 radix points to the correct block, we just need to  */
-        /* update the crc.                                            */
-        if (IO_INT(r) < 0) goto fail;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 101;
-        *bi = req->bi;
-        req->state = WRITE_L3;
-        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
-        break;
-    
-    /* L3 Zero Path: */
-
-    case ALLOC_DATA_L3z:
-
-        DPRINTF("ALLOC_DATA_L3z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 102;
-        *bi = req->bi;
-        req->state = WRITE_L3_L3z;
-        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
-        break;
-    
-    /* L3 Fault Path: */
-
-    case ALLOC_DATA_L3f:
-    
-        DPRINTF("ALLOC_DATA_L3f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 103;
-        *bi = req->bi;
-        req->state = WRITE_L3_L3f;
-        block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req);
-        break;
-
-    /* L2 Zero Path: */
-        
-    case ALLOC_DATA_L2z:
-
-        DPRINTF("ALLOC_DATA_L2z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3] = newblock();
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 104;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L2z;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-
-    case ALLOC_L3_L2z:
-
-        DPRINTF("ALLOC_L3_L2z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = WRITE_L2_L2z;
-        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
-        break;
-        
-    /* L2 Fault Path: */
-        
-    case READ_L3_L2f:
-    
-       DPRINTF("READ_L3_L2f\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        clear_L3_w_bits(node);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix[L3] = node;
-        req->state = ALLOC_DATA_L2f;
-        block_alloc( req->block, write_cb, req );
-        break;
-                
-    case ALLOC_DATA_L2f:
-
-        DPRINTF("ALLOC_DATA_L2f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 105;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L2f;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-
-    case ALLOC_L3_L2f:
-
-        DPRINTF("ALLOC_L3_L2f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = WRITE_L2_L2f;
-        block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req);
-        break;
-        
-    /* L1 Zero Path: */
-    
-    case ALLOC_DATA_L1z:
-
-        DPRINTF("ALLOC_DATA_L1z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3] = newblock();
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 106;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L1z;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-        
-    case ALLOC_L3_L1z:
-
-        DPRINTF("ALLOC_L3_L1z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2] = newblock();
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = ALLOC_L2_L1z;
-        block_alloc( (char*)req->radix[L2], write_cb, req );
-        break;
-
-    case ALLOC_L2_L1z:
-
-        DPRINTF("ALLOC_L2_L1z\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L1][L1_IDX(req->vaddr)] = a;
-        req->state = WRITE_L1_L1z;
-        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
-        break;
-
-    /* L1 Fault Path: */
-        
-    case READ_L2_L1f:
-    
-       DPRINTF("READ_L2_L1f\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        clear_w_bits(node);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix_addr[L3] = addr;
-        req->radix[L2] = node;
-        
-        if (addr == ZERO) {
-            /* nothing below L2, create an empty L3 and alloc data. */
-            /* (So skip READ_L3_L1f.) */
-            req->radix[L3] = newblock();
-            req->state = ALLOC_DATA_L1f;
-            block_alloc( req->block, write_cb, req );
-        } else {
-            req->state = READ_L3_L1f;
-            block_read( addr, write_cb, req );
-        }
-        break;
-        
-    case READ_L3_L1f:
-    
-       DPRINTF("READ_L3_L1f\n");
-        node = (radix_tree_node) IO_BLOCK(r);
-        clear_L3_w_bits(node);
-        if (node == NULL) goto fail;
-        a    = node[L2_IDX(req->vaddr)];
-        addr = getid(a);
-
-        req->radix[L3] = node;
-        req->state = ALLOC_DATA_L1f;
-        block_alloc( req->block, write_cb, req );
-        break;
-                
-    case ALLOC_DATA_L1f:
-
-        DPRINTF("ALLOC_DATA_L1f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L3][L3_IDX(req->vaddr)] = a;
-        bi  = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1];
-        req->bi.unused = 107;
-        *bi = req->bi;
-        req->state = ALLOC_L3_L1f;
-        block_alloc( (char*)req->radix[L3], write_cb, req );
-        break;
-
-    case ALLOC_L3_L1f:
-
-        DPRINTF("ALLOC_L3_L1f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L2][L2_IDX(req->vaddr)] = a;
-        req->state = ALLOC_L2_L1f;
-        block_alloc( (char*)req->radix[L2], write_cb, req );
-        break;
-
-    case ALLOC_L2_L1f:
-
-        DPRINTF("ALLOC_L2_L1f\n");
-        addr = IO_ADDR(r);
-        a = writable(addr);
-        req->radix[L1][L1_IDX(req->vaddr)] = a;
-        req->state = WRITE_L1_L1f;
-        block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req);
-        break;
-
-    case WRITE_L3:
-    case WRITE_L3_L3z:
-    case WRITE_L3_L3f:
-    case WRITE_L2_L2z:
-    case WRITE_L2_L2f:
-    case WRITE_L1_L1z:
-    case WRITE_L1_L1f:
-    {
-       int i;
-        DPRINTF("DONE\n");
-        /* free any saved node vals. */
-        for (i=0; i<3; i++)
-            if (req->radix[i] != 0) free(req->radix[i]);
-        req->retval = r;
-        req->state = WRITE_UNLOCKED;
-        block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req);
-        break;
-    }
-    case WRITE_UNLOCKED:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        DPRINTF("WRITE_UNLOCKED!\n");
-        req_param = req->param;
-        r         = req->retval;
-        cb        = req->cb;
-        free(req);
-        cb(r, req_param);
-        break;
-    }
-        
-    default:
-       DPRINTF("*** Write: Bad state! (%d) ***\n", req->state);
-       goto fail;
-    }
-    
-    return;
-    
- fail:
-    {
-        struct io_ret r;
-        io_cb_t cb;
-        int i;
-
-        DPRINTF("asyn_write had a read error mid-way.\n");
-        req_param = req->param;
-        cb        = req->cb;
-        r.type = IO_INT_T;
-        r.u.i  = -1;
-        /* free any saved node vals. */
-        for (i=0; i<3; i++)
-            if (req->radix[i] != 0) free(req->radix[i]);
-        free(req);
-        cb(r, req_param);
-    }
-}
-
-char *vdi_read_s(vdi_t *vdi, u64 vaddr)
-{
-    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
-    char *block = NULL;
-    int ret;
-
-    void reads_cb(struct io_ret r, void *param) 
-    {
-        block = IO_BLOCK(r);
-        pthread_mutex_unlock((pthread_mutex_t *)param);
-    }
-
-    pthread_mutex_lock(&m);
-    ret = vdi_read(vdi, vaddr, reads_cb, &m);
-
-    if (ret == 0) pthread_mutex_lock(&m);
-    
-    return block;
-}
-
-
-int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block)
-{
-    pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
-    int ret, result;
-
-    void writes_cb(struct io_ret r, void *param) 
-    {
-        result = IO_INT(r);
-        pthread_mutex_unlock((pthread_mutex_t *)param);
-    }
-
-    pthread_mutex_lock(&m);
-    ret = vdi_write(vdi, vaddr, block, writes_cb, &m);
-
-    if (ret == 0) pthread_mutex_lock(&m);
-    
-    return result;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/requests-async.h
--- a/tools/blktap/requests-async.h     Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,29 +0,0 @@
-#ifndef _REQUESTSASYNC_H_
-#define _REQUESTSASYNC_H_
-
-#include "block-async.h"
-#include "blockstore.h" /* for newblock etc. */
-
-/*
-#define BLOCK_SIZE 4096
-#define ZERO 0ULL
-#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU)
-#define iswritable(x) (((x) & 1LLU) != 0)
-#define writable(x) (((x) << 1) | 1LLU)
-#define readonly(x) ((u64)((x) << 1))
-*/
-
-#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */
-#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x))
-
-int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param);
-int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param);
-             
-/* synchronous versions: */
-char *vdi_read_s (vdi_t *vdi, u64 vaddr);
-int   vdi_write_s(vdi_t *vdi, u64 vaddr, char *block);
-
-#define ERR_BAD_VADDR  -1
-#define ERR_NOMEM      -2
-
-#endif //_REQUESTSASYNC_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/snaplog.c
--- a/tools/blktap/snaplog.c    Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,238 +0,0 @@
-/**************************************************************************
- * 
- * snaplog.c
- *
- * Snapshot log on-disk data structure.
- *
- */
- 
- /* VDI histories are made from chains of snapshot logs.  These logs record 
-  * the (radix) root and timestamp of individual snapshots.
-  *
-  * creation of a new VDI involves 'forking' a snapshot log, by creating a 
-  * new, empty log (in a new VDI) and parenting it off of a record in an 
-  * existing snapshot log.
-  *
-  * snapshot log blocks have at most one writer.
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "snaplog.h"
-
-
-
-snap_block_t *snap_get_block(u64 block)
-{
-    snap_block_t *blk = (snap_block_t *)readblock(block);
-    
-    if ( blk == NULL)
-        return NULL;
-    if ( blk->hdr.magic != SNAP_MAGIC ) {
-        freeblock(blk);
-        return NULL;
-    }
-    
-    return blk;
-}
-    
-int snap_get_id(snap_id_t *id, snap_rec_t *target)
-{
-    snap_block_t *blk;
-    
-    if ( id == NULL )
-        return -1;
-    
-    blk = snap_get_block(id->block);
-    
-    if ( blk == NULL ) 
-        return -1;
-    
-    if ( id->index > blk->hdr.nr_entries ) {
-        freeblock(blk);
-        return -1;
-    }
-    
-    *target = blk->snaps[id->index];
-    freeblock(blk);
-    return 0;
-}
-
-int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
-                                  snap_id_t *new_id)
-{
-    snap_rec_t parent_rec, fork_rec;
-    snap_block_t *blk, *pblk;
-    /*
-    if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
-        return -1;    
-    
-    if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
-        return -1;   
-*/
-    blk = (snap_block_t *)newblock();
-    blk->hdr.magic  = SNAP_MAGIC;
-    blk->hdr.nr_entries  = 0;
-    blk->hdr.log_entries = 0;
-    blk->hdr.immutable   = 0;
-    
-    if (   (parent_id  != NULL) 
-        && (parent_id->block != fork_id->block) 
-        && (parent_id->block != 0)) {
-        
-        pblk = snap_get_block(parent_id->block);
-        blk->hdr.log_entries = pblk->hdr.log_entries;
-        freeblock(pblk);
-    }
-    
-    if (parent_id != NULL) {
-        blk->hdr.parent_block = *parent_id;
-        blk->hdr.fork_block   = *fork_id;
-    } else {
-        blk->hdr.parent_block = null_snap_id;
-        blk->hdr.fork_block   = null_snap_id;
-    }
-    
-    new_id->index = 0;
-    new_id->block = allocblock(blk);
-    freeblock(blk);
-    if (new_id->block == 0)
-        return -1;
-    
-    return 0;
-}
-
-int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
-{
-    return __snap_block_create(parent_id, parent_id, new_id);
-}
-
-int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
-{
-    snap_id_t id = *old_id;
-    snap_block_t *blk = snap_get_block(id.block);
-    
-    if ( rec->deleted == 1 ) {
-        printf("Attempt to append a deleted snapshot!\n");
-        return -1;
-    }
-    
-    if ( blk->hdr.immutable != 0 ) {
-        printf("Attempt to snap an immutable snap block!\n");
-        return -1;
-    }
-    
-    new_id->block = id.block;
-    
-    if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
-        int ret;
-        
-        id.index--; /* make id point to the last full record */
-        
-        ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
-        if ( ret != 0 ) {
-            freeblock(blk);
-            return -1;
-        }
-        
-        blk->hdr.immutable = 1;
-        writeblock(id.block, blk);
-        freeblock(blk);
-        blk = snap_get_block(new_id->block);
-        id = *new_id;
-    }
-    
-    blk->snaps[blk->hdr.nr_entries] = *rec;
-    blk->hdr.nr_entries++;
-    blk->hdr.log_entries++;
-    new_id->index = blk->hdr.nr_entries;
-    //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
-    writeblock(id.block, blk);
-    freeblock(blk);
-    return 0;
-}
-
-int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id)
-{
-    snap_block_t *p_blk, *c_blk, *blk;
-    snap_rec_t   *p_rec, *c_rec;
-    int ret = -1;
-    
-    p_blk = snap_get_block(p_id->block);
-    
-    if (p_blk == NULL) return(-1);
-    
-    if (c_id->block == p_id->block)
-    {
-        c_blk = p_blk;
-    } else {
-         c_blk = snap_get_block(c_id->block);
-    }
-    
-    if (p_blk == NULL) {
-        freeblock(p_blk);
-        return(-1);
-    }
-     
-    /* parent and child must not be deleted. */
-    p_rec = &p_blk->snaps[p_id->index];
-    c_rec = &c_blk->snaps[c_id->index];
-    /*
-    if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) {
-        printf("One of those snaps is already deleted.\n");
-        goto done;
-    }
-    */
-    /* first non-deleted thing in the log before child must be parent. */
-    
-    /* XXX todo: text the range here for delete (and eventually fork) bits) */
-    /* for now, snaps must be consecutive, on the same log page: */
-    
-    if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1))
-    {
-        printf("Deleting non-consecutive snaps is not done yet.\n");
-        goto done;
-    }
-    
-    /* mark parent as deleted XXX: may need to lock parent block here.*/
-    p_rec->deleted = 1;
-    writeblock(p_id->block, p_blk);
-    
-    /* delete the parent */
-    printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root);
-    ret = collapse(height, p_rec->radix_root, c_rec->radix_root);
-    
-    /* return the number of blocks reclaimed. */
-    
-done:
-    if (c_blk != p_blk) freeblock(c_blk);
-    freeblock(p_blk);
-    
-    return(ret);
-}
-
-void snap_print_history(snap_id_t *snap_id)
-{
-    snap_id_t id = *snap_id;
-    unsigned int idx = id.index;
-    snap_block_t *new_blk, *blk = snap_get_block(id.block);
-    
-    while ( blk ) {
-        printf("[Snap block %Ld]:\n", id.block);
-        do {
-            printf("   %03u: root: %Ld ts: %ld.%ld\n", idx, 
-                    blk->snaps[idx].radix_root,
-                    blk->snaps[idx].timestamp.tv_sec,
-                    blk->snaps[idx].timestamp.tv_usec);
-        } while (idx-- != 0);
-        
-        id = blk->hdr.parent_block;
-        if (id.block != 0) {
-            new_blk = snap_get_block(id.block);
-        }
-        freeblock(blk);
-        blk = new_blk;
-    }
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/snaplog.h
--- a/tools/blktap/snaplog.h    Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,61 +0,0 @@
-/**************************************************************************
- * 
- * snaplog.h
- *
- * Snapshot log on-disk data structure.
- *
- */
- 
-#include "radix.h"
-#include "blockstore.h"    /* for BLOCK_SIZE */
- 
-#ifndef __SNAPLOG_H__
-#define __SNAPLOG_H__
-
-typedef struct snap_id {
-    u64            block;
-    unsigned int   index;
-} snap_id_t;
-
-typedef struct snap_rec {
-    u64            radix_root;
-    struct timeval timestamp;
-    /* flags: */
-    unsigned       deleted:1;
-} snap_rec_t;
-
-
-int  snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
-int  snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
-int  snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id);
-void snap_print_history(snap_id_t *snap_id);
-int  snap_get_id(snap_id_t *id, snap_rec_t *target);
-
-
-/* exported for vdi debugging */
-#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
-
-static const snap_id_t null_snap_id = { 0, 0 }; 
-
-typedef struct snap_block_hdr {
-    u64            magic;
-    snap_id_t      parent_block; /* parent block within this chain */
-    snap_id_t      fork_block;   /* where this log was forked */
-    unsigned       log_entries;  /* total entries since forking */
-    unsigned short nr_entries;   /* entries in snaps[] */
-    unsigned short immutable;    /* has this snap page become immutable? */
-} snap_block_hdr_t;
-
-
-#define SNAPS_PER_BLOCK \
-    ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
-
-typedef struct snap_block {
-    snap_block_hdr_t hdr;
-    snap_rec_t       snaps[SNAPS_PER_BLOCK];
-} snap_block_t;
-    
-
-snap_block_t *snap_get_block(u64 block);
-
-#endif /* __SNAPLOG_H__ */
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi.c
--- a/tools/blktap/vdi.c        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,367 +0,0 @@
-/**************************************************************************
- * 
- * vdi.c
- *
- * Virtual Disk Image (VDI) Interfaces
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include "blockstore.h"
-#include "block-async.h"
-#include "requests-async.h"
-#include "radix.h"
-#include "vdi.h"
-                    
-#define VDI_REG_BLOCK   2LL
-#define VDI_RADIX_ROOT  writable(3)
-                                                            
-#if 0
-#define DPRINTF(_f, _a...) printf ( _f , ## _a )
-#else
-#define DPRINTF(_f, _a...) ((void)0)
-#endif
-
-/* I haven't decided about this registry stuff, so this is just a really
- * quick lash-up so that there is some way to track VDIs.
- *
- * (Most vdi access should be with a direct handle to the block, so this
- *  registry is just for start-of-day lookup and other control operations.)
- */
-
-vdi_registry_t *create_vdi_registry(void)
-{
-    vdi_registry_t *reg = (vdi_registry_t *)newblock();
-    
-    if (reg == NULL)
-        return NULL;
-    
-    /* zero-fill the vdi radix root while we have an empty block. */
-    writeblock(VDI_RADIX_ROOT, (void *)reg);
-    
-    
-    DPRINTF("[vdi.c] Creating VDI registry!\n");
-    reg->magic      = VDI_REG_MAGIC;
-    reg->nr_vdis    = 0;
-    
-    writeblock(VDI_REG_BLOCK, (void *)reg);
-    
-    return reg;
-}
-    
-vdi_registry_t *get_vdi_registry(void)
-{
-    vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
-    
-    if ( vdi_reg == NULL )
-        vdi_reg = create_vdi_registry();
-    
-    if ( vdi_reg->magic != VDI_REG_MAGIC ) {
-        freeblock(vdi_reg);
-        return NULL;
-    }
-    
-    return vdi_reg;
-}
-
-
-vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
-{
-    int ret;
-    vdi_t *vdi;
-    vdi_registry_t *vdi_reg;
-    snap_rec_t snap_rec;
-    
-    /* create a vdi struct */
-    vdi = newblock();
-    if (vdi == NULL) 
-        return NULL;
-    
-    if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
-        vdi->radix_root = snapshot(snap_rec.radix_root);
-    } else {
-        vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
-        vdi->radix_root = writable(vdi->radix_root); /* grr. */
-    }
-    
-    /* create a snapshot log, and add it to the vdi struct */
-    
-    ret = snap_block_create(parent_snap, &vdi->snap);
-    if ( ret != 0 ) {
-        DPRINTF("Error getting snap block in vdi_create.\n");
-        freeblock(vdi);
-        return NULL;
-    }
-            
-    /* append the vdi to the registry, fill block and id.             */
-    /* implicit allocation means we have to write the vdi twice here. */
-    vdi_reg    = get_vdi_registry();
-    if ( vdi_reg == NULL ) {
-        freeblock(vdi);
-        return NULL;
-    }
-    
-    vdi->block = allocblock((void *)vdi);
-    vdi->id    = vdi_reg->nr_vdis++;
-    strncpy(vdi->name, name, VDI_NAME_SZ);
-    vdi->name[VDI_NAME_SZ] = '\0';
-    vdi->radix_lock = NULL; /* for tidiness */
-    writeblock(vdi->block, (void *)vdi);
-    
-    update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
-    writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
-    freeblock(vdi_reg);
-    
-    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
-    if (vdi->radix_lock == NULL) 
-    {
-       perror("couldn't malloc radix_lock for new vdi!");
-       freeblock(vdi);
-       return NULL;
-    }
-    radix_lock_init(vdi->radix_lock);
-    
-    return vdi;
-}
-
-/* vdi_get and vdi_put currently act more like alloc/free -- they don't 
- * do refcount-based allocation.  
- */
-vdi_t *vdi_get(u64 vdi_id)
-{
-    u64 vdi_blk;
-    vdi_t *vdi;
-    
-    vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
-    
-    if ( vdi_blk == 0 )
-        return NULL;
-    
-    vdi = (vdi_t *)readblock(vdi_blk);
-    
-    vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock));
-    if (vdi->radix_lock == NULL) 
-    {
-       perror("couldn't malloc radix_lock for new vdi!");
-       freeblock(vdi);
-       return NULL;
-    }
-    radix_lock_init(vdi->radix_lock);
-    
-    return vdi;
-}
-
-void vdi_put(vdi_t *vdi)
-{
-    free(vdi->radix_lock);
-    freeblock(vdi);
-}
-
-void vdi_snapshot(vdi_t *vdi)
-{
-    snap_rec_t rec;
-    int ret;
-    
-    rec.radix_root = vdi->radix_root;
-    gettimeofday(&rec.timestamp, NULL);
-    rec.deleted = 0;
-    
-    vdi->radix_root = snapshot(vdi->radix_root);
-    ret = snap_append(&vdi->snap, &rec, &vdi->snap);
-    if ( ret != 0 ) {
-        printf("snap_append returned failure\n");
-        return;
-    }
-    writeblock(vdi->block, vdi);
-}
-    
-int __init_vdi()
-{
-    /* sneak this in here for the moment. */
-    __rcache_init();
-    
-    /* force the registry to be created if it doesn't exist. */
-    vdi_registry_t *vdi_reg = get_vdi_registry();
-    if (vdi_reg == NULL) {
-        printf("[vdi.c] Couldn't get/create a VDI registry!\n");
-        return -1;
-    }
-    freeblock(vdi_reg);
-    
-    
-    return 0;
-}
-    
-#ifdef VDI_STANDALONE
-
-#define TEST_VDIS      50
-#define NR_ITERS    50000
-#define FORK_POINTS   200
-#define INIT_VDIS       3
-#define INIT_SNAPS     40
-
-/* These must be of decreasing size: */
-#define NEW_FORK       (RAND_MAX-(RAND_MAX/1000))
-#define NEW_ROOT_VDI   (RAND_MAX-((RAND_MAX/1000)*2))
-#define NEW_FORK_VDI   (RAND_MAX-((RAND_MAX/1000)*3))
-
-#define GRAPH_DOT_FILE "vdi.dot"
-#define GRAPH_PS_FILE  "vdi.ps"
-
-
-typedef struct sh_st {
-    snap_id_t     id;
-    struct sh_st *next;
-} sh_t;
-
-#define SNAP_HASHSZ 1024
-sh_t *node_hash[SNAP_HASHSZ];
-#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
-
-#define SNAPID_EQUAL(_a,_b) \
-    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
-int sh_check_and_add(snap_id_t *id)
-{
-    sh_t **s = &node_hash[SNAP_HASH(id)];
-    
-    while (*s != NULL) {
-        if (SNAPID_EQUAL(&((*s)->id), id))
-            return 1;
-        *s = (*s)->next;
-    }
-    
-    *s = (sh_t *)malloc(sizeof(sh_t));
-    (*s)->id = *id;
-    (*s)->next = NULL;
-    
-    return 0;
-}
-
-int main(int argc, char *argv[])
-{
-    vdi_t *vdi_list[TEST_VDIS];
-    snap_id_t id, fork_points[FORK_POINTS];
-    int nr_vdis = 0, nr_forks = 0;
-    int i, j, r;
-    FILE *f;
-    char name[VDI_NAME_SZ];
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
-    
-    for (i=0; i<INIT_VDIS; i++) {
-        r=rand();
-        
-        sprintf(name, "VDI Number %d", nr_vdis);
-        vdi_list[i] = vdi_create(NULL, name);
-        for (j=0; j<(r%INIT_SNAPS); j++)
-            vdi_snapshot(vdi_list[i]);
-        fork_points[i] = vdi_list[i]->snap;
-        nr_vdis++;
-        nr_forks++;
-    }
-    
-    printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
-            
-    for (i=0; i<NR_ITERS; i++) {
-        r = rand();
-        
-        if ( r > NEW_FORK ) {
-            if ( nr_forks > FORK_POINTS )
-                continue;
-            id = vdi_list[r%nr_vdis]->snap;
-            if ( ( id.block == 0 ) || ( id.index == 0 ) )
-                continue;
-            id.index--;
-            fork_points[nr_forks++] = id;
-            
-        } else if ( r > NEW_ROOT_VDI ) {
-            
-            if ( nr_vdis == TEST_VDIS )
-                continue;
-            
-            sprintf(name, "VDI Number %d.", nr_vdis);
-            vdi_list[nr_vdis++] = vdi_create(NULL, name);
-            
-        } else if ( r > NEW_FORK_VDI ) {
-            
-            if ( nr_vdis == TEST_VDIS )
-                continue;
-            
-            sprintf(name, "VDI Number %d.", nr_vdis);
-            vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
-            
-        } else /* SNAPSHOT */ {
-            
-            vdi_snapshot(vdi_list[r%nr_vdis]);
-            
-        }
-    }
-    
-    /* now dump it out to a dot file. */
-    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
-    
-    f = fopen(GRAPH_DOT_FILE, "w");
-    
-    /* write graph preamble */
-    fprintf(f, "digraph G {\n");
-    fprintf(f, "   rankdir=LR\n");
-    
-    for (i=0; i<nr_vdis; i++) {
-        char oldnode[255];
-        snap_block_t *blk;
-        snap_id_t id = vdi_list[i]->snap;
-        int nr_snaps, done=0;
-        
-        /* add a node for the id */
-printf("vdi: %d\n", i);
-        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
-                id.block, id.index, vdi_list[i]->name,
-                id.block, id.index);
-        sprintf(oldnode, "n%Ld%d", id.block, id.index);
-        
-        while (id.block != 0) {
-            blk = snap_get_block(id.block);
-            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
-            id = blk->hdr.fork_block;
-            
-            done = sh_check_and_add(&id);
-            
-            /* add a node for the fork_id */
-            if (!done) {
-                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
-                    id.block, id.index,
-                    id.block, id.index);
-            }
-            
-            /* add an edge between them */
-            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
-                    id.block, id.index, oldnode, nr_snaps);
-            sprintf(oldnode, "n%Ld%d", id.block, id.index);
-            freeblock(blk);
-            
-            if (done) break;
-        }
-    }
-    
-    /* write graph postamble */
-    fprintf(f, "}\n");
-    fclose(f);
-    
-    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
-    {
-        char cmd[255];
-        sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
-        system(cmd);
-    }
-    return 0;
-}
-
-#endif
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi.h
--- a/tools/blktap/vdi.h        Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,55 +0,0 @@
-#ifndef _VDI_H_
-#define _VDI_H_
-/**************************************************************************
- * 
- * vdi.h
- *
- * Virtual Disk Image (VDI) Interfaces
- *
- */
-
-#ifndef __VDI_H__
-#define __VDI_H__
-
-#include "blktaplib.h"
-#include "snaplog.h"
-
-#define VDI_HEIGHT     27 /* Note that these are now hard-coded */
-#define VDI_REG_HEIGHT 27 /* in the async lookup code           */
-
-#define VDI_NAME_SZ 256
-
-
-typedef struct vdi {
-    u64         id;               /* unique vdi id -- used by the registry   */
-    u64         block;            /* block where this vdi lives (also unique)*/
-    u64         radix_root;       /* radix root node for block mappings      */
-    snap_id_t   snap;             /* next snapshot slot for this VDI         */
-    struct vdi *next;             /* used to hash-chain in blkif.            */
-    blkif_vdev_t vdevice;         /* currently mounted as...                 */
-    struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs   */
-    char        name[VDI_NAME_SZ];/* human readable vdi name                 */
-} vdi_t;
-
-#define VDI_REG_MAGIC   0xff00ff0bb0ff00ffLL
-
-typedef struct vdi_registry {
-    u64     magic;
-    u64     nr_vdis;
-} vdi_registry_t;
-
-
-int __init_vdi(void);
-
-vdi_t *vdi_get(u64 vdi_id);
-void vdi_put(vdi_t *vdi);
-vdi_registry_t *get_vdi_registry(void);
-vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
-u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
-void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
-void vdi_snapshot(vdi_t *vdi);
-
-
-#endif /* __VDI_H__ */
-
-#endif //_VDI_H_
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_create.c
--- a/tools/blktap/vdi_create.c Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,52 +0,0 @@
-/**************************************************************************
- * 
- * vdi_create.c
- *
- * Create a new vdi.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    char         name[VDI_NAME_SZ] = "";
-    snap_id_t    id;
-    int          from_snap = 0;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc == 1 ) {
-        printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
-        exit(-1);
-    }
-    
-    strncpy( name, argv[1], VDI_NAME_SZ);
-    name[VDI_NAME_SZ] = '\0';    
-    
-    if ( argc > 3 ) {
-        id.block   = (u64)          atoll(argv[2]);
-        id.index   = (unsigned int) atol (argv[3]);
-        from_snap  = 1;
-    }
-    
-    vdi = vdi_create( from_snap ? &id : NULL, name);
-    
-    if ( vdi == NULL ) {
-        printf("Failed to create VDI!\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-    
-    freeblock(vdi);
-    
-    return (0);
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_fill.c
--- a/tools/blktap/vdi_fill.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,81 +0,0 @@
-/**************************************************************************
- * 
- * vdi_fill.c
- *
- * Hoover a file or device into a vdi.
- * You must first create the vdi with vdi_create.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "requests-async.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    u64          id;
-    int          fd;
-    struct stat  st;
-    u64          tot_size;
-    char         spage[BLOCK_SIZE];
-    char        *dpage;
-    u64          vblock = 0, count=0;
-    
-    __init_blockstore();
-    init_block_async();
-    __init_vdi();
-    
-    if ( argc < 3 ) {
-        printf("usage: %s <VDI id> <filename>\n", argv[0]);
-        exit(-1);
-    }
-        
-    id = (u64) atoll(argv[1]);
-    
-    vdi = vdi_get( id );
-    
-    if ( vdi == NULL ) {
-        printf("Failed to retreive VDI %Ld!\n", id);
-        exit(-1);
-    }
-    
-    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
-    
-    if (fd < 0) {
-        printf("Couldn't open %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    if ( fstat(fd, &st) != 0 ) {
-        printf("Couldn't stat %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    tot_size = (u64) st.st_size;
-    printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
-    
-    printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);    
-    printf("           ");
-    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
-        vdi_write_s(vdi, vblock, spage);
-        
-        vblock++;
-        if ((vblock % 512) == 0)
-        printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
-        fflush(stdout);
-    }
-    printf("\n");
-    
-    freeblock(vdi);
-    
-    return (0);
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_list.c
--- a/tools/blktap/vdi_list.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,47 +0,0 @@
-/**************************************************************************
- * 
- * vdi_list.c
- *
- * Print a list of VDIs on the block store.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_registry_t *reg;
-    vdi_t *vdi;
-    int i;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    reg = get_vdi_registry();
-    
-    if ( reg == NULL ) {
-        printf("couldn't get VDI registry.\n");
-        exit(-1);
-    }
-    
-    for (i=0; i < reg->nr_vdis; i++) {
-        vdi = vdi_get(i);
-        
-        if ( vdi != NULL ) {
-            
-            printf("%10Ld %60s\n", vdi->id, vdi->name);
-            freeblock(vdi);
-            
-        }
-    }
-    
-    freeblock(reg);
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap.c
--- a/tools/blktap/vdi_snap.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,43 +0,0 @@
-/**************************************************************************
- * 
- * vdi_snap.c
- *
- * Snapshot a vdi.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t  *vdi;
-    u64     id;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc == 1 ) {
-        printf("usage: %s <VDI id>\n", argv[0]);
-        exit(-1);
-    }
-    
-    id = (u64) atoll(argv[1]);
-    
-    vdi = vdi_get(id);
-    
-    if ( vdi == NULL ) {
-        printf("couldn't find the requested VDI.\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-    
-    vdi_snapshot(vdi);
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap_delete.c
--- a/tools/blktap/vdi_snap_delete.c    Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,48 +0,0 @@
-/**************************************************************************
- * 
- * vdi_snap_delete.c
- *
- * Delete a snapshot.
- *
- * This is not finished:  right now it takes a snap n and calls 
- * snap_collapse(n,n+1).
- *
- * TODO: support for non-consecutive, non-same-block snaps
- *       Avoid forking probs.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "snaplog.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    snap_id_t    id, c_id;
-    int ret;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc != 3 ) {
-        printf("usage: %s <snap block> <snap idx>\n", argv[0]);
-        exit(-1);
-    }
-    
-    id.block   = (u64)          atoll(argv[1]);
-    id.index   = (unsigned int) atol (argv[2]);
-    
-    c_id = id;
-    c_id.index++;
-    
-    ret = snap_collapse(VDI_HEIGHT, &id, &c_id);
-    
-    printf("Freed %d blocks.\n", ret);
-    
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap_list.c
--- a/tools/blktap/vdi_snap_list.c      Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,82 +0,0 @@
-/**************************************************************************
- * 
- * vdi_snap_list.c
- *
- * Print a list of snapshots for the specified vdi.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t        *vdi;
-    u64           id;
-    int           i, max_snaps = -1;
-    snap_block_t *blk;
-    snap_id_t     sid;
-    char         *t;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    if ( argc == 1 ) {
-        printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
-        exit(-1);
-    }
-    
-    id = (u64) atoll(argv[1]);
-    
-    if ( argc > 2 ) {
-        max_snaps = atoi(argv[2]);
-    }
-    
-    vdi = vdi_get(id);
-    
-    if ( vdi == NULL ) {
-        printf("couldn't find the requested VDI.\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-    
-    sid = vdi->snap;
-    sid.index--;
-    
-    //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", 
-    //    "radix root", "d");
-    printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", 
-            "radix root", "d");
-     
-    while (sid.block != 0) {
-        blk = snap_get_block(sid.block);
-        for (i = sid.index; i >= 0; i--) {
-            if ( max_snaps == 0  ) {
-                freeblock(blk);
-                goto done;
-            }
-            t = ctime(&blk->snaps[i].timestamp.tv_sec);
-            t[strlen(t)-1] = '\0';
-            //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n",
-            printf("%8Ld%4u%30s %06lu %12Ld %1s\n",
-                    sid.block, i, 
-                    //blk->snaps[i].timestamp.tv_sec,
-                    t,
-                    blk->snaps[i].timestamp.tv_usec,
-                    blk->snaps[i].radix_root,
-                    blk->snaps[i].deleted ? "*" : " ");
-            if ( max_snaps != -1 ) 
-                max_snaps--;
-        }
-        sid = blk->hdr.parent_block;
-        freeblock(blk);
-    }
-done:            
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_tree.c
--- a/tools/blktap/vdi_tree.c   Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,132 +0,0 @@
-/**************************************************************************
- * 
- * vdi_tree.c
- *
- * Output current vdi tree to dot and postscript.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-#define GRAPH_DOT_FILE "vdi.dot"
-#define GRAPH_PS_FILE  "vdi.ps"
-
-typedef struct sh_st {
-    snap_id_t     id;
-    struct sh_st *next;
-} sh_t;
-
-#define SNAP_HASHSZ 1024
-sh_t *node_hash[SNAP_HASHSZ];
-#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
-
-#define SNAPID_EQUAL(_a,_b) \
-    (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
-int sh_check_and_add(snap_id_t *id)
-{
-    sh_t **s = &node_hash[SNAP_HASH(id)];
-    
-    while (*s != NULL) {
-        if (SNAPID_EQUAL(&((*s)->id), id))
-            return 1;
-        *s = (*s)->next;
-    }
-    
-    *s = (sh_t *)malloc(sizeof(sh_t));
-    (*s)->id = *id;
-    (*s)->next = NULL;
-    
-    return 0;
-}
-
-int main(int argc, char *argv[])
-{
-    FILE *f;
-    char dot_file[255] = GRAPH_DOT_FILE;
-    char  ps_file[255] = GRAPH_PS_FILE;
-    int nr_vdis = 0, nr_forks = 0;
-    vdi_registry_t *reg;
-    vdi_t *vdi;
-    int i;
-    
-    __init_blockstore();
-    __init_vdi();
-    
-    reg = get_vdi_registry();
-    
-    if ( reg == NULL ) {
-        printf("couldn't get VDI registry.\n");
-        exit(-1);
-    }
-    
-    if ( argc > 1 ) {
-        strncpy(ps_file, argv[1], 255);
-        ps_file[255] = '\0';
-    }
-    
-    /* now dump it out to a dot file. */
-    printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
-    
-    f = fopen(dot_file, "w");
-    
-    /* write graph preamble */
-    fprintf(f, "digraph G {\n");
-    fprintf(f, "   rankdir=LR\n");
-    
-    for (i=0; i<reg->nr_vdis; i++) {
-        char oldnode[255];
-        snap_block_t *blk;
-        snap_id_t id;
-        int nr_snaps, done=0;
-        
-        vdi = vdi_get(i);
-        id = vdi->snap;
-        /* add a node for the id */
-printf("vdi: %d\n", i);
-        fprintf(f, "   n%Ld%d 
[color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", 
-                id.block, id.index, vdi->name,
-                id.block, id.index);
-        sprintf(oldnode, "n%Ld%d", id.block, id.index);
-        
-        while (id.block != 0) {
-            blk = snap_get_block(id.block);
-            nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
-            id = blk->hdr.fork_block;
-            
-            done = sh_check_and_add(&id);
-            
-            /* add a node for the fork_id */
-            if (!done) {
-                fprintf(f, "   n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", 
-                    id.block, id.index,
-                    id.block, id.index);
-            }
-            
-            /* add an edge between them */
-            fprintf(f, "   n%Ld%d -> %s [label=\"%u snapshots\"]\n",
-                    id.block, id.index, oldnode, nr_snaps);
-            sprintf(oldnode, "n%Ld%d", id.block, id.index);
-            freeblock(blk);
-            
-            if (done) break;
-        }
-    }
-    
-    /* write graph postamble */
-    fprintf(f, "}\n");
-    fclose(f);
-    
-    printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
-    {
-        char cmd[255];
-        sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
-        system(cmd);
-    }
-    return 0;
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_unittest.c
--- a/tools/blktap/vdi_unittest.c       Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,184 +0,0 @@
-/**************************************************************************
- * 
- * vdi_unittest.c
- *
- * Run a small test workload to ensure that data access through a vdi
- * is (at least superficially) correct.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "requests-async.h"
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-
-#define TEST_PAGES  32
-static char *zero_page;
-static char pages[TEST_PAGES][BLOCK_SIZE];
-static int next_page = 0;
-
-void fill_test_pages(void)
-{
-    int i, j;
-    long *page;
-
-    for (i=0; i< TEST_PAGES; i++) {
-        page = (unsigned long *)pages[i];
-        for (j=0; j<(BLOCK_SIZE/4); j++) {
-            page[j] = random();
-        }
-    }
-
-    zero_page = newblock();
-}
-
-inline u64 make_vaddr(u64 L1, u64 L2, u64 L3)
-{
-    u64 ret = L1;
-
-    ret = (ret << 9) | L2;
-    ret = (ret << 9) | L3;
-
-    return ret;
-}
-
-void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3)
-{
-    u64 vaddr;
-    char *page = pages[next_page++];
-    char *rpage = NULL;
-
-    printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
-
-    vaddr = make_vaddr(L1, L2, L3);
-    vdi_write_s(vdi, vaddr, page);
-    rpage = vdi_read_s(vdi, vaddr);
-
-    if (rpage == NULL) 
-    {
-        printf( "read %Lu returned NULL\n", vaddr); 
-        return; 
-    }
-
-    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
-    {
-        printf( "read %Lu returned a different page\n", vaddr);
-        return;
-    }
-
-    freeblock(rpage);
-}
-
-void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page)
-{
-    u64 vaddr;
-    char *rpage = NULL;
-
-    printf("TEST  (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3);
-
-    vaddr = make_vaddr(L1, L2, L3);
-    rpage = vdi_read_s(vdi, vaddr);
-
-    if (rpage == NULL) 
-    {
-        printf( "read %Lu returned NULL\n", vaddr); 
-        return; 
-    }
-
-    if (memcmp(page, rpage, BLOCK_SIZE) != 0)
-    {
-        printf( "read %Lu returned a different page\n", vaddr);
-        return;
-    }
-
-    freeblock(rpage);
-}
-
-void coverage_test(vdi_t *vdi)
-{
-    u64 vaddr;
-    int i, j, k;
-
-    /* Do a series of writes and reads to test all paths through the 
-     * async radix code.  The radix request code will dump CRC warnings
-     * if there are data problems here as well.
-     */
-
-    /* L1 Zero */
-    touch_block(vdi, 0, 0, 0);
-
-    /* L2 Zero */
-    i = next_page;
-    touch_block(vdi, 0, 1, 0);
-
-    /* L3 Zero */
-    j = next_page;
-    touch_block(vdi, 0, 0, 1);
-    k = next_page;
-    touch_block(vdi, 0, 1, 1);
-
-    /* Direct write */
-    touch_block(vdi, 0, 0, 0);
-
-    vdi_snapshot(vdi);
-
-    /* L1 fault */
-    touch_block(vdi, 0, 0, 0);
-    /* test the read-only branches that should have been copied over. */
-    test_block(vdi, 0, 1, 0, pages[i]);
-    test_block(vdi, 0, 0, 1, pages[j]);
-
-    /* L2 fault */
-    touch_block(vdi, 0, 1, 0);
-    test_block(vdi, 0, 1, 1, pages[k]);
-
-    /* L3 fault */
-    touch_block(vdi, 0, 0, 1);
-    
-    /* read - L1 zero */
-    test_block(vdi, 1, 0, 0, zero_page);
-    
-    /* read - L2 zero */
-    test_block(vdi, 0, 2, 0, zero_page);
-
-    /* read - L3 zero */
-    test_block(vdi, 0, 0, 2, zero_page);
-}
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    u64          id;
-    int          fd;
-    struct stat  st;
-    u64          tot_size;
-    char         spage[BLOCK_SIZE];
-    char        *dpage;
-    u64          vblock = 0, count=0;
-    
-    __init_blockstore();
-    init_block_async();
-    __init_vdi();
-        
-    vdi = vdi_create( NULL, "UNIT TEST VDI");
-    
-    if ( vdi == NULL ) {
-        printf("Failed to create VDI!\n");
-        freeblock(vdi);
-        exit(-1);
-    }
-
-    fill_test_pages();
-    coverage_test(vdi);
-    
-    freeblock(vdi);
-    
-    return (0);
-}
diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_validate.c
--- a/tools/blktap/vdi_validate.c       Sun Jul  3 22:32:52 2005
+++ /dev/null   Sun Jul  3 22:36:48 2005
@@ -1,97 +0,0 @@
-/**************************************************************************
- * 
- * vdi_validate.c
- *
- * Intended to sanity-check vm_fill and the underlying vdi code.
- *
- * Block-by-block compare of a vdi with a file/device on the disk.
- *
- */
- 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "blockstore.h"
-#include "radix.h"
-#include "vdi.h"
-#include "requests-async.h"
-
-int main(int argc, char *argv[])
-{
-    vdi_t       *vdi;
-    u64          id;
-    int          fd;
-    struct stat  st;
-    u64          tot_size;
-    char         spage[BLOCK_SIZE], *dpage;
-    char        *vpage;
-    u64          vblock = 0, count=0;
-    
-    __init_blockstore();
-    init_block_async();
-    __init_vdi();
-    
-    if ( argc < 3 ) {
-        printf("usage: %s <VDI id> <filename>\n", argv[0]);
-        exit(-1);
-    }
-        
-    id = (u64) atoll(argv[1]);
-    
-    vdi = vdi_get( id );
-    
-    if ( vdi == NULL ) {
-        printf("Failed to retreive VDI %Ld!\n", id);
-        exit(-1);
-    }
-    
-    fd = open(argv[2], O_RDONLY | O_LARGEFILE);
-    
-    if (fd < 0) {
-        printf("Couldn't open %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    if ( fstat(fd, &st) != 0 ) {
-        printf("Couldn't stat %s!\n", argv[2]);
-        exit(-1);
-    }
-    
-    tot_size = (u64) st.st_size;
-    printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
-    
-    printf("           ");
-    while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
-
-        dpage = vdi_read_s(vdi, vblock);
-
-        if (dpage == NULL) {
-            printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
-            exit(0);
-        }
-
-        if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
-            printf("\n\nblocks don't match! (%Ld)\n", vblock);
-            exit(0);
-        }
-        
-        freeblock(dpage);
-        
-        vblock++;
-        if ((vblock % 1024) == 0) {
-            printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
-            fflush(stdout);
-        }
-    }
-    printf("\n");
-    
-    printf("VDI %Ld looks good!\n", id);
-    
-    freeblock(vdi);
-    
-    return (0);
-}

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.