[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Manual merge.
# HG changeset patch # User kaf24@xxxxxxxxxxxxxxxxxxxx # Node ID f8acd354e1295226fbda14aaf8bd164e07b93742 # Parent 80d5dd14711eccf379e475000f3b156df286d279 # Parent 09067ce923038c4ba6dcb9630fb848cce0d1c5fa Manual merge. diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/Makefile --- a/tools/blktap/Makefile Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/Makefile Sun Jul 3 22:36:48 2005 @@ -2,43 +2,24 @@ MINOR = 0 SONAME = libblktap.so.$(MAJOR) -CC = gcc - XEN_ROOT = ../.. include $(XEN_ROOT)/tools/Rules.mk -BLKTAP_INSTALL_DIR = /usr/sbin +SUBDIRS := +SUBDIRS += parallax -INSTALL = install -INSTALL_PROG = $(INSTALL) -m0755 -INSTALL_DIR = $(INSTALL) -d -m0755 +BLKTAP_INSTALL_DIR = /usr/sbin -INCLUDES += +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +INSTALL_DIR = $(INSTALL) -d -m0755 + +INCLUDES += -I. -I $(XEN_LIBXC) LIBS := -lpthread -lz SRCS := SRCS += blktaplib.c - -PLX_SRCS := -PLX_SRCS += vdi.c -PLX_SRCS += radix.c -PLX_SRCS += snaplog.c -PLX_SRCS += blockstore.c -PLX_SRCS += block-async.c -PLX_SRCS += requests-async.c -VDI_SRCS := $(PLX_SRCS) -PLX_SRCS += parallax.c - -VDI_TOOLS := -VDI_TOOLS += vdi_create -VDI_TOOLS += vdi_list -VDI_TOOLS += vdi_snap -VDI_TOOLS += vdi_snap_list -VDI_TOOLS += vdi_snap_delete -VDI_TOOLS += vdi_fill -VDI_TOOLS += vdi_tree -VDI_TOOLS += vdi_validate CFLAGS += -Wall CFLAGS += -Werror @@ -46,20 +27,21 @@ #CFLAGS += -O3 CFLAGS += -g3 CFLAGS += -fno-strict-aliasing -CFLAGS += -I $(XEN_LIBXC) -CFLAGS += $(INCLUDES) -I. CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE # Get gcc to generate the dependencies for us. CFLAGS += -Wp,-MD,.$(@F).d +CFLAGS += $(INCLUDES) DEPS = .*.d OBJS = $(patsubst %.c,%.o,$(SRCS)) -IBINS = blkdump parallax $(VDI_TOOLS) +IBINS = blkdump LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR) -all: mk-symlinks blkdump $(VDI_TOOLS) parallax blockstored - $(MAKE) $(LIB) +all: mk-symlinks libblktap.so blkdump + @set -e; for subdir in $(SUBDIRS); do \ + $(MAKE) -C $$subdir $@; \ + done LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse) mk-symlinks: @@ -77,10 +59,16 @@ $(INSTALL_DIR) -p $(DESTDIR)/usr/include $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR) $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include - $(INSTALL_PROG) $(IBINS) $(DESTDIR)/$(BLKTAP_INSTALL_DIR) + $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR) + @set -e; for subdir in $(SUBDIRS); do \ + $(MAKE) -C $$subdir $@; \ + done clean: - rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump $(VDI_TOOLS) parallax vdi_unittest + rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump + @set -e; for subdir in $(SUBDIRS); do \ + $(MAKE) -C $$subdir $@; \ + done rpm: all rm -rf staging @@ -91,52 +79,17 @@ mv staging/i386/*.rpm . rm -rf staging -libblktap.so: +libblktap.so: $(OBJS) + $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o \ + libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS) + ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR) ln -sf libblktap.so.$(MAJOR) $@ -libblktap.so.$(MAJOR): - ln -sf libblktap.so.$(MAJOR).$(MINOR) $@ -libblktap.so.$(MAJOR).$(MINOR): $(OBJS) - $(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ $(LIBS) -blkdump: $(LIB) +blkdump: libblktap.so $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c -parallax: $(LIB) $(PLX_SRCS) - $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L. -lblktap $(LIBS) $(PLX_SRCS) +.PHONY: TAGS clean install mk-symlinks rpm -vdi_list: $(LIB) vdi_list.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(LIBS) $(VDI_SRCS) - -vdi_create: $(LIB) vdi_create.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(LIBS) $(VDI_SRCS) - -vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(LIBS) $(VDI_SRCS) - -vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(LIBS) $(VDI_SRCS) - -vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(LIBS) $(VDI_SRCS) - -vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(LIBS) $(VDI_SRCS) - -vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(LIBS) $(VDI_SRCS) - -vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(LIBS) $(VDI_SRCS) - -vdi_unittest: $(LIB) vdi_unittest.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_unittest vdi_unittest.c $(LIBS) $(VDI_SRCS) - -blockstored: blockstored.c - $(CC) $(CFLAGS) -g3 -o blockstored $(LIBS) blockstored.c -bstest: bstest.c blockstore.c - $(CC) $(CFLAGS) -g3 -o bstest bstest.c $(LIBS) blockstore.c - -.PHONY: TAGS clean install mk-symlinks rpm TAGS: etags -t $(SRCS) *.h diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_tree.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_tree.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,132 @@ +/************************************************************************** + * + * vdi_tree.c + * + * Output current vdi tree to dot and postscript. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include "blockstore.h" +#include "radix.h" +#include "vdi.h" + +#define GRAPH_DOT_FILE "vdi.dot" +#define GRAPH_PS_FILE "vdi.ps" + +typedef struct sh_st { + snap_id_t id; + struct sh_st *next; +} sh_t; + +#define SNAP_HASHSZ 1024 +sh_t *node_hash[SNAP_HASHSZ]; +#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ) + +#define SNAPID_EQUAL(_a,_b) \ + (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index)) +int sh_check_and_add(snap_id_t *id) +{ + sh_t **s = &node_hash[SNAP_HASH(id)]; + + while (*s != NULL) { + if (SNAPID_EQUAL(&((*s)->id), id)) + return 1; + *s = (*s)->next; + } + + *s = (sh_t *)malloc(sizeof(sh_t)); + (*s)->id = *id; + (*s)->next = NULL; + + return 0; +} + +int main(int argc, char *argv[]) +{ + FILE *f; + char dot_file[255] = GRAPH_DOT_FILE; + char ps_file[255] = GRAPH_PS_FILE; + int nr_vdis = 0, nr_forks = 0; + vdi_registry_t *reg; + vdi_t *vdi; + int i; + + __init_blockstore(); + __init_vdi(); + + reg = get_vdi_registry(); + + if ( reg == NULL ) { + printf("couldn't get VDI registry.\n"); + exit(-1); + } + + if ( argc > 1 ) { + strncpy(ps_file, argv[1], 255); + ps_file[255] = '\0'; + } + + /* now dump it out to a dot file. */ + printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis); + + f = fopen(dot_file, "w"); + + /* write graph preamble */ + fprintf(f, "digraph G {\n"); + fprintf(f, " rankdir=LR\n"); + + for (i=0; i<reg->nr_vdis; i++) { + char oldnode[255]; + snap_block_t *blk; + snap_id_t id; + int nr_snaps, done=0; + + vdi = vdi_get(i); + id = vdi->snap; + /* add a node for the id */ +printf("vdi: %d\n", i); + fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", + id.block, id.index, vdi->name, + id.block, id.index); + sprintf(oldnode, "n%Ld%d", id.block, id.index); + + while (id.block != 0) { + blk = snap_get_block(id.block); + nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index); + id = blk->hdr.fork_block; + + done = sh_check_and_add(&id); + + /* add a node for the fork_id */ + if (!done) { + fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", + id.block, id.index, + id.block, id.index); + } + + /* add an edge between them */ + fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n", + id.block, id.index, oldnode, nr_snaps); + sprintf(oldnode, "n%Ld%d", id.block, id.index); + freeblock(blk); + + if (done) break; + } + } + + /* write graph postamble */ + fprintf(f, "}\n"); + fclose(f); + + printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE); + { + char cmd[255]; + sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file); + system(cmd); + } + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/snaplog.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/snaplog.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,238 @@ +/************************************************************************** + * + * snaplog.c + * + * Snapshot log on-disk data structure. + * + */ + + /* VDI histories are made from chains of snapshot logs. These logs record + * the (radix) root and timestamp of individual snapshots. + * + * creation of a new VDI involves 'forking' a snapshot log, by creating a + * new, empty log (in a new VDI) and parenting it off of a record in an + * existing snapshot log. + * + * snapshot log blocks have at most one writer. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include "blockstore.h" +#include "snaplog.h" + + + +snap_block_t *snap_get_block(u64 block) +{ + snap_block_t *blk = (snap_block_t *)readblock(block); + + if ( blk == NULL) + return NULL; + if ( blk->hdr.magic != SNAP_MAGIC ) { + freeblock(blk); + return NULL; + } + + return blk; +} + +int snap_get_id(snap_id_t *id, snap_rec_t *target) +{ + snap_block_t *blk; + + if ( id == NULL ) + return -1; + + blk = snap_get_block(id->block); + + if ( blk == NULL ) + return -1; + + if ( id->index > blk->hdr.nr_entries ) { + freeblock(blk); + return -1; + } + + *target = blk->snaps[id->index]; + freeblock(blk); + return 0; +} + +int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id, + snap_id_t *new_id) +{ + snap_rec_t parent_rec, fork_rec; + snap_block_t *blk, *pblk; + /* + if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) ) + return -1; + + if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) ) + return -1; +*/ + blk = (snap_block_t *)newblock(); + blk->hdr.magic = SNAP_MAGIC; + blk->hdr.nr_entries = 0; + blk->hdr.log_entries = 0; + blk->hdr.immutable = 0; + + if ( (parent_id != NULL) + && (parent_id->block != fork_id->block) + && (parent_id->block != 0)) { + + pblk = snap_get_block(parent_id->block); + blk->hdr.log_entries = pblk->hdr.log_entries; + freeblock(pblk); + } + + if (parent_id != NULL) { + blk->hdr.parent_block = *parent_id; + blk->hdr.fork_block = *fork_id; + } else { + blk->hdr.parent_block = null_snap_id; + blk->hdr.fork_block = null_snap_id; + } + + new_id->index = 0; + new_id->block = allocblock(blk); + freeblock(blk); + if (new_id->block == 0) + return -1; + + return 0; +} + +int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id) +{ + return __snap_block_create(parent_id, parent_id, new_id); +} + +int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id) +{ + snap_id_t id = *old_id; + snap_block_t *blk = snap_get_block(id.block); + + if ( rec->deleted == 1 ) { + printf("Attempt to append a deleted snapshot!\n"); + return -1; + } + + if ( blk->hdr.immutable != 0 ) { + printf("Attempt to snap an immutable snap block!\n"); + return -1; + } + + new_id->block = id.block; + + if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) { + int ret; + + id.index--; /* make id point to the last full record */ + + ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id); + if ( ret != 0 ) { + freeblock(blk); + return -1; + } + + blk->hdr.immutable = 1; + writeblock(id.block, blk); + freeblock(blk); + blk = snap_get_block(new_id->block); + id = *new_id; + } + + blk->snaps[blk->hdr.nr_entries] = *rec; + blk->hdr.nr_entries++; + blk->hdr.log_entries++; + new_id->index = blk->hdr.nr_entries; + //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries); + writeblock(id.block, blk); + freeblock(blk); + return 0; +} + +int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id) +{ + snap_block_t *p_blk, *c_blk, *blk; + snap_rec_t *p_rec, *c_rec; + int ret = -1; + + p_blk = snap_get_block(p_id->block); + + if (p_blk == NULL) return(-1); + + if (c_id->block == p_id->block) + { + c_blk = p_blk; + } else { + c_blk = snap_get_block(c_id->block); + } + + if (p_blk == NULL) { + freeblock(p_blk); + return(-1); + } + + /* parent and child must not be deleted. */ + p_rec = &p_blk->snaps[p_id->index]; + c_rec = &c_blk->snaps[c_id->index]; + /* + if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) { + printf("One of those snaps is already deleted.\n"); + goto done; + } + */ + /* first non-deleted thing in the log before child must be parent. */ + + /* XXX todo: text the range here for delete (and eventually fork) bits) */ + /* for now, snaps must be consecutive, on the same log page: */ + + if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1)) + { + printf("Deleting non-consecutive snaps is not done yet.\n"); + goto done; + } + + /* mark parent as deleted XXX: may need to lock parent block here.*/ + p_rec->deleted = 1; + writeblock(p_id->block, p_blk); + + /* delete the parent */ + printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root); + ret = collapse(height, p_rec->radix_root, c_rec->radix_root); + + /* return the number of blocks reclaimed. */ + +done: + if (c_blk != p_blk) freeblock(c_blk); + freeblock(p_blk); + + return(ret); +} + +void snap_print_history(snap_id_t *snap_id) +{ + snap_id_t id = *snap_id; + unsigned int idx = id.index; + snap_block_t *new_blk, *blk = snap_get_block(id.block); + + while ( blk ) { + printf("[Snap block %Ld]:\n", id.block); + do { + printf(" %03u: root: %Ld ts: %ld.%ld\n", idx, + blk->snaps[idx].radix_root, + blk->snaps[idx].timestamp.tv_sec, + blk->snaps[idx].timestamp.tv_usec); + } while (idx-- != 0); + + id = blk->hdr.parent_block; + if (id.block != 0) { + new_blk = snap_get_block(id.block); + } + freeblock(blk); + blk = new_blk; + } +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/snaplog.h --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/snaplog.h Sun Jul 3 22:36:48 2005 @@ -0,0 +1,61 @@ +/************************************************************************** + * + * snaplog.h + * + * Snapshot log on-disk data structure. + * + */ + +#include "radix.h" +#include "blockstore.h" /* for BLOCK_SIZE */ + +#ifndef __SNAPLOG_H__ +#define __SNAPLOG_H__ + +typedef struct snap_id { + u64 block; + unsigned int index; +} snap_id_t; + +typedef struct snap_rec { + u64 radix_root; + struct timeval timestamp; + /* flags: */ + unsigned deleted:1; +} snap_rec_t; + + +int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id); +int snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id); +int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id); +void snap_print_history(snap_id_t *snap_id); +int snap_get_id(snap_id_t *id, snap_rec_t *target); + + +/* exported for vdi debugging */ +#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL + +static const snap_id_t null_snap_id = { 0, 0 }; + +typedef struct snap_block_hdr { + u64 magic; + snap_id_t parent_block; /* parent block within this chain */ + snap_id_t fork_block; /* where this log was forked */ + unsigned log_entries; /* total entries since forking */ + unsigned short nr_entries; /* entries in snaps[] */ + unsigned short immutable; /* has this snap page become immutable? */ +} snap_block_hdr_t; + + +#define SNAPS_PER_BLOCK \ + ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t)) + +typedef struct snap_block { + snap_block_hdr_t hdr; + snap_rec_t snaps[SNAPS_PER_BLOCK]; +} snap_block_t; + + +snap_block_t *snap_get_block(u64 block); + +#endif /* __SNAPLOG_H__ */ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/README --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/README Sun Jul 3 22:36:48 2005 @@ -0,0 +1,177 @@ +Parallax Quick Overview +March 3, 2005 + +This is intended to provide a quick set of instructions to let you +guys play with the current parallax source. In it's current form, the +code will let you run an arbitrary number of VMs off of a single disk +image, doing copy-on-write as they make updates. Each domain is +assigned a virtual disk image (VDI), which may be based on a snapshot +of an existing image. All of the VDI and snapshot management should +currently work. + +The current implementation uses a single file as a blockstore for +_everything_ this will soon be replaced by the fancier backend code +and the local cache. As it stands, Parallax will create +"blockstore.dat" in the directory that you run it from, and use +largefile support to make this grow to unfathomable girth. So, you +probably want to run the daemon off of a local disk, with a lot of +free space. + +Here's how to get going: + +0. Setup: +--------- + +Pick a local directory on a disk with lots of room. You should be +running from a privileged domain (e.g. dom0) with the blocktap +configured in and block backend NOT. + +For convenience (for the moment) copy all of the vdi tools (vdi_*) and +the parallax daemon from tools/blktap into this directory. + +1. Populate the blockstore: +--------------------------- + +First you need to put at least one image into the blockstore. You +will need a disk image, either as a file or local partition. My +general approach has been to + +(a) make a really big sparse file with + + dd if=/dev/zero of=./image bs=4K count=1 seek=[big value] + +(b) put a filesystem into it + + mkfs.ext3 ./image + +(c) mount it using loopback + + mkdir ./mnt + mount -o loop ./image + +(d) cd into it and untar one of the image files from srg-roots. + + cd mnt + tar ... + +NOTE: Beware if your system is FC3. mkfs is not compatible with old +versions of fedora, and so you don't have much choice but to install +further fc3 images if you have used the fc3 version of mkfs. + +(e) unmount the image + + cd .. + umount mnt + +(f) now, create a new VDI to hold the image + + ./vdi_create "My new FC3 VDI" + +(g) get the id of the new VDI. + + ./vdi_list + + | 0 My new FC3 VDI + +(0 is the VDI id... create a few more if you want.) + +(h) hoover your image into the new VDI. + + ./vdi_fill 0 ./image + +This will pull the entire image into the blockstore and set up a +mapping tree for it for VDI 0. Passing a device (i.e. /dev/sda3) +should also work, but vdi_fill has NO notion of sparseness yet, so you +are going to pump a block into the store for each block you read. + +vdi_fill will count up until it is done, and you should be ready to +go. If you want to be anal, you can use vdi_validate to test the VDI +against the original image. + +2. Create some extra VDIs +------------------------- + +VDIs are actually a list of snapshots, and each snapshot is a full +image of mappings. So, to preserve an immutable copy of a current +VDI, do this: + +(a) Snapshot your new VDI. + + ./vdi_snap 0 + +Snapshotting writes the current radix root to the VDI's snapshot log, +and assigns it a new writable root. + +(b) look at the VDI's snapshot log. + + ./vdi_snap_list 0 + + | 16 0 Thu Mar 3 19:27:48 2005 565111 31 + +The first two columns constitute a snapshot id and represent the +(block, offset) of the snapshot record. The Date tells you when the +snapshot was made, and 31 is the radix root node of the snapshot. + +(c) Create a new VDI, based on that snapshot, and look at the list. + + ./vdi_create "FC3 - Copy 1" 16 0 + ./vdi_list + + | 0 My new FC3 VDI + | 1 FC3 - Copy 1 + +NOTE: If you have Graphviz installed on your system, you can use +vdi_tree to generate a postscript of your current set of VDIs and +snapshots. + + +Create as many VDIs as you need for the VMs that you want to run. + +3. Boot some VMs: +----------------- + +Parallax currently uses a hack in xend to pass the VDI id, you need to +modify the disk line of the VM config that is going to mount it. + +(a) set up your vm config, by using the following disk line: + + disk = ['parallax:1,sda1,w,0' ] + +This example uses VDI 1 (from vdi_list above), presents it as sda1 +(writable), and uses dom 0 as the backend. If you were running the +daemon (and tap driver) in some domain other than 0, you would change +this last parameter. + +NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:". + +(b) Run parallax in the backend domain. + + ./parallax + +(c) create your new domain. + + xm create ... + +--- + +That's pretty much all there is to it at the moment. Hope this is +clear enough to get you going. Now, a few serious caveats that will +be sorted out in the almost immediate future: + +WARNINGS: +--------- + +1. There is NO locking in the VDI tools at the moment, so I'd avoid +running them in parallel, or more importantly, running them while the +daemon is running. + +2. I doubt that xend will be very happy about restarting if you have +parallax-using domains. So if it dies while there are active parallax +doms, you may need to reboot. + +3. I've turned off write-in-place. So at the moment, EVERY block +write is a log append on the blockstore. I've been having some probs +with the radix tree's marking of writable blocks after snapshots and +will sort this out very soon. + + diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/bstest.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/bstest.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,191 @@ +/************************************************************************** + * + * bstest.c + * + * Block store daemon test program. + * + * usage: bstest <host>|X {r|w|a} ID + * + */ + +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <netinet/in.h> +#include <netdb.h> +#include <errno.h> +#include "blockstore.h" + +int direct(char *host, u32 op, u64 id, int len) { + struct sockaddr_in sn, peer; + int sock; + bsmsg_t msgbuf; + int rc, slen; + struct hostent *addr; + + addr = gethostbyname(host); + if (!addr) { + perror("bad hostname"); + exit(1); + } + peer.sin_family = addr->h_addrtype; + peer.sin_port = htons(BLOCKSTORED_PORT); + peer.sin_addr.s_addr = ((struct in_addr *)(addr->h_addr))->s_addr; + fprintf(stderr, "Sending to: %u.%u.%u.%u\n", + (unsigned int)(unsigned char)addr->h_addr[0], + (unsigned int)(unsigned char)addr->h_addr[1], + (unsigned int)(unsigned char)addr->h_addr[2], + (unsigned int)(unsigned char)addr->h_addr[3]); + + sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) { + perror("Bad socket"); + exit(1); + } + memset(&sn, 0, sizeof(sn)); + sn.sin_family = AF_INET; + sn.sin_port = htons(BLOCKSTORED_PORT); + sn.sin_addr.s_addr = htonl(INADDR_ANY); + if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) { + perror("bind"); + close(sock); + exit(1); + } + + memset((void *)&msgbuf, 0, sizeof(msgbuf)); + msgbuf.operation = op; + msgbuf.id = id; + + rc = sendto(sock, (void *)&msgbuf, len, 0, + (struct sockaddr *)&peer, sizeof(peer)); + if (rc < 0) { + perror("sendto"); + exit(1); + } + + slen = sizeof(peer); + len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0, + (struct sockaddr *)&peer, &slen); + if (len < 0) { + perror("recvfrom"); + exit(1); + } + + printf("Reply %u bytes:\n", len); + if (len >= MSGBUFSIZE_OP) + printf(" operation: %u\n", msgbuf.operation); + if (len >= MSGBUFSIZE_FLAGS) + printf(" flags: 0x%x\n", msgbuf.flags); + if (len >= MSGBUFSIZE_ID) + printf(" id: %llu\n", msgbuf.id); + if (len >= (MSGBUFSIZE_ID + 4)) + printf(" data: %02x %02x %02x %02x...\n", + (unsigned int)msgbuf.block[0], + (unsigned int)msgbuf.block[1], + (unsigned int)msgbuf.block[2], + (unsigned int)msgbuf.block[3]); + + if (sock > 0) + close(sock); + + return 0; +} + +int main (int argc, char **argv) { + + u32 op = 0; + u64 id = 0; + int len = 0, rc; + void *block; + + if (argc < 3) { + fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n"); + return 1; + } + + switch (argv[2][0]) { + case 'r': + case 'R': + op = BSOP_READBLOCK; + len = MSGBUFSIZE_ID; + break; + case 'w': + case 'W': + op = BSOP_WRITEBLOCK; + len = MSGBUFSIZE_BLOCK; + break; + case 'a': + case 'A': + op = BSOP_ALLOCBLOCK; + len = MSGBUFSIZE_BLOCK; + break; + default: + fprintf(stderr, "Unknown action '%s'.\n", argv[2]); + return 1; + } + + if (argc >= 4) + id = atoll(argv[3]); + + if (strcmp(argv[1], "X") == 0) { + rc = __init_blockstore(); + if (rc < 0) { + fprintf(stderr, "blockstore init failed.\n"); + return 1; + } + switch(op) { + case BSOP_READBLOCK: + block = readblock(id); + if (block) { + printf("data: %02x %02x %02x %02x...\n", + (unsigned int)((unsigned char*)block)[0], + (unsigned int)((unsigned char*)block)[1], + (unsigned int)((unsigned char*)block)[2], + (unsigned int)((unsigned char*)block)[3]); + } + break; + case BSOP_WRITEBLOCK: + block = malloc(BLOCK_SIZE); + if (!block) { + perror("bstest malloc"); + return 1; + } + memset(block, 0, BLOCK_SIZE); + rc = writeblock(id, block); + if (rc != 0) { + printf("error\n"); + } + else { + printf("OK\n"); + } + break; + case BSOP_ALLOCBLOCK: + block = malloc(BLOCK_SIZE); + if (!block) { + perror("bstest malloc"); + return 1; + } + memset(block, 0, BLOCK_SIZE); + id = allocblock_hint(block, id); + if (id == 0) { + printf("error\n"); + } + else { + printf("ID: %llu\n", id); + } + break; + } + } + else { + direct(argv[1], op, id, len); + } + + + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap_delete.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_snap_delete.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,48 @@ +/************************************************************************** + * + * vdi_snap_delete.c + * + * Delete a snapshot. + * + * This is not finished: right now it takes a snap n and calls + * snap_collapse(n,n+1). + * + * TODO: support for non-consecutive, non-same-block snaps + * Avoid forking probs. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include "blockstore.h" +#include "snaplog.h" +#include "radix.h" +#include "vdi.h" + +int main(int argc, char *argv[]) +{ + snap_id_t id, c_id; + int ret; + + __init_blockstore(); + __init_vdi(); + + if ( argc != 3 ) { + printf("usage: %s <snap block> <snap idx>\n", argv[0]); + exit(-1); + } + + id.block = (u64) atoll(argv[1]); + id.index = (unsigned int) atol (argv[2]); + + c_id = id; + c_id.index++; + + ret = snap_collapse(VDI_HEIGHT, &id, &c_id); + + printf("Freed %d blocks.\n", ret); + + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/block-async.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/block-async.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,393 @@ +/* block-async.c + * + * Asynchronous block wrappers for parallax. + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include "block-async.h" +#include "blockstore.h" +#include "vdi.h" + + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +/* We have a queue of outstanding I/O requests implemented as a + * circular producer-consumer ring with free-running buffers. + * to allow reordering, this ring indirects to indexes in an + * ring of io_structs. + * + * the block_* calls may either add an entry to this ring and return, + * or satisfy the request immediately and call the callback directly. + * None of the io calls in parallax should be nested enough to worry + * about stack problems with this approach. + */ + +struct read_args { + u64 addr; +}; + +struct write_args { + u64 addr; + char *block; +}; + +struct alloc_args { + char *block; +}; + +struct pending_io_req { + enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op; + union { + struct read_args r; + struct write_args w; + struct alloc_args a; + } u; + io_cb_t cb; + void *param; +}; + +void radix_lock_init(struct radix_lock *r) +{ + int i; + + pthread_mutex_init(&r->lock, NULL); + for (i=0; i < 1024; i++) { + r->lines[i] = 0; + r->waiters[i] = NULL; + r->state[i] = ANY; + } +} + +/* maximum outstanding I/O requests issued asynchronously */ +/* must be a power of 2.*/ +#define MAX_PENDING_IO 1024 + +/* how many threads to concurrently issue I/O to the disk. */ +#define IO_POOL_SIZE 10 + +static struct pending_io_req pending_io_reqs[MAX_PENDING_IO]; +static int pending_io_list[MAX_PENDING_IO]; +static unsigned long io_prod = 0, io_cons = 0, io_free = 0; +#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1)) +#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs) +#define PENDING_IO_ENT(_x) \ + (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]]) +#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod) +#define CAN_CONSUME_PENDING_IO (io_cons != io_prod) +static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t pending_io_cond = PTHREAD_COND_INITIALIZER; + +static void init_pending_io(void) +{ + int i; + + for (i=0; i<MAX_PENDING_IO; i++) + pending_io_list[i] = i; + +} + +void block_read(u64 addr, io_cb_t cb, void *param) +{ + struct pending_io_req *req; + + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req); + req->op = IO_READ; + req->u.r.addr = addr; + req->cb = cb; + req->param = param; + + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + + +void block_write(u64 addr, char *block, io_cb_t cb, void *param) +{ + struct pending_io_req *req; + + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req); + req->op = IO_WRITE; + req->u.w.addr = addr; + req->u.w.block = block; + req->cb = cb; + req->param = param; + + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + + +void block_alloc(char *block, io_cb_t cb, void *param) +{ + struct pending_io_req *req; + + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + req->op = IO_ALLOC; + req->u.a.block = block; + req->cb = cb; + req->param = param; + + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + pthread_mutex_lock(&r->lock); + + if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) { + r->lines[row]++; + r->state[row] = READ; + DPRINTF("RLOCK : %3d (row: %d)\n", r->lines[row], row); + pthread_mutex_unlock(&r->lock); + ret.type = IO_INT_T; + ret.u.i = 0; + cb(ret, param); + } else { + struct radix_wait **rwc; + struct radix_wait *rw = + (struct radix_wait *) malloc (sizeof(struct radix_wait)); + DPRINTF("RLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); + rw->type = RLOCK; + rw->param = param; + rw->cb = cb; + rw->next = NULL; + /* append to waiters list. */ + rwc = &r->waiters[row]; + while (*rwc != NULL) rwc = &(*rwc)->next; + *rwc = rw; + pthread_mutex_unlock(&r->lock); + return; + } +} + + +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + pthread_mutex_lock(&r->lock); + + /* the second check here is redundant -- just here for debugging now. */ + if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) { + r->state[row] = STOP; + r->lines[row] = -1; + DPRINTF("WLOCK : %3d (row: %d)\n", r->lines[row], row); + pthread_mutex_unlock(&r->lock); + ret.type = IO_INT_T; + ret.u.i = 0; + cb(ret, param); + } else { + struct radix_wait **rwc; + struct radix_wait *rw = + (struct radix_wait *) malloc (sizeof(struct radix_wait)); + DPRINTF("WLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); + rw->type = WLOCK; + rw->param = param; + rw->cb = cb; + rw->next = NULL; + /* append to waiters list. */ + rwc = &r->waiters[row]; + while (*rwc != NULL) rwc = &(*rwc)->next; + *rwc = rw; + pthread_mutex_unlock(&r->lock); + return; + } + +} + +/* called with radix_lock locked and lock count of zero. */ +static void wake_waiters(struct radix_lock *r, int row) +{ + struct pending_io_req *req; + struct radix_wait *rw; + + if (r->lines[row] != 0) return; + if (r->waiters[row] == NULL) return; + + if (r->waiters[row]->type == WLOCK) { + + rw = r->waiters[row]; + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + req->op = IO_WWAKE; + req->cb = rw->cb; + req->param = rw->param; + r->lines[row] = -1; /* write lock the row. */ + r->state[row] = STOP; + r->waiters[row] = rw->next; + free(rw); + pthread_mutex_unlock(&pending_io_lock); + + } else /* RLOCK */ { + + while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) { + rw = r->waiters[row]; + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + req->op = IO_RWAKE; + req->cb = rw->cb; + req->param = rw->param; + r->lines[row]++; /* read lock the row. */ + r->state[row] = READ; + r->waiters[row] = rw->next; + free(rw); + pthread_mutex_unlock(&pending_io_lock); + } + + if (r->waiters[row] != NULL) /* There is a write queued still */ + r->state[row] = STOP; + } + + pthread_mutex_lock(&pending_io_lock); + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + + pthread_mutex_lock(&r->lock); + assert(r->lines[row] > 0); /* try to catch misuse. */ + r->lines[row]--; + if (r->lines[row] == 0) { + r->state[row] = ANY; + wake_waiters(r, row); + } + pthread_mutex_unlock(&r->lock); + cb(ret, param); +} + +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + + pthread_mutex_lock(&r->lock); + assert(r->lines[row] == -1); /* try to catch misuse. */ + r->lines[row] = 0; + r->state[row] = ANY; + wake_waiters(r, row); + pthread_mutex_unlock(&r->lock); + cb(ret, param); +} + +/* consumer calls */ +static void do_next_io_req(struct pending_io_req *req) +{ + struct io_ret ret; + void *param; + + switch (req->op) { + case IO_READ: + ret.type = IO_BLOCK_T; + ret.u.b = readblock(req->u.r.addr); + break; + case IO_WRITE: + ret.type = IO_INT_T; + ret.u.i = writeblock(req->u.w.addr, req->u.w.block); + DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr); + break; + case IO_ALLOC: + ret.type = IO_ADDR_T; + ret.u.a = allocblock(req->u.a.block); + break; + case IO_RWAKE: + DPRINTF("WAKE DEFERRED RLOCK!\n"); + ret.type = IO_INT_T; + ret.u.i = 0; + break; + case IO_WWAKE: + DPRINTF("WAKE DEFERRED WLOCK!\n"); + ret.type = IO_INT_T; + ret.u.i = 0; + break; + default: + DPRINTF("Unknown IO operation on pending list!\n"); + return; + } + + param = req->param; + pthread_mutex_lock(&pending_io_lock); + pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req); + pthread_mutex_unlock(&pending_io_lock); + + assert(req->cb != NULL); + req->cb(ret, param); + +} + +void *io_thread(void *param) +{ + int tid; + struct pending_io_req *req; + + /* Set this thread's tid. */ + tid = *(int *)param; + free(param); + +start: + pthread_mutex_lock(&pending_io_lock); + while (io_prod == io_cons) { + pthread_cond_wait(&pending_io_cond, &pending_io_lock); + } + + if (io_prod == io_cons) { + /* unnecessary wakeup. */ + pthread_mutex_unlock(&pending_io_lock); + goto start; + } + + req = PENDING_IO_ENT(io_cons++); + pthread_mutex_unlock(&pending_io_lock); + + do_next_io_req(req); + + goto start; + +} + +static pthread_t io_pool[IO_POOL_SIZE]; +void start_io_threads(void) + +{ + int i, tid=0; + + for (i=0; i < IO_POOL_SIZE; i++) { + int ret, *t; + t = (int *)malloc(sizeof(int)); + *t = tid++; + ret = pthread_create(&io_pool[i], NULL, io_thread, t); + if (ret != 0) printf("Error starting thread %d\n", i); + } + +} + +void init_block_async(void) +{ + init_pending_io(); + start_io_threads(); +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap_list.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_snap_list.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,82 @@ +/************************************************************************** + * + * vdi_snap_list.c + * + * Print a list of snapshots for the specified vdi. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <sys/time.h> +#include "blockstore.h" +#include "radix.h" +#include "vdi.h" + +int main(int argc, char *argv[]) +{ + vdi_t *vdi; + u64 id; + int i, max_snaps = -1; + snap_block_t *blk; + snap_id_t sid; + char *t; + + __init_blockstore(); + __init_vdi(); + + if ( argc == 1 ) { + printf("usage: %s <VDI id> [max snaps]\n", argv[0]); + exit(-1); + } + + id = (u64) atoll(argv[1]); + + if ( argc > 2 ) { + max_snaps = atoi(argv[2]); + } + + vdi = vdi_get(id); + + if ( vdi == NULL ) { + printf("couldn't find the requested VDI.\n"); + freeblock(vdi); + exit(-1); + } + + sid = vdi->snap; + sid.index--; + + //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", + // "radix root", "d"); + printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", + "radix root", "d"); + + while (sid.block != 0) { + blk = snap_get_block(sid.block); + for (i = sid.index; i >= 0; i--) { + if ( max_snaps == 0 ) { + freeblock(blk); + goto done; + } + t = ctime(&blk->snaps[i].timestamp.tv_sec); + t[strlen(t)-1] = '\0'; + //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n", + printf("%8Ld%4u%30s %06lu %12Ld %1s\n", + sid.block, i, + //blk->snaps[i].timestamp.tv_sec, + t, + blk->snaps[i].timestamp.tv_usec, + blk->snaps[i].radix_root, + blk->snaps[i].deleted ? "*" : " "); + if ( max_snaps != -1 ) + max_snaps--; + } + sid = blk->hdr.parent_block; + freeblock(blk); + } +done: + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_list.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_list.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,47 @@ +/************************************************************************** + * + * vdi_list.c + * + * Print a list of VDIs on the block store. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include "blockstore.h" +#include "radix.h" +#include "vdi.h" + +int main(int argc, char *argv[]) +{ + vdi_registry_t *reg; + vdi_t *vdi; + int i; + + __init_blockstore(); + __init_vdi(); + + reg = get_vdi_registry(); + + if ( reg == NULL ) { + printf("couldn't get VDI registry.\n"); + exit(-1); + } + + for (i=0; i < reg->nr_vdis; i++) { + vdi = vdi_get(i); + + if ( vdi != NULL ) { + + printf("%10Ld %60s\n", vdi->id, vdi->name); + freeblock(vdi); + + } + } + + freeblock(reg); + + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstore.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/blockstore.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,1350 @@ +/************************************************************************** + * + * blockstore.c + * + * Simple block store interface + * + */ + +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <stdarg.h> +#include "blockstore.h" +#include <pthread.h> + +//#define BLOCKSTORE_REMOTE +//#define BSDEBUG + +#define RETRY_TIMEOUT 1000000 /* microseconds */ + +/***************************************************************************** + * Debugging + */ +#ifdef BSDEBUG +void DB(char *format, ...) +{ + va_list args; + fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key)); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); +} +#else +#define DB(format, ...) (void)0 +#endif + +#ifdef BLOCKSTORE_REMOTE + +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <netinet/in.h> +#include <netdb.h> + +/***************************************************************************** + * Network state * + *****************************************************************************/ + +/* The individual disk servers we talks to. These will be referenced by + * an integer index into bsservers[]. + */ +bsserver_t bsservers[MAX_SERVERS]; + +/* The cluster map. This is indexed by an integer cluster number. + */ +bscluster_t bsclusters[MAX_CLUSTERS]; + +/* Local socket. + */ +struct sockaddr_in sin_local; +int bssock = 0; + +/***************************************************************************** + * Notification * + *****************************************************************************/ + +typedef struct pool_thread_t_struct { + pthread_mutex_t ptmutex; + pthread_cond_t ptcv; + int newdata; +} pool_thread_t; + +pool_thread_t pool_thread[READ_POOL_SIZE+1]; + +#define RECV_NOTIFY(tid) { \ + pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ + pool_thread[tid].newdata = 1; \ + DB("CV Waking %u", tid); \ + pthread_cond_signal(&(pool_thread[tid].ptcv)); \ + pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } +#define RECV_AWAIT(tid) { \ + pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ + if (pool_thread[tid].newdata) { \ + pool_thread[tid].newdata = 0; \ + DB("CV Woken %u", tid); \ + } \ + else { \ + DB("CV Waiting %u", tid); \ + pthread_cond_wait(&(pool_thread[tid].ptcv), \ + &(pool_thread[tid].ptmutex)); \ + } \ + pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } + +/***************************************************************************** + * Message queue management * + *****************************************************************************/ + +/* Protects the queue manipulation critcal regions. + */ +pthread_mutex_t ptmutex_queue; +#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue) +#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue) + +pthread_mutex_t ptmutex_recv; +#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv) +#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv) + +/* A message queue entry. We allocate one of these for every request we send. + * Asynchronous reply reception also used one of these. + */ +typedef struct bsq_t_struct { + struct bsq_t_struct *prev; + struct bsq_t_struct *next; + int status; + int server; + int length; + struct msghdr msghdr; + struct iovec iov[2]; + int tid; + struct timeval tv_sent; + bshdr_t message; + void *block; +} bsq_t; + +#define BSQ_STATUS_MATCHED 1 + +pthread_mutex_t ptmutex_luid; +#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid) +#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid) + +static u64 luid_cnt = 0x1000ULL; +u64 new_luid(void) { + u64 luid; + ENTER_LUID_CR; + luid = luid_cnt++; + LEAVE_LUID_CR; + return luid; +} + +/* Queue of outstanding requests. + */ +bsq_t *bs_head = NULL; +bsq_t *bs_tail = NULL; +int bs_qlen = 0; + +/* + */ +void queuedebug(char *msg) { + bsq_t *q; + ENTER_QUEUE_CR; + fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen); + for (q = bs_head; q; q = q->next) { + fprintf(stderr, " luid=%016llx server=%u\n", + q->message.luid, q->server); + } + LEAVE_QUEUE_CR; +} + +int enqueue(bsq_t *qe) { + ENTER_QUEUE_CR; + qe->next = NULL; + qe->prev = bs_tail; + if (!bs_head) + bs_head = qe; + else + bs_tail->next = qe; + bs_tail = qe; + bs_qlen++; + LEAVE_QUEUE_CR; +#ifdef BSDEBUG + queuedebug("enqueue"); +#endif + return 0; +} + +int dequeue(bsq_t *qe) { + bsq_t *q; + ENTER_QUEUE_CR; + for (q = bs_head; q; q = q->next) { + if (q == qe) { + if (q->prev) + q->prev->next = q->next; + else + bs_head = q->next; + if (q->next) + q->next->prev = q->prev; + else + bs_tail = q->prev; + bs_qlen--; + goto found; + } + } + + LEAVE_QUEUE_CR; +#ifdef BSDEBUG + queuedebug("dequeue not found"); +#endif + return 0; + + found: + LEAVE_QUEUE_CR; +#ifdef BSDEBUG + queuedebug("dequeue not found"); +#endif + return 1; +} + +bsq_t *queuesearch(bsq_t *qe) { + bsq_t *q; + ENTER_QUEUE_CR; + for (q = bs_head; q; q = q->next) { + if ((qe->server == q->server) && + (qe->message.operation == q->message.operation) && + (qe->message.luid == q->message.luid)) { + + if ((q->message.operation == BSOP_READBLOCK) && + ((q->message.flags & BSOP_FLAG_ERROR) == 0)) { + q->block = qe->block; + qe->block = NULL; + } + q->length = qe->length; + q->message.flags = qe->message.flags; + q->message.id = qe->message.id; + q->status |= BSQ_STATUS_MATCHED; + + if (q->prev) + q->prev->next = q->next; + else + bs_head = q->next; + if (q->next) + q->next->prev = q->prev; + else + bs_tail = q->prev; + q->next = NULL; + q->prev = NULL; + bs_qlen--; + goto found; + } + } + + LEAVE_QUEUE_CR; +#ifdef BSDEBUG + queuedebug("queuesearch not found"); +#endif + return NULL; + + found: + LEAVE_QUEUE_CR; +#ifdef BSDEBUG + queuedebug("queuesearch found"); +#endif + return q; +} + +/***************************************************************************** + * Network communication * + *****************************************************************************/ + +int send_message(bsq_t *qe) { + int rc; + + qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin); + qe->msghdr.msg_namelen = sizeof(struct sockaddr_in); + qe->msghdr.msg_iov = qe->iov; + if (qe->block) + qe->msghdr.msg_iovlen = 2; + else + qe->msghdr.msg_iovlen = 1; + qe->msghdr.msg_control = NULL; + qe->msghdr.msg_controllen = 0; + qe->msghdr.msg_flags = 0; + + qe->iov[0].iov_base = (void *)&(qe->message); + qe->iov[0].iov_len = MSGBUFSIZE_ID; + + if (qe->block) { + qe->iov[1].iov_base = qe->block; + qe->iov[1].iov_len = BLOCK_SIZE; + } + + qe->message.luid = new_luid(); + + qe->status = 0; + qe->tid = (int)pthread_getspecific(tid_key); + if (enqueue(qe) < 0) { + fprintf(stderr, "Error enqueuing request.\n"); + return -1; + } + + gettimeofday(&(qe->tv_sent), NULL); + DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid); + rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); + //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0, + // (struct sockaddr *)&(bsservers[qe->server].sin), + // sizeof(struct sockaddr_in)); + if (rc < 0) + return rc; + + return rc; +} + +int recv_message(bsq_t *qe) { + struct sockaddr_in from; + //int flen = sizeof(from); + int rc; + + qe->msghdr.msg_name = &from; + qe->msghdr.msg_namelen = sizeof(struct sockaddr_in); + qe->msghdr.msg_iov = qe->iov; + if (qe->block) + qe->msghdr.msg_iovlen = 2; + else + qe->msghdr.msg_iovlen = 1; + qe->msghdr.msg_control = NULL; + qe->msghdr.msg_controllen = 0; + qe->msghdr.msg_flags = 0; + + qe->iov[0].iov_base = (void *)&(qe->message); + qe->iov[0].iov_len = MSGBUFSIZE_ID; + if (qe->block) { + qe->iov[1].iov_base = qe->block; + qe->iov[1].iov_len = BLOCK_SIZE; + } + + rc = recvmsg(bssock, &(qe->msghdr), 0); + + //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0, + // (struct sockaddr *)&from, &flen); + return rc; +} + +int get_server_number(struct sockaddr_in *sin) { + int i; + +#ifdef BSDEBUG2 + fprintf(stderr, + "get_server_number(%u.%u.%u.%u/%u)\n", + (unsigned int)sin->sin_addr.s_addr & 0xff, + ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff, + ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff, + ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff, + (unsigned int)sin->sin_port); +#endif + + for (i = 0; i < MAX_SERVERS; i++) { + if (bsservers[i].hostname) { +#ifdef BSDEBUG2 + fprintf(stderr, + "get_server_number check %u.%u.%u.%u/%u\n", + (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff, + ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff, + ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff, + ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff, + (unsigned int)bsservers[i].sin.sin_port); +#endif + if ((sin->sin_family == bsservers[i].sin.sin_family) && + (sin->sin_port == bsservers[i].sin.sin_port) && + (memcmp((void *)&(sin->sin_addr), + (void *)&(bsservers[i].sin.sin_addr), + sizeof(struct in_addr)) == 0)) { + return i; + } + } + } + + return -1; +} + +void *rx_buffer = NULL; +bsq_t rx_qe; +bsq_t *recv_any(void) { + struct sockaddr_in from; + int rc; + + DB("ENTER recv_any\n"); + + rx_qe.msghdr.msg_name = &from; + rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in); + rx_qe.msghdr.msg_iov = rx_qe.iov; + if (!rx_buffer) { + rx_buffer = malloc(BLOCK_SIZE); + if (!rx_buffer) { + perror("recv_any malloc"); + return NULL; + } + } + rx_qe.block = rx_buffer; + rx_buffer = NULL; + rx_qe.msghdr.msg_iovlen = 2; + rx_qe.msghdr.msg_control = NULL; + rx_qe.msghdr.msg_controllen = 0; + rx_qe.msghdr.msg_flags = 0; + + rx_qe.iov[0].iov_base = (void *)&(rx_qe.message); + rx_qe.iov[0].iov_len = MSGBUFSIZE_ID; + rx_qe.iov[1].iov_base = rx_qe.block; + rx_qe.iov[1].iov_len = BLOCK_SIZE; + + rc = recvmsg(bssock, &(rx_qe.msghdr), 0); + if (rc < 0) { + perror("recv_any"); + return NULL; + } + + rx_qe.length = rc; + rx_qe.server = get_server_number(&from); + + DB("recv_any from %d luid=%016llx len=%u\n", + rx_qe.server, rx_qe.message.luid, rx_qe.length); + + return &rx_qe; +} + +void recv_recycle_buffer(bsq_t *q) { + if (q->block) { + rx_buffer = q->block; + q->block = NULL; + } +} + +// cycle through reading any incoming, searching for a match in the +// queue, until we have all we need. +int wait_recv(bsq_t **reqs, int numreqs) { + bsq_t *q, *m; + unsigned int x, i; + int tid = (int)pthread_getspecific(tid_key); + + DB("ENTER wait_recv %u\n", numreqs); + + checkmatch: + x = 0xffffffff; + for (i = 0; i < numreqs; i++) { + x &= reqs[i]->status; + } + if ((x & BSQ_STATUS_MATCHED)) { + DB("LEAVE wait_recv\n"); + return numreqs; + } + + RECV_AWAIT(tid); + + /* + rxagain: + ENTER_RECV_CR; + q = recv_any(); + LEAVE_RECV_CR; + if (!q) + return -1; + + m = queuesearch(q); + recv_recycle_buffer(q); + if (!m) { + fprintf(stderr, "Unmatched RX\n"); + goto rxagain; + } + */ + + goto checkmatch; + +} + +/* retry + */ +static int retry_count = 0; +int retry(bsq_t *qe) +{ + int rc; + gettimeofday(&(qe->tv_sent), NULL); + DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid); + retry_count++; + rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); + if (rc < 0) + return rc; + return 0; +} + +/* queue runner + */ +void *queue_runner(void *arg) +{ + for (;;) { + struct timeval now; + long long nowus, sus; + bsq_t *q; + int r; + + sleep(1); + + gettimeofday(&now, NULL); + nowus = now.tv_usec + now.tv_sec * 1000000; + ENTER_QUEUE_CR; + r = retry_count; + for (q = bs_head; q; q = q->next) { + sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000; + if ((nowus - sus) > RETRY_TIMEOUT) { + if (retry(q) < 0) { + fprintf(stderr, "Error on sendmsg retry.\n"); + } + } + } + if (r != retry_count) { + fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count); + } + LEAVE_QUEUE_CR; + } +} + +/* receive loop + */ +void *receive_loop(void *arg) +{ + bsq_t *q, *m; + + for(;;) { + q = recv_any(); + if (!q) { + fprintf(stderr, "recv_any error\n"); + } + else { + m = queuesearch(q); + recv_recycle_buffer(q); + if (!m) { + fprintf(stderr, "Unmatched RX\n"); + } + else { + DB("RX MATCH"); + RECV_NOTIFY(m->tid); + } + } + } +} +pthread_t pthread_recv; + +/***************************************************************************** + * Reading * + *****************************************************************************/ + +void *readblock_indiv(int server, u64 id) { + void *block; + bsq_t *qe; + int len, rc; + + qe = (bsq_t *)malloc(sizeof(bsq_t)); + if (!qe) { + perror("readblock qe malloc"); + return NULL; + } + qe->block = NULL; + + /* + qe->block = malloc(BLOCK_SIZE); + if (!qe->block) { + perror("readblock qe malloc"); + free((void *)qe); + return NULL; + } + */ + + qe->server = server; + + qe->message.operation = BSOP_READBLOCK; + qe->message.flags = 0; + qe->message.id = id; + qe->length = MSGBUFSIZE_ID; + + if (send_message(qe) < 0) { + perror("readblock sendto"); + goto err; + } + + /*len = recv_message(qe); + if (len < 0) { + perror("readblock recv"); + goto err; + }*/ + + rc = wait_recv(&qe, 1); + if (rc < 0) { + perror("readblock recv"); + goto err; + } + + if ((qe->message.flags & BSOP_FLAG_ERROR)) { + fprintf(stderr, "readblock server error\n"); + goto err; + } + if (qe->length < MSGBUFSIZE_BLOCK) { + fprintf(stderr, "readblock recv short (%u)\n", len); + goto err; + } + /* if ((block = malloc(BLOCK_SIZE)) == NULL) { + perror("readblock malloc"); + goto err; + } + memcpy(block, qe->message.block, BLOCK_SIZE); + */ + block = qe->block; + + free((void *)qe); + return block; + + err: + if (qe->block) + free(qe->block); + free((void *)qe); + return NULL; +} + +/** + * readblock: read a block from disk + * @id: block id to read + * + * @return: pointer to block, NULL on error + */ +void *readblock(u64 id) { + int map = (int)BSID_MAP(id); + u64 xid; + static int i = CLUSTER_MAX_REPLICAS - 1; + void *block = NULL; + + /* special case for the "superblock" just use the first block on the + * first replica. (extend to blocks < 6 for vdi bug) + */ + if (id < 6) { + block = readblock_indiv(bsclusters[map].servers[0], id); + goto out; + } + + i++; + if (i >= CLUSTER_MAX_REPLICAS) + i = 0; + switch (i) { + case 0: + xid = BSID_REPLICA0(id); + break; + case 1: + xid = BSID_REPLICA1(id); + break; + case 2: + xid = BSID_REPLICA2(id); + break; + } + + block = readblock_indiv(bsclusters[map].servers[i], xid); + + out: +#ifdef BSDEBUG + if (block) + fprintf(stderr, "READ: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", + id, + (unsigned int)((unsigned char *)block)[0], + (unsigned int)((unsigned char *)block)[1], + (unsigned int)((unsigned char *)block)[2], + (unsigned int)((unsigned char *)block)[3], + (unsigned int)((unsigned char *)block)[4], + (unsigned int)((unsigned char *)block)[5], + (unsigned int)((unsigned char *)block)[6], + (unsigned int)((unsigned char *)block)[7]); + else + fprintf(stderr, "READ: %016llx NULL\n", id); +#endif + return block; +} + +/***************************************************************************** + * Writing * + *****************************************************************************/ + +bsq_t *writeblock_indiv(int server, u64 id, void *block) { + + bsq_t *qe; + int len; + + qe = (bsq_t *)malloc(sizeof(bsq_t)); + if (!qe) { + perror("writeblock qe malloc"); + goto err; + } + qe->server = server; + + qe->message.operation = BSOP_WRITEBLOCK; + qe->message.flags = 0; + qe->message.id = id; + //memcpy(qe->message.block, block, BLOCK_SIZE); + qe->block = block; + qe->length = MSGBUFSIZE_BLOCK; + + if (send_message(qe) < 0) { + perror("writeblock sendto"); + goto err; + } + + return qe; + + err: + free((void *)qe); + return NULL; +} + + +/** + * writeblock: write an existing block to disk + * @id: block id + * @block: pointer to block + * + * @return: zero on success, -1 on failure + */ +int writeblock(u64 id, void *block) { + + int map = (int)BSID_MAP(id); + int rep0 = bsclusters[map].servers[0]; + int rep1 = bsclusters[map].servers[1]; + int rep2 = bsclusters[map].servers[2]; + bsq_t *reqs[3]; + int rc; + + reqs[0] = reqs[1] = reqs[2] = NULL; + +#ifdef BSDEBUG + fprintf(stderr, + "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", + id, + (unsigned int)((unsigned char *)block)[0], + (unsigned int)((unsigned char *)block)[1], + (unsigned int)((unsigned char *)block)[2], + (unsigned int)((unsigned char *)block)[3], + (unsigned int)((unsigned char *)block)[4], + (unsigned int)((unsigned char *)block)[5], + (unsigned int)((unsigned char *)block)[6], + (unsigned int)((unsigned char *)block)[7]); +#endif + + /* special case for the "superblock" just use the first block on the + * first replica. (extend to blocks < 6 for vdi bug) + */ + if (id < 6) { + reqs[0] = writeblock_indiv(rep0, id, block); + if (!reqs[0]) + return -1; + rc = wait_recv(reqs, 1); + return rc; + } + + reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block); + if (!reqs[0]) + goto err; + reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block); + if (!reqs[1]) + goto err; + reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block); + if (!reqs[2]) + goto err; + + rc = wait_recv(reqs, 3); + if (rc < 0) { + perror("writeblock recv"); + goto err; + } + if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) { + fprintf(stderr, "writeblock server0 error\n"); + goto err; + } + if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) { + fprintf(stderr, "writeblock server1 error\n"); + goto err; + } + if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) { + fprintf(stderr, "writeblock server2 error\n"); + goto err; + } + + + free((void *)reqs[0]); + free((void *)reqs[1]); + free((void *)reqs[2]); + return 0; + + err: + if (reqs[0]) { + dequeue(reqs[0]); + free((void *)reqs[0]); + } + if (reqs[1]) { + dequeue(reqs[1]); + free((void *)reqs[1]); + } + if (reqs[2]) { + dequeue(reqs[2]); + free((void *)reqs[2]); + } + return -1; +} + +/***************************************************************************** + * Allocation * + *****************************************************************************/ + +/** + * allocblock: write a new block to disk + * @block: pointer to block + * + * @return: new id of block on disk + */ +u64 allocblock(void *block) { + return allocblock_hint(block, 0); +} + +bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) { + bsq_t *qe; + int len; + + qe = (bsq_t *)malloc(sizeof(bsq_t)); + if (!qe) { + perror("allocblock_hint qe malloc"); + goto err; + } + qe->server = server; + + qe->message.operation = BSOP_ALLOCBLOCK; + qe->message.flags = 0; + qe->message.id = hint; + //memcpy(qe->message.block, block, BLOCK_SIZE); + qe->block = block; + qe->length = MSGBUFSIZE_BLOCK; + + if (send_message(qe) < 0) { + perror("allocblock_hint sendto"); + goto err; + } + + return qe; + + err: + free((void *)qe); + return NULL; +} + +/** + * allocblock_hint: write a new block to disk + * @block: pointer to block + * @hint: allocation hint + * + * @return: new id of block on disk + */ +u64 allocblock_hint(void *block, u64 hint) { + int map = (int)hint; + int rep0 = bsclusters[map].servers[0]; + int rep1 = bsclusters[map].servers[1]; + int rep2 = bsclusters[map].servers[2]; + bsq_t *reqs[3]; + int rc; + u64 id0, id1, id2; + + reqs[0] = reqs[1] = reqs[2] = NULL; + + DB("ENTER allocblock\n"); + + reqs[0] = allocblock_hint_indiv(rep0, block, hint); + if (!reqs[0]) + goto err; + reqs[1] = allocblock_hint_indiv(rep1, block, hint); + if (!reqs[1]) + goto err; + reqs[2] = allocblock_hint_indiv(rep2, block, hint); + if (!reqs[2]) + goto err; + + rc = wait_recv(reqs, 3); + if (rc < 0) { + perror("allocblock recv"); + goto err; + } + if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) { + fprintf(stderr, "allocblock server0 error\n"); + goto err; + } + if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) { + fprintf(stderr, "allocblock server1 error\n"); + goto err; + } + if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) { + fprintf(stderr, "allocblock server2 error\n"); + goto err; + } + + id0 = reqs[0]->message.id; + id1 = reqs[1]->message.id; + id2 = reqs[2]->message.id; + +#ifdef BSDEBUG + fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", + BSID(map, id0, id1, id2), + (unsigned int)((unsigned char *)block)[0], + (unsigned int)((unsigned char *)block)[1], + (unsigned int)((unsigned char *)block)[2], + (unsigned int)((unsigned char *)block)[3], + (unsigned int)((unsigned char *)block)[4], + (unsigned int)((unsigned char *)block)[5], + (unsigned int)((unsigned char *)block)[6], + (unsigned int)((unsigned char *)block)[7]); +#endif + + free((void *)reqs[0]); + free((void *)reqs[1]); + free((void *)reqs[2]); + return BSID(map, id0, id1, id2); + + err: + if (reqs[0]) { + dequeue(reqs[0]); + free((void *)reqs[0]); + } + if (reqs[1]) { + dequeue(reqs[1]); + free((void *)reqs[1]); + } + if (reqs[2]) { + dequeue(reqs[2]); + free((void *)reqs[2]); + } + return 0; +} + +#else /* /BLOCKSTORE_REMOTE */ + +/***************************************************************************** + * Local storage version * + *****************************************************************************/ + +/** + * readblock: read a block from disk + * @id: block id to read + * + * @return: pointer to block, NULL on error + */ + +void *readblock(u64 id) { + void *block; + int block_fp; + +//printf("readblock(%llu)\n", id); + block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return NULL; + } + + if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { + printf ("%Ld ", id); + printf ("%Ld\n", (id - 1) * BLOCK_SIZE); + perror("readblock lseek"); + goto err; + } + if ((block = malloc(BLOCK_SIZE)) == NULL) { + perror("readblock malloc"); + goto err; + } + if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { + perror("readblock read"); + free(block); + goto err; + } + close(block_fp); + return block; + +err: + close(block_fp); + return NULL; +} + +/** + * writeblock: write an existing block to disk + * @id: block id + * @block: pointer to block + * + * @return: zero on success, -1 on failure + */ +int writeblock(u64 id, void *block) { + + int block_fp; + + block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return -1; + } + + if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { + perror("writeblock lseek"); + goto err; + } + if (write(block_fp, block, BLOCK_SIZE) < 0) { + perror("writeblock write"); + goto err; + } + close(block_fp); + return 0; + +err: + close(block_fp); + return -1; +} + +/** + * allocblock: write a new block to disk + * @block: pointer to block + * + * @return: new id of block on disk + */ + +u64 allocblock(void *block) { + u64 lb; + off64_t pos; + int block_fp; + + block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return 0; + } + + pos = lseek64(block_fp, 0, SEEK_END); + if (pos == (off64_t)-1) { + perror("allocblock lseek"); + goto err; + } + if (pos % BLOCK_SIZE != 0) { + fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); + goto err; + } + if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { + perror("allocblock write"); + goto err; + } + lb = pos / BLOCK_SIZE + 1; +//printf("alloc(%Ld)\n", lb); + close(block_fp); + return lb; + +err: + close(block_fp); + return 0; + +} + +/** + * allocblock_hint: write a new block to disk + * @block: pointer to block + * @hint: allocation hint + * + * @return: new id of block on disk + */ +u64 allocblock_hint(void *block, u64 hint) { + return allocblock(block); +} + +#endif /* BLOCKSTORE_REMOTE */ + +/***************************************************************************** + * Memory management * + *****************************************************************************/ + +/** + * newblock: get a new in-memory block set to zeros + * + * @return: pointer to new block, NULL on error + */ +void *newblock() { + void *block = malloc(BLOCK_SIZE); + if (block == NULL) { + perror("newblock"); + return NULL; + } + memset(block, 0, BLOCK_SIZE); + return block; +} + + +/** + * freeblock: unallocate an in-memory block + * @id: block id (zero if this is only in-memory) + * @block: block to be freed + */ +void freeblock(void *block) { + if (block != NULL) + free(block); +} + +static freeblock_t *new_freeblock(void) +{ + freeblock_t *fb; + + fb = newblock(); + + if (fb == NULL) return NULL; + + fb->magic = FREEBLOCK_MAGIC; + fb->next = 0ULL; + fb->count = 0ULL; + memset(fb->list, 0, sizeof fb->list); + + return fb; +} + +void releaseblock(u64 id) +{ + blockstore_super_t *bs_super; + freeblock_t *fl_current; + + /* get superblock */ + bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); + + /* get freeblock_current */ + if (bs_super->freelist_current == 0ULL) + { + fl_current = new_freeblock(); + bs_super->freelist_current = allocblock(fl_current); + writeblock(BLOCKSTORE_SUPER, bs_super); + } else { + fl_current = readblock(bs_super->freelist_current); + } + + /* if full, chain to superblock and allocate new current */ + + if (fl_current->count == FREEBLOCK_SIZE) { + fl_current->next = bs_super->freelist_full; + writeblock(bs_super->freelist_current, fl_current); + bs_super->freelist_full = bs_super->freelist_current; + freeblock(fl_current); + fl_current = new_freeblock(); + bs_super->freelist_current = allocblock(fl_current); + writeblock(BLOCKSTORE_SUPER, bs_super); + } + + /* append id to current */ + fl_current->list[fl_current->count++] = id; + writeblock(bs_super->freelist_current, fl_current); + + freeblock(fl_current); + freeblock(bs_super); + + +} + +/* freelist debug functions: */ +void freelist_count(int print_each) +{ + blockstore_super_t *bs_super; + freeblock_t *fb; + u64 total = 0, next; + + bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); + + if (bs_super->freelist_current == 0ULL) { + printf("freelist is empty!\n"); + return; + } + + fb = readblock(bs_super->freelist_current); + printf("%Ld entires on current.\n", fb->count); + total += fb->count; + if (print_each == 1) + { + int i; + for (i=0; i< fb->count; i++) + printf(" %Ld\n", fb->list[i]); + } + + freeblock(fb); + + if (bs_super->freelist_full == 0ULL) { + printf("freelist_full is empty!\n"); + return; + } + + next = bs_super->freelist_full; + for (;;) { + fb = readblock(next); + total += fb->count; + if (print_each == 1) + { + int i; + for (i=0; i< fb->count; i++) + printf(" %Ld\n", fb->list[i]); + } + next = fb->next; + freeblock(fb); + if (next == 0ULL) break; + } + printf("Total of %Ld ids on freelist.\n", total); +} + +/***************************************************************************** + * Initialisation * + *****************************************************************************/ + +int __init_blockstore(void) +{ + int i; + blockstore_super_t *bs_super; + u64 ret; + int block_fp; + +#ifdef BLOCKSTORE_REMOTE + struct hostent *addr; + + pthread_mutex_init(&ptmutex_queue, NULL); + pthread_mutex_init(&ptmutex_luid, NULL); + pthread_mutex_init(&ptmutex_recv, NULL); + /*pthread_mutex_init(&ptmutex_notify, NULL);*/ + for (i = 0; i <= READ_POOL_SIZE; i++) { + pool_thread[i].newdata = 0; + pthread_mutex_init(&(pool_thread[i].ptmutex), NULL); + pthread_cond_init(&(pool_thread[i].ptcv), NULL); + } + + bsservers[0].hostname = "firebug.cl.cam.ac.uk"; + bsservers[1].hostname = "planb.cl.cam.ac.uk"; + bsservers[2].hostname = "simcity.cl.cam.ac.uk"; + bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/; + bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/; + bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/; + bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/; + bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/; + bsservers[8].hostname = NULL; + bsservers[9].hostname = NULL; + bsservers[10].hostname = NULL; + bsservers[11].hostname = NULL; + bsservers[12].hostname = NULL; + bsservers[13].hostname = NULL; + bsservers[14].hostname = NULL; + bsservers[15].hostname = NULL; + + for (i = 0; i < MAX_SERVERS; i++) { + if (!bsservers[i].hostname) + continue; + addr = gethostbyname(bsservers[i].hostname); + if (!addr) { + perror("bad hostname"); + return -1; + } + bsservers[i].sin.sin_family = addr->h_addrtype; + bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT); + bsservers[i].sin.sin_addr.s_addr = + ((struct in_addr *)(addr->h_addr))->s_addr; + } + + /* Cluster map + */ + bsclusters[0].servers[0] = 0; + bsclusters[0].servers[1] = 1; + bsclusters[0].servers[2] = 2; + bsclusters[1].servers[0] = 1; + bsclusters[1].servers[1] = 2; + bsclusters[1].servers[2] = 3; + bsclusters[2].servers[0] = 2; + bsclusters[2].servers[1] = 3; + bsclusters[2].servers[2] = 4; + bsclusters[3].servers[0] = 3; + bsclusters[3].servers[1] = 4; + bsclusters[3].servers[2] = 5; + bsclusters[4].servers[0] = 4; + bsclusters[4].servers[1] = 5; + bsclusters[4].servers[2] = 6; + bsclusters[5].servers[0] = 5; + bsclusters[5].servers[1] = 6; + bsclusters[5].servers[2] = 7; + bsclusters[6].servers[0] = 6; + bsclusters[6].servers[1] = 7; + bsclusters[6].servers[2] = 0; + bsclusters[7].servers[0] = 7; + bsclusters[7].servers[1] = 0; + bsclusters[7].servers[2] = 1; + + /* Local socket set up + */ + bssock = socket(AF_INET, SOCK_DGRAM, 0); + if (bssock < 0) { + perror("Bad socket"); + return -1; + } + memset(&sin_local, 0, sizeof(sin_local)); + sin_local.sin_family = AF_INET; + sin_local.sin_port = htons(BLOCKSTORED_PORT); + sin_local.sin_addr.s_addr = htonl(INADDR_ANY); + if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) { + perror("bind"); + close(bssock); + return -1; + } + + pthread_create(&pthread_recv, NULL, receive_loop, NULL); + pthread_create(&pthread_recv, NULL, queue_runner, NULL); + +#else /* /BLOCKSTORE_REMOTE */ + block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return -1; + exit(-1); + } + + if (lseek(block_fp, 0, SEEK_END) == 0) { + bs_super = newblock(); + bs_super->magic = BLOCKSTORE_MAGIC; + bs_super->freelist_full = 0LL; + bs_super->freelist_current = 0LL; + + ret = allocblock(bs_super); + + freeblock(bs_super); + } else { + bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); + if (bs_super->magic != BLOCKSTORE_MAGIC) + { + printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n"); + exit(-1); + } + freeblock(bs_super); + } + + close(block_fp); + +#endif /* BLOCKSTORE_REMOTE */ + return 0; +} + +void __exit_blockstore(void) +{ + int i; +#ifdef BLOCKSTORE_REMOTE + pthread_mutex_destroy(&ptmutex_recv); + pthread_mutex_destroy(&ptmutex_luid); + pthread_mutex_destroy(&ptmutex_queue); + /*pthread_mutex_destroy(&ptmutex_notify); + pthread_cond_destroy(&ptcv_notify);*/ + for (i = 0; i <= READ_POOL_SIZE; i++) { + pthread_mutex_destroy(&(pool_thread[i].ptmutex)); + pthread_cond_destroy(&(pool_thread[i].ptcv)); + } +#endif +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstore.h --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/blockstore.h Sun Jul 3 22:36:48 2005 @@ -0,0 +1,134 @@ +/************************************************************************** + * + * blockstore.h + * + * Simple block store interface + * + */ + +#ifndef __BLOCKSTORE_H__ +#define __BLOCKSTORE_H__ + +#include <netinet/in.h> +#include <xc.h> + +#define BLOCK_SIZE 4096 +#define BLOCK_SHIFT 12 +#define BLOCK_MASK 0xfffffffffffff000LL + +/* XXX SMH: where is the below supposed to be defined???? */ +#ifndef SECTOR_SHIFT +#define SECTOR_SHIFT 9 +#endif + +#define FREEBLOCK_SIZE (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64)) +#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL + +typedef struct { + u64 magic; + u64 next; + u64 count; + u64 list[FREEBLOCK_SIZE]; +} freeblock_t; + +#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL +#define BLOCKSTORE_SUPER 1ULL + +typedef struct { + u64 magic; + u64 freelist_full; + u64 freelist_current; +} blockstore_super_t; + +extern void *newblock(); +extern void *readblock(u64 id); +extern u64 allocblock(void *block); +extern u64 allocblock_hint(void *block, u64 hint); +extern int writeblock(u64 id, void *block); + +/* Add this blockid to a freelist, to be recycled by the allocator. */ +extern void releaseblock(u64 id); + +/* this is a memory free() operation for block-sized allocations */ +extern void freeblock(void *block); +extern int __init_blockstore(void); + +/* debug for freelist. */ +void freelist_count(int print_each); +#define ALLOCFAIL (((u64)(-1))) + +/* Distribution + */ +#define BLOCKSTORED_PORT 9346 + +struct bshdr_t_struct { + u32 operation; + u32 flags; + u64 id; + u64 luid; +} __attribute__ ((packed)); +typedef struct bshdr_t_struct bshdr_t; + +struct bsmsg_t_struct { + bshdr_t hdr; + unsigned char block[BLOCK_SIZE]; +} __attribute__ ((packed)); + +typedef struct bsmsg_t_struct bsmsg_t; + +#define MSGBUFSIZE_OP sizeof(u32) +#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32)) +#define MSGBUFSIZE_ID (sizeof(u32) + sizeof(u32) + sizeof(u64) + sizeof(u64)) +#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t) + +#define BSOP_READBLOCK 0x01 +#define BSOP_WRITEBLOCK 0x02 +#define BSOP_ALLOCBLOCK 0x03 +#define BSOP_FREEBLOCK 0x04 + +#define BSOP_FLAG_ERROR 0x01 + +#define BS_ALLOC_SKIP 10 +#define BS_ALLOC_HACK + +/* Remote hosts and cluster map - XXX need to generalise + */ + +/* + + Interim ID format is + + 63 60 59 40 39 20 19 0 + +----+--------------------+--------------------+--------------------+ + |map | replica 2 | replica 1 | replica 0 | + +----+--------------------+--------------------+--------------------+ + + The map is an index into a table detailing which machines form the + cluster. + + */ + +#define BSID_REPLICA0(_id) ((_id)&0xfffffULL) +#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL) +#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL) +#define BSID_MAP(_id) (((_id)>>60)&0xfULL) + +#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \ + (((u64)(_rep2))<<40) | \ + (((u64)(_rep1))<<20) | ((u64)(_rep0))) + +typedef struct bsserver_t_struct { + char *hostname; + struct sockaddr_in sin; +} bsserver_t; + +#define MAX_SERVERS 16 + +#define CLUSTER_MAX_REPLICAS 3 +typedef struct bscluster_t_struct { + int servers[CLUSTER_MAX_REPLICAS]; +} bscluster_t; + +#define MAX_CLUSTERS 16 + +#endif /* __BLOCKSTORE_H__ */ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/parallax.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/parallax.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,611 @@ +/************************************************************************** + * + * parallax.c + * + * The Parallax Storage Server + * + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include "blktaplib.h" +#include "blockstore.h" +#include "vdi.h" +#include "block-async.h" +#include "requests-async.h" + +#define PARALLAX_DEV 61440 +#define SECTS_PER_NODE 8 + + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +/* ------[ session records ]----------------------------------------------- */ + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) + +#define VDI_HASHSZ 16 +#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1)) + +typedef struct blkif { + domid_t domid; + unsigned int handle; + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + vdi_t *vdi_hash[VDI_HASHSZ]; + struct blkif *hash_next; +} blkif_t; + +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + if ( handle != 0 ) + printf("blktap/parallax don't currently support non-0 dev handles!\n"); + + blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + ((blkif->domid != domid) || (blkif->handle != handle)) ) + blkif = blkif->hash_next; + return blkif; +} + +vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device) +{ + vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)]; + + while ((vdi != NULL) && (vdi->vdevice != device)) + vdi = vdi->next; + + return vdi; +} + +/* ------[ control message handling ]-------------------------------------- */ + +void blkif_create(blkif_be_create_t *create) +{ + domid_t domid = create->domid; + unsigned int handle = create->blkif_handle; + blkif_t **pblkif, *blkif; + + DPRINTF("parallax (blkif_create): create is %p\n", create); + + if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL ) + { + DPRINTF("Could not create blkif: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->handle = handle; + blkif->status = DISCONNECTED; + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + { + DPRINTF("Could not create blkif: already exists (%d,%d)\n", + domid, handle); + create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; + free(blkif); + return; + } + pblkif = &(*pblkif)->hash_next; + } + + blkif->hash_next = *pblkif; + *pblkif = blkif; + + DPRINTF("Successfully created blkif\n"); + create->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_destroy(blkif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->blkif_handle; + blkif_t **pblkif, *blkif; + + DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif = *pblkif) != NULL ) + { + if ( (blkif->domid == domid) && (blkif->handle == handle) ) + { + if ( blkif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pblkif = &blkif->hash_next; + } + + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pblkif = blkif->hash_next; + free(blkif); + destroy->status = BLKIF_BE_STATUS_OKAY; +} + +void vbd_create(blkif_be_vbd_create_t *create) +{ + blkif_t *blkif; + vdi_t *vdi, **vdip; + blkif_vdev_t vdevice = create->vdevice; + + DPRINTF("parallax (vbd_create): create=%p\n", create); + + blkif = blkif_find_by_handle(create->domid, create->blkif_handle); + if ( blkif == NULL ) + { + DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", + create->domid, create->blkif_handle); + create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + /* VDI identifier is in grow->extent.sector_start */ + DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", + (unsigned long)create->dev_handle); + + vdi = vdi_get(create->dev_handle); + if (vdi == NULL) + { + printf("parallax (vbd_create): VDI %lx not found.\n", + (unsigned long)create->dev_handle); + create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + return; + } + + vdi->next = NULL; + vdi->vdevice = vdevice; + vdip = &blkif->vdi_hash[VDI_HASH(vdevice)]; + while (*vdip != NULL) + vdip = &(*vdip)->next; + *vdip = vdi; + + DPRINTF("blkif_create succeeded\n"); + create->status = BLKIF_BE_STATUS_OKAY; +} + +void vbd_destroy(blkif_be_vbd_destroy_t *destroy) +{ + blkif_t *blkif; + vdi_t *vdi, **vdip; + blkif_vdev_t vdevice = destroy->vdevice; + + blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); + if ( blkif == NULL ) + { + DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", + destroy->domid, destroy->blkif_handle); + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + vdip = &blkif->vdi_hash[VDI_HASH(vdevice)]; + while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice)) + vdip = &(*vdip)->next; + + if (*vdip != NULL) + { + vdi = *vdip; + *vdip = vdi->next; + vdi_put(vdi); + } + +} + +int parallax_control(control_msg_t *msg) +{ + domid_t domid; + int ret; + + DPRINTF("parallax_control: msg is %p\n", msg); + + if (msg->type != CMSG_BLKIF_BE) + { + printf("Unexpected control message (%d)\n", msg->type); + return 0; + } + + switch(msg->subtype) + { + case CMSG_BLKIF_BE_CREATE: + if ( msg->length != sizeof(blkif_be_create_t) ) + goto parse_error; + blkif_create((blkif_be_create_t *)msg->msg); + break; + + case CMSG_BLKIF_BE_DESTROY: + if ( msg->length != sizeof(blkif_be_destroy_t) ) + goto parse_error; + blkif_destroy((blkif_be_destroy_t *)msg->msg); + break; + + case CMSG_BLKIF_BE_VBD_CREATE: + if ( msg->length != sizeof(blkif_be_vbd_create_t) ) + goto parse_error; + vbd_create((blkif_be_vbd_create_t *)msg->msg); + break; + + case CMSG_BLKIF_BE_VBD_DESTROY: + if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) + goto parse_error; + vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg); + break; + + case CMSG_BLKIF_BE_CONNECT: + case CMSG_BLKIF_BE_DISCONNECT: + /* we don't manage the device channel, the tap does. */ + break; + + default: + goto parse_error; + } + return 0; +parse_error: + printf("Bad control message!\n"); + return 0; + +} + +int parallax_probe(blkif_request_t *req, blkif_t *blkif) +{ + blkif_response_t *rsp; + vdisk_t *img_info; + vdi_t *vdi; + int i, nr_vdis = 0; + + DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); + + /* We expect one buffer only. */ + if ( req->nr_segments != 1 ) + goto err; + + /* Make sure the buffer is page-sized. */ + if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || + (blkif_last_sect (req->frame_and_sects[0]) != 7) ) + goto err; + + /* fill the list of devices */ + for (i=0; i<VDI_HASHSZ; i++) { + vdi = blkif->vdi_hash[i]; + while (vdi) { + img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); + img_info[nr_vdis].device = vdi->vdevice; + img_info[nr_vdis].info = 0; + /* The -1 here accounts for the LSB in the radix tree */ + img_info[nr_vdis].capacity = + ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE); + nr_vdis++; + vdi = vdi->next; + } + } + + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_PROBE; + rsp->status = nr_vdis; /* number of disks */ + + DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis); + return BLKTAP_RESPOND; +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_PROBE; + rsp->status = BLKIF_RSP_ERROR; + + DPRINTF("parallax_probe: send error response\n"); + return BLKTAP_RESPOND; +} + +typedef struct { + blkif_request_t *req; + int count; + int error; + pthread_mutex_t mutex; +} pending_t; + +#define MAX_REQUESTS 64 +pending_t pending_list[MAX_REQUESTS]; + +struct cb_param { + pending_t *pent; + int segment; + u64 sector; + u64 vblock; /* for debug printing -- can be removed. */ +}; + +static void read_cb(struct io_ret r, void *in_param) +{ + struct cb_param *param = (struct cb_param *)in_param; + pending_t *p = param->pent; + int segment = param->segment; + blkif_request_t *req = p->req; + unsigned long size, offset, start; + char *dpage, *spage; + + spage = IO_BLOCK(r); + if (spage == NULL) { p->error++; goto finish; } + dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment); + + /* Calculate read size and offset within the read block. */ + + offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE; + size = ( blkif_last_sect (req->frame_and_sects[segment]) - + blkif_first_sect(req->frame_and_sects[segment]) + 1 + ) << SECTOR_SHIFT; + start = blkif_first_sect(req->frame_and_sects[segment]) + << SECTOR_SHIFT; + + DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), " + "vblock %llx, " + "size %lx\n", + param->sector, blkif_first_sect(p->req->frame_and_sects[segment]), + blkif_last_sect (p->req->frame_and_sects[segment]), + param->vblock, size); + + memcpy(dpage + start, spage + offset, size); + freeblock(spage); + + /* Done the read. Now update the pending record. */ + finish: + pthread_mutex_lock(&p->mutex); + p->count--; + + if (p->count == 0) { + blkif_response_t *rsp; + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_READ; + if (p->error == 0) { + rsp->status = BLKIF_RSP_OKAY; + } else { + rsp->status = BLKIF_RSP_ERROR; + } + blktap_inject_response(rsp); + } + + pthread_mutex_unlock(&p->mutex); + + free(param); /* TODO: replace with cached alloc/dealloc */ +} + +int parallax_read(blkif_request_t *req, blkif_t *blkif) +{ + blkif_response_t *rsp; + u64 vblock, gblock; + vdi_t *vdi; + u64 sector; + int i; + char *dpage, *spage; + pending_t *pent; + + vdi = blkif_get_vdi(blkif, req->device); + + if ( vdi == NULL ) + goto err; + + pent = &pending_list[ID_TO_IDX(req->id)]; + pent->count = req->nr_segments; + pent->req = req; + pthread_mutex_init(&pent->mutex, NULL); + + for (i = 0; i < req->nr_segments; i++) { + pthread_t tid; + int ret; + struct cb_param *p; + + /* Round the requested segment to a block address. */ + sector = req->sector_number + (8*i); + vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; + + /* TODO: Replace this call to malloc with a cached allocation */ + p = (struct cb_param *)malloc(sizeof(struct cb_param)); + p->pent = pent; + p->sector = sector; + p->segment = i; + p->vblock = vblock; /* dbg */ + + /* Get that block from the store. */ + vdi_read(vdi, vblock, read_cb, (void *)p); + } + + return BLKTAP_STOLEN; + +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_READ; + rsp->status = BLKIF_RSP_ERROR; + + return BLKTAP_RESPOND; +} + +static void write_cb(struct io_ret r, void *in_param) +{ + struct cb_param *param = (struct cb_param *)in_param; + pending_t *p = param->pent; + blkif_request_t *req = p->req; + + /* catch errors from the block code. */ + if (IO_INT(r) < 0) p->error++; + + pthread_mutex_lock(&p->mutex); + p->count--; + + if (p->count == 0) { + blkif_response_t *rsp; + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_WRITE; + if (p->error == 0) { + rsp->status = BLKIF_RSP_OKAY; + } else { + rsp->status = BLKIF_RSP_ERROR; + } + blktap_inject_response(rsp); + } + + pthread_mutex_unlock(&p->mutex); + + free(param); /* TODO: replace with cached alloc/dealloc */ +} + +int parallax_write(blkif_request_t *req, blkif_t *blkif) +{ + blkif_response_t *rsp; + u64 sector; + int i, writable = 0; + u64 vblock, gblock; + char *spage; + unsigned long size, offset, start; + vdi_t *vdi; + pending_t *pent; + + vdi = blkif_get_vdi(blkif, req->device); + + if ( vdi == NULL ) + goto err; + + pent = &pending_list[ID_TO_IDX(req->id)]; + pent->count = req->nr_segments; + pent->req = req; + pthread_mutex_init(&pent->mutex, NULL); + + for (i = 0; i < req->nr_segments; i++) { + struct cb_param *p; + + spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); + + /* Round the requested segment to a block address. */ + + sector = req->sector_number + (8*i); + vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; + + /* Calculate read size and offset within the read block. */ + + offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE; + size = ( blkif_last_sect (req->frame_and_sects[i]) - + blkif_first_sect(req->frame_and_sects[i]) + 1 + ) << SECTOR_SHIFT; + start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; + + DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld), " + "vblock %llx, gblock %llx, " + "size %lx\n", + sector, blkif_first_sect(req->frame_and_sects[i]), + blkif_last_sect (req->frame_and_sects[i]), + vblock, gblock, size); + + /* XXX: For now we just freak out if they try to write a */ + /* non block-sized, block-aligned page. */ + + if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) { + printf("]\n] STRANGE WRITE!\n]\n"); + goto err; + } + + /* TODO: Replace this call to malloc with a cached allocation */ + p = (struct cb_param *)malloc(sizeof(struct cb_param)); + p->pent = pent; + p->sector = sector; + p->segment = i; + p->vblock = vblock; /* dbg */ + + /* Issue the write to the store. */ + vdi_write(vdi, vblock, spage, write_cb, (void *)p); + } + + return BLKTAP_STOLEN; + +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_WRITE; + rsp->status = BLKIF_RSP_ERROR; + + return BLKTAP_RESPOND; +} + +int parallax_request(blkif_request_t *req) +{ + blkif_response_t *rsp; + domid_t dom = ID_TO_DOM(req->id); + blkif_t *blkif = blkif_find_by_handle(dom, 0); + + if (blkif == NULL) + goto err; + + if ( req->operation == BLKIF_OP_PROBE ) { + + return parallax_probe(req, blkif); + + } else if ( req->operation == BLKIF_OP_READ ) { + + return parallax_read(req, blkif); + + } else if ( req->operation == BLKIF_OP_WRITE ) { + + return parallax_write(req, blkif); + + } else { + printf("Unknown request message type!\n"); + /* Unknown operation */ + goto err; + } + +err: + rsp = (blkif_response_t *)req; + rsp->operation = req->operation; + rsp->id = req->id; + rsp->status = BLKIF_RSP_ERROR; + return BLKTAP_RESPOND; +} + +void __init_parallax(void) +{ + memset(blkif_hash, 0, sizeof(blkif_hash)); +} + + + +int main(int argc, char *argv[]) +{ + DPRINTF("parallax: starting.\n"); + __init_blockstore(); + DPRINTF("parallax: initialized blockstore...\n"); + init_block_async(); + DPRINTF("parallax: initialized async blocks...\n"); + __init_vdi(); + DPRINTF("parallax: initialized vdi registry etc...\n"); + __init_parallax(); + DPRINTF("parallax: initialized local stuff..\n"); + + blktap_register_ctrl_hook("parallax_control", parallax_control); + blktap_register_request_hook("parallax_request", parallax_request); + DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); + blktap_listen(); + + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,367 @@ +/************************************************************************** + * + * vdi.c + * + * Virtual Disk Image (VDI) Interfaces + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <string.h> +#include <sys/time.h> +#include <pthread.h> +#include "blockstore.h" +#include "block-async.h" +#include "requests-async.h" +#include "radix.h" +#include "vdi.h" + +#define VDI_REG_BLOCK 2LL +#define VDI_RADIX_ROOT writable(3) + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +/* I haven't decided about this registry stuff, so this is just a really + * quick lash-up so that there is some way to track VDIs. + * + * (Most vdi access should be with a direct handle to the block, so this + * registry is just for start-of-day lookup and other control operations.) + */ + +vdi_registry_t *create_vdi_registry(void) +{ + vdi_registry_t *reg = (vdi_registry_t *)newblock(); + + if (reg == NULL) + return NULL; + + /* zero-fill the vdi radix root while we have an empty block. */ + writeblock(VDI_RADIX_ROOT, (void *)reg); + + + DPRINTF("[vdi.c] Creating VDI registry!\n"); + reg->magic = VDI_REG_MAGIC; + reg->nr_vdis = 0; + + writeblock(VDI_REG_BLOCK, (void *)reg); + + return reg; +} + +vdi_registry_t *get_vdi_registry(void) +{ + vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK); + + if ( vdi_reg == NULL ) + vdi_reg = create_vdi_registry(); + + if ( vdi_reg->magic != VDI_REG_MAGIC ) { + freeblock(vdi_reg); + return NULL; + } + + return vdi_reg; +} + + +vdi_t *vdi_create(snap_id_t *parent_snap, char *name) +{ + int ret; + vdi_t *vdi; + vdi_registry_t *vdi_reg; + snap_rec_t snap_rec; + + /* create a vdi struct */ + vdi = newblock(); + if (vdi == NULL) + return NULL; + + if ( snap_get_id(parent_snap, &snap_rec) == 0 ) { + vdi->radix_root = snapshot(snap_rec.radix_root); + } else { + vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */ + vdi->radix_root = writable(vdi->radix_root); /* grr. */ + } + + /* create a snapshot log, and add it to the vdi struct */ + + ret = snap_block_create(parent_snap, &vdi->snap); + if ( ret != 0 ) { + DPRINTF("Error getting snap block in vdi_create.\n"); + freeblock(vdi); + return NULL; + } + + /* append the vdi to the registry, fill block and id. */ + /* implicit allocation means we have to write the vdi twice here. */ + vdi_reg = get_vdi_registry(); + if ( vdi_reg == NULL ) { + freeblock(vdi); + return NULL; + } + + vdi->block = allocblock((void *)vdi); + vdi->id = vdi_reg->nr_vdis++; + strncpy(vdi->name, name, VDI_NAME_SZ); + vdi->name[VDI_NAME_SZ] = '\0'; + vdi->radix_lock = NULL; /* for tidiness */ + writeblock(vdi->block, (void *)vdi); + + update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block); + writeblock(VDI_REG_BLOCK, (void *)vdi_reg); + freeblock(vdi_reg); + + vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); + if (vdi->radix_lock == NULL) + { + perror("couldn't malloc radix_lock for new vdi!"); + freeblock(vdi); + return NULL; + } + radix_lock_init(vdi->radix_lock); + + return vdi; +} + +/* vdi_get and vdi_put currently act more like alloc/free -- they don't + * do refcount-based allocation. + */ +vdi_t *vdi_get(u64 vdi_id) +{ + u64 vdi_blk; + vdi_t *vdi; + + vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id); + + if ( vdi_blk == 0 ) + return NULL; + + vdi = (vdi_t *)readblock(vdi_blk); + + vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); + if (vdi->radix_lock == NULL) + { + perror("couldn't malloc radix_lock for new vdi!"); + freeblock(vdi); + return NULL; + } + radix_lock_init(vdi->radix_lock); + + return vdi; +} + +void vdi_put(vdi_t *vdi) +{ + free(vdi->radix_lock); + freeblock(vdi); +} + +void vdi_snapshot(vdi_t *vdi) +{ + snap_rec_t rec; + int ret; + + rec.radix_root = vdi->radix_root; + gettimeofday(&rec.timestamp, NULL); + rec.deleted = 0; + + vdi->radix_root = snapshot(vdi->radix_root); + ret = snap_append(&vdi->snap, &rec, &vdi->snap); + if ( ret != 0 ) { + printf("snap_append returned failure\n"); + return; + } + writeblock(vdi->block, vdi); +} + +int __init_vdi() +{ + /* sneak this in here for the moment. */ + __rcache_init(); + + /* force the registry to be created if it doesn't exist. */ + vdi_registry_t *vdi_reg = get_vdi_registry(); + if (vdi_reg == NULL) { + printf("[vdi.c] Couldn't get/create a VDI registry!\n"); + return -1; + } + freeblock(vdi_reg); + + + return 0; +} + +#ifdef VDI_STANDALONE + +#define TEST_VDIS 50 +#define NR_ITERS 50000 +#define FORK_POINTS 200 +#define INIT_VDIS 3 +#define INIT_SNAPS 40 + +/* These must be of decreasing size: */ +#define NEW_FORK (RAND_MAX-(RAND_MAX/1000)) +#define NEW_ROOT_VDI (RAND_MAX-((RAND_MAX/1000)*2)) +#define NEW_FORK_VDI (RAND_MAX-((RAND_MAX/1000)*3)) + +#define GRAPH_DOT_FILE "vdi.dot" +#define GRAPH_PS_FILE "vdi.ps" + + +typedef struct sh_st { + snap_id_t id; + struct sh_st *next; +} sh_t; + +#define SNAP_HASHSZ 1024 +sh_t *node_hash[SNAP_HASHSZ]; +#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ) + +#define SNAPID_EQUAL(_a,_b) \ + (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index)) +int sh_check_and_add(snap_id_t *id) +{ + sh_t **s = &node_hash[SNAP_HASH(id)]; + + while (*s != NULL) { + if (SNAPID_EQUAL(&((*s)->id), id)) + return 1; + *s = (*s)->next; + } + + *s = (sh_t *)malloc(sizeof(sh_t)); + (*s)->id = *id; + (*s)->next = NULL; + + return 0; +} + +int main(int argc, char *argv[]) +{ + vdi_t *vdi_list[TEST_VDIS]; + snap_id_t id, fork_points[FORK_POINTS]; + int nr_vdis = 0, nr_forks = 0; + int i, j, r; + FILE *f; + char name[VDI_NAME_SZ]; + + __init_blockstore(); + __init_vdi(); + + printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS); + + for (i=0; i<INIT_VDIS; i++) { + r=rand(); + + sprintf(name, "VDI Number %d", nr_vdis); + vdi_list[i] = vdi_create(NULL, name); + for (j=0; j<(r%INIT_SNAPS); j++) + vdi_snapshot(vdi_list[i]); + fork_points[i] = vdi_list[i]->snap; + nr_vdis++; + nr_forks++; + } + + printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS); + + for (i=0; i<NR_ITERS; i++) { + r = rand(); + + if ( r > NEW_FORK ) { + if ( nr_forks > FORK_POINTS ) + continue; + id = vdi_list[r%nr_vdis]->snap; + if ( ( id.block == 0 ) || ( id.index == 0 ) ) + continue; + id.index--; + fork_points[nr_forks++] = id; + + } else if ( r > NEW_ROOT_VDI ) { + + if ( nr_vdis == TEST_VDIS ) + continue; + + sprintf(name, "VDI Number %d.", nr_vdis); + vdi_list[nr_vdis++] = vdi_create(NULL, name); + + } else if ( r > NEW_FORK_VDI ) { + + if ( nr_vdis == TEST_VDIS ) + continue; + + sprintf(name, "VDI Number %d.", nr_vdis); + vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name); + + } else /* SNAPSHOT */ { + + vdi_snapshot(vdi_list[r%nr_vdis]); + + } + } + + /* now dump it out to a dot file. */ + printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis); + + f = fopen(GRAPH_DOT_FILE, "w"); + + /* write graph preamble */ + fprintf(f, "digraph G {\n"); + fprintf(f, " rankdir=LR\n"); + + for (i=0; i<nr_vdis; i++) { + char oldnode[255]; + snap_block_t *blk; + snap_id_t id = vdi_list[i]->snap; + int nr_snaps, done=0; + + /* add a node for the id */ +printf("vdi: %d\n", i); + fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", + id.block, id.index, vdi_list[i]->name, + id.block, id.index); + sprintf(oldnode, "n%Ld%d", id.block, id.index); + + while (id.block != 0) { + blk = snap_get_block(id.block); + nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index); + id = blk->hdr.fork_block; + + done = sh_check_and_add(&id); + + /* add a node for the fork_id */ + if (!done) { + fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", + id.block, id.index, + id.block, id.index); + } + + /* add an edge between them */ + fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n", + id.block, id.index, oldnode, nr_snaps); + sprintf(oldnode, "n%Ld%d", id.block, id.index); + freeblock(blk); + + if (done) break; + } + } + + /* write graph postamble */ + fprintf(f, "}\n"); + fclose(f); + + printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE); + { + char cmd[255]; + sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE); + system(cmd); + } + return 0; +} + +#endif diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi.h --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi.h Sun Jul 3 22:36:48 2005 @@ -0,0 +1,55 @@ +#ifndef _VDI_H_ +#define _VDI_H_ +/************************************************************************** + * + * vdi.h + * + * Virtual Disk Image (VDI) Interfaces + * + */ + +#ifndef __VDI_H__ +#define __VDI_H__ + +#include "blktaplib.h" +#include "snaplog.h" + +#define VDI_HEIGHT 27 /* Note that these are now hard-coded */ +#define VDI_REG_HEIGHT 27 /* in the async lookup code */ + +#define VDI_NAME_SZ 256 + + +typedef struct vdi { + u64 id; /* unique vdi id -- used by the registry */ + u64 block; /* block where this vdi lives (also unique)*/ + u64 radix_root; /* radix root node for block mappings */ + snap_id_t snap; /* next snapshot slot for this VDI */ + struct vdi *next; /* used to hash-chain in blkif. */ + blkif_vdev_t vdevice; /* currently mounted as... */ + struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs */ + char name[VDI_NAME_SZ];/* human readable vdi name */ +} vdi_t; + +#define VDI_REG_MAGIC 0xff00ff0bb0ff00ffLL + +typedef struct vdi_registry { + u64 magic; + u64 nr_vdis; +} vdi_registry_t; + + +int __init_vdi(void); + +vdi_t *vdi_get(u64 vdi_id); +void vdi_put(vdi_t *vdi); +vdi_registry_t *get_vdi_registry(void); +vdi_t *vdi_create(snap_id_t *parent_snap, char *name); +u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable); +void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block); +void vdi_snapshot(vdi_t *vdi); + + +#endif /* __VDI_H__ */ + +#endif //_VDI_H_ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/requests-async.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/requests-async.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,762 @@ +/* requests-async.c + * + * asynchronous request dispatcher for radix access in parallax. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <assert.h> +#include <pthread.h> +#include <err.h> +#include <zlib.h> /* for crc32() */ +#include "requests-async.h" +#include "vdi.h" +#include "radix.h" + +#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18) +#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9) +#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL)) + + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +struct block_info { + u32 crc; + u32 unused; +}; + +struct io_req { + enum { IO_OP_READ, IO_OP_WRITE } op; + u64 root; + u64 vaddr; + int state; + io_cb_t cb; + void *param; + struct radix_lock *lock; + + /* internal stuff: */ + struct io_ret retval;/* holds the return while we unlock. */ + char *block; /* the block to write */ + radix_tree_node radix[3]; + u64 radix_addr[3]; + struct block_info bi; +}; + +void clear_w_bits(radix_tree_node node) +{ + int i; + for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++) + node[i] = node[i] & ONEMASK; + return; +} + +void clear_L3_w_bits(radix_tree_node node) +{ + int i; + for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2) + node[i] = node[i] & ONEMASK; + return; +} + +enum states { + /* both */ + READ_L1, + READ_L2, + READ_L3, + + /* read */ + READ_LOCKED, + READ_DATA, + READ_UNLOCKED, + RETURN_ZERO, + + /* write */ + WRITE_LOCKED, + WRITE_DATA, + WRITE_L3, + WRITE_UNLOCKED, + + /* L3 Zero Path */ + ALLOC_DATA_L3z, + WRITE_L3_L3z, + + /* L3 Fault Path */ + ALLOC_DATA_L3f, + WRITE_L3_L3f, + + /* L2 Zero Path */ + ALLOC_DATA_L2z, + WRITE_L2_L2z, + ALLOC_L3_L2z, + WRITE_L2_L3z, + + /* L2 Fault Path */ + READ_L3_L2f, + ALLOC_DATA_L2f, + WRITE_L2_L2f, + ALLOC_L3_L2f, + WRITE_L2_L3f, + + /* L1 Zero Path */ + ALLOC_DATA_L1z, + ALLOC_L3_L1z, + ALLOC_L2_L1z, + WRITE_L1_L1z, + + /* L1 Fault Path */ + READ_L2_L1f, + READ_L3_L1f, + ALLOC_DATA_L1f, + ALLOC_L3_L1f, + ALLOC_L2_L1f, + WRITE_L1_L1f, + +}; + +enum radix_offsets { + L1 = 0, + L2 = 1, + L3 = 2 +}; + + +static void read_cb(struct io_ret ret, void *param); +static void write_cb(struct io_ret ret, void *param); + +int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param) +{ + struct io_req *req; + + if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR; + /* Every second line in the bottom-level radix tree is used to */ + /* store crc32 values etc. We shift the vadder here to achied this. */ + vaddr <<= 1; + + req = (struct io_req *)malloc(sizeof (struct io_req)); + if (req == NULL) return ERR_NOMEM; + + req->radix[0] = req->radix[1] = req->radix[2] = NULL; + req->op = IO_OP_READ; + req->root = vdi->radix_root; + req->lock = vdi->radix_lock; + req->vaddr = vaddr; + req->cb = cb; + req->param = param; + req->state = READ_LOCKED; + + block_rlock(req->lock, L1_IDX(vaddr), read_cb, req); + + return 0; +} + + +int vdi_write(vdi_t *vdi, u64 vaddr, char *block, + io_cb_t cb, void *param) +{ + struct io_req *req; + + if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR; + /* Every second line in the bottom-level radix tree is used to */ + /* store crc32 values etc. We shift the vadder here to achied this. */ + vaddr <<= 1; + + req = (struct io_req *)malloc(sizeof (struct io_req)); + if (req == NULL) return ERR_NOMEM; + + req->radix[0] = req->radix[1] = req->radix[2] = NULL; + req->op = IO_OP_WRITE; + req->root = vdi->radix_root; + req->lock = vdi->radix_lock; + req->vaddr = vaddr; + req->block = block; + /* Todo: add a pseodoheader to the block to include some location */ + /* information in the CRC as well. */ + req->bi.crc = (u32) crc32(0L, Z_NULL, 0); + req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); + req->bi.unused = 0xdeadbeef; + + req->cb = cb; + req->param = param; + req->radix_addr[L1] = getid(req->root); /* for consistency */ + req->state = WRITE_LOCKED; + + block_wlock(req->lock, L1_IDX(vaddr), write_cb, req); + + + return 0; +} + +static void read_cb(struct io_ret ret, void *param) +{ + struct io_req *req = (struct io_req *)param; + radix_tree_node node; + u64 idx; + char *block; + void *req_param; + + DPRINTF("read_cb\n"); + /* get record */ + switch(req->state) { + + case READ_LOCKED: + + DPRINTF("READ_LOCKED\n"); + req->state = READ_L1; + block_read(getid(req->root), read_cb, req); + break; + + case READ_L1: /* block is the radix root */ + + DPRINTF("READ_L1\n"); + block = IO_BLOCK(ret); + if (block == NULL) goto fail; + node = (radix_tree_node) block; + idx = getid( node[L1_IDX(req->vaddr)] ); + free(block); + if ( idx == ZERO ) { + req->state = RETURN_ZERO; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + } else { + req->state = READ_L2; + block_read(idx, read_cb, req); + } + break; + + case READ_L2: + + DPRINTF("READ_L2\n"); + block = IO_BLOCK(ret); + if (block == NULL) goto fail; + node = (radix_tree_node) block; + idx = getid( node[L2_IDX(req->vaddr)] ); + free(block); + if ( idx == ZERO ) { + req->state = RETURN_ZERO; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + } else { + req->state = READ_L3; + block_read(idx, read_cb, req); + } + break; + + case READ_L3: + { + struct block_info *bi; + + DPRINTF("READ_L3\n"); + block = IO_BLOCK(ret); + if (block == NULL) goto fail; + node = (radix_tree_node) block; + idx = getid( node[L3_IDX(req->vaddr)] ); + bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1]; + req->bi = *bi; + free(block); + if ( idx == ZERO ) { + req->state = RETURN_ZERO; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + } else { + req->state = READ_DATA; + block_read(idx, read_cb, req); + } + break; + } + case READ_DATA: + { + u32 crc; + + DPRINTF("READ_DATA\n"); + block = IO_BLOCK(ret); + if (block == NULL) goto fail; + + /* crc check */ + crc = (u32) crc32(0L, Z_NULL, 0); + crc = (u32) crc32(crc, block, BLOCK_SIZE); + if (crc != req->bi.crc) { + /* TODO: add a retry loop here. */ + /* Do this after the cache is added -- make sure to */ + /* invalidate the bad page before reissuing the read. */ + + warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused); +#ifdef PRINT_BADCRC_PAGES + { + int j; + for (j=0; j<BLOCK_SIZE; j++) { + if isprint(block[j]) { + printf("%c", block[j]); + } else { + printf("."); + } + if ((j % 64) == 0) printf("\n"); + } + } +#endif /* PRINT_BADCRC_PAGES */ + + /* fast and loose for the moment. */ + /* goto fail; */ + } + + req->retval = ret; + req->state = READ_UNLOCKED; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + break; + } + case READ_UNLOCKED: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("READ_UNLOCKED\n"); + req_param = req->param; + r = req->retval; + cb = req->cb; + free(req); + cb(r, req_param); + break; + } + + case RETURN_ZERO: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("RETURN_ZERO\n"); + req_param = req->param; + cb = req->cb; + free(req); + r.type = IO_BLOCK_T; + r.u.b = newblock(); + cb(r, req_param); + break; + } + + default: + DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); + goto fail; + } + + return; + + fail: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("asyn_read had a read error.\n"); + req_param = req->param; + r = ret; + cb = req->cb; + free(req); + cb(r, req_param); + } + + +} + +static void write_cb(struct io_ret r, void *param) +{ + struct io_req *req = (struct io_req *)param; + radix_tree_node node; + u64 a, addr; + void *req_param; + struct block_info *bi; + + switch(req->state) { + + case WRITE_LOCKED: + + DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr)); + req->state = READ_L1; + block_read(getid(req->root), write_cb, req); + break; + + case READ_L1: /* block is the radix root */ + + DPRINTF("READ_L1\n"); + node = (radix_tree_node) IO_BLOCK(r); + if (node == NULL) goto fail; + a = node[L1_IDX(req->vaddr)]; + addr = getid(a); + + req->radix_addr[L2] = addr; + req->radix[L1] = node; + + if ( addr == ZERO ) { + /* L1 empty subtree: */ + req->state = ALLOC_DATA_L1z; + block_alloc( req->block, write_cb, req ); + } else if ( !iswritable(a) ) { + /* L1 fault: */ + req->state = READ_L2_L1f; + block_read( addr, write_cb, req ); + } else { + req->state = READ_L2; + block_read( addr, write_cb, req ); + } + break; + + case READ_L2: + + DPRINTF("READ_L2\n"); + node = (radix_tree_node) IO_BLOCK(r); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix_addr[L3] = addr; + req->radix[L2] = node; + + if ( addr == ZERO ) { + /* L2 empty subtree: */ + req->state = ALLOC_DATA_L2z; + block_alloc( req->block, write_cb, req ); + } else if ( !iswritable(a) ) { + /* L2 fault: */ + req->state = READ_L3_L2f; + block_read( addr, write_cb, req ); + } else { + req->state = READ_L3; + block_read( addr, write_cb, req ); + } + break; + + case READ_L3: + + DPRINTF("READ_L3\n"); + node = (radix_tree_node) IO_BLOCK(r); + if (node == NULL) goto fail; + a = node[L3_IDX(req->vaddr)]; + addr = getid(a); + + req->radix[L3] = node; + + if ( addr == ZERO ) { + /* L3 fault: */ + req->state = ALLOC_DATA_L3z; + block_alloc( req->block, write_cb, req ); + } else if ( !iswritable(a) ) { + /* L3 fault: */ + req->state = ALLOC_DATA_L3f; + block_alloc( req->block, write_cb, req ); + } else { + req->state = WRITE_DATA; + block_write( addr, req->block, write_cb, req ); + } + break; + + case WRITE_DATA: + + DPRINTF("WRITE_DATA\n"); + /* The L3 radix points to the correct block, we just need to */ + /* update the crc. */ + if (IO_INT(r) < 0) goto fail; + bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; + req->bi.unused = 101; + *bi = req->bi; + req->state = WRITE_L3; + block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); + break; + + /* L3 Zero Path: */ + + case ALLOC_DATA_L3z: + + DPRINTF("ALLOC_DATA_L3z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; + req->bi.unused = 102; + *bi = req->bi; + req->state = WRITE_L3_L3z; + block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); + break; + + /* L3 Fault Path: */ + + case ALLOC_DATA_L3f: + + DPRINTF("ALLOC_DATA_L3f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; + req->bi.unused = 103; + *bi = req->bi; + req->state = WRITE_L3_L3f; + block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); + break; + + /* L2 Zero Path: */ + + case ALLOC_DATA_L2z: + + DPRINTF("ALLOC_DATA_L2z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3] = newblock(); + req->radix[L3][L3_IDX(req->vaddr)] = a; + bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; + req->bi.unused = 104; + *bi = req->bi; + req->state = ALLOC_L3_L2z; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L2z: + + DPRINTF("ALLOC_L3_L2z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = WRITE_L2_L2z; + block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); + break; + + /* L2 Fault Path: */ + + case READ_L3_L2f: + + DPRINTF("READ_L3_L2f\n"); + node = (radix_tree_node) IO_BLOCK(r); + clear_L3_w_bits(node); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix[L3] = node; + req->state = ALLOC_DATA_L2f; + block_alloc( req->block, write_cb, req ); + break; + + case ALLOC_DATA_L2f: + + DPRINTF("ALLOC_DATA_L2f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; + req->bi.unused = 105; + *bi = req->bi; + req->state = ALLOC_L3_L2f; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L2f: + + DPRINTF("ALLOC_L3_L2f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = WRITE_L2_L2f; + block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); + break; + + /* L1 Zero Path: */ + + case ALLOC_DATA_L1z: + + DPRINTF("ALLOC_DATA_L1z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3] = newblock(); + req->radix[L3][L3_IDX(req->vaddr)] = a; + bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; + req->bi.unused = 106; + *bi = req->bi; + req->state = ALLOC_L3_L1z; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L1z: + + DPRINTF("ALLOC_L3_L1z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2] = newblock(); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = ALLOC_L2_L1z; + block_alloc( (char*)req->radix[L2], write_cb, req ); + break; + + case ALLOC_L2_L1z: + + DPRINTF("ALLOC_L2_L1z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L1][L1_IDX(req->vaddr)] = a; + req->state = WRITE_L1_L1z; + block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); + break; + + /* L1 Fault Path: */ + + case READ_L2_L1f: + + DPRINTF("READ_L2_L1f\n"); + node = (radix_tree_node) IO_BLOCK(r); + clear_w_bits(node); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix_addr[L3] = addr; + req->radix[L2] = node; + + if (addr == ZERO) { + /* nothing below L2, create an empty L3 and alloc data. */ + /* (So skip READ_L3_L1f.) */ + req->radix[L3] = newblock(); + req->state = ALLOC_DATA_L1f; + block_alloc( req->block, write_cb, req ); + } else { + req->state = READ_L3_L1f; + block_read( addr, write_cb, req ); + } + break; + + case READ_L3_L1f: + + DPRINTF("READ_L3_L1f\n"); + node = (radix_tree_node) IO_BLOCK(r); + clear_L3_w_bits(node); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix[L3] = node; + req->state = ALLOC_DATA_L1f; + block_alloc( req->block, write_cb, req ); + break; + + case ALLOC_DATA_L1f: + + DPRINTF("ALLOC_DATA_L1f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; + req->bi.unused = 107; + *bi = req->bi; + req->state = ALLOC_L3_L1f; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L1f: + + DPRINTF("ALLOC_L3_L1f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = ALLOC_L2_L1f; + block_alloc( (char*)req->radix[L2], write_cb, req ); + break; + + case ALLOC_L2_L1f: + + DPRINTF("ALLOC_L2_L1f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L1][L1_IDX(req->vaddr)] = a; + req->state = WRITE_L1_L1f; + block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); + break; + + case WRITE_L3: + case WRITE_L3_L3z: + case WRITE_L3_L3f: + case WRITE_L2_L2z: + case WRITE_L2_L2f: + case WRITE_L1_L1z: + case WRITE_L1_L1f: + { + int i; + DPRINTF("DONE\n"); + /* free any saved node vals. */ + for (i=0; i<3; i++) + if (req->radix[i] != 0) free(req->radix[i]); + req->retval = r; + req->state = WRITE_UNLOCKED; + block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req); + break; + } + case WRITE_UNLOCKED: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("WRITE_UNLOCKED!\n"); + req_param = req->param; + r = req->retval; + cb = req->cb; + free(req); + cb(r, req_param); + break; + } + + default: + DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); + goto fail; + } + + return; + + fail: + { + struct io_ret r; + io_cb_t cb; + int i; + + DPRINTF("asyn_write had a read error mid-way.\n"); + req_param = req->param; + cb = req->cb; + r.type = IO_INT_T; + r.u.i = -1; + /* free any saved node vals. */ + for (i=0; i<3; i++) + if (req->radix[i] != 0) free(req->radix[i]); + free(req); + cb(r, req_param); + } +} + +char *vdi_read_s(vdi_t *vdi, u64 vaddr) +{ + pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; + char *block = NULL; + int ret; + + void reads_cb(struct io_ret r, void *param) + { + block = IO_BLOCK(r); + pthread_mutex_unlock((pthread_mutex_t *)param); + } + + pthread_mutex_lock(&m); + ret = vdi_read(vdi, vaddr, reads_cb, &m); + + if (ret == 0) pthread_mutex_lock(&m); + + return block; +} + + +int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block) +{ + pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; + int ret, result; + + void writes_cb(struct io_ret r, void *param) + { + result = IO_INT(r); + pthread_mutex_unlock((pthread_mutex_t *)param); + } + + pthread_mutex_lock(&m); + ret = vdi_write(vdi, vaddr, block, writes_cb, &m); + + if (ret == 0) pthread_mutex_lock(&m); + + return result; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/requests-async.h --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/requests-async.h Sun Jul 3 22:36:48 2005 @@ -0,0 +1,29 @@ +#ifndef _REQUESTSASYNC_H_ +#define _REQUESTSASYNC_H_ + +#include "block-async.h" +#include "blockstore.h" /* for newblock etc. */ + +/* +#define BLOCK_SIZE 4096 +#define ZERO 0ULL +#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU) +#define iswritable(x) (((x) & 1LLU) != 0) +#define writable(x) (((x) << 1) | 1LLU) +#define readonly(x) ((u64)((x) << 1)) +*/ + +#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */ +#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x)) + +int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param); +int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param); + +/* synchronous versions: */ +char *vdi_read_s (vdi_t *vdi, u64 vaddr); +int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block); + +#define ERR_BAD_VADDR -1 +#define ERR_NOMEM -2 + +#endif //_REQUESTSASYNC_H_ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_unittest.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_unittest.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,184 @@ +/************************************************************************** + * + * vdi_unittest.c + * + * Run a small test workload to ensure that data access through a vdi + * is (at least superficially) correct. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include "requests-async.h" +#include "blockstore.h" +#include "radix.h" +#include "vdi.h" + +#define TEST_PAGES 32 +static char *zero_page; +static char pages[TEST_PAGES][BLOCK_SIZE]; +static int next_page = 0; + +void fill_test_pages(void) +{ + int i, j; + long *page; + + for (i=0; i< TEST_PAGES; i++) { + page = (unsigned long *)pages[i]; + for (j=0; j<(BLOCK_SIZE/4); j++) { + page[j] = random(); + } + } + + zero_page = newblock(); +} + +inline u64 make_vaddr(u64 L1, u64 L2, u64 L3) +{ + u64 ret = L1; + + ret = (ret << 9) | L2; + ret = (ret << 9) | L3; + + return ret; +} + +void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3) +{ + u64 vaddr; + char *page = pages[next_page++]; + char *rpage = NULL; + + printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3); + + vaddr = make_vaddr(L1, L2, L3); + vdi_write_s(vdi, vaddr, page); + rpage = vdi_read_s(vdi, vaddr); + + if (rpage == NULL) + { + printf( "read %Lu returned NULL\n", vaddr); + return; + } + + if (memcmp(page, rpage, BLOCK_SIZE) != 0) + { + printf( "read %Lu returned a different page\n", vaddr); + return; + } + + freeblock(rpage); +} + +void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page) +{ + u64 vaddr; + char *rpage = NULL; + + printf("TEST (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3); + + vaddr = make_vaddr(L1, L2, L3); + rpage = vdi_read_s(vdi, vaddr); + + if (rpage == NULL) + { + printf( "read %Lu returned NULL\n", vaddr); + return; + } + + if (memcmp(page, rpage, BLOCK_SIZE) != 0) + { + printf( "read %Lu returned a different page\n", vaddr); + return; + } + + freeblock(rpage); +} + +void coverage_test(vdi_t *vdi) +{ + u64 vaddr; + int i, j, k; + + /* Do a series of writes and reads to test all paths through the + * async radix code. The radix request code will dump CRC warnings + * if there are data problems here as well. + */ + + /* L1 Zero */ + touch_block(vdi, 0, 0, 0); + + /* L2 Zero */ + i = next_page; + touch_block(vdi, 0, 1, 0); + + /* L3 Zero */ + j = next_page; + touch_block(vdi, 0, 0, 1); + k = next_page; + touch_block(vdi, 0, 1, 1); + + /* Direct write */ + touch_block(vdi, 0, 0, 0); + + vdi_snapshot(vdi); + + /* L1 fault */ + touch_block(vdi, 0, 0, 0); + /* test the read-only branches that should have been copied over. */ + test_block(vdi, 0, 1, 0, pages[i]); + test_block(vdi, 0, 0, 1, pages[j]); + + /* L2 fault */ + touch_block(vdi, 0, 1, 0); + test_block(vdi, 0, 1, 1, pages[k]); + + /* L3 fault */ + touch_block(vdi, 0, 0, 1); + + /* read - L1 zero */ + test_block(vdi, 1, 0, 0, zero_page); + + /* read - L2 zero */ + test_block(vdi, 0, 2, 0, zero_page); + + /* read - L3 zero */ + test_block(vdi, 0, 0, 2, zero_page); +} + +int main(int argc, char *argv[]) +{ + vdi_t *vdi; + u64 id; + int fd; + struct stat st; + u64 tot_size; + char spage[BLOCK_SIZE]; + char *dpage; + u64 vblock = 0, count=0; + + __init_blockstore(); + init_block_async(); + __init_vdi(); + + vdi = vdi_create( NULL, "UNIT TEST VDI"); + + if ( vdi == NULL ) { + printf("Failed to create VDI!\n"); + freeblock(vdi); + exit(-1); + } + + fill_test_pages(); + coverage_test(vdi); + + freeblock(vdi); + + return (0); +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/block-async.h --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/block-async.h Sun Jul 3 22:36:48 2005 @@ -0,0 +1,69 @@ +/* block-async.h + * + * Asynchronous block wrappers for parallax. + */ + +#ifndef _BLOCKASYNC_H_ +#define _BLOCKASYNC_H_ + +#include <assert.h> +#include <xc.h> +#include "vdi.h" + +struct io_ret +{ + enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type; + union { + u64 a; + char *b; + int i; + } u; +}; + +typedef void (*io_cb_t)(struct io_ret r, void *param); + +/* per-vdi lock structures to make sure requests run in a safe order. */ +struct radix_wait { + enum {RLOCK, WLOCK} type; + io_cb_t cb; + void *param; + struct radix_wait *next; +}; + +struct radix_lock { + pthread_mutex_t lock; + int lines[1024]; + struct radix_wait *waiters[1024]; + enum {ANY, READ, STOP} state[1024]; +}; +void radix_lock_init(struct radix_lock *r); + +void block_read(u64 addr, io_cb_t cb, void *param); +void block_write(u64 addr, char *block, io_cb_t cb, void *param); +void block_alloc(char *block, io_cb_t cb, void *param); +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void init_block_async(void); + +static inline u64 IO_ADDR(struct io_ret r) +{ + assert(r.type == IO_ADDR_T); + return r.u.a; +} + +static inline char *IO_BLOCK(struct io_ret r) +{ + assert(r.type == IO_BLOCK_T); + return r.u.b; +} + +static inline int IO_INT(struct io_ret r) +{ + assert(r.type == IO_INT_T); + return r.u.i; +} + + +#endif //_BLOCKASYNC_H_ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_snap.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_snap.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,43 @@ +/************************************************************************** + * + * vdi_snap.c + * + * Snapshot a vdi. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include "blockstore.h" +#include "radix.h" +#include "vdi.h" + +int main(int argc, char *argv[]) +{ + vdi_t *vdi; + u64 id; + + __init_blockstore(); + __init_vdi(); + + if ( argc == 1 ) { + printf("usage: %s <VDI id>\n", argv[0]); + exit(-1); + } + + id = (u64) atoll(argv[1]); + + vdi = vdi_get(id); + + if ( vdi == NULL ) { + printf("couldn't find the requested VDI.\n"); + freeblock(vdi); + exit(-1); + } + + vdi_snapshot(vdi); + + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_create.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_create.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,52 @@ +/************************************************************************** + * + * vdi_create.c + * + * Create a new vdi. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include "blockstore.h" +#include "radix.h" +#include "vdi.h" + +int main(int argc, char *argv[]) +{ + vdi_t *vdi; + char name[VDI_NAME_SZ] = ""; + snap_id_t id; + int from_snap = 0; + + __init_blockstore(); + __init_vdi(); + + if ( argc == 1 ) { + printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]); + exit(-1); + } + + strncpy( name, argv[1], VDI_NAME_SZ); + name[VDI_NAME_SZ] = '\0'; + + if ( argc > 3 ) { + id.block = (u64) atoll(argv[2]); + id.index = (unsigned int) atol (argv[3]); + from_snap = 1; + } + + vdi = vdi_create( from_snap ? &id : NULL, name); + + if ( vdi == NULL ) { + printf("Failed to create VDI!\n"); + freeblock(vdi); + exit(-1); + } + + freeblock(vdi); + + return (0); +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_validate.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_validate.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,97 @@ +/************************************************************************** + * + * vdi_validate.c + * + * Intended to sanity-check vm_fill and the underlying vdi code. + * + * Block-by-block compare of a vdi with a file/device on the disk. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include "blockstore.h" +#include "radix.h" +#include "vdi.h" +#include "requests-async.h" + +int main(int argc, char *argv[]) +{ + vdi_t *vdi; + u64 id; + int fd; + struct stat st; + u64 tot_size; + char spage[BLOCK_SIZE], *dpage; + char *vpage; + u64 vblock = 0, count=0; + + __init_blockstore(); + init_block_async(); + __init_vdi(); + + if ( argc < 3 ) { + printf("usage: %s <VDI id> <filename>\n", argv[0]); + exit(-1); + } + + id = (u64) atoll(argv[1]); + + vdi = vdi_get( id ); + + if ( vdi == NULL ) { + printf("Failed to retreive VDI %Ld!\n", id); + exit(-1); + } + + fd = open(argv[2], O_RDONLY | O_LARGEFILE); + + if (fd < 0) { + printf("Couldn't open %s!\n", argv[2]); + exit(-1); + } + + if ( fstat(fd, &st) != 0 ) { + printf("Couldn't stat %s!\n", argv[2]); + exit(-1); + } + + tot_size = (u64) st.st_size; + printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size); + + printf(" "); + while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) { + + dpage = vdi_read_s(vdi, vblock); + + if (dpage == NULL) { + printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock); + exit(0); + } + + if (memcmp(spage, dpage, BLOCK_SIZE) != 0) { + printf("\n\nblocks don't match! (%Ld)\n", vblock); + exit(0); + } + + freeblock(dpage); + + vblock++; + if ((vblock % 1024) == 0) { + printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock); + fflush(stdout); + } + } + printf("\n"); + + printf("VDI %Ld looks good!\n", id); + + freeblock(vdi); + + return (0); +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/vdi_fill.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/vdi_fill.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,81 @@ +/************************************************************************** + * + * vdi_fill.c + * + * Hoover a file or device into a vdi. + * You must first create the vdi with vdi_create. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include "blockstore.h" +#include "radix.h" +#include "requests-async.h" +#include "vdi.h" + +int main(int argc, char *argv[]) +{ + vdi_t *vdi; + u64 id; + int fd; + struct stat st; + u64 tot_size; + char spage[BLOCK_SIZE]; + char *dpage; + u64 vblock = 0, count=0; + + __init_blockstore(); + init_block_async(); + __init_vdi(); + + if ( argc < 3 ) { + printf("usage: %s <VDI id> <filename>\n", argv[0]); + exit(-1); + } + + id = (u64) atoll(argv[1]); + + vdi = vdi_get( id ); + + if ( vdi == NULL ) { + printf("Failed to retreive VDI %Ld!\n", id); + exit(-1); + } + + fd = open(argv[2], O_RDONLY | O_LARGEFILE); + + if (fd < 0) { + printf("Couldn't open %s!\n", argv[2]); + exit(-1); + } + + if ( fstat(fd, &st) != 0 ) { + printf("Couldn't stat %s!\n", argv[2]); + exit(-1); + } + + tot_size = (u64) st.st_size; + printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size); + + printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE); + printf(" "); + while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) { + vdi_write_s(vdi, vblock, spage); + + vblock++; + if ((vblock % 512) == 0) + printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock); + fflush(stdout); + } + printf("\n"); + + freeblock(vdi); + + return (0); +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/radix.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/radix.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,631 @@ +/* + * Radix tree for mapping (up to) 63-bit virtual block IDs to + * 63-bit global block IDs + * + * Pointers within the tree set aside the least significant bit to indicate + * whther or not the target block is writable from this node. + * + * The block with ID 0 is assumed to be an empty block of all zeros + */ + +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <pthread.h> +#include "blockstore.h" +#include "radix.h" + +#define RADIX_TREE_MAP_SHIFT 9 +#define RADIX_TREE_MAP_MASK 0x1ff +#define RADIX_TREE_MAP_ENTRIES 512 + +/* +#define DEBUG +*/ + +/* Experimental radix cache. */ + +static pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER; +static int rcache_count = 0; +#define RCACHE_MAX 1024 + +typedef struct rcache_st { + radix_tree_node *node; + u64 id; + struct rcache_st *hash_next; + struct rcache_st *cache_next; + struct rcache_st *cache_prev; +} rcache_t; + +static rcache_t *rcache_head = NULL; +static rcache_t *rcache_tail = NULL; + +#define RCHASH_SIZE 512ULL +rcache_t *rcache[RCHASH_SIZE]; +#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1)) + +void __rcache_init(void) +{ + int i; + + for (i=0; i<RCHASH_SIZE; i++) + rcache[i] = NULL; +} + + +void rcache_write(u64 id, radix_tree_node *node) +{ + rcache_t *r, *tmp, **curs; + + pthread_mutex_lock(&rcache_mutex); + + /* Is it already in the cache? */ + r = rcache[RCACHE_HASH(id)]; + + for (;;) { + if (r == NULL) + break; + if (r->id == id) + { + memcpy(r->node, node, BLOCK_SIZE); + + /* bring to front. */ + if (r != rcache_head) { + + if (r == rcache_tail) { + if (r->cache_prev != NULL) rcache_tail = r->cache_prev; + rcache_tail->cache_next = NULL; + } + + tmp = r->cache_next; + if (r->cache_next != NULL) r->cache_next->cache_prev + = r->cache_prev; + if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp; + + r->cache_prev = NULL; + r->cache_next = rcache_head; + if (rcache_head != NULL) rcache_head->cache_prev = r; + rcache_head = r; + } + +//printf("Update (%Ld)\n", r->id); + goto done; + } + r = r->hash_next; + } + + if ( rcache_count == RCACHE_MAX ) + { + /* Remove an entry */ + + r = rcache_tail; + if (r->cache_prev != NULL) rcache_tail = r->cache_prev; + rcache_tail->cache_next = NULL; + freeblock(r->node); + + curs = &rcache[RCACHE_HASH(r->id)]; + while ((*curs) != r) + curs = &(*curs)->hash_next; + *curs = r->hash_next; +//printf("Evict (%Ld)\n", r->id); + + } else { + + r = (rcache_t *)malloc(sizeof(rcache_t)); + rcache_count++; + } + + r->node = newblock(); + memcpy(r->node, node, BLOCK_SIZE); + r->id = id; + + r->hash_next = rcache[RCACHE_HASH(id)]; + rcache[RCACHE_HASH(id)] = r; + + r->cache_prev = NULL; + r->cache_next = rcache_head; + if (rcache_head != NULL) rcache_head->cache_prev = r; + rcache_head = r; + if (rcache_tail == NULL) rcache_tail = r; + +//printf("Added (%Ld, %p)\n", id, r->node); +done: + pthread_mutex_unlock(&rcache_mutex); +} + +radix_tree_node *rcache_read(u64 id) +{ + rcache_t *r, *tmp; + radix_tree_node *node = NULL; + + pthread_mutex_lock(&rcache_mutex); + + r = rcache[RCACHE_HASH(id)]; + + for (;;) { + if (r == NULL) { +//printf("Miss (%Ld)\n", id); + goto done; + } + if (r->id == id) break; + r = r->hash_next; + } + + /* bring to front. */ + if (r != rcache_head) + { + if (r == rcache_tail) { + if (r->cache_prev != NULL) rcache_tail = r->cache_prev; + rcache_tail->cache_next = NULL; + } + tmp = r->cache_next; + if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev; + if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp; + + r->cache_prev = NULL; + r->cache_next = rcache_head; + if (rcache_head != NULL) rcache_head->cache_prev = r; + rcache_head = r; + } + + node = newblock(); + memcpy(node, r->node, BLOCK_SIZE); + +//printf("Hit (%Ld, %p)\n", id, r->node); +done: + pthread_mutex_unlock(&rcache_mutex); + + return(node); +} + + +void *rc_readblock(u64 id) +{ + void *ret; + + ret = (void *)rcache_read(id); + + if (ret != NULL) return ret; + + ret = readblock(id); + + if (ret != NULL) + rcache_write(id, ret); + + return(ret); +} + +u64 rc_allocblock(void *block) +{ + u64 ret; + + ret = allocblock(block); + + if (ret != ZERO) + rcache_write(ret, block); + + return(ret); +} + +int rc_writeblock(u64 id, void *block) +{ + int ret; + + ret = writeblock(id, block); + rcache_write(id, block); + + return(ret); +} + + +/* + * block device interface and other helper functions + * with these functions, block id is just a 63-bit number, with + * no special consideration for the LSB + */ +radix_tree_node cloneblock(radix_tree_node block); + +/* + * main api + * with these functions, the LSB of root always indicates + * whether or not the block is writable, including the return + * values of update and snapshot + */ +u64 lookup(int height, u64 root, u64 key); +u64 update(int height, u64 root, u64 key, u64 val); +u64 snapshot(u64 root); + +/** + * cloneblock: clone an existing block in memory + * @block: the old block + * + * @return: new block, with LSB cleared for every entry + */ +radix_tree_node cloneblock(radix_tree_node block) { + radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE); + int i; + if (node == NULL) { + perror("cloneblock malloc"); + return NULL; + } + for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) + node[i] = block[i] & ONEMASK; + return node; +} + +/** + * lookup: find a value given a key + * @height: height in bits of the radix tree + * @root: root node id, with set LSB indicating writable node + * @key: key to lookup + * + * @return: value on success, zero on error + */ + +u64 lookup(int height, u64 root, u64 key) { + radix_tree_node node; + u64 mask = ONE; + + assert(key >> height == 0); + + /* the root block may be smaller to ensure all leaves are full */ + height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; + + /* now carve off equal sized chunks at each step */ + for (;;) { + u64 oldroot; + +#ifdef DEBUG + printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root, + (int) ((key >> height) & RADIX_TREE_MAP_MASK), + (iswritable(root) ? "" : " (readonly)")); +#endif + + if (getid(root) == ZERO) + return ZERO; + + oldroot = root; + node = (radix_tree_node) rc_readblock(getid(root)); + if (node == NULL) + return ZERO; + + root = node[(key >> height) & RADIX_TREE_MAP_MASK]; + mask &= root; + freeblock(node); + + if (height == 0) + return ( root & ONEMASK ) | mask; + + height -= RADIX_TREE_MAP_SHIFT; + } + + return ZERO; +} + +/* + * update: set a radix tree entry, doing copy-on-write as necessary + * @height: height in bits of the radix tree + * @root: root node id, with set LSB indicating writable node + * @key: key to set + * @val: value to set, s.t. radix(key)=val + * + * @returns: (possibly new) root id on success (with LSB=1), 0 on failure + */ + +u64 update(int height, u64 root, u64 key, u64 val) { + int offset; + u64 child; + radix_tree_node node; + + /* base case--return val */ + if (height == 0) + return val; + + /* the root block may be smaller to ensure all leaves are full */ + height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; + offset = (key >> height) & RADIX_TREE_MAP_MASK; + +#ifdef DEBUG + printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root, + offset, (iswritable(root)?"":" (clone)")); +#endif + + /* load a block, or create a new one */ + if (root == ZERO) { + node = (radix_tree_node) newblock(); + } else { + node = (radix_tree_node) rc_readblock(getid(root)); + + if (!iswritable(root)) { + /* need to clone this node */ + radix_tree_node oldnode = node; + node = cloneblock(node); + freeblock(oldnode); + root = ZERO; + } + } + + if (node == NULL) { +#ifdef DEBUG + printf("update: node is null!\n"); +#endif + return ZERO; + } + + child = update(height, node[offset], key, val); + + if (child == ZERO) { + freeblock(node); + return ZERO; + } else if (child == node[offset]) { + /* no change, so we already owned the child */ + assert(iswritable(root)); + + freeblock(node); + return root; + } + + node[offset] = child; + + /* new/cloned blocks need to be saved */ + if (root == ZERO) { + /* mark this as an owned block */ + root = rc_allocblock(node); + if (root) + root = writable(root); + } else if (rc_writeblock(getid(root), node) < 0) { + freeblock(node); + return ZERO; + } + + freeblock(node); + return root; +} + +/** + * snapshot: create a snapshot + * @root: old root node + * + * @return: new root node, 0 on error + */ +u64 snapshot(u64 root) { + radix_tree_node node, newnode; + + if ((node = rc_readblock(getid(root))) == NULL) + return ZERO; + + newnode = cloneblock(node); + freeblock(node); + if (newnode == NULL) + return ZERO; + + root = rc_allocblock(newnode); + freeblock(newnode); + + if (root == ZERO) + return ZERO; + else + return writable(root); +} + +/** + * collapse: collapse a parent onto a child. + * + * NOTE: This assumes that parent and child really are, and further that + * there are no other children forked from this parent. (children of the + * child are okay...) + */ + +int collapse(int height, u64 proot, u64 croot) +{ + int i, numlinks, ret, total = 0; + radix_tree_node pnode, cnode; + + if (height == 0) { + height = -1; /* terminate recursion */ + } else { + height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; + } + numlinks = (1UL << RADIX_TREE_MAP_SHIFT); + + /* Terminal cases: */ + + if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) ) + return -1; + + /* get roots */ + if ((pnode = readblock(getid(proot))) == NULL) + return -1; + + if ((cnode = readblock(getid(croot))) == NULL) + { + freeblock(pnode); + return -1; + } + + /* For each writable link in proot */ + for (i=0; i<numlinks; i++) + { + if ( pnode[i] == cnode[i] ) continue; + + /* collapse (next level) */ + /* if height != 0 and writable... */ + if (( height >= 0 ) && ( iswritable(pnode[i]) ) ) + { + //printf(" %Ld is writable (i=%d).\n", getid(pnode[i]), i); + ret = collapse(height, pnode[i], cnode[i]); + if (ret == -1) + { + total = -1; + } else { + total += ret; + } + } + + + } + + /* if plink is writable, AND clink is writable -> free plink block */ + if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) + { + releaseblock(getid(proot)); + if (ret >=0) total++; + //printf(" Delete %Ld\n", getid(proot)); + } +//printf("done : %Ld\n", getid(proot)); + return total; + +} + + +void print_root(u64 root, int height, FILE *dot_f) +{ + FILE *f; + int i; + radix_tree_node node; + char *style[2] = { "", "style=bold,color=blue," }; + + if (dot_f == NULL) { + f = fopen("radix.dot", "w"); + if (f == NULL) { + perror("print_root: open"); + return; + } + + /* write graph preamble */ + fprintf(f, "digraph G {\n"); + + /* add a node for this root. */ + fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", + getid(root), style[iswritable(root)], getid(root)); + } + + printf("print_root(%Ld)\n", getid(root)); + + /* base case */ + if (height == 0) { + /* add a node and edge for each child root */ + node = (radix_tree_node) readblock(getid(root)); + if (node == NULL) + return; + + for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) { + if (node[i] != ZERO) { + fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", + getid(node[i]), style[iswritable(node[i])], + getid(node[i])); + fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), + getid(node[i]), i); + } + } + freeblock(node); + return; + } + + /* the root block may be smaller to ensure all leaves are full */ + height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; + + if (getid(root) == ZERO) + return; + + node = (radix_tree_node) readblock(getid(root)); + if (node == NULL) + return; + + /* add a node and edge for each child root */ + for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) + if (node[i] != ZERO) { + fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", + getid(node[i]), style[iswritable(node[i])], + getid(node[i])); + + print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f); + fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), + getid(node[i]), i); + } + + freeblock(node); + + /* write graph postamble */ + if (dot_f == NULL) { + fprintf(f, "}\n"); + fclose(f); + } +} + +#ifdef RADIX_STANDALONE + +int main(int argc, char **argv) { + u64 key = ZERO, val = ZERO; + u64 root = writable(2ULL); + u64 p = ZERO, c = ZERO; + int v; + char buff[4096]; + + __init_blockstore(); + + memset(buff, 0, 4096); + /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644); + + if (fp < 3) { + perror("open"); + return -1; + } + if (lseek(fp, 0, SEEK_END) == 0) { + write(fp, buff, 4096); + }*/ + + allocblock(buff); + + printf("Recognized commands:\n" + "Note: the LSB of a node number indicates if it is writable\n" + " root <node> set root to <node>\n" + " snapshot take a snapshot of the root\n" + " set <key> <val> set key=val\n" + " get <key> query key\n" + " c <proot> <croot> collapse\n" + " pr print tree to dot\n" + " pf <1=verbose> print freelist\n" + " quit\n" + "\nroot = %Ld\n", root); + for (;;) { + //print_root(root, 34, NULL); + //system("dot radix.dot -Tps -o radix.ps"); + + printf("> "); + fflush(stdout); + fgets(buff, 1024, stdin); + if (feof(stdin)) + break; + if (sscanf(buff, " root %Ld", &root) == 1) { + printf("root set to %Ld\n", root); + } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) { + root = update(34, root, key, val); + printf("root = %Ld\n", root); + } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) { + v = collapse(34, p, c); + printf("reclaimed %d blocks.\n", v); + } else if (sscanf(buff, " get %Ld", &key) == 1) { + val = lookup(34, root, key); + printf("value = %Ld\n", val); + } else if (!strcmp(buff, "quit\n")) { + break; + } else if (!strcmp(buff, "snapshot\n")) { + root = snapshot(root); + printf("new root = %Ld\n", root); + } else if (sscanf(buff, " pr %Ld", &root) == 1) { + print_root(root, 34, NULL); + } else if (sscanf(buff, " pf %d", &v) == 1) { + freelist_count(v); + } else if (!strcmp(buff, "pf\n")) { + freelist_count(0); + } else { + printf("command not recognized\n"); + } + } + return 0; +} + +#endif diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/radix.h --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/radix.h Sun Jul 3 22:36:48 2005 @@ -0,0 +1,45 @@ +/* + * Radix tree for mapping (up to) 63-bit virtual block IDs to + * 63-bit global block IDs + * + * Pointers within the tree set aside the least significant bit to indicate + * whther or not the target block is writable from this node. + * + * The block with ID 0 is assumed to be an empty block of all zeros + */ + +#ifndef __RADIX_H__ +#define __RADIX_H__ + +/* I don't really like exposing these, but... */ +#define getid(x) (((x)>>1)&0x7fffffffffffffffLL) +#define putid(x) ((x)<<1) +#define writable(x) (((x)<<1)|1LL) +#define iswritable(x) ((x)&1LL) +#define ZERO 0LL +#define ONE 1LL +#define ONEMASK 0xffffffffffffffeLL + +#define RADIX_TREE_MAP_SHIFT 9 +#define RADIX_TREE_MAP_MASK 0x1ff +#define RADIX_TREE_MAP_ENTRIES 512 + +typedef u64 *radix_tree_node; + + +/* + * main api + * with these functions, the LSB of root always indicates + * whether or not the block is writable, including the return + * values of update and snapshot + */ +u64 lookup(int height, u64 root, u64 key); +u64 update(int height, u64 root, u64 key, u64 val); +u64 snapshot(u64 root); +int collapse(int height, u64 proot, u64 croot); +int isprivate(int height, u64 root, u64 key); + + +void __rcache_init(void); + +#endif /* __RADIX_H__ */ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax/blockstored.c --- /dev/null Sun Jul 3 22:32:52 2005 +++ b/tools/blktap/parallax/blockstored.c Sun Jul 3 22:36:48 2005 @@ -0,0 +1,276 @@ +/************************************************************************** + * + * blockstored.c + * + * Block store daemon. + * + */ + +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <netinet/in.h> +#include <errno.h> +#include "blockstore.h" + +//#define BSDEBUG + +int readblock_into(u64 id, void *block); + +int open_socket(u16 port) { + + struct sockaddr_in sn; + int sock; + + sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) { + perror("Bad socket"); + return -1; + } + memset(&sn, 0, sizeof(sn)); + sn.sin_family = AF_INET; + sn.sin_port = htons(port); + sn.sin_addr.s_addr = htonl(INADDR_ANY); + if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) { + perror("bind"); + close(sock); + return -1; + } + + return sock; +} + +static int block_fp = -1; +static int bssock = -1; + +int send_reply(struct sockaddr_in *peer, void *buffer, int len) { + + int rc; + +#ifdef BSDEBUG + fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n", + len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id); +#endif + rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer)); + if (rc < 0) { + perror("send_reply"); + return 1; + } + + + return 0; +} + +static bsmsg_t msgbuf; + +void service_loop(void) { + + for (;;) { + int rc, len; + struct sockaddr_in from; + size_t slen = sizeof(from); + u64 bid; + + len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0, + (struct sockaddr *)&from, &slen); + + if (len < 0) { + perror("recvfrom"); + continue; + } + + if (len < MSGBUFSIZE_OP) { + fprintf(stderr, "Short packet.\n"); + continue; + } + +#ifdef BSDEBUG + fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n", + len, msgbuf.hdr.operation, msgbuf.hdr.id); +#endif + + switch (msgbuf.hdr.operation) { + case BSOP_READBLOCK: + if (len < MSGBUFSIZE_ID) { + fprintf(stderr, "Short packet (readblock %u).\n", len); + continue; + } + rc = readblock_into(msgbuf.hdr.id, msgbuf.block); + if (rc < 0) { + fprintf(stderr, "readblock error\n"); + msgbuf.hdr.flags = BSOP_FLAG_ERROR; + send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); + continue; + } + msgbuf.hdr.flags = 0; + send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK); + break; + case BSOP_WRITEBLOCK: + if (len < MSGBUFSIZE_BLOCK) { + fprintf(stderr, "Short packet (writeblock %u).\n", len); + continue; + } + rc = writeblock(msgbuf.hdr.id, msgbuf.block); + if (rc < 0) { + fprintf(stderr, "writeblock error\n"); + msgbuf.hdr.flags = BSOP_FLAG_ERROR; + send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); + continue; + } + msgbuf.hdr.flags = 0; + send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); + break; + case BSOP_ALLOCBLOCK: + if (len < MSGBUFSIZE_BLOCK) { + fprintf(stderr, "Short packet (allocblock %u).\n", len); + continue; + } + bid = allocblock(msgbuf.block); + if (bid == ALLOCFAIL) { + fprintf(stderr, "allocblock error\n"); + msgbuf.hdr.flags = BSOP_FLAG_ERROR; + send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); + continue; + } + msgbuf.hdr.id = bid; + msgbuf.hdr.flags = 0; + send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); + break; + } + + } +} + +/** + * readblock: read a block from disk + * @id: block id to read + * @block: pointer to buffer to receive block + * + * @return: 0 if OK, other on error + */ + +int readblock_into(u64 id, void *block) { + if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { + printf ("%Ld\n", (id - 1) * BLOCK_SIZE); + perror("readblock lseek"); + return -1; + } + if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { + perror("readblock read"); + return -1; + } + return 0; +} + +/** + * writeblock: write an existing block to disk + * @id: block id + * @block: pointer to block + * + * @return: zero on success, -1 on failure + */ +int writeblock(u64 id, void *block) { + if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { + perror("writeblock lseek"); + return -1; + } + if (write(block_fp, block, BLOCK_SIZE) < 0) { + perror("writeblock write"); + return -1; + } + return 0; +} + +/** + * allocblock: write a new block to disk + * @block: pointer to block + * + * @return: new id of block on disk + */ +static u64 lastblock = 0; + +u64 allocblock(void *block) { + u64 lb; + off64_t pos; + + retry: + pos = lseek64(block_fp, 0, SEEK_END); + if (pos == (off64_t)-1) { + perror("allocblock lseek"); + return ALLOCFAIL; + } + if (pos % BLOCK_SIZE != 0) { + fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); + return ALLOCFAIL; + } + if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { + perror("allocblock write"); + return ALLOCFAIL; + } + lb = pos / BLOCK_SIZE + 1; + +#ifdef BS_ALLOC_HACK + if (lb < BS_ALLOC_SKIP) + goto retry; +#endif + + if (lb <= lastblock) + printf("[*** %Ld alredy allocated! ***]\n", lb); + + lastblock = lb; + return lb; +} + +/** + * newblock: get a new in-memory block set to zeros + * + * @return: pointer to new block, NULL on error + */ +void *newblock() { + void *block = malloc(BLOCK_SIZE); + if (block == NULL) { + perror("newblock"); + return NULL; + } + memset(block, 0, BLOCK_SIZE); + return block; +} + + +/** + * freeblock: unallocate an in-memory block + * @id: block id (zero if this is only in-memory) + * @block: block to be freed + */ +void freeblock(void *block) { + if (block != NULL) + free(block); +} + + +int main(int argc, char **argv) +{ + block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return -1; + } + + bssock = open_socket(BLOCKSTORED_PORT); + if (bssock < 0) { + return -1; + } + + service_loop(); + + close(bssock); + + return 0; +} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/README-PARALLAX --- a/tools/blktap/README-PARALLAX Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,177 +0,0 @@ -Parallax Quick Overview -March 3, 2005 - -This is intended to provide a quick set of instructions to let you -guys play with the current parallax source. In it's current form, the -code will let you run an arbitrary number of VMs off of a single disk -image, doing copy-on-write as they make updates. Each domain is -assigned a virtual disk image (VDI), which may be based on a snapshot -of an existing image. All of the VDI and snapshot management should -currently work. - -The current implementation uses a single file as a blockstore for -_everything_ this will soon be replaced by the fancier backend code -and the local cache. As it stands, Parallax will create -"blockstore.dat" in the directory that you run it from, and use -largefile support to make this grow to unfathomable girth. So, you -probably want to run the daemon off of a local disk, with a lot of -free space. - -Here's how to get going: - -0. Setup: ---------- - -Pick a local directory on a disk with lots of room. You should be -running from a privileged domain (e.g. dom0) with the blocktap -configured in and block backend NOT. - -For convenience (for the moment) copy all of the vdi tools (vdi_*) and -the parallax daemon from tools/blktap into this directory. - -1. Populate the blockstore: ---------------------------- - -First you need to put at least one image into the blockstore. You -will need a disk image, either as a file or local partition. My -general approach has been to - -(a) make a really big sparse file with - - dd if=/dev/zero of=./image bs=4K count=1 seek=[big value] - -(b) put a filesystem into it - - mkfs.ext3 ./image - -(c) mount it using loopback - - mkdir ./mnt - mount -o loop ./image - -(d) cd into it and untar one of the image files from srg-roots. - - cd mnt - tar ... - -NOTE: Beware if your system is FC3. mkfs is not compatible with old -versions of fedora, and so you don't have much choice but to install -further fc3 images if you have used the fc3 version of mkfs. - -(e) unmount the image - - cd .. - umount mnt - -(f) now, create a new VDI to hold the image - - ./vdi_create "My new FC3 VDI" - -(g) get the id of the new VDI. - - ./vdi_list - - | 0 My new FC3 VDI - -(0 is the VDI id... create a few more if you want.) - -(h) hoover your image into the new VDI. - - ./vdi_fill 0 ./image - -This will pull the entire image into the blockstore and set up a -mapping tree for it for VDI 0. Passing a device (i.e. /dev/sda3) -should also work, but vdi_fill has NO notion of sparseness yet, so you -are going to pump a block into the store for each block you read. - -vdi_fill will count up until it is done, and you should be ready to -go. If you want to be anal, you can use vdi_validate to test the VDI -against the original image. - -2. Create some extra VDIs -------------------------- - -VDIs are actually a list of snapshots, and each snapshot is a full -image of mappings. So, to preserve an immutable copy of a current -VDI, do this: - -(a) Snapshot your new VDI. - - ./vdi_snap 0 - -Snapshotting writes the current radix root to the VDI's snapshot log, -and assigns it a new writable root. - -(b) look at the VDI's snapshot log. - - ./vdi_snap_list 0 - - | 16 0 Thu Mar 3 19:27:48 2005 565111 31 - -The first two columns constitute a snapshot id and represent the -(block, offset) of the snapshot record. The Date tells you when the -snapshot was made, and 31 is the radix root node of the snapshot. - -(c) Create a new VDI, based on that snapshot, and look at the list. - - ./vdi_create "FC3 - Copy 1" 16 0 - ./vdi_list - - | 0 My new FC3 VDI - | 1 FC3 - Copy 1 - -NOTE: If you have Graphviz installed on your system, you can use -vdi_tree to generate a postscript of your current set of VDIs and -snapshots. - - -Create as many VDIs as you need for the VMs that you want to run. - -3. Boot some VMs: ------------------ - -Parallax currently uses a hack in xend to pass the VDI id, you need to -modify the disk line of the VM config that is going to mount it. - -(a) set up your vm config, by using the following disk line: - - disk = ['parallax:1,sda1,w,0' ] - -This example uses VDI 1 (from vdi_list above), presents it as sda1 -(writable), and uses dom 0 as the backend. If you were running the -daemon (and tap driver) in some domain other than 0, you would change -this last parameter. - -NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:". - -(b) Run parallax in the backend domain. - - ./parallax - -(c) create your new domain. - - xm create ... - ---- - -That's pretty much all there is to it at the moment. Hope this is -clear enough to get you going. Now, a few serious caveats that will -be sorted out in the almost immediate future: - -WARNINGS: ---------- - -1. There is NO locking in the VDI tools at the moment, so I'd avoid -running them in parallel, or more importantly, running them while the -daemon is running. - -2. I doubt that xend will be very happy about restarting if you have -parallax-using domains. So if it dies while there are active parallax -doms, you may need to reboot. - -3. I've turned off write-in-place. So at the moment, EVERY block -write is a log append on the blockstore. I've been having some probs -with the radix tree's marking of writable blocks after snapshots and -will sort this out very soon. - - diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/block-async.c --- a/tools/blktap/block-async.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,393 +0,0 @@ -/* block-async.c - * - * Asynchronous block wrappers for parallax. - */ - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> -#include "block-async.h" -#include "blockstore.h" -#include "vdi.h" - - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* We have a queue of outstanding I/O requests implemented as a - * circular producer-consumer ring with free-running buffers. - * to allow reordering, this ring indirects to indexes in an - * ring of io_structs. - * - * the block_* calls may either add an entry to this ring and return, - * or satisfy the request immediately and call the callback directly. - * None of the io calls in parallax should be nested enough to worry - * about stack problems with this approach. - */ - -struct read_args { - u64 addr; -}; - -struct write_args { - u64 addr; - char *block; -}; - -struct alloc_args { - char *block; -}; - -struct pending_io_req { - enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op; - union { - struct read_args r; - struct write_args w; - struct alloc_args a; - } u; - io_cb_t cb; - void *param; -}; - -void radix_lock_init(struct radix_lock *r) -{ - int i; - - pthread_mutex_init(&r->lock, NULL); - for (i=0; i < 1024; i++) { - r->lines[i] = 0; - r->waiters[i] = NULL; - r->state[i] = ANY; - } -} - -/* maximum outstanding I/O requests issued asynchronously */ -/* must be a power of 2.*/ -#define MAX_PENDING_IO 1024 - -/* how many threads to concurrently issue I/O to the disk. */ -#define IO_POOL_SIZE 10 - -static struct pending_io_req pending_io_reqs[MAX_PENDING_IO]; -static int pending_io_list[MAX_PENDING_IO]; -static unsigned long io_prod = 0, io_cons = 0, io_free = 0; -#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1)) -#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs) -#define PENDING_IO_ENT(_x) \ - (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]]) -#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod) -#define CAN_CONSUME_PENDING_IO (io_cons != io_prod) -static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t pending_io_cond = PTHREAD_COND_INITIALIZER; - -static void init_pending_io(void) -{ - int i; - - for (i=0; i<MAX_PENDING_IO; i++) - pending_io_list[i] = i; - -} - -void block_read(u64 addr, io_cb_t cb, void *param) -{ - struct pending_io_req *req; - - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req); - req->op = IO_READ; - req->u.r.addr = addr; - req->cb = cb; - req->param = param; - - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - - -void block_write(u64 addr, char *block, io_cb_t cb, void *param) -{ - struct pending_io_req *req; - - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req); - req->op = IO_WRITE; - req->u.w.addr = addr; - req->u.w.block = block; - req->cb = cb; - req->param = param; - - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - - -void block_alloc(char *block, io_cb_t cb, void *param) -{ - struct pending_io_req *req; - - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - req->op = IO_ALLOC; - req->u.a.block = block; - req->cb = cb; - req->param = param; - - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - pthread_mutex_lock(&r->lock); - - if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) { - r->lines[row]++; - r->state[row] = READ; - DPRINTF("RLOCK : %3d (row: %d)\n", r->lines[row], row); - pthread_mutex_unlock(&r->lock); - ret.type = IO_INT_T; - ret.u.i = 0; - cb(ret, param); - } else { - struct radix_wait **rwc; - struct radix_wait *rw = - (struct radix_wait *) malloc (sizeof(struct radix_wait)); - DPRINTF("RLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); - rw->type = RLOCK; - rw->param = param; - rw->cb = cb; - rw->next = NULL; - /* append to waiters list. */ - rwc = &r->waiters[row]; - while (*rwc != NULL) rwc = &(*rwc)->next; - *rwc = rw; - pthread_mutex_unlock(&r->lock); - return; - } -} - - -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - pthread_mutex_lock(&r->lock); - - /* the second check here is redundant -- just here for debugging now. */ - if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) { - r->state[row] = STOP; - r->lines[row] = -1; - DPRINTF("WLOCK : %3d (row: %d)\n", r->lines[row], row); - pthread_mutex_unlock(&r->lock); - ret.type = IO_INT_T; - ret.u.i = 0; - cb(ret, param); - } else { - struct radix_wait **rwc; - struct radix_wait *rw = - (struct radix_wait *) malloc (sizeof(struct radix_wait)); - DPRINTF("WLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); - rw->type = WLOCK; - rw->param = param; - rw->cb = cb; - rw->next = NULL; - /* append to waiters list. */ - rwc = &r->waiters[row]; - while (*rwc != NULL) rwc = &(*rwc)->next; - *rwc = rw; - pthread_mutex_unlock(&r->lock); - return; - } - -} - -/* called with radix_lock locked and lock count of zero. */ -static void wake_waiters(struct radix_lock *r, int row) -{ - struct pending_io_req *req; - struct radix_wait *rw; - - if (r->lines[row] != 0) return; - if (r->waiters[row] == NULL) return; - - if (r->waiters[row]->type == WLOCK) { - - rw = r->waiters[row]; - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - req->op = IO_WWAKE; - req->cb = rw->cb; - req->param = rw->param; - r->lines[row] = -1; /* write lock the row. */ - r->state[row] = STOP; - r->waiters[row] = rw->next; - free(rw); - pthread_mutex_unlock(&pending_io_lock); - - } else /* RLOCK */ { - - while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) { - rw = r->waiters[row]; - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - req->op = IO_RWAKE; - req->cb = rw->cb; - req->param = rw->param; - r->lines[row]++; /* read lock the row. */ - r->state[row] = READ; - r->waiters[row] = rw->next; - free(rw); - pthread_mutex_unlock(&pending_io_lock); - } - - if (r->waiters[row] != NULL) /* There is a write queued still */ - r->state[row] = STOP; - } - - pthread_mutex_lock(&pending_io_lock); - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - - pthread_mutex_lock(&r->lock); - assert(r->lines[row] > 0); /* try to catch misuse. */ - r->lines[row]--; - if (r->lines[row] == 0) { - r->state[row] = ANY; - wake_waiters(r, row); - } - pthread_mutex_unlock(&r->lock); - cb(ret, param); -} - -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - - pthread_mutex_lock(&r->lock); - assert(r->lines[row] == -1); /* try to catch misuse. */ - r->lines[row] = 0; - r->state[row] = ANY; - wake_waiters(r, row); - pthread_mutex_unlock(&r->lock); - cb(ret, param); -} - -/* consumer calls */ -static void do_next_io_req(struct pending_io_req *req) -{ - struct io_ret ret; - void *param; - - switch (req->op) { - case IO_READ: - ret.type = IO_BLOCK_T; - ret.u.b = readblock(req->u.r.addr); - break; - case IO_WRITE: - ret.type = IO_INT_T; - ret.u.i = writeblock(req->u.w.addr, req->u.w.block); - DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr); - break; - case IO_ALLOC: - ret.type = IO_ADDR_T; - ret.u.a = allocblock(req->u.a.block); - break; - case IO_RWAKE: - DPRINTF("WAKE DEFERRED RLOCK!\n"); - ret.type = IO_INT_T; - ret.u.i = 0; - break; - case IO_WWAKE: - DPRINTF("WAKE DEFERRED WLOCK!\n"); - ret.type = IO_INT_T; - ret.u.i = 0; - break; - default: - DPRINTF("Unknown IO operation on pending list!\n"); - return; - } - - param = req->param; - pthread_mutex_lock(&pending_io_lock); - pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req); - pthread_mutex_unlock(&pending_io_lock); - - assert(req->cb != NULL); - req->cb(ret, param); - -} - -void *io_thread(void *param) -{ - int tid; - struct pending_io_req *req; - - /* Set this thread's tid. */ - tid = *(int *)param; - free(param); - -start: - pthread_mutex_lock(&pending_io_lock); - while (io_prod == io_cons) { - pthread_cond_wait(&pending_io_cond, &pending_io_lock); - } - - if (io_prod == io_cons) { - /* unnecessary wakeup. */ - pthread_mutex_unlock(&pending_io_lock); - goto start; - } - - req = PENDING_IO_ENT(io_cons++); - pthread_mutex_unlock(&pending_io_lock); - - do_next_io_req(req); - - goto start; - -} - -static pthread_t io_pool[IO_POOL_SIZE]; -void start_io_threads(void) - -{ - int i, tid=0; - - for (i=0; i < IO_POOL_SIZE; i++) { - int ret, *t; - t = (int *)malloc(sizeof(int)); - *t = tid++; - ret = pthread_create(&io_pool[i], NULL, io_thread, t); - if (ret != 0) printf("Error starting thread %d\n", i); - } - -} - -void init_block_async(void) -{ - init_pending_io(); - start_io_threads(); -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/block-async.h --- a/tools/blktap/block-async.h Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,69 +0,0 @@ -/* block-async.h - * - * Asynchronous block wrappers for parallax. - */ - -#ifndef _BLOCKASYNC_H_ -#define _BLOCKASYNC_H_ - -#include <assert.h> -#include <xc.h> -#include "vdi.h" - -struct io_ret -{ - enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type; - union { - u64 a; - char *b; - int i; - } u; -}; - -typedef void (*io_cb_t)(struct io_ret r, void *param); - -/* per-vdi lock structures to make sure requests run in a safe order. */ -struct radix_wait { - enum {RLOCK, WLOCK} type; - io_cb_t cb; - void *param; - struct radix_wait *next; -}; - -struct radix_lock { - pthread_mutex_t lock; - int lines[1024]; - struct radix_wait *waiters[1024]; - enum {ANY, READ, STOP} state[1024]; -}; -void radix_lock_init(struct radix_lock *r); - -void block_read(u64 addr, io_cb_t cb, void *param); -void block_write(u64 addr, char *block, io_cb_t cb, void *param); -void block_alloc(char *block, io_cb_t cb, void *param); -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void init_block_async(void); - -static inline u64 IO_ADDR(struct io_ret r) -{ - assert(r.type == IO_ADDR_T); - return r.u.a; -} - -static inline char *IO_BLOCK(struct io_ret r) -{ - assert(r.type == IO_BLOCK_T); - return r.u.b; -} - -static inline int IO_INT(struct io_ret r) -{ - assert(r.type == IO_INT_T); - return r.u.i; -} - - -#endif //_BLOCKASYNC_H_ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstore.c --- a/tools/blktap/blockstore.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,1350 +0,0 @@ -/************************************************************************** - * - * blockstore.c - * - * Simple block store interface - * - */ - -#include <fcntl.h> -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/time.h> -#include <stdarg.h> -#include "blockstore.h" -#include <pthread.h> - -//#define BLOCKSTORE_REMOTE -//#define BSDEBUG - -#define RETRY_TIMEOUT 1000000 /* microseconds */ - -/***************************************************************************** - * Debugging - */ -#ifdef BSDEBUG -void DB(char *format, ...) -{ - va_list args; - fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key)); - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); -} -#else -#define DB(format, ...) (void)0 -#endif - -#ifdef BLOCKSTORE_REMOTE - -#include <sys/socket.h> -#include <sys/ioctl.h> -#include <netinet/in.h> -#include <netdb.h> - -/***************************************************************************** - * Network state * - *****************************************************************************/ - -/* The individual disk servers we talks to. These will be referenced by - * an integer index into bsservers[]. - */ -bsserver_t bsservers[MAX_SERVERS]; - -/* The cluster map. This is indexed by an integer cluster number. - */ -bscluster_t bsclusters[MAX_CLUSTERS]; - -/* Local socket. - */ -struct sockaddr_in sin_local; -int bssock = 0; - -/***************************************************************************** - * Notification * - *****************************************************************************/ - -typedef struct pool_thread_t_struct { - pthread_mutex_t ptmutex; - pthread_cond_t ptcv; - int newdata; -} pool_thread_t; - -pool_thread_t pool_thread[READ_POOL_SIZE+1]; - -#define RECV_NOTIFY(tid) { \ - pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ - pool_thread[tid].newdata = 1; \ - DB("CV Waking %u", tid); \ - pthread_cond_signal(&(pool_thread[tid].ptcv)); \ - pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } -#define RECV_AWAIT(tid) { \ - pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ - if (pool_thread[tid].newdata) { \ - pool_thread[tid].newdata = 0; \ - DB("CV Woken %u", tid); \ - } \ - else { \ - DB("CV Waiting %u", tid); \ - pthread_cond_wait(&(pool_thread[tid].ptcv), \ - &(pool_thread[tid].ptmutex)); \ - } \ - pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } - -/***************************************************************************** - * Message queue management * - *****************************************************************************/ - -/* Protects the queue manipulation critcal regions. - */ -pthread_mutex_t ptmutex_queue; -#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue) -#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue) - -pthread_mutex_t ptmutex_recv; -#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv) -#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv) - -/* A message queue entry. We allocate one of these for every request we send. - * Asynchronous reply reception also used one of these. - */ -typedef struct bsq_t_struct { - struct bsq_t_struct *prev; - struct bsq_t_struct *next; - int status; - int server; - int length; - struct msghdr msghdr; - struct iovec iov[2]; - int tid; - struct timeval tv_sent; - bshdr_t message; - void *block; -} bsq_t; - -#define BSQ_STATUS_MATCHED 1 - -pthread_mutex_t ptmutex_luid; -#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid) -#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid) - -static u64 luid_cnt = 0x1000ULL; -u64 new_luid(void) { - u64 luid; - ENTER_LUID_CR; - luid = luid_cnt++; - LEAVE_LUID_CR; - return luid; -} - -/* Queue of outstanding requests. - */ -bsq_t *bs_head = NULL; -bsq_t *bs_tail = NULL; -int bs_qlen = 0; - -/* - */ -void queuedebug(char *msg) { - bsq_t *q; - ENTER_QUEUE_CR; - fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen); - for (q = bs_head; q; q = q->next) { - fprintf(stderr, " luid=%016llx server=%u\n", - q->message.luid, q->server); - } - LEAVE_QUEUE_CR; -} - -int enqueue(bsq_t *qe) { - ENTER_QUEUE_CR; - qe->next = NULL; - qe->prev = bs_tail; - if (!bs_head) - bs_head = qe; - else - bs_tail->next = qe; - bs_tail = qe; - bs_qlen++; - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("enqueue"); -#endif - return 0; -} - -int dequeue(bsq_t *qe) { - bsq_t *q; - ENTER_QUEUE_CR; - for (q = bs_head; q; q = q->next) { - if (q == qe) { - if (q->prev) - q->prev->next = q->next; - else - bs_head = q->next; - if (q->next) - q->next->prev = q->prev; - else - bs_tail = q->prev; - bs_qlen--; - goto found; - } - } - - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("dequeue not found"); -#endif - return 0; - - found: - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("dequeue not found"); -#endif - return 1; -} - -bsq_t *queuesearch(bsq_t *qe) { - bsq_t *q; - ENTER_QUEUE_CR; - for (q = bs_head; q; q = q->next) { - if ((qe->server == q->server) && - (qe->message.operation == q->message.operation) && - (qe->message.luid == q->message.luid)) { - - if ((q->message.operation == BSOP_READBLOCK) && - ((q->message.flags & BSOP_FLAG_ERROR) == 0)) { - q->block = qe->block; - qe->block = NULL; - } - q->length = qe->length; - q->message.flags = qe->message.flags; - q->message.id = qe->message.id; - q->status |= BSQ_STATUS_MATCHED; - - if (q->prev) - q->prev->next = q->next; - else - bs_head = q->next; - if (q->next) - q->next->prev = q->prev; - else - bs_tail = q->prev; - q->next = NULL; - q->prev = NULL; - bs_qlen--; - goto found; - } - } - - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("queuesearch not found"); -#endif - return NULL; - - found: - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("queuesearch found"); -#endif - return q; -} - -/***************************************************************************** - * Network communication * - *****************************************************************************/ - -int send_message(bsq_t *qe) { - int rc; - - qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin); - qe->msghdr.msg_namelen = sizeof(struct sockaddr_in); - qe->msghdr.msg_iov = qe->iov; - if (qe->block) - qe->msghdr.msg_iovlen = 2; - else - qe->msghdr.msg_iovlen = 1; - qe->msghdr.msg_control = NULL; - qe->msghdr.msg_controllen = 0; - qe->msghdr.msg_flags = 0; - - qe->iov[0].iov_base = (void *)&(qe->message); - qe->iov[0].iov_len = MSGBUFSIZE_ID; - - if (qe->block) { - qe->iov[1].iov_base = qe->block; - qe->iov[1].iov_len = BLOCK_SIZE; - } - - qe->message.luid = new_luid(); - - qe->status = 0; - qe->tid = (int)pthread_getspecific(tid_key); - if (enqueue(qe) < 0) { - fprintf(stderr, "Error enqueuing request.\n"); - return -1; - } - - gettimeofday(&(qe->tv_sent), NULL); - DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid); - rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); - //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0, - // (struct sockaddr *)&(bsservers[qe->server].sin), - // sizeof(struct sockaddr_in)); - if (rc < 0) - return rc; - - return rc; -} - -int recv_message(bsq_t *qe) { - struct sockaddr_in from; - //int flen = sizeof(from); - int rc; - - qe->msghdr.msg_name = &from; - qe->msghdr.msg_namelen = sizeof(struct sockaddr_in); - qe->msghdr.msg_iov = qe->iov; - if (qe->block) - qe->msghdr.msg_iovlen = 2; - else - qe->msghdr.msg_iovlen = 1; - qe->msghdr.msg_control = NULL; - qe->msghdr.msg_controllen = 0; - qe->msghdr.msg_flags = 0; - - qe->iov[0].iov_base = (void *)&(qe->message); - qe->iov[0].iov_len = MSGBUFSIZE_ID; - if (qe->block) { - qe->iov[1].iov_base = qe->block; - qe->iov[1].iov_len = BLOCK_SIZE; - } - - rc = recvmsg(bssock, &(qe->msghdr), 0); - - //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0, - // (struct sockaddr *)&from, &flen); - return rc; -} - -int get_server_number(struct sockaddr_in *sin) { - int i; - -#ifdef BSDEBUG2 - fprintf(stderr, - "get_server_number(%u.%u.%u.%u/%u)\n", - (unsigned int)sin->sin_addr.s_addr & 0xff, - ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff, - ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff, - ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff, - (unsigned int)sin->sin_port); -#endif - - for (i = 0; i < MAX_SERVERS; i++) { - if (bsservers[i].hostname) { -#ifdef BSDEBUG2 - fprintf(stderr, - "get_server_number check %u.%u.%u.%u/%u\n", - (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff, - ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff, - ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff, - ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff, - (unsigned int)bsservers[i].sin.sin_port); -#endif - if ((sin->sin_family == bsservers[i].sin.sin_family) && - (sin->sin_port == bsservers[i].sin.sin_port) && - (memcmp((void *)&(sin->sin_addr), - (void *)&(bsservers[i].sin.sin_addr), - sizeof(struct in_addr)) == 0)) { - return i; - } - } - } - - return -1; -} - -void *rx_buffer = NULL; -bsq_t rx_qe; -bsq_t *recv_any(void) { - struct sockaddr_in from; - int rc; - - DB("ENTER recv_any\n"); - - rx_qe.msghdr.msg_name = &from; - rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in); - rx_qe.msghdr.msg_iov = rx_qe.iov; - if (!rx_buffer) { - rx_buffer = malloc(BLOCK_SIZE); - if (!rx_buffer) { - perror("recv_any malloc"); - return NULL; - } - } - rx_qe.block = rx_buffer; - rx_buffer = NULL; - rx_qe.msghdr.msg_iovlen = 2; - rx_qe.msghdr.msg_control = NULL; - rx_qe.msghdr.msg_controllen = 0; - rx_qe.msghdr.msg_flags = 0; - - rx_qe.iov[0].iov_base = (void *)&(rx_qe.message); - rx_qe.iov[0].iov_len = MSGBUFSIZE_ID; - rx_qe.iov[1].iov_base = rx_qe.block; - rx_qe.iov[1].iov_len = BLOCK_SIZE; - - rc = recvmsg(bssock, &(rx_qe.msghdr), 0); - if (rc < 0) { - perror("recv_any"); - return NULL; - } - - rx_qe.length = rc; - rx_qe.server = get_server_number(&from); - - DB("recv_any from %d luid=%016llx len=%u\n", - rx_qe.server, rx_qe.message.luid, rx_qe.length); - - return &rx_qe; -} - -void recv_recycle_buffer(bsq_t *q) { - if (q->block) { - rx_buffer = q->block; - q->block = NULL; - } -} - -// cycle through reading any incoming, searching for a match in the -// queue, until we have all we need. -int wait_recv(bsq_t **reqs, int numreqs) { - bsq_t *q, *m; - unsigned int x, i; - int tid = (int)pthread_getspecific(tid_key); - - DB("ENTER wait_recv %u\n", numreqs); - - checkmatch: - x = 0xffffffff; - for (i = 0; i < numreqs; i++) { - x &= reqs[i]->status; - } - if ((x & BSQ_STATUS_MATCHED)) { - DB("LEAVE wait_recv\n"); - return numreqs; - } - - RECV_AWAIT(tid); - - /* - rxagain: - ENTER_RECV_CR; - q = recv_any(); - LEAVE_RECV_CR; - if (!q) - return -1; - - m = queuesearch(q); - recv_recycle_buffer(q); - if (!m) { - fprintf(stderr, "Unmatched RX\n"); - goto rxagain; - } - */ - - goto checkmatch; - -} - -/* retry - */ -static int retry_count = 0; -int retry(bsq_t *qe) -{ - int rc; - gettimeofday(&(qe->tv_sent), NULL); - DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid); - retry_count++; - rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); - if (rc < 0) - return rc; - return 0; -} - -/* queue runner - */ -void *queue_runner(void *arg) -{ - for (;;) { - struct timeval now; - long long nowus, sus; - bsq_t *q; - int r; - - sleep(1); - - gettimeofday(&now, NULL); - nowus = now.tv_usec + now.tv_sec * 1000000; - ENTER_QUEUE_CR; - r = retry_count; - for (q = bs_head; q; q = q->next) { - sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000; - if ((nowus - sus) > RETRY_TIMEOUT) { - if (retry(q) < 0) { - fprintf(stderr, "Error on sendmsg retry.\n"); - } - } - } - if (r != retry_count) { - fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count); - } - LEAVE_QUEUE_CR; - } -} - -/* receive loop - */ -void *receive_loop(void *arg) -{ - bsq_t *q, *m; - - for(;;) { - q = recv_any(); - if (!q) { - fprintf(stderr, "recv_any error\n"); - } - else { - m = queuesearch(q); - recv_recycle_buffer(q); - if (!m) { - fprintf(stderr, "Unmatched RX\n"); - } - else { - DB("RX MATCH"); - RECV_NOTIFY(m->tid); - } - } - } -} -pthread_t pthread_recv; - -/***************************************************************************** - * Reading * - *****************************************************************************/ - -void *readblock_indiv(int server, u64 id) { - void *block; - bsq_t *qe; - int len, rc; - - qe = (bsq_t *)malloc(sizeof(bsq_t)); - if (!qe) { - perror("readblock qe malloc"); - return NULL; - } - qe->block = NULL; - - /* - qe->block = malloc(BLOCK_SIZE); - if (!qe->block) { - perror("readblock qe malloc"); - free((void *)qe); - return NULL; - } - */ - - qe->server = server; - - qe->message.operation = BSOP_READBLOCK; - qe->message.flags = 0; - qe->message.id = id; - qe->length = MSGBUFSIZE_ID; - - if (send_message(qe) < 0) { - perror("readblock sendto"); - goto err; - } - - /*len = recv_message(qe); - if (len < 0) { - perror("readblock recv"); - goto err; - }*/ - - rc = wait_recv(&qe, 1); - if (rc < 0) { - perror("readblock recv"); - goto err; - } - - if ((qe->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "readblock server error\n"); - goto err; - } - if (qe->length < MSGBUFSIZE_BLOCK) { - fprintf(stderr, "readblock recv short (%u)\n", len); - goto err; - } - /* if ((block = malloc(BLOCK_SIZE)) == NULL) { - perror("readblock malloc"); - goto err; - } - memcpy(block, qe->message.block, BLOCK_SIZE); - */ - block = qe->block; - - free((void *)qe); - return block; - - err: - if (qe->block) - free(qe->block); - free((void *)qe); - return NULL; -} - -/** - * readblock: read a block from disk - * @id: block id to read - * - * @return: pointer to block, NULL on error - */ -void *readblock(u64 id) { - int map = (int)BSID_MAP(id); - u64 xid; - static int i = CLUSTER_MAX_REPLICAS - 1; - void *block = NULL; - - /* special case for the "superblock" just use the first block on the - * first replica. (extend to blocks < 6 for vdi bug) - */ - if (id < 6) { - block = readblock_indiv(bsclusters[map].servers[0], id); - goto out; - } - - i++; - if (i >= CLUSTER_MAX_REPLICAS) - i = 0; - switch (i) { - case 0: - xid = BSID_REPLICA0(id); - break; - case 1: - xid = BSID_REPLICA1(id); - break; - case 2: - xid = BSID_REPLICA2(id); - break; - } - - block = readblock_indiv(bsclusters[map].servers[i], xid); - - out: -#ifdef BSDEBUG - if (block) - fprintf(stderr, "READ: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", - id, - (unsigned int)((unsigned char *)block)[0], - (unsigned int)((unsigned char *)block)[1], - (unsigned int)((unsigned char *)block)[2], - (unsigned int)((unsigned char *)block)[3], - (unsigned int)((unsigned char *)block)[4], - (unsigned int)((unsigned char *)block)[5], - (unsigned int)((unsigned char *)block)[6], - (unsigned int)((unsigned char *)block)[7]); - else - fprintf(stderr, "READ: %016llx NULL\n", id); -#endif - return block; -} - -/***************************************************************************** - * Writing * - *****************************************************************************/ - -bsq_t *writeblock_indiv(int server, u64 id, void *block) { - - bsq_t *qe; - int len; - - qe = (bsq_t *)malloc(sizeof(bsq_t)); - if (!qe) { - perror("writeblock qe malloc"); - goto err; - } - qe->server = server; - - qe->message.operation = BSOP_WRITEBLOCK; - qe->message.flags = 0; - qe->message.id = id; - //memcpy(qe->message.block, block, BLOCK_SIZE); - qe->block = block; - qe->length = MSGBUFSIZE_BLOCK; - - if (send_message(qe) < 0) { - perror("writeblock sendto"); - goto err; - } - - return qe; - - err: - free((void *)qe); - return NULL; -} - - -/** - * writeblock: write an existing block to disk - * @id: block id - * @block: pointer to block - * - * @return: zero on success, -1 on failure - */ -int writeblock(u64 id, void *block) { - - int map = (int)BSID_MAP(id); - int rep0 = bsclusters[map].servers[0]; - int rep1 = bsclusters[map].servers[1]; - int rep2 = bsclusters[map].servers[2]; - bsq_t *reqs[3]; - int rc; - - reqs[0] = reqs[1] = reqs[2] = NULL; - -#ifdef BSDEBUG - fprintf(stderr, - "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", - id, - (unsigned int)((unsigned char *)block)[0], - (unsigned int)((unsigned char *)block)[1], - (unsigned int)((unsigned char *)block)[2], - (unsigned int)((unsigned char *)block)[3], - (unsigned int)((unsigned char *)block)[4], - (unsigned int)((unsigned char *)block)[5], - (unsigned int)((unsigned char *)block)[6], - (unsigned int)((unsigned char *)block)[7]); -#endif - - /* special case for the "superblock" just use the first block on the - * first replica. (extend to blocks < 6 for vdi bug) - */ - if (id < 6) { - reqs[0] = writeblock_indiv(rep0, id, block); - if (!reqs[0]) - return -1; - rc = wait_recv(reqs, 1); - return rc; - } - - reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block); - if (!reqs[0]) - goto err; - reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block); - if (!reqs[1]) - goto err; - reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block); - if (!reqs[2]) - goto err; - - rc = wait_recv(reqs, 3); - if (rc < 0) { - perror("writeblock recv"); - goto err; - } - if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "writeblock server0 error\n"); - goto err; - } - if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "writeblock server1 error\n"); - goto err; - } - if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "writeblock server2 error\n"); - goto err; - } - - - free((void *)reqs[0]); - free((void *)reqs[1]); - free((void *)reqs[2]); - return 0; - - err: - if (reqs[0]) { - dequeue(reqs[0]); - free((void *)reqs[0]); - } - if (reqs[1]) { - dequeue(reqs[1]); - free((void *)reqs[1]); - } - if (reqs[2]) { - dequeue(reqs[2]); - free((void *)reqs[2]); - } - return -1; -} - -/***************************************************************************** - * Allocation * - *****************************************************************************/ - -/** - * allocblock: write a new block to disk - * @block: pointer to block - * - * @return: new id of block on disk - */ -u64 allocblock(void *block) { - return allocblock_hint(block, 0); -} - -bsq_t *allocblock_hint_indiv(int server, void *block, u64 hint) { - bsq_t *qe; - int len; - - qe = (bsq_t *)malloc(sizeof(bsq_t)); - if (!qe) { - perror("allocblock_hint qe malloc"); - goto err; - } - qe->server = server; - - qe->message.operation = BSOP_ALLOCBLOCK; - qe->message.flags = 0; - qe->message.id = hint; - //memcpy(qe->message.block, block, BLOCK_SIZE); - qe->block = block; - qe->length = MSGBUFSIZE_BLOCK; - - if (send_message(qe) < 0) { - perror("allocblock_hint sendto"); - goto err; - } - - return qe; - - err: - free((void *)qe); - return NULL; -} - -/** - * allocblock_hint: write a new block to disk - * @block: pointer to block - * @hint: allocation hint - * - * @return: new id of block on disk - */ -u64 allocblock_hint(void *block, u64 hint) { - int map = (int)hint; - int rep0 = bsclusters[map].servers[0]; - int rep1 = bsclusters[map].servers[1]; - int rep2 = bsclusters[map].servers[2]; - bsq_t *reqs[3]; - int rc; - u64 id0, id1, id2; - - reqs[0] = reqs[1] = reqs[2] = NULL; - - DB("ENTER allocblock\n"); - - reqs[0] = allocblock_hint_indiv(rep0, block, hint); - if (!reqs[0]) - goto err; - reqs[1] = allocblock_hint_indiv(rep1, block, hint); - if (!reqs[1]) - goto err; - reqs[2] = allocblock_hint_indiv(rep2, block, hint); - if (!reqs[2]) - goto err; - - rc = wait_recv(reqs, 3); - if (rc < 0) { - perror("allocblock recv"); - goto err; - } - if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "allocblock server0 error\n"); - goto err; - } - if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "allocblock server1 error\n"); - goto err; - } - if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "allocblock server2 error\n"); - goto err; - } - - id0 = reqs[0]->message.id; - id1 = reqs[1]->message.id; - id2 = reqs[2]->message.id; - -#ifdef BSDEBUG - fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", - BSID(map, id0, id1, id2), - (unsigned int)((unsigned char *)block)[0], - (unsigned int)((unsigned char *)block)[1], - (unsigned int)((unsigned char *)block)[2], - (unsigned int)((unsigned char *)block)[3], - (unsigned int)((unsigned char *)block)[4], - (unsigned int)((unsigned char *)block)[5], - (unsigned int)((unsigned char *)block)[6], - (unsigned int)((unsigned char *)block)[7]); -#endif - - free((void *)reqs[0]); - free((void *)reqs[1]); - free((void *)reqs[2]); - return BSID(map, id0, id1, id2); - - err: - if (reqs[0]) { - dequeue(reqs[0]); - free((void *)reqs[0]); - } - if (reqs[1]) { - dequeue(reqs[1]); - free((void *)reqs[1]); - } - if (reqs[2]) { - dequeue(reqs[2]); - free((void *)reqs[2]); - } - return 0; -} - -#else /* /BLOCKSTORE_REMOTE */ - -/***************************************************************************** - * Local storage version * - *****************************************************************************/ - -/** - * readblock: read a block from disk - * @id: block id to read - * - * @return: pointer to block, NULL on error - */ - -void *readblock(u64 id) { - void *block; - int block_fp; - -//printf("readblock(%llu)\n", id); - block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return NULL; - } - - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - printf ("%Ld ", id); - printf ("%Ld\n", (id - 1) * BLOCK_SIZE); - perror("readblock lseek"); - goto err; - } - if ((block = malloc(BLOCK_SIZE)) == NULL) { - perror("readblock malloc"); - goto err; - } - if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("readblock read"); - free(block); - goto err; - } - close(block_fp); - return block; - -err: - close(block_fp); - return NULL; -} - -/** - * writeblock: write an existing block to disk - * @id: block id - * @block: pointer to block - * - * @return: zero on success, -1 on failure - */ -int writeblock(u64 id, void *block) { - - int block_fp; - - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return -1; - } - - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - perror("writeblock lseek"); - goto err; - } - if (write(block_fp, block, BLOCK_SIZE) < 0) { - perror("writeblock write"); - goto err; - } - close(block_fp); - return 0; - -err: - close(block_fp); - return -1; -} - -/** - * allocblock: write a new block to disk - * @block: pointer to block - * - * @return: new id of block on disk - */ - -u64 allocblock(void *block) { - u64 lb; - off64_t pos; - int block_fp; - - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return 0; - } - - pos = lseek64(block_fp, 0, SEEK_END); - if (pos == (off64_t)-1) { - perror("allocblock lseek"); - goto err; - } - if (pos % BLOCK_SIZE != 0) { - fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); - goto err; - } - if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("allocblock write"); - goto err; - } - lb = pos / BLOCK_SIZE + 1; -//printf("alloc(%Ld)\n", lb); - close(block_fp); - return lb; - -err: - close(block_fp); - return 0; - -} - -/** - * allocblock_hint: write a new block to disk - * @block: pointer to block - * @hint: allocation hint - * - * @return: new id of block on disk - */ -u64 allocblock_hint(void *block, u64 hint) { - return allocblock(block); -} - -#endif /* BLOCKSTORE_REMOTE */ - -/***************************************************************************** - * Memory management * - *****************************************************************************/ - -/** - * newblock: get a new in-memory block set to zeros - * - * @return: pointer to new block, NULL on error - */ -void *newblock() { - void *block = malloc(BLOCK_SIZE); - if (block == NULL) { - perror("newblock"); - return NULL; - } - memset(block, 0, BLOCK_SIZE); - return block; -} - - -/** - * freeblock: unallocate an in-memory block - * @id: block id (zero if this is only in-memory) - * @block: block to be freed - */ -void freeblock(void *block) { - if (block != NULL) - free(block); -} - -static freeblock_t *new_freeblock(void) -{ - freeblock_t *fb; - - fb = newblock(); - - if (fb == NULL) return NULL; - - fb->magic = FREEBLOCK_MAGIC; - fb->next = 0ULL; - fb->count = 0ULL; - memset(fb->list, 0, sizeof fb->list); - - return fb; -} - -void releaseblock(u64 id) -{ - blockstore_super_t *bs_super; - freeblock_t *fl_current; - - /* get superblock */ - bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); - - /* get freeblock_current */ - if (bs_super->freelist_current == 0ULL) - { - fl_current = new_freeblock(); - bs_super->freelist_current = allocblock(fl_current); - writeblock(BLOCKSTORE_SUPER, bs_super); - } else { - fl_current = readblock(bs_super->freelist_current); - } - - /* if full, chain to superblock and allocate new current */ - - if (fl_current->count == FREEBLOCK_SIZE) { - fl_current->next = bs_super->freelist_full; - writeblock(bs_super->freelist_current, fl_current); - bs_super->freelist_full = bs_super->freelist_current; - freeblock(fl_current); - fl_current = new_freeblock(); - bs_super->freelist_current = allocblock(fl_current); - writeblock(BLOCKSTORE_SUPER, bs_super); - } - - /* append id to current */ - fl_current->list[fl_current->count++] = id; - writeblock(bs_super->freelist_current, fl_current); - - freeblock(fl_current); - freeblock(bs_super); - - -} - -/* freelist debug functions: */ -void freelist_count(int print_each) -{ - blockstore_super_t *bs_super; - freeblock_t *fb; - u64 total = 0, next; - - bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); - - if (bs_super->freelist_current == 0ULL) { - printf("freelist is empty!\n"); - return; - } - - fb = readblock(bs_super->freelist_current); - printf("%Ld entires on current.\n", fb->count); - total += fb->count; - if (print_each == 1) - { - int i; - for (i=0; i< fb->count; i++) - printf(" %Ld\n", fb->list[i]); - } - - freeblock(fb); - - if (bs_super->freelist_full == 0ULL) { - printf("freelist_full is empty!\n"); - return; - } - - next = bs_super->freelist_full; - for (;;) { - fb = readblock(next); - total += fb->count; - if (print_each == 1) - { - int i; - for (i=0; i< fb->count; i++) - printf(" %Ld\n", fb->list[i]); - } - next = fb->next; - freeblock(fb); - if (next == 0ULL) break; - } - printf("Total of %Ld ids on freelist.\n", total); -} - -/***************************************************************************** - * Initialisation * - *****************************************************************************/ - -int __init_blockstore(void) -{ - int i; - blockstore_super_t *bs_super; - u64 ret; - int block_fp; - -#ifdef BLOCKSTORE_REMOTE - struct hostent *addr; - - pthread_mutex_init(&ptmutex_queue, NULL); - pthread_mutex_init(&ptmutex_luid, NULL); - pthread_mutex_init(&ptmutex_recv, NULL); - /*pthread_mutex_init(&ptmutex_notify, NULL);*/ - for (i = 0; i <= READ_POOL_SIZE; i++) { - pool_thread[i].newdata = 0; - pthread_mutex_init(&(pool_thread[i].ptmutex), NULL); - pthread_cond_init(&(pool_thread[i].ptcv), NULL); - } - - bsservers[0].hostname = "firebug.cl.cam.ac.uk"; - bsservers[1].hostname = "planb.cl.cam.ac.uk"; - bsservers[2].hostname = "simcity.cl.cam.ac.uk"; - bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/; - bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/; - bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/; - bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/; - bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/; - bsservers[8].hostname = NULL; - bsservers[9].hostname = NULL; - bsservers[10].hostname = NULL; - bsservers[11].hostname = NULL; - bsservers[12].hostname = NULL; - bsservers[13].hostname = NULL; - bsservers[14].hostname = NULL; - bsservers[15].hostname = NULL; - - for (i = 0; i < MAX_SERVERS; i++) { - if (!bsservers[i].hostname) - continue; - addr = gethostbyname(bsservers[i].hostname); - if (!addr) { - perror("bad hostname"); - return -1; - } - bsservers[i].sin.sin_family = addr->h_addrtype; - bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT); - bsservers[i].sin.sin_addr.s_addr = - ((struct in_addr *)(addr->h_addr))->s_addr; - } - - /* Cluster map - */ - bsclusters[0].servers[0] = 0; - bsclusters[0].servers[1] = 1; - bsclusters[0].servers[2] = 2; - bsclusters[1].servers[0] = 1; - bsclusters[1].servers[1] = 2; - bsclusters[1].servers[2] = 3; - bsclusters[2].servers[0] = 2; - bsclusters[2].servers[1] = 3; - bsclusters[2].servers[2] = 4; - bsclusters[3].servers[0] = 3; - bsclusters[3].servers[1] = 4; - bsclusters[3].servers[2] = 5; - bsclusters[4].servers[0] = 4; - bsclusters[4].servers[1] = 5; - bsclusters[4].servers[2] = 6; - bsclusters[5].servers[0] = 5; - bsclusters[5].servers[1] = 6; - bsclusters[5].servers[2] = 7; - bsclusters[6].servers[0] = 6; - bsclusters[6].servers[1] = 7; - bsclusters[6].servers[2] = 0; - bsclusters[7].servers[0] = 7; - bsclusters[7].servers[1] = 0; - bsclusters[7].servers[2] = 1; - - /* Local socket set up - */ - bssock = socket(AF_INET, SOCK_DGRAM, 0); - if (bssock < 0) { - perror("Bad socket"); - return -1; - } - memset(&sin_local, 0, sizeof(sin_local)); - sin_local.sin_family = AF_INET; - sin_local.sin_port = htons(BLOCKSTORED_PORT); - sin_local.sin_addr.s_addr = htonl(INADDR_ANY); - if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) { - perror("bind"); - close(bssock); - return -1; - } - - pthread_create(&pthread_recv, NULL, receive_loop, NULL); - pthread_create(&pthread_recv, NULL, queue_runner, NULL); - -#else /* /BLOCKSTORE_REMOTE */ - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return -1; - exit(-1); - } - - if (lseek(block_fp, 0, SEEK_END) == 0) { - bs_super = newblock(); - bs_super->magic = BLOCKSTORE_MAGIC; - bs_super->freelist_full = 0LL; - bs_super->freelist_current = 0LL; - - ret = allocblock(bs_super); - - freeblock(bs_super); - } else { - bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); - if (bs_super->magic != BLOCKSTORE_MAGIC) - { - printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n"); - exit(-1); - } - freeblock(bs_super); - } - - close(block_fp); - -#endif /* BLOCKSTORE_REMOTE */ - return 0; -} - -void __exit_blockstore(void) -{ - int i; -#ifdef BLOCKSTORE_REMOTE - pthread_mutex_destroy(&ptmutex_recv); - pthread_mutex_destroy(&ptmutex_luid); - pthread_mutex_destroy(&ptmutex_queue); - /*pthread_mutex_destroy(&ptmutex_notify); - pthread_cond_destroy(&ptcv_notify);*/ - for (i = 0; i <= READ_POOL_SIZE; i++) { - pthread_mutex_destroy(&(pool_thread[i].ptmutex)); - pthread_cond_destroy(&(pool_thread[i].ptcv)); - } -#endif -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstore.h --- a/tools/blktap/blockstore.h Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,134 +0,0 @@ -/************************************************************************** - * - * blockstore.h - * - * Simple block store interface - * - */ - -#ifndef __BLOCKSTORE_H__ -#define __BLOCKSTORE_H__ - -#include <netinet/in.h> -#include <xc.h> - -#define BLOCK_SIZE 4096 -#define BLOCK_SHIFT 12 -#define BLOCK_MASK 0xfffffffffffff000LL - -/* XXX SMH: where is the below supposed to be defined???? */ -#ifndef SECTOR_SHIFT -#define SECTOR_SHIFT 9 -#endif - -#define FREEBLOCK_SIZE (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64)) -#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL - -typedef struct { - u64 magic; - u64 next; - u64 count; - u64 list[FREEBLOCK_SIZE]; -} freeblock_t; - -#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL -#define BLOCKSTORE_SUPER 1ULL - -typedef struct { - u64 magic; - u64 freelist_full; - u64 freelist_current; -} blockstore_super_t; - -extern void *newblock(); -extern void *readblock(u64 id); -extern u64 allocblock(void *block); -extern u64 allocblock_hint(void *block, u64 hint); -extern int writeblock(u64 id, void *block); - -/* Add this blockid to a freelist, to be recycled by the allocator. */ -extern void releaseblock(u64 id); - -/* this is a memory free() operation for block-sized allocations */ -extern void freeblock(void *block); -extern int __init_blockstore(void); - -/* debug for freelist. */ -void freelist_count(int print_each); -#define ALLOCFAIL (((u64)(-1))) - -/* Distribution - */ -#define BLOCKSTORED_PORT 9346 - -struct bshdr_t_struct { - u32 operation; - u32 flags; - u64 id; - u64 luid; -} __attribute__ ((packed)); -typedef struct bshdr_t_struct bshdr_t; - -struct bsmsg_t_struct { - bshdr_t hdr; - unsigned char block[BLOCK_SIZE]; -} __attribute__ ((packed)); - -typedef struct bsmsg_t_struct bsmsg_t; - -#define MSGBUFSIZE_OP sizeof(u32) -#define MSGBUFSIZE_FLAGS (sizeof(u32) + sizeof(u32)) -#define MSGBUFSIZE_ID (sizeof(u32) + sizeof(u32) + sizeof(u64) + sizeof(u64)) -#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t) - -#define BSOP_READBLOCK 0x01 -#define BSOP_WRITEBLOCK 0x02 -#define BSOP_ALLOCBLOCK 0x03 -#define BSOP_FREEBLOCK 0x04 - -#define BSOP_FLAG_ERROR 0x01 - -#define BS_ALLOC_SKIP 10 -#define BS_ALLOC_HACK - -/* Remote hosts and cluster map - XXX need to generalise - */ - -/* - - Interim ID format is - - 63 60 59 40 39 20 19 0 - +----+--------------------+--------------------+--------------------+ - |map | replica 2 | replica 1 | replica 0 | - +----+--------------------+--------------------+--------------------+ - - The map is an index into a table detailing which machines form the - cluster. - - */ - -#define BSID_REPLICA0(_id) ((_id)&0xfffffULL) -#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL) -#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL) -#define BSID_MAP(_id) (((_id)>>60)&0xfULL) - -#define BSID(_map, _rep0, _rep1, _rep2) ((((u64)(_map))<<60) | \ - (((u64)(_rep2))<<40) | \ - (((u64)(_rep1))<<20) | ((u64)(_rep0))) - -typedef struct bsserver_t_struct { - char *hostname; - struct sockaddr_in sin; -} bsserver_t; - -#define MAX_SERVERS 16 - -#define CLUSTER_MAX_REPLICAS 3 -typedef struct bscluster_t_struct { - int servers[CLUSTER_MAX_REPLICAS]; -} bscluster_t; - -#define MAX_CLUSTERS 16 - -#endif /* __BLOCKSTORE_H__ */ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/blockstored.c --- a/tools/blktap/blockstored.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,276 +0,0 @@ -/************************************************************************** - * - * blockstored.c - * - * Block store daemon. - * - */ - -#include <fcntl.h> -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/socket.h> -#include <sys/ioctl.h> -#include <netinet/in.h> -#include <errno.h> -#include "blockstore.h" - -//#define BSDEBUG - -int readblock_into(u64 id, void *block); - -int open_socket(u16 port) { - - struct sockaddr_in sn; - int sock; - - sock = socket(AF_INET, SOCK_DGRAM, 0); - if (sock < 0) { - perror("Bad socket"); - return -1; - } - memset(&sn, 0, sizeof(sn)); - sn.sin_family = AF_INET; - sn.sin_port = htons(port); - sn.sin_addr.s_addr = htonl(INADDR_ANY); - if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) { - perror("bind"); - close(sock); - return -1; - } - - return sock; -} - -static int block_fp = -1; -static int bssock = -1; - -int send_reply(struct sockaddr_in *peer, void *buffer, int len) { - - int rc; - -#ifdef BSDEBUG - fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n", - len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id); -#endif - rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer)); - if (rc < 0) { - perror("send_reply"); - return 1; - } - - - return 0; -} - -static bsmsg_t msgbuf; - -void service_loop(void) { - - for (;;) { - int rc, len; - struct sockaddr_in from; - size_t slen = sizeof(from); - u64 bid; - - len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0, - (struct sockaddr *)&from, &slen); - - if (len < 0) { - perror("recvfrom"); - continue; - } - - if (len < MSGBUFSIZE_OP) { - fprintf(stderr, "Short packet.\n"); - continue; - } - -#ifdef BSDEBUG - fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n", - len, msgbuf.hdr.operation, msgbuf.hdr.id); -#endif - - switch (msgbuf.hdr.operation) { - case BSOP_READBLOCK: - if (len < MSGBUFSIZE_ID) { - fprintf(stderr, "Short packet (readblock %u).\n", len); - continue; - } - rc = readblock_into(msgbuf.hdr.id, msgbuf.block); - if (rc < 0) { - fprintf(stderr, "readblock error\n"); - msgbuf.hdr.flags = BSOP_FLAG_ERROR; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - continue; - } - msgbuf.hdr.flags = 0; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK); - break; - case BSOP_WRITEBLOCK: - if (len < MSGBUFSIZE_BLOCK) { - fprintf(stderr, "Short packet (writeblock %u).\n", len); - continue; - } - rc = writeblock(msgbuf.hdr.id, msgbuf.block); - if (rc < 0) { - fprintf(stderr, "writeblock error\n"); - msgbuf.hdr.flags = BSOP_FLAG_ERROR; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - continue; - } - msgbuf.hdr.flags = 0; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - break; - case BSOP_ALLOCBLOCK: - if (len < MSGBUFSIZE_BLOCK) { - fprintf(stderr, "Short packet (allocblock %u).\n", len); - continue; - } - bid = allocblock(msgbuf.block); - if (bid == ALLOCFAIL) { - fprintf(stderr, "allocblock error\n"); - msgbuf.hdr.flags = BSOP_FLAG_ERROR; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - continue; - } - msgbuf.hdr.id = bid; - msgbuf.hdr.flags = 0; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - break; - } - - } -} - -/** - * readblock: read a block from disk - * @id: block id to read - * @block: pointer to buffer to receive block - * - * @return: 0 if OK, other on error - */ - -int readblock_into(u64 id, void *block) { - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - printf ("%Ld\n", (id - 1) * BLOCK_SIZE); - perror("readblock lseek"); - return -1; - } - if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("readblock read"); - return -1; - } - return 0; -} - -/** - * writeblock: write an existing block to disk - * @id: block id - * @block: pointer to block - * - * @return: zero on success, -1 on failure - */ -int writeblock(u64 id, void *block) { - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - perror("writeblock lseek"); - return -1; - } - if (write(block_fp, block, BLOCK_SIZE) < 0) { - perror("writeblock write"); - return -1; - } - return 0; -} - -/** - * allocblock: write a new block to disk - * @block: pointer to block - * - * @return: new id of block on disk - */ -static u64 lastblock = 0; - -u64 allocblock(void *block) { - u64 lb; - off64_t pos; - - retry: - pos = lseek64(block_fp, 0, SEEK_END); - if (pos == (off64_t)-1) { - perror("allocblock lseek"); - return ALLOCFAIL; - } - if (pos % BLOCK_SIZE != 0) { - fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); - return ALLOCFAIL; - } - if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("allocblock write"); - return ALLOCFAIL; - } - lb = pos / BLOCK_SIZE + 1; - -#ifdef BS_ALLOC_HACK - if (lb < BS_ALLOC_SKIP) - goto retry; -#endif - - if (lb <= lastblock) - printf("[*** %Ld alredy allocated! ***]\n", lb); - - lastblock = lb; - return lb; -} - -/** - * newblock: get a new in-memory block set to zeros - * - * @return: pointer to new block, NULL on error - */ -void *newblock() { - void *block = malloc(BLOCK_SIZE); - if (block == NULL) { - perror("newblock"); - return NULL; - } - memset(block, 0, BLOCK_SIZE); - return block; -} - - -/** - * freeblock: unallocate an in-memory block - * @id: block id (zero if this is only in-memory) - * @block: block to be freed - */ -void freeblock(void *block) { - if (block != NULL) - free(block); -} - - -int main(int argc, char **argv) -{ - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return -1; - } - - bssock = open_socket(BLOCKSTORED_PORT); - if (bssock < 0) { - return -1; - } - - service_loop(); - - close(bssock); - - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/bstest.c --- a/tools/blktap/bstest.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,191 +0,0 @@ -/************************************************************************** - * - * bstest.c - * - * Block store daemon test program. - * - * usage: bstest <host>|X {r|w|a} ID - * - */ - -#include <fcntl.h> -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/socket.h> -#include <sys/ioctl.h> -#include <netinet/in.h> -#include <netdb.h> -#include <errno.h> -#include "blockstore.h" - -int direct(char *host, u32 op, u64 id, int len) { - struct sockaddr_in sn, peer; - int sock; - bsmsg_t msgbuf; - int rc, slen; - struct hostent *addr; - - addr = gethostbyname(host); - if (!addr) { - perror("bad hostname"); - exit(1); - } - peer.sin_family = addr->h_addrtype; - peer.sin_port = htons(BLOCKSTORED_PORT); - peer.sin_addr.s_addr = ((struct in_addr *)(addr->h_addr))->s_addr; - fprintf(stderr, "Sending to: %u.%u.%u.%u\n", - (unsigned int)(unsigned char)addr->h_addr[0], - (unsigned int)(unsigned char)addr->h_addr[1], - (unsigned int)(unsigned char)addr->h_addr[2], - (unsigned int)(unsigned char)addr->h_addr[3]); - - sock = socket(AF_INET, SOCK_DGRAM, 0); - if (sock < 0) { - perror("Bad socket"); - exit(1); - } - memset(&sn, 0, sizeof(sn)); - sn.sin_family = AF_INET; - sn.sin_port = htons(BLOCKSTORED_PORT); - sn.sin_addr.s_addr = htonl(INADDR_ANY); - if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) { - perror("bind"); - close(sock); - exit(1); - } - - memset((void *)&msgbuf, 0, sizeof(msgbuf)); - msgbuf.operation = op; - msgbuf.id = id; - - rc = sendto(sock, (void *)&msgbuf, len, 0, - (struct sockaddr *)&peer, sizeof(peer)); - if (rc < 0) { - perror("sendto"); - exit(1); - } - - slen = sizeof(peer); - len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0, - (struct sockaddr *)&peer, &slen); - if (len < 0) { - perror("recvfrom"); - exit(1); - } - - printf("Reply %u bytes:\n", len); - if (len >= MSGBUFSIZE_OP) - printf(" operation: %u\n", msgbuf.operation); - if (len >= MSGBUFSIZE_FLAGS) - printf(" flags: 0x%x\n", msgbuf.flags); - if (len >= MSGBUFSIZE_ID) - printf(" id: %llu\n", msgbuf.id); - if (len >= (MSGBUFSIZE_ID + 4)) - printf(" data: %02x %02x %02x %02x...\n", - (unsigned int)msgbuf.block[0], - (unsigned int)msgbuf.block[1], - (unsigned int)msgbuf.block[2], - (unsigned int)msgbuf.block[3]); - - if (sock > 0) - close(sock); - - return 0; -} - -int main (int argc, char **argv) { - - u32 op = 0; - u64 id = 0; - int len = 0, rc; - void *block; - - if (argc < 3) { - fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n"); - return 1; - } - - switch (argv[2][0]) { - case 'r': - case 'R': - op = BSOP_READBLOCK; - len = MSGBUFSIZE_ID; - break; - case 'w': - case 'W': - op = BSOP_WRITEBLOCK; - len = MSGBUFSIZE_BLOCK; - break; - case 'a': - case 'A': - op = BSOP_ALLOCBLOCK; - len = MSGBUFSIZE_BLOCK; - break; - default: - fprintf(stderr, "Unknown action '%s'.\n", argv[2]); - return 1; - } - - if (argc >= 4) - id = atoll(argv[3]); - - if (strcmp(argv[1], "X") == 0) { - rc = __init_blockstore(); - if (rc < 0) { - fprintf(stderr, "blockstore init failed.\n"); - return 1; - } - switch(op) { - case BSOP_READBLOCK: - block = readblock(id); - if (block) { - printf("data: %02x %02x %02x %02x...\n", - (unsigned int)((unsigned char*)block)[0], - (unsigned int)((unsigned char*)block)[1], - (unsigned int)((unsigned char*)block)[2], - (unsigned int)((unsigned char*)block)[3]); - } - break; - case BSOP_WRITEBLOCK: - block = malloc(BLOCK_SIZE); - if (!block) { - perror("bstest malloc"); - return 1; - } - memset(block, 0, BLOCK_SIZE); - rc = writeblock(id, block); - if (rc != 0) { - printf("error\n"); - } - else { - printf("OK\n"); - } - break; - case BSOP_ALLOCBLOCK: - block = malloc(BLOCK_SIZE); - if (!block) { - perror("bstest malloc"); - return 1; - } - memset(block, 0, BLOCK_SIZE); - id = allocblock_hint(block, id); - if (id == 0) { - printf("error\n"); - } - else { - printf("ID: %llu\n", id); - } - break; - } - } - else { - direct(argv[1], op, id, len); - } - - - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/parallax.c --- a/tools/blktap/parallax.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,611 +0,0 @@ -/************************************************************************** - * - * parallax.c - * - * The Parallax Storage Server - * - */ - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> -#include "blktaplib.h" -#include "blockstore.h" -#include "vdi.h" -#include "block-async.h" -#include "requests-async.h" - -#define PARALLAX_DEV 61440 -#define SECTS_PER_NODE 8 - - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* ------[ session records ]----------------------------------------------- */ - -#define BLKIF_HASHSZ 1024 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) - -#define VDI_HASHSZ 16 -#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1)) - -typedef struct blkif { - domid_t domid; - unsigned int handle; - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - vdi_t *vdi_hash[VDI_HASHSZ]; - struct blkif *hash_next; -} blkif_t; - -static blkif_t *blkif_hash[BLKIF_HASHSZ]; - -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) -{ - if ( handle != 0 ) - printf("blktap/parallax don't currently support non-0 dev handles!\n"); - - blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif != NULL) && - ((blkif->domid != domid) || (blkif->handle != handle)) ) - blkif = blkif->hash_next; - return blkif; -} - -vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device) -{ - vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)]; - - while ((vdi != NULL) && (vdi->vdevice != device)) - vdi = vdi->next; - - return vdi; -} - -/* ------[ control message handling ]-------------------------------------- */ - -void blkif_create(blkif_be_create_t *create) -{ - domid_t domid = create->domid; - unsigned int handle = create->blkif_handle; - blkif_t **pblkif, *blkif; - - DPRINTF("parallax (blkif_create): create is %p\n", create); - - if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL ) - { - DPRINTF("Could not create blkif: out of memory\n"); - create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->handle = handle; - blkif->status = DISCONNECTED; - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( *pblkif != NULL ) - { - if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) - { - DPRINTF("Could not create blkif: already exists (%d,%d)\n", - domid, handle); - create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; - free(blkif); - return; - } - pblkif = &(*pblkif)->hash_next; - } - - blkif->hash_next = *pblkif; - *pblkif = blkif; - - DPRINTF("Successfully created blkif\n"); - create->status = BLKIF_BE_STATUS_OKAY; -} - -void blkif_destroy(blkif_be_destroy_t *destroy) -{ - domid_t domid = destroy->domid; - unsigned int handle = destroy->blkif_handle; - blkif_t **pblkif, *blkif; - - DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif = *pblkif) != NULL ) - { - if ( (blkif->domid == domid) && (blkif->handle == handle) ) - { - if ( blkif->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pblkif = &blkif->hash_next; - } - - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pblkif = blkif->hash_next; - free(blkif); - destroy->status = BLKIF_BE_STATUS_OKAY; -} - -void vbd_create(blkif_be_vbd_create_t *create) -{ - blkif_t *blkif; - vdi_t *vdi, **vdip; - blkif_vdev_t vdevice = create->vdevice; - - DPRINTF("parallax (vbd_create): create=%p\n", create); - - blkif = blkif_find_by_handle(create->domid, create->blkif_handle); - if ( blkif == NULL ) - { - DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", - create->domid, create->blkif_handle); - create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - /* VDI identifier is in grow->extent.sector_start */ - DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", - (unsigned long)create->dev_handle); - - vdi = vdi_get(create->dev_handle); - if (vdi == NULL) - { - printf("parallax (vbd_create): VDI %lx not found.\n", - (unsigned long)create->dev_handle); - create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; - return; - } - - vdi->next = NULL; - vdi->vdevice = vdevice; - vdip = &blkif->vdi_hash[VDI_HASH(vdevice)]; - while (*vdip != NULL) - vdip = &(*vdip)->next; - *vdip = vdi; - - DPRINTF("blkif_create succeeded\n"); - create->status = BLKIF_BE_STATUS_OKAY; -} - -void vbd_destroy(blkif_be_vbd_destroy_t *destroy) -{ - blkif_t *blkif; - vdi_t *vdi, **vdip; - blkif_vdev_t vdevice = destroy->vdevice; - - blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); - if ( blkif == NULL ) - { - DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", - destroy->domid, destroy->blkif_handle); - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - vdip = &blkif->vdi_hash[VDI_HASH(vdevice)]; - while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice)) - vdip = &(*vdip)->next; - - if (*vdip != NULL) - { - vdi = *vdip; - *vdip = vdi->next; - vdi_put(vdi); - } - -} - -int parallax_control(control_msg_t *msg) -{ - domid_t domid; - int ret; - - DPRINTF("parallax_control: msg is %p\n", msg); - - if (msg->type != CMSG_BLKIF_BE) - { - printf("Unexpected control message (%d)\n", msg->type); - return 0; - } - - switch(msg->subtype) - { - case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_be_create_t) ) - goto parse_error; - blkif_create((blkif_be_create_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_be_destroy_t) ) - goto parse_error; - blkif_destroy((blkif_be_destroy_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_VBD_CREATE: - if ( msg->length != sizeof(blkif_be_vbd_create_t) ) - goto parse_error; - vbd_create((blkif_be_vbd_create_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_VBD_DESTROY: - if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) - goto parse_error; - vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_CONNECT: - case CMSG_BLKIF_BE_DISCONNECT: - /* we don't manage the device channel, the tap does. */ - break; - - default: - goto parse_error; - } - return 0; -parse_error: - printf("Bad control message!\n"); - return 0; - -} - -int parallax_probe(blkif_request_t *req, blkif_t *blkif) -{ - blkif_response_t *rsp; - vdisk_t *img_info; - vdi_t *vdi; - int i, nr_vdis = 0; - - DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); - - /* We expect one buffer only. */ - if ( req->nr_segments != 1 ) - goto err; - - /* Make sure the buffer is page-sized. */ - if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || - (blkif_last_sect (req->frame_and_sects[0]) != 7) ) - goto err; - - /* fill the list of devices */ - for (i=0; i<VDI_HASHSZ; i++) { - vdi = blkif->vdi_hash[i]; - while (vdi) { - img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); - img_info[nr_vdis].device = vdi->vdevice; - img_info[nr_vdis].info = 0; - /* The -1 here accounts for the LSB in the radix tree */ - img_info[nr_vdis].capacity = - ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE); - nr_vdis++; - vdi = vdi->next; - } - } - - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_PROBE; - rsp->status = nr_vdis; /* number of disks */ - - DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis); - return BLKTAP_RESPOND; -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_PROBE; - rsp->status = BLKIF_RSP_ERROR; - - DPRINTF("parallax_probe: send error response\n"); - return BLKTAP_RESPOND; -} - -typedef struct { - blkif_request_t *req; - int count; - int error; - pthread_mutex_t mutex; -} pending_t; - -#define MAX_REQUESTS 64 -pending_t pending_list[MAX_REQUESTS]; - -struct cb_param { - pending_t *pent; - int segment; - u64 sector; - u64 vblock; /* for debug printing -- can be removed. */ -}; - -static void read_cb(struct io_ret r, void *in_param) -{ - struct cb_param *param = (struct cb_param *)in_param; - pending_t *p = param->pent; - int segment = param->segment; - blkif_request_t *req = p->req; - unsigned long size, offset, start; - char *dpage, *spage; - - spage = IO_BLOCK(r); - if (spage == NULL) { p->error++; goto finish; } - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment); - - /* Calculate read size and offset within the read block. */ - - offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE; - size = ( blkif_last_sect (req->frame_and_sects[segment]) - - blkif_first_sect(req->frame_and_sects[segment]) + 1 - ) << SECTOR_SHIFT; - start = blkif_first_sect(req->frame_and_sects[segment]) - << SECTOR_SHIFT; - - DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), " - "vblock %llx, " - "size %lx\n", - param->sector, blkif_first_sect(p->req->frame_and_sects[segment]), - blkif_last_sect (p->req->frame_and_sects[segment]), - param->vblock, size); - - memcpy(dpage + start, spage + offset, size); - freeblock(spage); - - /* Done the read. Now update the pending record. */ - finish: - pthread_mutex_lock(&p->mutex); - p->count--; - - if (p->count == 0) { - blkif_response_t *rsp; - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - if (p->error == 0) { - rsp->status = BLKIF_RSP_OKAY; - } else { - rsp->status = BLKIF_RSP_ERROR; - } - blktap_inject_response(rsp); - } - - pthread_mutex_unlock(&p->mutex); - - free(param); /* TODO: replace with cached alloc/dealloc */ -} - -int parallax_read(blkif_request_t *req, blkif_t *blkif) -{ - blkif_response_t *rsp; - u64 vblock, gblock; - vdi_t *vdi; - u64 sector; - int i; - char *dpage, *spage; - pending_t *pent; - - vdi = blkif_get_vdi(blkif, req->device); - - if ( vdi == NULL ) - goto err; - - pent = &pending_list[ID_TO_IDX(req->id)]; - pent->count = req->nr_segments; - pent->req = req; - pthread_mutex_init(&pent->mutex, NULL); - - for (i = 0; i < req->nr_segments; i++) { - pthread_t tid; - int ret; - struct cb_param *p; - - /* Round the requested segment to a block address. */ - sector = req->sector_number + (8*i); - vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; - - /* TODO: Replace this call to malloc with a cached allocation */ - p = (struct cb_param *)malloc(sizeof(struct cb_param)); - p->pent = pent; - p->sector = sector; - p->segment = i; - p->vblock = vblock; /* dbg */ - - /* Get that block from the store. */ - vdi_read(vdi, vblock, read_cb, (void *)p); - } - - return BLKTAP_STOLEN; - -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - rsp->status = BLKIF_RSP_ERROR; - - return BLKTAP_RESPOND; -} - -static void write_cb(struct io_ret r, void *in_param) -{ - struct cb_param *param = (struct cb_param *)in_param; - pending_t *p = param->pent; - blkif_request_t *req = p->req; - - /* catch errors from the block code. */ - if (IO_INT(r) < 0) p->error++; - - pthread_mutex_lock(&p->mutex); - p->count--; - - if (p->count == 0) { - blkif_response_t *rsp; - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; - if (p->error == 0) { - rsp->status = BLKIF_RSP_OKAY; - } else { - rsp->status = BLKIF_RSP_ERROR; - } - blktap_inject_response(rsp); - } - - pthread_mutex_unlock(&p->mutex); - - free(param); /* TODO: replace with cached alloc/dealloc */ -} - -int parallax_write(blkif_request_t *req, blkif_t *blkif) -{ - blkif_response_t *rsp; - u64 sector; - int i, writable = 0; - u64 vblock, gblock; - char *spage; - unsigned long size, offset, start; - vdi_t *vdi; - pending_t *pent; - - vdi = blkif_get_vdi(blkif, req->device); - - if ( vdi == NULL ) - goto err; - - pent = &pending_list[ID_TO_IDX(req->id)]; - pent->count = req->nr_segments; - pent->req = req; - pthread_mutex_init(&pent->mutex, NULL); - - for (i = 0; i < req->nr_segments; i++) { - struct cb_param *p; - - spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - - /* Round the requested segment to a block address. */ - - sector = req->sector_number + (8*i); - vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; - - /* Calculate read size and offset within the read block. */ - - offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE; - size = ( blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1 - ) << SECTOR_SHIFT; - start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - - DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld), " - "vblock %llx, gblock %llx, " - "size %lx\n", - sector, blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - vblock, gblock, size); - - /* XXX: For now we just freak out if they try to write a */ - /* non block-sized, block-aligned page. */ - - if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) { - printf("]\n] STRANGE WRITE!\n]\n"); - goto err; - } - - /* TODO: Replace this call to malloc with a cached allocation */ - p = (struct cb_param *)malloc(sizeof(struct cb_param)); - p->pent = pent; - p->sector = sector; - p->segment = i; - p->vblock = vblock; /* dbg */ - - /* Issue the write to the store. */ - vdi_write(vdi, vblock, spage, write_cb, (void *)p); - } - - return BLKTAP_STOLEN; - -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; - rsp->status = BLKIF_RSP_ERROR; - - return BLKTAP_RESPOND; -} - -int parallax_request(blkif_request_t *req) -{ - blkif_response_t *rsp; - domid_t dom = ID_TO_DOM(req->id); - blkif_t *blkif = blkif_find_by_handle(dom, 0); - - if (blkif == NULL) - goto err; - - if ( req->operation == BLKIF_OP_PROBE ) { - - return parallax_probe(req, blkif); - - } else if ( req->operation == BLKIF_OP_READ ) { - - return parallax_read(req, blkif); - - } else if ( req->operation == BLKIF_OP_WRITE ) { - - return parallax_write(req, blkif); - - } else { - printf("Unknown request message type!\n"); - /* Unknown operation */ - goto err; - } - -err: - rsp = (blkif_response_t *)req; - rsp->operation = req->operation; - rsp->id = req->id; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - -void __init_parallax(void) -{ - memset(blkif_hash, 0, sizeof(blkif_hash)); -} - - - -int main(int argc, char *argv[]) -{ - DPRINTF("parallax: starting.\n"); - __init_blockstore(); - DPRINTF("parallax: initialized blockstore...\n"); - init_block_async(); - DPRINTF("parallax: initialized async blocks...\n"); - __init_vdi(); - DPRINTF("parallax: initialized vdi registry etc...\n"); - __init_parallax(); - DPRINTF("parallax: initialized local stuff..\n"); - - blktap_register_ctrl_hook("parallax_control", parallax_control); - blktap_register_request_hook("parallax_request", parallax_request); - DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); - blktap_listen(); - - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/radix.c --- a/tools/blktap/radix.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,631 +0,0 @@ -/* - * Radix tree for mapping (up to) 63-bit virtual block IDs to - * 63-bit global block IDs - * - * Pointers within the tree set aside the least significant bit to indicate - * whther or not the target block is writable from this node. - * - * The block with ID 0 is assumed to be an empty block of all zeros - */ - -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> -#include <string.h> -#include <pthread.h> -#include "blockstore.h" -#include "radix.h" - -#define RADIX_TREE_MAP_SHIFT 9 -#define RADIX_TREE_MAP_MASK 0x1ff -#define RADIX_TREE_MAP_ENTRIES 512 - -/* -#define DEBUG -*/ - -/* Experimental radix cache. */ - -static pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER; -static int rcache_count = 0; -#define RCACHE_MAX 1024 - -typedef struct rcache_st { - radix_tree_node *node; - u64 id; - struct rcache_st *hash_next; - struct rcache_st *cache_next; - struct rcache_st *cache_prev; -} rcache_t; - -static rcache_t *rcache_head = NULL; -static rcache_t *rcache_tail = NULL; - -#define RCHASH_SIZE 512ULL -rcache_t *rcache[RCHASH_SIZE]; -#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1)) - -void __rcache_init(void) -{ - int i; - - for (i=0; i<RCHASH_SIZE; i++) - rcache[i] = NULL; -} - - -void rcache_write(u64 id, radix_tree_node *node) -{ - rcache_t *r, *tmp, **curs; - - pthread_mutex_lock(&rcache_mutex); - - /* Is it already in the cache? */ - r = rcache[RCACHE_HASH(id)]; - - for (;;) { - if (r == NULL) - break; - if (r->id == id) - { - memcpy(r->node, node, BLOCK_SIZE); - - /* bring to front. */ - if (r != rcache_head) { - - if (r == rcache_tail) { - if (r->cache_prev != NULL) rcache_tail = r->cache_prev; - rcache_tail->cache_next = NULL; - } - - tmp = r->cache_next; - if (r->cache_next != NULL) r->cache_next->cache_prev - = r->cache_prev; - if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp; - - r->cache_prev = NULL; - r->cache_next = rcache_head; - if (rcache_head != NULL) rcache_head->cache_prev = r; - rcache_head = r; - } - -//printf("Update (%Ld)\n", r->id); - goto done; - } - r = r->hash_next; - } - - if ( rcache_count == RCACHE_MAX ) - { - /* Remove an entry */ - - r = rcache_tail; - if (r->cache_prev != NULL) rcache_tail = r->cache_prev; - rcache_tail->cache_next = NULL; - freeblock(r->node); - - curs = &rcache[RCACHE_HASH(r->id)]; - while ((*curs) != r) - curs = &(*curs)->hash_next; - *curs = r->hash_next; -//printf("Evict (%Ld)\n", r->id); - - } else { - - r = (rcache_t *)malloc(sizeof(rcache_t)); - rcache_count++; - } - - r->node = newblock(); - memcpy(r->node, node, BLOCK_SIZE); - r->id = id; - - r->hash_next = rcache[RCACHE_HASH(id)]; - rcache[RCACHE_HASH(id)] = r; - - r->cache_prev = NULL; - r->cache_next = rcache_head; - if (rcache_head != NULL) rcache_head->cache_prev = r; - rcache_head = r; - if (rcache_tail == NULL) rcache_tail = r; - -//printf("Added (%Ld, %p)\n", id, r->node); -done: - pthread_mutex_unlock(&rcache_mutex); -} - -radix_tree_node *rcache_read(u64 id) -{ - rcache_t *r, *tmp; - radix_tree_node *node = NULL; - - pthread_mutex_lock(&rcache_mutex); - - r = rcache[RCACHE_HASH(id)]; - - for (;;) { - if (r == NULL) { -//printf("Miss (%Ld)\n", id); - goto done; - } - if (r->id == id) break; - r = r->hash_next; - } - - /* bring to front. */ - if (r != rcache_head) - { - if (r == rcache_tail) { - if (r->cache_prev != NULL) rcache_tail = r->cache_prev; - rcache_tail->cache_next = NULL; - } - tmp = r->cache_next; - if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev; - if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp; - - r->cache_prev = NULL; - r->cache_next = rcache_head; - if (rcache_head != NULL) rcache_head->cache_prev = r; - rcache_head = r; - } - - node = newblock(); - memcpy(node, r->node, BLOCK_SIZE); - -//printf("Hit (%Ld, %p)\n", id, r->node); -done: - pthread_mutex_unlock(&rcache_mutex); - - return(node); -} - - -void *rc_readblock(u64 id) -{ - void *ret; - - ret = (void *)rcache_read(id); - - if (ret != NULL) return ret; - - ret = readblock(id); - - if (ret != NULL) - rcache_write(id, ret); - - return(ret); -} - -u64 rc_allocblock(void *block) -{ - u64 ret; - - ret = allocblock(block); - - if (ret != ZERO) - rcache_write(ret, block); - - return(ret); -} - -int rc_writeblock(u64 id, void *block) -{ - int ret; - - ret = writeblock(id, block); - rcache_write(id, block); - - return(ret); -} - - -/* - * block device interface and other helper functions - * with these functions, block id is just a 63-bit number, with - * no special consideration for the LSB - */ -radix_tree_node cloneblock(radix_tree_node block); - -/* - * main api - * with these functions, the LSB of root always indicates - * whether or not the block is writable, including the return - * values of update and snapshot - */ -u64 lookup(int height, u64 root, u64 key); -u64 update(int height, u64 root, u64 key, u64 val); -u64 snapshot(u64 root); - -/** - * cloneblock: clone an existing block in memory - * @block: the old block - * - * @return: new block, with LSB cleared for every entry - */ -radix_tree_node cloneblock(radix_tree_node block) { - radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE); - int i; - if (node == NULL) { - perror("cloneblock malloc"); - return NULL; - } - for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) - node[i] = block[i] & ONEMASK; - return node; -} - -/** - * lookup: find a value given a key - * @height: height in bits of the radix tree - * @root: root node id, with set LSB indicating writable node - * @key: key to lookup - * - * @return: value on success, zero on error - */ - -u64 lookup(int height, u64 root, u64 key) { - radix_tree_node node; - u64 mask = ONE; - - assert(key >> height == 0); - - /* the root block may be smaller to ensure all leaves are full */ - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - - /* now carve off equal sized chunks at each step */ - for (;;) { - u64 oldroot; - -#ifdef DEBUG - printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root, - (int) ((key >> height) & RADIX_TREE_MAP_MASK), - (iswritable(root) ? "" : " (readonly)")); -#endif - - if (getid(root) == ZERO) - return ZERO; - - oldroot = root; - node = (radix_tree_node) rc_readblock(getid(root)); - if (node == NULL) - return ZERO; - - root = node[(key >> height) & RADIX_TREE_MAP_MASK]; - mask &= root; - freeblock(node); - - if (height == 0) - return ( root & ONEMASK ) | mask; - - height -= RADIX_TREE_MAP_SHIFT; - } - - return ZERO; -} - -/* - * update: set a radix tree entry, doing copy-on-write as necessary - * @height: height in bits of the radix tree - * @root: root node id, with set LSB indicating writable node - * @key: key to set - * @val: value to set, s.t. radix(key)=val - * - * @returns: (possibly new) root id on success (with LSB=1), 0 on failure - */ - -u64 update(int height, u64 root, u64 key, u64 val) { - int offset; - u64 child; - radix_tree_node node; - - /* base case--return val */ - if (height == 0) - return val; - - /* the root block may be smaller to ensure all leaves are full */ - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - offset = (key >> height) & RADIX_TREE_MAP_MASK; - -#ifdef DEBUG - printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root, - offset, (iswritable(root)?"":" (clone)")); -#endif - - /* load a block, or create a new one */ - if (root == ZERO) { - node = (radix_tree_node) newblock(); - } else { - node = (radix_tree_node) rc_readblock(getid(root)); - - if (!iswritable(root)) { - /* need to clone this node */ - radix_tree_node oldnode = node; - node = cloneblock(node); - freeblock(oldnode); - root = ZERO; - } - } - - if (node == NULL) { -#ifdef DEBUG - printf("update: node is null!\n"); -#endif - return ZERO; - } - - child = update(height, node[offset], key, val); - - if (child == ZERO) { - freeblock(node); - return ZERO; - } else if (child == node[offset]) { - /* no change, so we already owned the child */ - assert(iswritable(root)); - - freeblock(node); - return root; - } - - node[offset] = child; - - /* new/cloned blocks need to be saved */ - if (root == ZERO) { - /* mark this as an owned block */ - root = rc_allocblock(node); - if (root) - root = writable(root); - } else if (rc_writeblock(getid(root), node) < 0) { - freeblock(node); - return ZERO; - } - - freeblock(node); - return root; -} - -/** - * snapshot: create a snapshot - * @root: old root node - * - * @return: new root node, 0 on error - */ -u64 snapshot(u64 root) { - radix_tree_node node, newnode; - - if ((node = rc_readblock(getid(root))) == NULL) - return ZERO; - - newnode = cloneblock(node); - freeblock(node); - if (newnode == NULL) - return ZERO; - - root = rc_allocblock(newnode); - freeblock(newnode); - - if (root == ZERO) - return ZERO; - else - return writable(root); -} - -/** - * collapse: collapse a parent onto a child. - * - * NOTE: This assumes that parent and child really are, and further that - * there are no other children forked from this parent. (children of the - * child are okay...) - */ - -int collapse(int height, u64 proot, u64 croot) -{ - int i, numlinks, ret, total = 0; - radix_tree_node pnode, cnode; - - if (height == 0) { - height = -1; /* terminate recursion */ - } else { - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - } - numlinks = (1UL << RADIX_TREE_MAP_SHIFT); - - /* Terminal cases: */ - - if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) ) - return -1; - - /* get roots */ - if ((pnode = readblock(getid(proot))) == NULL) - return -1; - - if ((cnode = readblock(getid(croot))) == NULL) - { - freeblock(pnode); - return -1; - } - - /* For each writable link in proot */ - for (i=0; i<numlinks; i++) - { - if ( pnode[i] == cnode[i] ) continue; - - /* collapse (next level) */ - /* if height != 0 and writable... */ - if (( height >= 0 ) && ( iswritable(pnode[i]) ) ) - { - //printf(" %Ld is writable (i=%d).\n", getid(pnode[i]), i); - ret = collapse(height, pnode[i], cnode[i]); - if (ret == -1) - { - total = -1; - } else { - total += ret; - } - } - - - } - - /* if plink is writable, AND clink is writable -> free plink block */ - if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) - { - releaseblock(getid(proot)); - if (ret >=0) total++; - //printf(" Delete %Ld\n", getid(proot)); - } -//printf("done : %Ld\n", getid(proot)); - return total; - -} - - -void print_root(u64 root, int height, FILE *dot_f) -{ - FILE *f; - int i; - radix_tree_node node; - char *style[2] = { "", "style=bold,color=blue," }; - - if (dot_f == NULL) { - f = fopen("radix.dot", "w"); - if (f == NULL) { - perror("print_root: open"); - return; - } - - /* write graph preamble */ - fprintf(f, "digraph G {\n"); - - /* add a node for this root. */ - fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", - getid(root), style[iswritable(root)], getid(root)); - } - - printf("print_root(%Ld)\n", getid(root)); - - /* base case */ - if (height == 0) { - /* add a node and edge for each child root */ - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return; - - for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) { - if (node[i] != ZERO) { - fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", - getid(node[i]), style[iswritable(node[i])], - getid(node[i])); - fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), - getid(node[i]), i); - } - } - freeblock(node); - return; - } - - /* the root block may be smaller to ensure all leaves are full */ - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - - if (getid(root) == ZERO) - return; - - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return; - - /* add a node and edge for each child root */ - for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) - if (node[i] != ZERO) { - fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", - getid(node[i]), style[iswritable(node[i])], - getid(node[i])); - - print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f); - fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), - getid(node[i]), i); - } - - freeblock(node); - - /* write graph postamble */ - if (dot_f == NULL) { - fprintf(f, "}\n"); - fclose(f); - } -} - -#ifdef RADIX_STANDALONE - -int main(int argc, char **argv) { - u64 key = ZERO, val = ZERO; - u64 root = writable(2ULL); - u64 p = ZERO, c = ZERO; - int v; - char buff[4096]; - - __init_blockstore(); - - memset(buff, 0, 4096); - /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644); - - if (fp < 3) { - perror("open"); - return -1; - } - if (lseek(fp, 0, SEEK_END) == 0) { - write(fp, buff, 4096); - }*/ - - allocblock(buff); - - printf("Recognized commands:\n" - "Note: the LSB of a node number indicates if it is writable\n" - " root <node> set root to <node>\n" - " snapshot take a snapshot of the root\n" - " set <key> <val> set key=val\n" - " get <key> query key\n" - " c <proot> <croot> collapse\n" - " pr print tree to dot\n" - " pf <1=verbose> print freelist\n" - " quit\n" - "\nroot = %Ld\n", root); - for (;;) { - //print_root(root, 34, NULL); - //system("dot radix.dot -Tps -o radix.ps"); - - printf("> "); - fflush(stdout); - fgets(buff, 1024, stdin); - if (feof(stdin)) - break; - if (sscanf(buff, " root %Ld", &root) == 1) { - printf("root set to %Ld\n", root); - } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) { - root = update(34, root, key, val); - printf("root = %Ld\n", root); - } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) { - v = collapse(34, p, c); - printf("reclaimed %d blocks.\n", v); - } else if (sscanf(buff, " get %Ld", &key) == 1) { - val = lookup(34, root, key); - printf("value = %Ld\n", val); - } else if (!strcmp(buff, "quit\n")) { - break; - } else if (!strcmp(buff, "snapshot\n")) { - root = snapshot(root); - printf("new root = %Ld\n", root); - } else if (sscanf(buff, " pr %Ld", &root) == 1) { - print_root(root, 34, NULL); - } else if (sscanf(buff, " pf %d", &v) == 1) { - freelist_count(v); - } else if (!strcmp(buff, "pf\n")) { - freelist_count(0); - } else { - printf("command not recognized\n"); - } - } - return 0; -} - -#endif diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/radix.h --- a/tools/blktap/radix.h Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,45 +0,0 @@ -/* - * Radix tree for mapping (up to) 63-bit virtual block IDs to - * 63-bit global block IDs - * - * Pointers within the tree set aside the least significant bit to indicate - * whther or not the target block is writable from this node. - * - * The block with ID 0 is assumed to be an empty block of all zeros - */ - -#ifndef __RADIX_H__ -#define __RADIX_H__ - -/* I don't really like exposing these, but... */ -#define getid(x) (((x)>>1)&0x7fffffffffffffffLL) -#define putid(x) ((x)<<1) -#define writable(x) (((x)<<1)|1LL) -#define iswritable(x) ((x)&1LL) -#define ZERO 0LL -#define ONE 1LL -#define ONEMASK 0xffffffffffffffeLL - -#define RADIX_TREE_MAP_SHIFT 9 -#define RADIX_TREE_MAP_MASK 0x1ff -#define RADIX_TREE_MAP_ENTRIES 512 - -typedef u64 *radix_tree_node; - - -/* - * main api - * with these functions, the LSB of root always indicates - * whether or not the block is writable, including the return - * values of update and snapshot - */ -u64 lookup(int height, u64 root, u64 key); -u64 update(int height, u64 root, u64 key, u64 val); -u64 snapshot(u64 root); -int collapse(int height, u64 proot, u64 croot); -int isprivate(int height, u64 root, u64 key); - - -void __rcache_init(void); - -#endif /* __RADIX_H__ */ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/requests-async.c --- a/tools/blktap/requests-async.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,762 +0,0 @@ -/* requests-async.c - * - * asynchronous request dispatcher for radix access in parallax. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <ctype.h> -#include <assert.h> -#include <pthread.h> -#include <err.h> -#include <zlib.h> /* for crc32() */ -#include "requests-async.h" -#include "vdi.h" -#include "radix.h" - -#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18) -#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9) -#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL)) - - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -struct block_info { - u32 crc; - u32 unused; -}; - -struct io_req { - enum { IO_OP_READ, IO_OP_WRITE } op; - u64 root; - u64 vaddr; - int state; - io_cb_t cb; - void *param; - struct radix_lock *lock; - - /* internal stuff: */ - struct io_ret retval;/* holds the return while we unlock. */ - char *block; /* the block to write */ - radix_tree_node radix[3]; - u64 radix_addr[3]; - struct block_info bi; -}; - -void clear_w_bits(radix_tree_node node) -{ - int i; - for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++) - node[i] = node[i] & ONEMASK; - return; -} - -void clear_L3_w_bits(radix_tree_node node) -{ - int i; - for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2) - node[i] = node[i] & ONEMASK; - return; -} - -enum states { - /* both */ - READ_L1, - READ_L2, - READ_L3, - - /* read */ - READ_LOCKED, - READ_DATA, - READ_UNLOCKED, - RETURN_ZERO, - - /* write */ - WRITE_LOCKED, - WRITE_DATA, - WRITE_L3, - WRITE_UNLOCKED, - - /* L3 Zero Path */ - ALLOC_DATA_L3z, - WRITE_L3_L3z, - - /* L3 Fault Path */ - ALLOC_DATA_L3f, - WRITE_L3_L3f, - - /* L2 Zero Path */ - ALLOC_DATA_L2z, - WRITE_L2_L2z, - ALLOC_L3_L2z, - WRITE_L2_L3z, - - /* L2 Fault Path */ - READ_L3_L2f, - ALLOC_DATA_L2f, - WRITE_L2_L2f, - ALLOC_L3_L2f, - WRITE_L2_L3f, - - /* L1 Zero Path */ - ALLOC_DATA_L1z, - ALLOC_L3_L1z, - ALLOC_L2_L1z, - WRITE_L1_L1z, - - /* L1 Fault Path */ - READ_L2_L1f, - READ_L3_L1f, - ALLOC_DATA_L1f, - ALLOC_L3_L1f, - ALLOC_L2_L1f, - WRITE_L1_L1f, - -}; - -enum radix_offsets { - L1 = 0, - L2 = 1, - L3 = 2 -}; - - -static void read_cb(struct io_ret ret, void *param); -static void write_cb(struct io_ret ret, void *param); - -int vdi_read(vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param) -{ - struct io_req *req; - - if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR; - /* Every second line in the bottom-level radix tree is used to */ - /* store crc32 values etc. We shift the vadder here to achied this. */ - vaddr <<= 1; - - req = (struct io_req *)malloc(sizeof (struct io_req)); - if (req == NULL) return ERR_NOMEM; - - req->radix[0] = req->radix[1] = req->radix[2] = NULL; - req->op = IO_OP_READ; - req->root = vdi->radix_root; - req->lock = vdi->radix_lock; - req->vaddr = vaddr; - req->cb = cb; - req->param = param; - req->state = READ_LOCKED; - - block_rlock(req->lock, L1_IDX(vaddr), read_cb, req); - - return 0; -} - - -int vdi_write(vdi_t *vdi, u64 vaddr, char *block, - io_cb_t cb, void *param) -{ - struct io_req *req; - - if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR; - /* Every second line in the bottom-level radix tree is used to */ - /* store crc32 values etc. We shift the vadder here to achied this. */ - vaddr <<= 1; - - req = (struct io_req *)malloc(sizeof (struct io_req)); - if (req == NULL) return ERR_NOMEM; - - req->radix[0] = req->radix[1] = req->radix[2] = NULL; - req->op = IO_OP_WRITE; - req->root = vdi->radix_root; - req->lock = vdi->radix_lock; - req->vaddr = vaddr; - req->block = block; - /* Todo: add a pseodoheader to the block to include some location */ - /* information in the CRC as well. */ - req->bi.crc = (u32) crc32(0L, Z_NULL, 0); - req->bi.crc = (u32) crc32(req->bi.crc, block, BLOCK_SIZE); - req->bi.unused = 0xdeadbeef; - - req->cb = cb; - req->param = param; - req->radix_addr[L1] = getid(req->root); /* for consistency */ - req->state = WRITE_LOCKED; - - block_wlock(req->lock, L1_IDX(vaddr), write_cb, req); - - - return 0; -} - -static void read_cb(struct io_ret ret, void *param) -{ - struct io_req *req = (struct io_req *)param; - radix_tree_node node; - u64 idx; - char *block; - void *req_param; - - DPRINTF("read_cb\n"); - /* get record */ - switch(req->state) { - - case READ_LOCKED: - - DPRINTF("READ_LOCKED\n"); - req->state = READ_L1; - block_read(getid(req->root), read_cb, req); - break; - - case READ_L1: /* block is the radix root */ - - DPRINTF("READ_L1\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - node = (radix_tree_node) block; - idx = getid( node[L1_IDX(req->vaddr)] ); - free(block); - if ( idx == ZERO ) { - req->state = RETURN_ZERO; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - } else { - req->state = READ_L2; - block_read(idx, read_cb, req); - } - break; - - case READ_L2: - - DPRINTF("READ_L2\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - node = (radix_tree_node) block; - idx = getid( node[L2_IDX(req->vaddr)] ); - free(block); - if ( idx == ZERO ) { - req->state = RETURN_ZERO; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - } else { - req->state = READ_L3; - block_read(idx, read_cb, req); - } - break; - - case READ_L3: - { - struct block_info *bi; - - DPRINTF("READ_L3\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - node = (radix_tree_node) block; - idx = getid( node[L3_IDX(req->vaddr)] ); - bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1]; - req->bi = *bi; - free(block); - if ( idx == ZERO ) { - req->state = RETURN_ZERO; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - } else { - req->state = READ_DATA; - block_read(idx, read_cb, req); - } - break; - } - case READ_DATA: - { - u32 crc; - - DPRINTF("READ_DATA\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - - /* crc check */ - crc = (u32) crc32(0L, Z_NULL, 0); - crc = (u32) crc32(crc, block, BLOCK_SIZE); - if (crc != req->bi.crc) { - /* TODO: add a retry loop here. */ - /* Do this after the cache is added -- make sure to */ - /* invalidate the bad page before reissuing the read. */ - - warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused); -#ifdef PRINT_BADCRC_PAGES - { - int j; - for (j=0; j<BLOCK_SIZE; j++) { - if isprint(block[j]) { - printf("%c", block[j]); - } else { - printf("."); - } - if ((j % 64) == 0) printf("\n"); - } - } -#endif /* PRINT_BADCRC_PAGES */ - - /* fast and loose for the moment. */ - /* goto fail; */ - } - - req->retval = ret; - req->state = READ_UNLOCKED; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - break; - } - case READ_UNLOCKED: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("READ_UNLOCKED\n"); - req_param = req->param; - r = req->retval; - cb = req->cb; - free(req); - cb(r, req_param); - break; - } - - case RETURN_ZERO: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("RETURN_ZERO\n"); - req_param = req->param; - cb = req->cb; - free(req); - r.type = IO_BLOCK_T; - r.u.b = newblock(); - cb(r, req_param); - break; - } - - default: - DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); - goto fail; - } - - return; - - fail: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("asyn_read had a read error.\n"); - req_param = req->param; - r = ret; - cb = req->cb; - free(req); - cb(r, req_param); - } - - -} - -static void write_cb(struct io_ret r, void *param) -{ - struct io_req *req = (struct io_req *)param; - radix_tree_node node; - u64 a, addr; - void *req_param; - struct block_info *bi; - - switch(req->state) { - - case WRITE_LOCKED: - - DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr)); - req->state = READ_L1; - block_read(getid(req->root), write_cb, req); - break; - - case READ_L1: /* block is the radix root */ - - DPRINTF("READ_L1\n"); - node = (radix_tree_node) IO_BLOCK(r); - if (node == NULL) goto fail; - a = node[L1_IDX(req->vaddr)]; - addr = getid(a); - - req->radix_addr[L2] = addr; - req->radix[L1] = node; - - if ( addr == ZERO ) { - /* L1 empty subtree: */ - req->state = ALLOC_DATA_L1z; - block_alloc( req->block, write_cb, req ); - } else if ( !iswritable(a) ) { - /* L1 fault: */ - req->state = READ_L2_L1f; - block_read( addr, write_cb, req ); - } else { - req->state = READ_L2; - block_read( addr, write_cb, req ); - } - break; - - case READ_L2: - - DPRINTF("READ_L2\n"); - node = (radix_tree_node) IO_BLOCK(r); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix_addr[L3] = addr; - req->radix[L2] = node; - - if ( addr == ZERO ) { - /* L2 empty subtree: */ - req->state = ALLOC_DATA_L2z; - block_alloc( req->block, write_cb, req ); - } else if ( !iswritable(a) ) { - /* L2 fault: */ - req->state = READ_L3_L2f; - block_read( addr, write_cb, req ); - } else { - req->state = READ_L3; - block_read( addr, write_cb, req ); - } - break; - - case READ_L3: - - DPRINTF("READ_L3\n"); - node = (radix_tree_node) IO_BLOCK(r); - if (node == NULL) goto fail; - a = node[L3_IDX(req->vaddr)]; - addr = getid(a); - - req->radix[L3] = node; - - if ( addr == ZERO ) { - /* L3 fault: */ - req->state = ALLOC_DATA_L3z; - block_alloc( req->block, write_cb, req ); - } else if ( !iswritable(a) ) { - /* L3 fault: */ - req->state = ALLOC_DATA_L3f; - block_alloc( req->block, write_cb, req ); - } else { - req->state = WRITE_DATA; - block_write( addr, req->block, write_cb, req ); - } - break; - - case WRITE_DATA: - - DPRINTF("WRITE_DATA\n"); - /* The L3 radix points to the correct block, we just need to */ - /* update the crc. */ - if (IO_INT(r) < 0) goto fail; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 101; - *bi = req->bi; - req->state = WRITE_L3; - block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); - break; - - /* L3 Zero Path: */ - - case ALLOC_DATA_L3z: - - DPRINTF("ALLOC_DATA_L3z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 102; - *bi = req->bi; - req->state = WRITE_L3_L3z; - block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); - break; - - /* L3 Fault Path: */ - - case ALLOC_DATA_L3f: - - DPRINTF("ALLOC_DATA_L3f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 103; - *bi = req->bi; - req->state = WRITE_L3_L3f; - block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); - break; - - /* L2 Zero Path: */ - - case ALLOC_DATA_L2z: - - DPRINTF("ALLOC_DATA_L2z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3] = newblock(); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 104; - *bi = req->bi; - req->state = ALLOC_L3_L2z; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L2z: - - DPRINTF("ALLOC_L3_L2z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = WRITE_L2_L2z; - block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); - break; - - /* L2 Fault Path: */ - - case READ_L3_L2f: - - DPRINTF("READ_L3_L2f\n"); - node = (radix_tree_node) IO_BLOCK(r); - clear_L3_w_bits(node); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix[L3] = node; - req->state = ALLOC_DATA_L2f; - block_alloc( req->block, write_cb, req ); - break; - - case ALLOC_DATA_L2f: - - DPRINTF("ALLOC_DATA_L2f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 105; - *bi = req->bi; - req->state = ALLOC_L3_L2f; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L2f: - - DPRINTF("ALLOC_L3_L2f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = WRITE_L2_L2f; - block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); - break; - - /* L1 Zero Path: */ - - case ALLOC_DATA_L1z: - - DPRINTF("ALLOC_DATA_L1z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3] = newblock(); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 106; - *bi = req->bi; - req->state = ALLOC_L3_L1z; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L1z: - - DPRINTF("ALLOC_L3_L1z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2] = newblock(); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = ALLOC_L2_L1z; - block_alloc( (char*)req->radix[L2], write_cb, req ); - break; - - case ALLOC_L2_L1z: - - DPRINTF("ALLOC_L2_L1z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L1][L1_IDX(req->vaddr)] = a; - req->state = WRITE_L1_L1z; - block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); - break; - - /* L1 Fault Path: */ - - case READ_L2_L1f: - - DPRINTF("READ_L2_L1f\n"); - node = (radix_tree_node) IO_BLOCK(r); - clear_w_bits(node); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix_addr[L3] = addr; - req->radix[L2] = node; - - if (addr == ZERO) { - /* nothing below L2, create an empty L3 and alloc data. */ - /* (So skip READ_L3_L1f.) */ - req->radix[L3] = newblock(); - req->state = ALLOC_DATA_L1f; - block_alloc( req->block, write_cb, req ); - } else { - req->state = READ_L3_L1f; - block_read( addr, write_cb, req ); - } - break; - - case READ_L3_L1f: - - DPRINTF("READ_L3_L1f\n"); - node = (radix_tree_node) IO_BLOCK(r); - clear_L3_w_bits(node); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix[L3] = node; - req->state = ALLOC_DATA_L1f; - block_alloc( req->block, write_cb, req ); - break; - - case ALLOC_DATA_L1f: - - DPRINTF("ALLOC_DATA_L1f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 107; - *bi = req->bi; - req->state = ALLOC_L3_L1f; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L1f: - - DPRINTF("ALLOC_L3_L1f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = ALLOC_L2_L1f; - block_alloc( (char*)req->radix[L2], write_cb, req ); - break; - - case ALLOC_L2_L1f: - - DPRINTF("ALLOC_L2_L1f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L1][L1_IDX(req->vaddr)] = a; - req->state = WRITE_L1_L1f; - block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); - break; - - case WRITE_L3: - case WRITE_L3_L3z: - case WRITE_L3_L3f: - case WRITE_L2_L2z: - case WRITE_L2_L2f: - case WRITE_L1_L1z: - case WRITE_L1_L1f: - { - int i; - DPRINTF("DONE\n"); - /* free any saved node vals. */ - for (i=0; i<3; i++) - if (req->radix[i] != 0) free(req->radix[i]); - req->retval = r; - req->state = WRITE_UNLOCKED; - block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req); - break; - } - case WRITE_UNLOCKED: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("WRITE_UNLOCKED!\n"); - req_param = req->param; - r = req->retval; - cb = req->cb; - free(req); - cb(r, req_param); - break; - } - - default: - DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); - goto fail; - } - - return; - - fail: - { - struct io_ret r; - io_cb_t cb; - int i; - - DPRINTF("asyn_write had a read error mid-way.\n"); - req_param = req->param; - cb = req->cb; - r.type = IO_INT_T; - r.u.i = -1; - /* free any saved node vals. */ - for (i=0; i<3; i++) - if (req->radix[i] != 0) free(req->radix[i]); - free(req); - cb(r, req_param); - } -} - -char *vdi_read_s(vdi_t *vdi, u64 vaddr) -{ - pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; - char *block = NULL; - int ret; - - void reads_cb(struct io_ret r, void *param) - { - block = IO_BLOCK(r); - pthread_mutex_unlock((pthread_mutex_t *)param); - } - - pthread_mutex_lock(&m); - ret = vdi_read(vdi, vaddr, reads_cb, &m); - - if (ret == 0) pthread_mutex_lock(&m); - - return block; -} - - -int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block) -{ - pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; - int ret, result; - - void writes_cb(struct io_ret r, void *param) - { - result = IO_INT(r); - pthread_mutex_unlock((pthread_mutex_t *)param); - } - - pthread_mutex_lock(&m); - ret = vdi_write(vdi, vaddr, block, writes_cb, &m); - - if (ret == 0) pthread_mutex_lock(&m); - - return result; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/requests-async.h --- a/tools/blktap/requests-async.h Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,29 +0,0 @@ -#ifndef _REQUESTSASYNC_H_ -#define _REQUESTSASYNC_H_ - -#include "block-async.h" -#include "blockstore.h" /* for newblock etc. */ - -/* -#define BLOCK_SIZE 4096 -#define ZERO 0ULL -#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU) -#define iswritable(x) (((x) & 1LLU) != 0) -#define writable(x) (((x) << 1) | 1LLU) -#define readonly(x) ((u64)((x) << 1)) -*/ - -#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */ -#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x)) - -int vdi_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param); -int vdi_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param); - -/* synchronous versions: */ -char *vdi_read_s (vdi_t *vdi, u64 vaddr); -int vdi_write_s(vdi_t *vdi, u64 vaddr, char *block); - -#define ERR_BAD_VADDR -1 -#define ERR_NOMEM -2 - -#endif //_REQUESTSASYNC_H_ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/snaplog.c --- a/tools/blktap/snaplog.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,238 +0,0 @@ -/************************************************************************** - * - * snaplog.c - * - * Snapshot log on-disk data structure. - * - */ - - /* VDI histories are made from chains of snapshot logs. These logs record - * the (radix) root and timestamp of individual snapshots. - * - * creation of a new VDI involves 'forking' a snapshot log, by creating a - * new, empty log (in a new VDI) and parenting it off of a record in an - * existing snapshot log. - * - * snapshot log blocks have at most one writer. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <sys/time.h> -#include "blockstore.h" -#include "snaplog.h" - - - -snap_block_t *snap_get_block(u64 block) -{ - snap_block_t *blk = (snap_block_t *)readblock(block); - - if ( blk == NULL) - return NULL; - if ( blk->hdr.magic != SNAP_MAGIC ) { - freeblock(blk); - return NULL; - } - - return blk; -} - -int snap_get_id(snap_id_t *id, snap_rec_t *target) -{ - snap_block_t *blk; - - if ( id == NULL ) - return -1; - - blk = snap_get_block(id->block); - - if ( blk == NULL ) - return -1; - - if ( id->index > blk->hdr.nr_entries ) { - freeblock(blk); - return -1; - } - - *target = blk->snaps[id->index]; - freeblock(blk); - return 0; -} - -int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id, - snap_id_t *new_id) -{ - snap_rec_t parent_rec, fork_rec; - snap_block_t *blk, *pblk; - /* - if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) ) - return -1; - - if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) ) - return -1; -*/ - blk = (snap_block_t *)newblock(); - blk->hdr.magic = SNAP_MAGIC; - blk->hdr.nr_entries = 0; - blk->hdr.log_entries = 0; - blk->hdr.immutable = 0; - - if ( (parent_id != NULL) - && (parent_id->block != fork_id->block) - && (parent_id->block != 0)) { - - pblk = snap_get_block(parent_id->block); - blk->hdr.log_entries = pblk->hdr.log_entries; - freeblock(pblk); - } - - if (parent_id != NULL) { - blk->hdr.parent_block = *parent_id; - blk->hdr.fork_block = *fork_id; - } else { - blk->hdr.parent_block = null_snap_id; - blk->hdr.fork_block = null_snap_id; - } - - new_id->index = 0; - new_id->block = allocblock(blk); - freeblock(blk); - if (new_id->block == 0) - return -1; - - return 0; -} - -int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id) -{ - return __snap_block_create(parent_id, parent_id, new_id); -} - -int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id) -{ - snap_id_t id = *old_id; - snap_block_t *blk = snap_get_block(id.block); - - if ( rec->deleted == 1 ) { - printf("Attempt to append a deleted snapshot!\n"); - return -1; - } - - if ( blk->hdr.immutable != 0 ) { - printf("Attempt to snap an immutable snap block!\n"); - return -1; - } - - new_id->block = id.block; - - if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) { - int ret; - - id.index--; /* make id point to the last full record */ - - ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id); - if ( ret != 0 ) { - freeblock(blk); - return -1; - } - - blk->hdr.immutable = 1; - writeblock(id.block, blk); - freeblock(blk); - blk = snap_get_block(new_id->block); - id = *new_id; - } - - blk->snaps[blk->hdr.nr_entries] = *rec; - blk->hdr.nr_entries++; - blk->hdr.log_entries++; - new_id->index = blk->hdr.nr_entries; - //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries); - writeblock(id.block, blk); - freeblock(blk); - return 0; -} - -int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id) -{ - snap_block_t *p_blk, *c_blk, *blk; - snap_rec_t *p_rec, *c_rec; - int ret = -1; - - p_blk = snap_get_block(p_id->block); - - if (p_blk == NULL) return(-1); - - if (c_id->block == p_id->block) - { - c_blk = p_blk; - } else { - c_blk = snap_get_block(c_id->block); - } - - if (p_blk == NULL) { - freeblock(p_blk); - return(-1); - } - - /* parent and child must not be deleted. */ - p_rec = &p_blk->snaps[p_id->index]; - c_rec = &c_blk->snaps[c_id->index]; - /* - if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) { - printf("One of those snaps is already deleted.\n"); - goto done; - } - */ - /* first non-deleted thing in the log before child must be parent. */ - - /* XXX todo: text the range here for delete (and eventually fork) bits) */ - /* for now, snaps must be consecutive, on the same log page: */ - - if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1)) - { - printf("Deleting non-consecutive snaps is not done yet.\n"); - goto done; - } - - /* mark parent as deleted XXX: may need to lock parent block here.*/ - p_rec->deleted = 1; - writeblock(p_id->block, p_blk); - - /* delete the parent */ - printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root); - ret = collapse(height, p_rec->radix_root, c_rec->radix_root); - - /* return the number of blocks reclaimed. */ - -done: - if (c_blk != p_blk) freeblock(c_blk); - freeblock(p_blk); - - return(ret); -} - -void snap_print_history(snap_id_t *snap_id) -{ - snap_id_t id = *snap_id; - unsigned int idx = id.index; - snap_block_t *new_blk, *blk = snap_get_block(id.block); - - while ( blk ) { - printf("[Snap block %Ld]:\n", id.block); - do { - printf(" %03u: root: %Ld ts: %ld.%ld\n", idx, - blk->snaps[idx].radix_root, - blk->snaps[idx].timestamp.tv_sec, - blk->snaps[idx].timestamp.tv_usec); - } while (idx-- != 0); - - id = blk->hdr.parent_block; - if (id.block != 0) { - new_blk = snap_get_block(id.block); - } - freeblock(blk); - blk = new_blk; - } -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/snaplog.h --- a/tools/blktap/snaplog.h Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,61 +0,0 @@ -/************************************************************************** - * - * snaplog.h - * - * Snapshot log on-disk data structure. - * - */ - -#include "radix.h" -#include "blockstore.h" /* for BLOCK_SIZE */ - -#ifndef __SNAPLOG_H__ -#define __SNAPLOG_H__ - -typedef struct snap_id { - u64 block; - unsigned int index; -} snap_id_t; - -typedef struct snap_rec { - u64 radix_root; - struct timeval timestamp; - /* flags: */ - unsigned deleted:1; -} snap_rec_t; - - -int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id); -int snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id); -int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id); -void snap_print_history(snap_id_t *snap_id); -int snap_get_id(snap_id_t *id, snap_rec_t *target); - - -/* exported for vdi debugging */ -#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL - -static const snap_id_t null_snap_id = { 0, 0 }; - -typedef struct snap_block_hdr { - u64 magic; - snap_id_t parent_block; /* parent block within this chain */ - snap_id_t fork_block; /* where this log was forked */ - unsigned log_entries; /* total entries since forking */ - unsigned short nr_entries; /* entries in snaps[] */ - unsigned short immutable; /* has this snap page become immutable? */ -} snap_block_hdr_t; - - -#define SNAPS_PER_BLOCK \ - ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t)) - -typedef struct snap_block { - snap_block_hdr_t hdr; - snap_rec_t snaps[SNAPS_PER_BLOCK]; -} snap_block_t; - - -snap_block_t *snap_get_block(u64 block); - -#endif /* __SNAPLOG_H__ */ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi.c --- a/tools/blktap/vdi.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,367 +0,0 @@ -/************************************************************************** - * - * vdi.c - * - * Virtual Disk Image (VDI) Interfaces - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <fcntl.h> -#include <string.h> -#include <sys/time.h> -#include <pthread.h> -#include "blockstore.h" -#include "block-async.h" -#include "requests-async.h" -#include "radix.h" -#include "vdi.h" - -#define VDI_REG_BLOCK 2LL -#define VDI_RADIX_ROOT writable(3) - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* I haven't decided about this registry stuff, so this is just a really - * quick lash-up so that there is some way to track VDIs. - * - * (Most vdi access should be with a direct handle to the block, so this - * registry is just for start-of-day lookup and other control operations.) - */ - -vdi_registry_t *create_vdi_registry(void) -{ - vdi_registry_t *reg = (vdi_registry_t *)newblock(); - - if (reg == NULL) - return NULL; - - /* zero-fill the vdi radix root while we have an empty block. */ - writeblock(VDI_RADIX_ROOT, (void *)reg); - - - DPRINTF("[vdi.c] Creating VDI registry!\n"); - reg->magic = VDI_REG_MAGIC; - reg->nr_vdis = 0; - - writeblock(VDI_REG_BLOCK, (void *)reg); - - return reg; -} - -vdi_registry_t *get_vdi_registry(void) -{ - vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK); - - if ( vdi_reg == NULL ) - vdi_reg = create_vdi_registry(); - - if ( vdi_reg->magic != VDI_REG_MAGIC ) { - freeblock(vdi_reg); - return NULL; - } - - return vdi_reg; -} - - -vdi_t *vdi_create(snap_id_t *parent_snap, char *name) -{ - int ret; - vdi_t *vdi; - vdi_registry_t *vdi_reg; - snap_rec_t snap_rec; - - /* create a vdi struct */ - vdi = newblock(); - if (vdi == NULL) - return NULL; - - if ( snap_get_id(parent_snap, &snap_rec) == 0 ) { - vdi->radix_root = snapshot(snap_rec.radix_root); - } else { - vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */ - vdi->radix_root = writable(vdi->radix_root); /* grr. */ - } - - /* create a snapshot log, and add it to the vdi struct */ - - ret = snap_block_create(parent_snap, &vdi->snap); - if ( ret != 0 ) { - DPRINTF("Error getting snap block in vdi_create.\n"); - freeblock(vdi); - return NULL; - } - - /* append the vdi to the registry, fill block and id. */ - /* implicit allocation means we have to write the vdi twice here. */ - vdi_reg = get_vdi_registry(); - if ( vdi_reg == NULL ) { - freeblock(vdi); - return NULL; - } - - vdi->block = allocblock((void *)vdi); - vdi->id = vdi_reg->nr_vdis++; - strncpy(vdi->name, name, VDI_NAME_SZ); - vdi->name[VDI_NAME_SZ] = '\0'; - vdi->radix_lock = NULL; /* for tidiness */ - writeblock(vdi->block, (void *)vdi); - - update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block); - writeblock(VDI_REG_BLOCK, (void *)vdi_reg); - freeblock(vdi_reg); - - vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); - if (vdi->radix_lock == NULL) - { - perror("couldn't malloc radix_lock for new vdi!"); - freeblock(vdi); - return NULL; - } - radix_lock_init(vdi->radix_lock); - - return vdi; -} - -/* vdi_get and vdi_put currently act more like alloc/free -- they don't - * do refcount-based allocation. - */ -vdi_t *vdi_get(u64 vdi_id) -{ - u64 vdi_blk; - vdi_t *vdi; - - vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id); - - if ( vdi_blk == 0 ) - return NULL; - - vdi = (vdi_t *)readblock(vdi_blk); - - vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); - if (vdi->radix_lock == NULL) - { - perror("couldn't malloc radix_lock for new vdi!"); - freeblock(vdi); - return NULL; - } - radix_lock_init(vdi->radix_lock); - - return vdi; -} - -void vdi_put(vdi_t *vdi) -{ - free(vdi->radix_lock); - freeblock(vdi); -} - -void vdi_snapshot(vdi_t *vdi) -{ - snap_rec_t rec; - int ret; - - rec.radix_root = vdi->radix_root; - gettimeofday(&rec.timestamp, NULL); - rec.deleted = 0; - - vdi->radix_root = snapshot(vdi->radix_root); - ret = snap_append(&vdi->snap, &rec, &vdi->snap); - if ( ret != 0 ) { - printf("snap_append returned failure\n"); - return; - } - writeblock(vdi->block, vdi); -} - -int __init_vdi() -{ - /* sneak this in here for the moment. */ - __rcache_init(); - - /* force the registry to be created if it doesn't exist. */ - vdi_registry_t *vdi_reg = get_vdi_registry(); - if (vdi_reg == NULL) { - printf("[vdi.c] Couldn't get/create a VDI registry!\n"); - return -1; - } - freeblock(vdi_reg); - - - return 0; -} - -#ifdef VDI_STANDALONE - -#define TEST_VDIS 50 -#define NR_ITERS 50000 -#define FORK_POINTS 200 -#define INIT_VDIS 3 -#define INIT_SNAPS 40 - -/* These must be of decreasing size: */ -#define NEW_FORK (RAND_MAX-(RAND_MAX/1000)) -#define NEW_ROOT_VDI (RAND_MAX-((RAND_MAX/1000)*2)) -#define NEW_FORK_VDI (RAND_MAX-((RAND_MAX/1000)*3)) - -#define GRAPH_DOT_FILE "vdi.dot" -#define GRAPH_PS_FILE "vdi.ps" - - -typedef struct sh_st { - snap_id_t id; - struct sh_st *next; -} sh_t; - -#define SNAP_HASHSZ 1024 -sh_t *node_hash[SNAP_HASHSZ]; -#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ) - -#define SNAPID_EQUAL(_a,_b) \ - (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index)) -int sh_check_and_add(snap_id_t *id) -{ - sh_t **s = &node_hash[SNAP_HASH(id)]; - - while (*s != NULL) { - if (SNAPID_EQUAL(&((*s)->id), id)) - return 1; - *s = (*s)->next; - } - - *s = (sh_t *)malloc(sizeof(sh_t)); - (*s)->id = *id; - (*s)->next = NULL; - - return 0; -} - -int main(int argc, char *argv[]) -{ - vdi_t *vdi_list[TEST_VDIS]; - snap_id_t id, fork_points[FORK_POINTS]; - int nr_vdis = 0, nr_forks = 0; - int i, j, r; - FILE *f; - char name[VDI_NAME_SZ]; - - __init_blockstore(); - __init_vdi(); - - printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS); - - for (i=0; i<INIT_VDIS; i++) { - r=rand(); - - sprintf(name, "VDI Number %d", nr_vdis); - vdi_list[i] = vdi_create(NULL, name); - for (j=0; j<(r%INIT_SNAPS); j++) - vdi_snapshot(vdi_list[i]); - fork_points[i] = vdi_list[i]->snap; - nr_vdis++; - nr_forks++; - } - - printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS); - - for (i=0; i<NR_ITERS; i++) { - r = rand(); - - if ( r > NEW_FORK ) { - if ( nr_forks > FORK_POINTS ) - continue; - id = vdi_list[r%nr_vdis]->snap; - if ( ( id.block == 0 ) || ( id.index == 0 ) ) - continue; - id.index--; - fork_points[nr_forks++] = id; - - } else if ( r > NEW_ROOT_VDI ) { - - if ( nr_vdis == TEST_VDIS ) - continue; - - sprintf(name, "VDI Number %d.", nr_vdis); - vdi_list[nr_vdis++] = vdi_create(NULL, name); - - } else if ( r > NEW_FORK_VDI ) { - - if ( nr_vdis == TEST_VDIS ) - continue; - - sprintf(name, "VDI Number %d.", nr_vdis); - vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name); - - } else /* SNAPSHOT */ { - - vdi_snapshot(vdi_list[r%nr_vdis]); - - } - } - - /* now dump it out to a dot file. */ - printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis); - - f = fopen(GRAPH_DOT_FILE, "w"); - - /* write graph preamble */ - fprintf(f, "digraph G {\n"); - fprintf(f, " rankdir=LR\n"); - - for (i=0; i<nr_vdis; i++) { - char oldnode[255]; - snap_block_t *blk; - snap_id_t id = vdi_list[i]->snap; - int nr_snaps, done=0; - - /* add a node for the id */ -printf("vdi: %d\n", i); - fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", - id.block, id.index, vdi_list[i]->name, - id.block, id.index); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - - while (id.block != 0) { - blk = snap_get_block(id.block); - nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index); - id = blk->hdr.fork_block; - - done = sh_check_and_add(&id); - - /* add a node for the fork_id */ - if (!done) { - fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", - id.block, id.index, - id.block, id.index); - } - - /* add an edge between them */ - fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n", - id.block, id.index, oldnode, nr_snaps); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - freeblock(blk); - - if (done) break; - } - } - - /* write graph postamble */ - fprintf(f, "}\n"); - fclose(f); - - printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE); - { - char cmd[255]; - sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE); - system(cmd); - } - return 0; -} - -#endif diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi.h --- a/tools/blktap/vdi.h Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,55 +0,0 @@ -#ifndef _VDI_H_ -#define _VDI_H_ -/************************************************************************** - * - * vdi.h - * - * Virtual Disk Image (VDI) Interfaces - * - */ - -#ifndef __VDI_H__ -#define __VDI_H__ - -#include "blktaplib.h" -#include "snaplog.h" - -#define VDI_HEIGHT 27 /* Note that these are now hard-coded */ -#define VDI_REG_HEIGHT 27 /* in the async lookup code */ - -#define VDI_NAME_SZ 256 - - -typedef struct vdi { - u64 id; /* unique vdi id -- used by the registry */ - u64 block; /* block where this vdi lives (also unique)*/ - u64 radix_root; /* radix root node for block mappings */ - snap_id_t snap; /* next snapshot slot for this VDI */ - struct vdi *next; /* used to hash-chain in blkif. */ - blkif_vdev_t vdevice; /* currently mounted as... */ - struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs */ - char name[VDI_NAME_SZ];/* human readable vdi name */ -} vdi_t; - -#define VDI_REG_MAGIC 0xff00ff0bb0ff00ffLL - -typedef struct vdi_registry { - u64 magic; - u64 nr_vdis; -} vdi_registry_t; - - -int __init_vdi(void); - -vdi_t *vdi_get(u64 vdi_id); -void vdi_put(vdi_t *vdi); -vdi_registry_t *get_vdi_registry(void); -vdi_t *vdi_create(snap_id_t *parent_snap, char *name); -u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable); -void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block); -void vdi_snapshot(vdi_t *vdi); - - -#endif /* __VDI_H__ */ - -#endif //_VDI_H_ diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_create.c --- a/tools/blktap/vdi_create.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,52 +0,0 @@ -/************************************************************************** - * - * vdi_create.c - * - * Create a new vdi. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - char name[VDI_NAME_SZ] = ""; - snap_id_t id; - int from_snap = 0; - - __init_blockstore(); - __init_vdi(); - - if ( argc == 1 ) { - printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]); - exit(-1); - } - - strncpy( name, argv[1], VDI_NAME_SZ); - name[VDI_NAME_SZ] = '\0'; - - if ( argc > 3 ) { - id.block = (u64) atoll(argv[2]); - id.index = (unsigned int) atol (argv[3]); - from_snap = 1; - } - - vdi = vdi_create( from_snap ? &id : NULL, name); - - if ( vdi == NULL ) { - printf("Failed to create VDI!\n"); - freeblock(vdi); - exit(-1); - } - - freeblock(vdi); - - return (0); -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_fill.c --- a/tools/blktap/vdi_fill.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,81 +0,0 @@ -/************************************************************************** - * - * vdi_fill.c - * - * Hoover a file or device into a vdi. - * You must first create the vdi with vdi_create. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include "blockstore.h" -#include "radix.h" -#include "requests-async.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - u64 id; - int fd; - struct stat st; - u64 tot_size; - char spage[BLOCK_SIZE]; - char *dpage; - u64 vblock = 0, count=0; - - __init_blockstore(); - init_block_async(); - __init_vdi(); - - if ( argc < 3 ) { - printf("usage: %s <VDI id> <filename>\n", argv[0]); - exit(-1); - } - - id = (u64) atoll(argv[1]); - - vdi = vdi_get( id ); - - if ( vdi == NULL ) { - printf("Failed to retreive VDI %Ld!\n", id); - exit(-1); - } - - fd = open(argv[2], O_RDONLY | O_LARGEFILE); - - if (fd < 0) { - printf("Couldn't open %s!\n", argv[2]); - exit(-1); - } - - if ( fstat(fd, &st) != 0 ) { - printf("Couldn't stat %s!\n", argv[2]); - exit(-1); - } - - tot_size = (u64) st.st_size; - printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size); - - printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE); - printf(" "); - while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) { - vdi_write_s(vdi, vblock, spage); - - vblock++; - if ((vblock % 512) == 0) - printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock); - fflush(stdout); - } - printf("\n"); - - freeblock(vdi); - - return (0); -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_list.c --- a/tools/blktap/vdi_list.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,47 +0,0 @@ -/************************************************************************** - * - * vdi_list.c - * - * Print a list of VDIs on the block store. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_registry_t *reg; - vdi_t *vdi; - int i; - - __init_blockstore(); - __init_vdi(); - - reg = get_vdi_registry(); - - if ( reg == NULL ) { - printf("couldn't get VDI registry.\n"); - exit(-1); - } - - for (i=0; i < reg->nr_vdis; i++) { - vdi = vdi_get(i); - - if ( vdi != NULL ) { - - printf("%10Ld %60s\n", vdi->id, vdi->name); - freeblock(vdi); - - } - } - - freeblock(reg); - - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap.c --- a/tools/blktap/vdi_snap.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,43 +0,0 @@ -/************************************************************************** - * - * vdi_snap.c - * - * Snapshot a vdi. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - u64 id; - - __init_blockstore(); - __init_vdi(); - - if ( argc == 1 ) { - printf("usage: %s <VDI id>\n", argv[0]); - exit(-1); - } - - id = (u64) atoll(argv[1]); - - vdi = vdi_get(id); - - if ( vdi == NULL ) { - printf("couldn't find the requested VDI.\n"); - freeblock(vdi); - exit(-1); - } - - vdi_snapshot(vdi); - - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap_delete.c --- a/tools/blktap/vdi_snap_delete.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,48 +0,0 @@ -/************************************************************************** - * - * vdi_snap_delete.c - * - * Delete a snapshot. - * - * This is not finished: right now it takes a snap n and calls - * snap_collapse(n,n+1). - * - * TODO: support for non-consecutive, non-same-block snaps - * Avoid forking probs. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "snaplog.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - snap_id_t id, c_id; - int ret; - - __init_blockstore(); - __init_vdi(); - - if ( argc != 3 ) { - printf("usage: %s <snap block> <snap idx>\n", argv[0]); - exit(-1); - } - - id.block = (u64) atoll(argv[1]); - id.index = (unsigned int) atol (argv[2]); - - c_id = id; - c_id.index++; - - ret = snap_collapse(VDI_HEIGHT, &id, &c_id); - - printf("Freed %d blocks.\n", ret); - - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_snap_list.c --- a/tools/blktap/vdi_snap_list.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,82 +0,0 @@ -/************************************************************************** - * - * vdi_snap_list.c - * - * Print a list of snapshots for the specified vdi. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - u64 id; - int i, max_snaps = -1; - snap_block_t *blk; - snap_id_t sid; - char *t; - - __init_blockstore(); - __init_vdi(); - - if ( argc == 1 ) { - printf("usage: %s <VDI id> [max snaps]\n", argv[0]); - exit(-1); - } - - id = (u64) atoll(argv[1]); - - if ( argc > 2 ) { - max_snaps = atoi(argv[2]); - } - - vdi = vdi_get(id); - - if ( vdi == NULL ) { - printf("couldn't find the requested VDI.\n"); - freeblock(vdi); - exit(-1); - } - - sid = vdi->snap; - sid.index--; - - //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", - // "radix root", "d"); - printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", - "radix root", "d"); - - while (sid.block != 0) { - blk = snap_get_block(sid.block); - for (i = sid.index; i >= 0; i--) { - if ( max_snaps == 0 ) { - freeblock(blk); - goto done; - } - t = ctime(&blk->snaps[i].timestamp.tv_sec); - t[strlen(t)-1] = '\0'; - //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n", - printf("%8Ld%4u%30s %06lu %12Ld %1s\n", - sid.block, i, - //blk->snaps[i].timestamp.tv_sec, - t, - blk->snaps[i].timestamp.tv_usec, - blk->snaps[i].radix_root, - blk->snaps[i].deleted ? "*" : " "); - if ( max_snaps != -1 ) - max_snaps--; - } - sid = blk->hdr.parent_block; - freeblock(blk); - } -done: - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_tree.c --- a/tools/blktap/vdi_tree.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,132 +0,0 @@ -/************************************************************************** - * - * vdi_tree.c - * - * Output current vdi tree to dot and postscript. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -#define GRAPH_DOT_FILE "vdi.dot" -#define GRAPH_PS_FILE "vdi.ps" - -typedef struct sh_st { - snap_id_t id; - struct sh_st *next; -} sh_t; - -#define SNAP_HASHSZ 1024 -sh_t *node_hash[SNAP_HASHSZ]; -#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ) - -#define SNAPID_EQUAL(_a,_b) \ - (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index)) -int sh_check_and_add(snap_id_t *id) -{ - sh_t **s = &node_hash[SNAP_HASH(id)]; - - while (*s != NULL) { - if (SNAPID_EQUAL(&((*s)->id), id)) - return 1; - *s = (*s)->next; - } - - *s = (sh_t *)malloc(sizeof(sh_t)); - (*s)->id = *id; - (*s)->next = NULL; - - return 0; -} - -int main(int argc, char *argv[]) -{ - FILE *f; - char dot_file[255] = GRAPH_DOT_FILE; - char ps_file[255] = GRAPH_PS_FILE; - int nr_vdis = 0, nr_forks = 0; - vdi_registry_t *reg; - vdi_t *vdi; - int i; - - __init_blockstore(); - __init_vdi(); - - reg = get_vdi_registry(); - - if ( reg == NULL ) { - printf("couldn't get VDI registry.\n"); - exit(-1); - } - - if ( argc > 1 ) { - strncpy(ps_file, argv[1], 255); - ps_file[255] = '\0'; - } - - /* now dump it out to a dot file. */ - printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis); - - f = fopen(dot_file, "w"); - - /* write graph preamble */ - fprintf(f, "digraph G {\n"); - fprintf(f, " rankdir=LR\n"); - - for (i=0; i<reg->nr_vdis; i++) { - char oldnode[255]; - snap_block_t *blk; - snap_id_t id; - int nr_snaps, done=0; - - vdi = vdi_get(i); - id = vdi->snap; - /* add a node for the id */ -printf("vdi: %d\n", i); - fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", - id.block, id.index, vdi->name, - id.block, id.index); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - - while (id.block != 0) { - blk = snap_get_block(id.block); - nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index); - id = blk->hdr.fork_block; - - done = sh_check_and_add(&id); - - /* add a node for the fork_id */ - if (!done) { - fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", - id.block, id.index, - id.block, id.index); - } - - /* add an edge between them */ - fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n", - id.block, id.index, oldnode, nr_snaps); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - freeblock(blk); - - if (done) break; - } - } - - /* write graph postamble */ - fprintf(f, "}\n"); - fclose(f); - - printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE); - { - char cmd[255]; - sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file); - system(cmd); - } - return 0; -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_unittest.c --- a/tools/blktap/vdi_unittest.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,184 +0,0 @@ -/************************************************************************** - * - * vdi_unittest.c - * - * Run a small test workload to ensure that data access through a vdi - * is (at least superficially) correct. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include "requests-async.h" -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -#define TEST_PAGES 32 -static char *zero_page; -static char pages[TEST_PAGES][BLOCK_SIZE]; -static int next_page = 0; - -void fill_test_pages(void) -{ - int i, j; - long *page; - - for (i=0; i< TEST_PAGES; i++) { - page = (unsigned long *)pages[i]; - for (j=0; j<(BLOCK_SIZE/4); j++) { - page[j] = random(); - } - } - - zero_page = newblock(); -} - -inline u64 make_vaddr(u64 L1, u64 L2, u64 L3) -{ - u64 ret = L1; - - ret = (ret << 9) | L2; - ret = (ret << 9) | L3; - - return ret; -} - -void touch_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3) -{ - u64 vaddr; - char *page = pages[next_page++]; - char *rpage = NULL; - - printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3); - - vaddr = make_vaddr(L1, L2, L3); - vdi_write_s(vdi, vaddr, page); - rpage = vdi_read_s(vdi, vaddr); - - if (rpage == NULL) - { - printf( "read %Lu returned NULL\n", vaddr); - return; - } - - if (memcmp(page, rpage, BLOCK_SIZE) != 0) - { - printf( "read %Lu returned a different page\n", vaddr); - return; - } - - freeblock(rpage); -} - -void test_block(vdi_t *vdi, u64 L1, u64 L2, u64 L3, char *page) -{ - u64 vaddr; - char *rpage = NULL; - - printf("TEST (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3); - - vaddr = make_vaddr(L1, L2, L3); - rpage = vdi_read_s(vdi, vaddr); - - if (rpage == NULL) - { - printf( "read %Lu returned NULL\n", vaddr); - return; - } - - if (memcmp(page, rpage, BLOCK_SIZE) != 0) - { - printf( "read %Lu returned a different page\n", vaddr); - return; - } - - freeblock(rpage); -} - -void coverage_test(vdi_t *vdi) -{ - u64 vaddr; - int i, j, k; - - /* Do a series of writes and reads to test all paths through the - * async radix code. The radix request code will dump CRC warnings - * if there are data problems here as well. - */ - - /* L1 Zero */ - touch_block(vdi, 0, 0, 0); - - /* L2 Zero */ - i = next_page; - touch_block(vdi, 0, 1, 0); - - /* L3 Zero */ - j = next_page; - touch_block(vdi, 0, 0, 1); - k = next_page; - touch_block(vdi, 0, 1, 1); - - /* Direct write */ - touch_block(vdi, 0, 0, 0); - - vdi_snapshot(vdi); - - /* L1 fault */ - touch_block(vdi, 0, 0, 0); - /* test the read-only branches that should have been copied over. */ - test_block(vdi, 0, 1, 0, pages[i]); - test_block(vdi, 0, 0, 1, pages[j]); - - /* L2 fault */ - touch_block(vdi, 0, 1, 0); - test_block(vdi, 0, 1, 1, pages[k]); - - /* L3 fault */ - touch_block(vdi, 0, 0, 1); - - /* read - L1 zero */ - test_block(vdi, 1, 0, 0, zero_page); - - /* read - L2 zero */ - test_block(vdi, 0, 2, 0, zero_page); - - /* read - L3 zero */ - test_block(vdi, 0, 0, 2, zero_page); -} - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - u64 id; - int fd; - struct stat st; - u64 tot_size; - char spage[BLOCK_SIZE]; - char *dpage; - u64 vblock = 0, count=0; - - __init_blockstore(); - init_block_async(); - __init_vdi(); - - vdi = vdi_create( NULL, "UNIT TEST VDI"); - - if ( vdi == NULL ) { - printf("Failed to create VDI!\n"); - freeblock(vdi); - exit(-1); - } - - fill_test_pages(); - coverage_test(vdi); - - freeblock(vdi); - - return (0); -} diff -r 80d5dd14711e -r f8acd354e129 tools/blktap/vdi_validate.c --- a/tools/blktap/vdi_validate.c Sun Jul 3 22:32:52 2005 +++ /dev/null Sun Jul 3 22:36:48 2005 @@ -1,97 +0,0 @@ -/************************************************************************** - * - * vdi_validate.c - * - * Intended to sanity-check vm_fill and the underlying vdi code. - * - * Block-by-block compare of a vdi with a file/device on the disk. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" -#include "requests-async.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - u64 id; - int fd; - struct stat st; - u64 tot_size; - char spage[BLOCK_SIZE], *dpage; - char *vpage; - u64 vblock = 0, count=0; - - __init_blockstore(); - init_block_async(); - __init_vdi(); - - if ( argc < 3 ) { - printf("usage: %s <VDI id> <filename>\n", argv[0]); - exit(-1); - } - - id = (u64) atoll(argv[1]); - - vdi = vdi_get( id ); - - if ( vdi == NULL ) { - printf("Failed to retreive VDI %Ld!\n", id); - exit(-1); - } - - fd = open(argv[2], O_RDONLY | O_LARGEFILE); - - if (fd < 0) { - printf("Couldn't open %s!\n", argv[2]); - exit(-1); - } - - if ( fstat(fd, &st) != 0 ) { - printf("Couldn't stat %s!\n", argv[2]); - exit(-1); - } - - tot_size = (u64) st.st_size; - printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size); - - printf(" "); - while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) { - - dpage = vdi_read_s(vdi, vblock); - - if (dpage == NULL) { - printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock); - exit(0); - } - - if (memcmp(spage, dpage, BLOCK_SIZE) != 0) { - printf("\n\nblocks don't match! (%Ld)\n", vblock); - exit(0); - } - - freeblock(dpage); - - vblock++; - if ((vblock % 1024) == 0) { - printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock); - fflush(stdout); - } - } - printf("\n"); - - printf("VDI %Ld looks good!\n", id); - - freeblock(vdi); - - return (0); -} _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |