[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] Remove old blktap tools.
# HG changeset patch # User akw@xxxxxxxxxxxxxxxxxxxxx # Node ID 840f33e54054270e3f4b9704111ed52bd381653b # Parent 533bad7c0883189e26c2a7f43011801c417b01fe Remove old blktap tools. Signed-off-by: Andrew Warfield <andrew.warfield@xxxxxxxxxxxx> --- tools/blktap/Makefile | 93 -- tools/blktap/README | 137 --- tools/blktap/README.sept05 | 33 tools/blktap/blkdump.c | 62 - tools/blktap/blkif.c | 212 ----- tools/blktap/blktaplib.c | 453 ---------- tools/blktap/blktaplib.h | 171 ---- tools/blktap/list.h | 55 - tools/blktap/parallax/Makefile | 62 - tools/blktap/parallax/README | 171 ---- tools/blktap/parallax/block-async.c | 393 --------- tools/blktap/parallax/block-async.h | 69 - tools/blktap/parallax/blockstore.c | 1348 -------------------------------- tools/blktap/parallax/blockstore.h | 134 --- tools/blktap/parallax/blockstored.c | 275 ------ tools/blktap/parallax/bstest.c | 191 ---- tools/blktap/parallax/parallax.c | 608 -------------- tools/blktap/parallax/radix.c | 631 -------------- tools/blktap/parallax/radix.h | 45 - tools/blktap/parallax/requests-async.c | 762 ------------------ tools/blktap/parallax/requests-async.h | 29 tools/blktap/parallax/snaplog.c | 238 ----- tools/blktap/parallax/snaplog.h | 61 - tools/blktap/parallax/vdi.c | 367 -------- tools/blktap/parallax/vdi.h | 55 - tools/blktap/parallax/vdi_create.c | 52 - tools/blktap/parallax/vdi_fill.c | 81 - tools/blktap/parallax/vdi_list.c | 47 - tools/blktap/parallax/vdi_snap.c | 43 - tools/blktap/parallax/vdi_snap_delete.c | 48 - tools/blktap/parallax/vdi_snap_list.c | 82 - tools/blktap/parallax/vdi_tree.c | 132 --- tools/blktap/parallax/vdi_unittest.c | 184 ---- tools/blktap/parallax/vdi_validate.c | 97 -- tools/blktap/ublkback/Makefile | 40 tools/blktap/ublkback/ublkback.c | 18 tools/blktap/ublkback/ublkbacklib.c | 473 ----------- tools/blktap/ublkback/ublkbacklib.h | 16 tools/blktap/xenbus.c | 568 ------------- 39 files changed, 8536 deletions(-) diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/Makefile --- a/tools/blktap/Makefile Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,94 +0,0 @@ -MAJOR = 3.0 -MINOR = 0 -SONAME = libblktap.so.$(MAJOR) - -XEN_ROOT = ../.. -include $(XEN_ROOT)/tools/Rules.mk - -SUBDIRS := -SUBDIRS += ublkback -#SUBDIRS += parallax - -BLKTAP_INSTALL_DIR = /usr/sbin - -INSTALL = install -INSTALL_PROG = $(INSTALL) -m0755 -INSTALL_DIR = $(INSTALL) -d -m0755 - -INCLUDES += -I. -I $(XEN_LIBXC) -I $(XEN_XENSTORE) - -LIBS := -lpthread -lz - -SRCS := -SRCS += blktaplib.c xenbus.c blkif.c - -CFLAGS += -Werror -CFLAGS += -Wno-unused -CFLAGS += -fno-strict-aliasing -CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -# get asprintf(): -CFLAGS += -D _GNU_SOURCE -# Get gcc to generate the dependencies for us. -CFLAGS += -Wp,-MD,.$(@F).d -CFLAGS += $(INCLUDES) -DEPS = .*.d - -OBJS = $(patsubst %.c,%.o,$(SRCS)) -IBINS := -#IBINS += blkdump - -LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR) - -.PHONY: all -all: mk-symlinks libblktap.so #blkdump - @set -e; for subdir in $(SUBDIRS); do \ - $(MAKE) -C $$subdir $@; \ - done - -.PHONY: install -install: all - $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR) - $(INSTALL_DIR) -p $(DESTDIR)/usr/include - $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR) - $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include - #$(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR) - @set -e; for subdir in $(SUBDIRS); do \ - $(MAKE) -C $$subdir $@; \ - done - -.PHONY: clean -clean: - rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump - @set -e; for subdir in $(SUBDIRS); do \ - $(MAKE) -C $$subdir $@; \ - done - -.PHONY: rpm -rpm: all - rm -rf staging - mkdir staging - mkdir staging/i386 - rpmbuild --define "staging$$PWD/staging" --define '_builddir.' \ - --define "_rpmdir$$PWD/staging" -bb rpm.spec - mv staging/i386/*.rpm . - rm -rf staging - -libblktap.so: $(OBJS) - $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared \ - -L$(XEN_XENSTORE) -l xenstore \ - -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS) - ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR) - ln -sf libblktap.so.$(MAJOR) $@ - -blkdump: libblktap.so - $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. \ - -l blktap blkdump.c - -.PHONY: TAGS clean install mk-symlinks rpm - -.PHONY: TAGS -TAGS: - etags -t $(SRCS) *.h - --include $(DEPS) - diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/README --- a/tools/blktap/README Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,149 +0,0 @@ -Block Tap User-level Interfaces -Andrew Warfield -andrew.warfield@xxxxxxxxxxxx -February 8, 2005 - -NOTE #1: The blktap is _experimental_ code. It works for me. Your -mileage may vary. Don't use it for anything important. Please. ;) - -NOTE #2: All of the interfaces here are likely to change. This is all -early code, and I am checking it in because others want to play with -it. If you use it for anything, please let me know! - -Overview: ---------- - -This directory contains a library and set of example applications for -the block tap device. The block tap hooks into the split block device -interfaces above Xen allowing them to be extended. This extension can -be done in userspace with the help of a library. - -The tap can be installed either as an interposition domain in between -a frontend and backend driver pair, or as a terminating backend, in -which case it is responsible for serving all requests itself. - -There are two reasons that you might want to use the tap, -corresponding to these configurations: - - 1. To examine or modify a stream of block requests while they are - in-flight (e.g. to encrypt data, or add data-driven watchpoints) - - 2. To prototype a new backend driver, serving requests from the tap - rather than passing them along to the XenLinux blkback driver. - (e.g. to forward block requests to a remote host) - - -Interface: ----------- - -At the moment, the tap interface is similar in spirit to that of the -Linux netfilter. Requests are messages from a client (frontend) -domain to a disk (backend) domain. Responses are messages travelling -back, acknowledging the completion of a request. the library allows -chains of functions to be attached to these events. In addition, -hooks may be attached to handle control messages, which signify things -like connections from new domains. - -At present the control messages especially expose a lot of the -underlying driver interfaces. This may change in the future in order -to simplify writing hooks. - -Here are the public interfaces: - -These allow hook functions to be chained: - - void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *)); - void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *)); - void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *)); - -This allows a response to be injected, in the case where a request has -been removed using BLKTAP_STOLEN. - - void blktap_inject_response(blkif_response_t *); - -These let you add file descriptors and handlers to the main poll loop: - - int blktap_attach_poll(int fd, short events, int (*func)(int)); - void blktap_detach_poll(int fd); - -This starts the main poll loop: - - int blktap_listen(void); - -Example: --------- - -blkimage.c uses an image on the local file system to serve requests to -a domain. Here's what it looks like: - ----[blkimg.c]--- - -/* blkimg.c - * - * file-backed disk. - */ - -#include "blktaplib.h" -#include "blkimglib.h" - - -int main(int argc, char *argv[]) -{ - image_init(); - - blktap_register_ctrl_hook("image_control", image_control); - blktap_register_request_hook("image_request", image_request); - blktap_listen(); - - return 0; -} - ----------------- - -All of the real work is in blkimglib.c, but this illustrates the -actual tap interface well enough. image_control() will be called with -all control messages. image_request() handles requests. As it reads -from an on-disk image file, no requests are ever passed on to a -backend, and so there will be no responses to process -- so there is -nothing registered as a response hook. - -Other examples: ---------------- - -Here is a list of other examples in the directory: - -Things that terminate a block request stream: - - blkimg - Use a image file/device to serve requests - blkgnbd - Use a remote gnbd server to serve requests - blkaio - Use libaio... (DOES NOT WORK) - -Things that don't: - - blkdump - Print in-flight requests. - blkcow - Really inefficient copy-on-write disks using libdb to store - writes. - -There are examples of plugging these things together, for instance -blkcowgnbd is a read-only gnbd device with copy-on-write to a local -file. - -TODO: ------ - -- Make session tracking work. At the moment these generally just handle a - single front-end client at a time. - -- Integrate with Xend. Need to cleanly pass a image identifier in the connect - message. - -- Make an asynchronous file-io terminator. The libaio attempt is - tragically stalled because mapped foreign pages make pfn_valid fail - (they are VM_IO), and so cannot be passed to aio as targets. A - better solution may be to tear the disk interfaces out of the real - backend and expose them somehow. - -- Make CoW suck less. - -- Do something more along the lines of dynamic linking for the - plugins, so thatthey don't all need a new main(). diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/README.sept05 --- a/tools/blktap/README.sept05 Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -The blktap has been rewritten substantially based on the current -blkback driver. I've removed passthrough support, as this is broken -by the move to grant tables and the lack of transitive grants. A -blktap VM is now only capable of terminating block requests in -userspace. - -ublkback/ contains a _very_ initial cut at a user-level version of the block -backend driver. It gives a working example of how the current tap -interfaces are used, in particular w.r.t. the vbd directories in -xenstore. - -parallax/ contains fairly recent parallax code. This does not run on -the changed blktap interface, but should only be a couple of hours -work to get going again. - -All of the tricky bits are done, but there is plenty of cleaning to -do, and the top-level functionality is not here yet. At the moment, -the daemon ignores the pdev requested by the tools and opens the file -or device specified by TMP_IMAGE_FILE_NAME in ublkback.c. - -TODO: -1. Fix to allow pdev in the store to specify the device to open. -2. Add support (to tools as well) to mount arbitrary files... - just write the filename to mount into the store, instead of pdev. -3. Reeximine blkif refcounting, it is almost certainly broken at the moment. - - creating a blkif should take a reference. - - each inflight request should take a reference on dequeue in blktaplib - - sending responses should drop refs. - - blkif should be implicitly freed when refcounts fall to 0. -4. Modify the parallax req/rsp code as per ublkback to use the new tap - interfaces. -5. Write a front end that allows parallax and normal mounts to coexist -6. Allow blkback and blktap to run at the same time. diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/blkdump.c --- a/tools/blktap/blkdump.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -/* blkdump.c - * - * show a running trace of block requests as they fly by. - * - * (c) 2004 Andrew Warfield. - */ - -#include <stdio.h> -#include "blktaplib.h" - -int request_print(blkif_request_t *req) -{ - int i; - - if ( (req->operation == BLKIF_OP_READ) || - (req->operation == BLKIF_OP_WRITE) ) - { - printf("[%2u:%2u<%5s] (nr_segs: %03u, dev: %03u, %010llu)\n", - ID_TO_DOM(req->id), ID_TO_IDX(req->id), - blkif_op_name[req->operation], - req->nr_segments, req->handle, - req->sector_number); - - - for (i=0; i < req->nr_segments; i++) { - printf(" (gref: 0x%8x start: %u stop: %u)\n", - req->seg[i].gref, - req->seg[i].first_sect, - req->seg[i].last_sect); - } - - } else { - printf("Unknown request message type.\n"); - } - - return BLKTAP_PASS; -} - -int response_print(blkif_response_t *rsp) -{ - if ( (rsp->operation == BLKIF_OP_READ) || - (rsp->operation == BLKIF_OP_WRITE) ) - { - printf("[%2u:%2u>%5s] (status: %d)\n", - ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id), - blkif_op_name[rsp->operation], - rsp->status); - - } else { - printf("Unknown request message type.\n"); - } - return BLKTAP_PASS; -} - -int main(int argc, char *argv[]) -{ - blktap_register_request_hook("request_print", request_print); - blktap_register_response_hook("response_print", response_print); - blktap_listen(); - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/blkif.c --- a/tools/blktap/blkif.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,212 +0,0 @@ -/* - * blkif.c - * - * The blkif interface for blktap. A blkif describes an in-use virtual disk. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <errno.h> -#include <string.h> -#include <err.h> - -#include "blktaplib.h" - -#if 1 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -#define BLKIF_HASHSZ 1024 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) - -static blkif_t *blkif_hash[BLKIF_HASHSZ]; - -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) -{ - blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif != NULL) && - ((blkif->domid != domid) || (blkif->handle != handle)) ) - blkif = blkif->hash_next; - return blkif; -} - -blkif_t *alloc_blkif(domid_t domid) -{ - blkif_t *blkif; - - blkif = (blkif_t *)malloc(sizeof(blkif_t)); - if (!blkif) - return NULL; - - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - - return blkif; -} - -static int (*new_blkif_hook)(blkif_t *blkif) = NULL; -void register_new_blkif_hook(int (*fn)(blkif_t *blkif)) -{ - new_blkif_hook = fn; -} - -int blkif_init(blkif_t *blkif, long int handle, long int pdev, - long int readonly) -{ - domid_t domid; - blkif_t **pblkif; - - if (blkif == NULL) - return -EINVAL; - - domid = blkif->domid; - blkif->handle = handle; - blkif->pdev = pdev; - blkif->readonly = readonly; - - /* - * Call out to the new_blkif_hook. The tap application should define this, - * and it should return having set blkif->ops - * - */ - if (new_blkif_hook == NULL) - { - warn("Probe detected a new blkif, but no new_blkif_hook!"); - return -1; - } - new_blkif_hook(blkif); - - /* Now wire it in. */ - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( *pblkif != NULL ) - { - if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) - { - DPRINTF("Could not create blkif: already exists\n"); - return -1; - } - pblkif = &(*pblkif)->hash_next; - } - blkif->hash_next = NULL; - *pblkif = blkif; - - return 0; -} - -void free_blkif(blkif_t *blkif) -{ - blkif_t **pblkif, *curs; - - pblkif = &blkif_hash[BLKIF_HASH(blkif->domid, blkif->handle)]; - while ( (curs = *pblkif) != NULL ) - { - if ( blkif == curs ) - { - *pblkif = curs->hash_next; - } - pblkif = &curs->hash_next; - } - free(blkif); -} - -void blkif_register_request_hook(blkif_t *blkif, char *name, - int (*rh)(blkif_t *, blkif_request_t *, int)) -{ - request_hook_t *rh_ent, **c; - - rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t)); - if (!rh_ent) - { - warn("couldn't allocate a new hook"); - return; - } - - rh_ent->func = rh; - rh_ent->next = NULL; - if (asprintf(&rh_ent->name, "%s", name) == -1) - { - free(rh_ent); - warn("couldn't allocate a new hook name"); - return; - } - - c = &blkif->request_hook_chain; - while (*c != NULL) { - c = &(*c)->next; - } - *c = rh_ent; -} - -void blkif_register_response_hook(blkif_t *blkif, char *name, - int (*rh)(blkif_t *, blkif_response_t *, int)) -{ - response_hook_t *rh_ent, **c; - - rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t)); - if (!rh_ent) - { - warn("couldn't allocate a new hook"); - return; - } - - rh_ent->func = rh; - rh_ent->next = NULL; - if (asprintf(&rh_ent->name, "%s", name) == -1) - { - free(rh_ent); - warn("couldn't allocate a new hook name"); - return; - } - - c = &blkif->response_hook_chain; - while (*c != NULL) { - c = &(*c)->next; - } - *c = rh_ent; -} - -void blkif_print_hooks(blkif_t *blkif) -{ - request_hook_t *req_hook; - response_hook_t *rsp_hook; - - DPRINTF("Request Hooks:\n"); - req_hook = blkif->request_hook_chain; - while (req_hook != NULL) - { - DPRINTF(" [0x%p] %s\n", req_hook->func, req_hook->name); - req_hook = req_hook->next; - } - - DPRINTF("Response Hooks:\n"); - rsp_hook = blkif->response_hook_chain; - while (rsp_hook != NULL) - { - DPRINTF(" [0x%p] %s\n", rsp_hook->func, rsp_hook->name); - rsp_hook = rsp_hook->next; - } -} - - -long int vbd_size(blkif_t *blkif) -{ - return 1000000000; -} - -long int vbd_secsize(blkif_t *blkif) -{ - return 512; -} - -unsigned vbd_info(blkif_t *blkif) -{ - return 0; -} - - -void __init_blkif(void) -{ - memset(blkif_hash, 0, sizeof(blkif_hash)); -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/blktaplib.c --- a/tools/blktap/blktaplib.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,453 +0,0 @@ -/* - * blktaplib.c - * - * userspace interface routines for the blktap driver. - * - * (threadsafe(r) version) - * - * (c) 2004 Andrew Warfield. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <sys/mman.h> -#include <sys/user.h> -#include <err.h> -#include <errno.h> -#include <sys/types.h> -#include <linux/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <signal.h> -#include <sys/poll.h> -#include <sys/ioctl.h> -#include <string.h> -#include <unistd.h> -#include <pthread.h> -#include <xs.h> - -#define __COMPILING_BLKTAP_LIB -#include "blktaplib.h" - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif -#define DEBUG_RING_IDXS 0 - -#define POLLRDNORM 0x040 - -#define BLKTAP_IOCTL_KICK 1 - - -void got_sig_bus(); -void got_sig_int(); - -/* in kernel these are opposite, but we are a consumer now. */ -blkif_back_ring_t fe_ring; /* slightly counterintuitive ;) */ -blkif_front_ring_t be_ring; - -unsigned long mmap_vstart = 0; -char *blktap_mem; -int fd = 0; - -#define BLKTAP_RING_PAGES 1 /* Front */ -#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES) - -int bad_count = 0; -void bad(void) -{ - bad_count ++; - if (bad_count > 50) exit(0); -} -/*-----[ ID Manipulation from tap driver code ]--------------------------*/ - -#define ACTIVE_RING_IDX unsigned short - -inline unsigned long MAKE_ID(domid_t fe_dom, ACTIVE_RING_IDX idx) -{ - return ( (fe_dom << 16) | idx ); -} - -inline unsigned int ID_TO_IDX(unsigned long id) -{ - return ( id & 0x0000ffff ); -} - -inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); } - -static int (*request_hook)(blkif_request_t *req) = NULL; -static int (*response_hook)(blkif_response_t *req) = NULL; - -/*-----[ Data to/from Backend (server) VM ]------------------------------*/ - -/* - -inline int write_req_to_be_ring(blkif_request_t *req) -{ - blkif_request_t *req_d; - static pthread_mutex_t be_prod_mutex = PTHREAD_MUTEX_INITIALIZER; - - pthread_mutex_lock(&be_prod_mutex); - req_d = RING_GET_REQUEST(&be_ring, be_ring.req_prod_pvt); - memcpy(req_d, req, sizeof(blkif_request_t)); - wmb(); - be_ring.req_prod_pvt++; - pthread_mutex_unlock(&be_prod_mutex); - - return 0; -} -*/ - -inline int write_rsp_to_fe_ring(blkif_response_t *rsp) -{ - blkif_response_t *rsp_d; - static pthread_mutex_t fe_prod_mutex = PTHREAD_MUTEX_INITIALIZER; - - pthread_mutex_lock(&fe_prod_mutex); - rsp_d = RING_GET_RESPONSE(&fe_ring, fe_ring.rsp_prod_pvt); - memcpy(rsp_d, rsp, sizeof(blkif_response_t)); - wmb(); - fe_ring.rsp_prod_pvt++; - pthread_mutex_unlock(&fe_prod_mutex); - - return 0; -} - -static void apply_rsp_hooks(blkif_t *blkif, blkif_response_t *rsp) -{ - response_hook_t *rsp_hook; - - rsp_hook = blkif->response_hook_chain; - while (rsp_hook != NULL) - { - switch(rsp_hook->func(blkif, rsp, 1)) - { - case BLKTAP_PASS: - break; - default: - printf("Only PASS is supported for resp hooks!\n"); - } - rsp_hook = rsp_hook->next; - } -} - - -static pthread_mutex_t push_mutex = PTHREAD_MUTEX_INITIALIZER; - -void blkif_inject_response(blkif_t *blkif, blkif_response_t *rsp) -{ - - apply_rsp_hooks(blkif, rsp); - - write_rsp_to_fe_ring(rsp); -} - -void blktap_kick_responses(void) -{ - pthread_mutex_lock(&push_mutex); - - RING_PUSH_RESPONSES(&fe_ring); - ioctl(fd, BLKTAP_IOCTL_KICK_FE); - - pthread_mutex_unlock(&push_mutex); -} - -/*-----[ Polling fd listeners ]------------------------------------------*/ - -#define MAX_POLLFDS 64 - -typedef struct { - int (*func)(int fd); - struct pollfd *pfd; - int fd; - short events; - int active; -} pollhook_t; - -static struct pollfd pfd[MAX_POLLFDS+2]; /* tap and store are extra */ -static pollhook_t pollhooks[MAX_POLLFDS]; -static unsigned int ph_freelist[MAX_POLLFDS]; -static unsigned int ph_cons, ph_prod; -#define nr_pollhooks() (MAX_POLLFDS - (ph_prod - ph_cons)) -#define PH_IDX(x) (x % MAX_POLLFDS) - -int blktap_attach_poll(int fd, short events, int (*func)(int fd)) -{ - pollhook_t *ph; - - if (nr_pollhooks() == MAX_POLLFDS) { - printf("Too many pollhooks!\n"); - return -1; - } - - ph = &pollhooks[ph_freelist[PH_IDX(ph_cons++)]]; - - ph->func = func; - ph->fd = fd; - ph->events = events; - ph->active = 1; - - DPRINTF("Added fd %d at ph index %d, now %d phs.\n", fd, ph_cons-1, - nr_pollhooks()); - - return 0; -} - -void blktap_detach_poll(int fd) -{ - int i; - - for (i=0; i<MAX_POLLFDS; i++) - if ((pollhooks[i].active) && (pollhooks[i].pfd->fd == fd)) { - ph_freelist[PH_IDX(ph_prod++)] = i; - pollhooks[i].pfd->fd = -1; - pollhooks[i].active = 0; - break; - } - - DPRINTF("Removed fd %d at ph index %d, now %d phs.\n", fd, i, - nr_pollhooks()); -} - -void pollhook_init(void) -{ - int i; - - for (i=0; i < MAX_POLLFDS; i++) { - ph_freelist[i] = (i+1) % MAX_POLLFDS; - pollhooks[i].active = 0; - } - - ph_cons = 0; - ph_prod = MAX_POLLFDS; -} - -void __attribute__ ((constructor)) blktaplib_init(void) -{ - pollhook_init(); -} - -/*-----[ The main listen loop ]------------------------------------------*/ - -int blktap_listen(void) -{ - int notify_be, notify_fe, tap_pfd, store_pfd, xs_fd, ret; - struct xs_handle *h; - blkif_t *blkif; - - /* comms rings: */ - blkif_request_t *req; - blkif_response_t *rsp; - blkif_sring_t *sring; - RING_IDX rp, i, pfd_count; - - /* pending rings */ - blkif_request_t req_pending[BLK_RING_SIZE]; - /* blkif_response_t rsp_pending[BLK_RING_SIZE] */; - - /* handler hooks: */ - request_hook_t *req_hook; - response_hook_t *rsp_hook; - - signal (SIGBUS, got_sig_bus); - signal (SIGINT, got_sig_int); - - __init_blkif(); - - fd = open("/dev/blktap", O_RDWR); - if (fd == -1) - err(-1, "open failed!"); - - blktap_mem = mmap(0, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE, - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - - if ((int)blktap_mem == -1) - err(-1, "mmap failed!"); - - /* assign the rings to the mapped memory */ -/* - sring = (blkif_sring_t *)((unsigned long)blktap_mem + PAGE_SIZE); - FRONT_RING_INIT(&be_ring, sring, PAGE_SIZE); -*/ - sring = (blkif_sring_t *)((unsigned long)blktap_mem); - BACK_RING_INIT(&fe_ring, sring, PAGE_SIZE); - - mmap_vstart = (unsigned long)blktap_mem +(BLKTAP_RING_PAGES << PAGE_SHIFT); - - - /* Set up store connection and watch. */ - h = xs_daemon_open(); - if (h == NULL) - err(-1, "xs_daemon_open"); - - ret = add_blockdevice_probe_watch(h, "Domain-0"); - if (ret != 0) - err(0, "adding device probewatch"); - - ioctl(fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE ); - - while(1) { - int ret; - - /* build the poll list */ - pfd_count = 0; - for ( i=0; i < MAX_POLLFDS; i++ ) { - pollhook_t *ph = &pollhooks[i]; - - if (ph->active) { - pfd[pfd_count].fd = ph->fd; - pfd[pfd_count].events = ph->events; - ph->pfd = &pfd[pfd_count]; - pfd_count++; - } - } - - tap_pfd = pfd_count++; - pfd[tap_pfd].fd = fd; - pfd[tap_pfd].events = POLLIN; - - store_pfd = pfd_count++; - pfd[store_pfd].fd = xs_fileno(h); - pfd[store_pfd].events = POLLIN; - - if ( (ret = (poll(pfd, pfd_count, 10000)) == 0) ) { - if (DEBUG_RING_IDXS) - ioctl(fd, BLKTAP_IOCTL_PRINT_IDXS); - continue; - } - - for (i=0; i < MAX_POLLFDS; i++) { - if ( (pollhooks[i].active ) && (pollhooks[i].pfd->revents ) ) - pollhooks[i].func(pollhooks[i].pfd->fd); - } - - if (pfd[store_pfd].revents) { - ret = xs_fire_next_watch(h); - } - - if (pfd[tap_pfd].revents) - { - /* empty the fe_ring */ - notify_fe = 0; - notify_be = RING_HAS_UNCONSUMED_REQUESTS(&fe_ring); - rp = fe_ring.sring->req_prod; - rmb(); - for (i = fe_ring.req_cons; i != rp; i++) - { - int done = 0; - - req = RING_GET_REQUEST(&fe_ring, i); - memcpy(&req_pending[ID_TO_IDX(req->id)], req, sizeof(*req)); - req = &req_pending[ID_TO_IDX(req->id)]; - - blkif = blkif_find_by_handle(ID_TO_DOM(req->id), req->handle); - - if (blkif != NULL) - { - req_hook = blkif->request_hook_chain; - while (req_hook != NULL) - { - switch(req_hook->func(blkif, req, ((i+1) == rp))) - { - case BLKTAP_RESPOND: - apply_rsp_hooks(blkif, (blkif_response_t *)req); - write_rsp_to_fe_ring((blkif_response_t *)req); - notify_fe = 1; - done = 1; - break; - case BLKTAP_STOLEN: - done = 1; - break; - case BLKTAP_PASS: - break; - default: - printf("Unknown request hook return value!\n"); - } - if (done) break; - req_hook = req_hook->next; - } - } - - if (done == 0) - { - /* this was: */ - /* write_req_to_be_ring(req); */ - - unsigned long id = req->id; - unsigned short operation = req->operation; - printf("Unterminated request!\n"); - rsp = (blkif_response_t *)req; - rsp->id = id; - rsp->operation = operation; - rsp->status = BLKIF_RSP_ERROR; - write_rsp_to_fe_ring(rsp); - notify_fe = 1; - done = 1; - } - - } - fe_ring.req_cons = i; - - /* empty the be_ring */ -/* - notify_fe |= RING_HAS_UNCONSUMED_RESPONSES(&be_ring); - rp = be_ring.sring->rsp_prod; - rmb(); - for (i = be_ring.rsp_cons; i != rp; i++) - { - - rsp = RING_GET_RESPONSE(&be_ring, i); - memcpy(&rsp_pending[ID_TO_IDX(rsp->id)], rsp, sizeof(*rsp)); - rsp = &rsp_pending[ID_TO_IDX(rsp->id)]; - - DPRINTF("copying a be request\n"); - - apply_rsp_hooks(rsp); - write_rsp_to_fe_ring(rsp); - } - be_ring.rsp_cons = i; -*/ - /* notify the domains */ -/* - if (notify_be) { - DPRINTF("notifying be\n"); -pthread_mutex_lock(&push_mutex); - RING_PUSH_REQUESTS(&be_ring); - ioctl(fd, BLKTAP_IOCTL_KICK_BE); -pthread_mutex_unlock(&push_mutex); - } -*/ - if (notify_fe) { - DPRINTF("notifying fe\n"); - pthread_mutex_lock(&push_mutex); - RING_PUSH_RESPONSES(&fe_ring); - ioctl(fd, BLKTAP_IOCTL_KICK_FE); - pthread_mutex_unlock(&push_mutex); - } - } - } - - - munmap(blktap_mem, PAGE_SIZE); - - mmap_failed: - close(fd); - - open_failed: - return 0; -} - -void got_sig_bus() { - printf("Attempted to access a page that isn't.\n"); - exit(-1); -} - -void got_sig_int() { - DPRINTF("quitting -- returning to passthrough mode.\n"); - if (fd > 0) ioctl(fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH ); - close(fd); - fd = 0; - exit(0); -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/blktaplib.h --- a/tools/blktap/blktaplib.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,171 +0,0 @@ -/* blktaplib.h - * - * userland accessors to the block tap. - * - * Sept 2/05 -- I'm scaling this back to only support block remappings - * to user in a backend domain. Passthrough and interposition can be readded - * once transitive grants are available. - */ - -#ifndef __BLKTAPLIB_H__ -#define __BLKTAPLIB_H__ - -#include <xenctrl.h> -#include <sys/user.h> -#include <xen/xen.h> -#include <xen/io/blkif.h> -#include <xen/io/ring.h> -#include <xen/io/domain_controller.h> -#include <xs.h> - -#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) - -/* /dev/xen/blktap resides at device number major=10, minor=202 */ -#define BLKTAP_MINOR 202 - -/* size of the extra VMA area to map in attached pages. */ -#define BLKTAP_VMA_PAGES BLK_RING_SIZE - -/* blktap IOCTLs: */ -#define BLKTAP_IOCTL_KICK_FE 1 -#define BLKTAP_IOCTL_KICK_BE 2 -#define BLKTAP_IOCTL_SETMODE 3 -#define BLKTAP_IOCTL_PRINT_IDXS 100 - -/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ -#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ -#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 -#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 -#define BLKTAP_MODE_COPY_FE 0x00000004 -#define BLKTAP_MODE_COPY_BE 0x00000008 -#define BLKTAP_MODE_COPY_FE_PAGES 0x00000010 -#define BLKTAP_MODE_COPY_BE_PAGES 0x00000020 - -#define BLKTAP_MODE_INTERPOSE \ - (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) - -#define BLKTAP_MODE_COPY_BOTH \ - (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE) - -#define BLKTAP_MODE_COPY_BOTH_PAGES \ - (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES) - -static inline int BLKTAP_MODE_VALID(unsigned long arg) -{ - return ( - ( arg == BLKTAP_MODE_PASSTHROUGH ) || - ( arg == BLKTAP_MODE_INTERCEPT_FE ) || - ( arg == BLKTAP_MODE_INTERPOSE ) ); -/* - return ( - ( arg == BLKTAP_MODE_PASSTHROUGH ) || - ( arg == BLKTAP_MODE_INTERCEPT_FE ) || - ( arg == BLKTAP_MODE_INTERCEPT_BE ) || - ( arg == BLKTAP_MODE_INTERPOSE ) || - ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) || - ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) || - ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH ) - ); -*/ -} - -/* Return values for handling messages in hooks. */ -#define BLKTAP_PASS 0 /* Keep passing this request as normal. */ -#define BLKTAP_RESPOND 1 /* Request is now a reply. Return it. */ -#define BLKTAP_STOLEN 2 /* Hook has stolen request. */ - -//#define domid_t unsigned short - -inline unsigned int ID_TO_IDX(unsigned long id); -inline domid_t ID_TO_DOM(unsigned long id); - -int blktap_attach_poll(int fd, short events, int (*func)(int)); -void blktap_detach_poll(int fd); -int blktap_listen(void); - -struct blkif; - -typedef struct request_hook_st { - char *name; - int (*func)(struct blkif *, blkif_request_t *, int); - struct request_hook_st *next; -} request_hook_t; - -typedef struct response_hook_st { - char *name; - int (*func)(struct blkif *, blkif_response_t *, int); - struct response_hook_st *next; -} response_hook_t; - -struct blkif_ops { - long int (*get_size)(struct blkif *blkif); - long int (*get_secsize)(struct blkif *blkif); - unsigned (*get_info)(struct blkif *blkif); -}; - -typedef struct blkif { - domid_t domid; - long int handle; - - long int pdev; - long int readonly; - - enum { DISCONNECTED, CONNECTED } state; - - struct blkif_ops *ops; - request_hook_t *request_hook_chain; - response_hook_t *response_hook_chain; - - struct blkif *hash_next; - - void *prv; /* device-specific data */ -} blkif_t; - -void register_new_blkif_hook(int (*fn)(blkif_t *blkif)); -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); -blkif_t *alloc_blkif(domid_t domid); -int blkif_init(blkif_t *blkif, long int handle, long int pdev, - long int readonly); -void free_blkif(blkif_t *blkif); -void __init_blkif(void); - - -/* xenstore/xenbus: */ -extern int add_blockdevice_probe_watch(struct xs_handle *h, - const char *domname); -int xs_fire_next_watch(struct xs_handle *h); - - -void blkif_print_hooks(blkif_t *blkif); -void blkif_register_request_hook(blkif_t *blkif, char *name, - int (*rh)(blkif_t *, blkif_request_t *, int)); -void blkif_register_response_hook(blkif_t *blkif, char *name, - int (*rh)(blkif_t *, blkif_response_t *, int)); -void blkif_inject_response(blkif_t *blkif, blkif_response_t *); -void blktap_kick_responses(void); - -/* this must match the underlying driver... */ -#define MAX_PENDING_REQS 64 - -/* Accessing attached data page mappings */ -#define MMAP_PAGES \ - (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - -extern unsigned long mmap_vstart; - -/* Defines that are only used by library clients */ - -#ifndef __COMPILING_BLKTAP_LIB - -static char *blkif_op_name[] = { - [BLKIF_OP_READ] = "READ", - [BLKIF_OP_WRITE] = "WRITE", -}; - -#endif /* __COMPILING_BLKTAP_LIB */ - -#endif /* __BLKTAPLIB_H__ */ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/list.h --- a/tools/blktap/list.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -/* - * list.h - * - * This is a subset of linux's list.h intended to be used in user-space. - * - */ - -#ifndef __LIST_H__ -#define __LIST_H__ - -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) - -struct list_head { - struct list_head *next, *prev; -}; - -#define LIST_HEAD_INIT(name) { &(name), &(name) } - -#define LIST_HEAD(name) \ - struct list_head name = LIST_HEAD_INIT(name) - -static inline void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) -{ - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; -} - -static inline void list_add(struct list_head *new, struct list_head *head) -{ - __list_add(new, head, head->next); -} -static inline void __list_del(struct list_head * prev, struct list_head * next) -{ - next->prev = prev; - prev->next = next; -} -static inline void list_del(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -#define list_entry(ptr, type, member) \ - ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) -#define list_for_each_entry(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) - -#endif /* __LIST_H__ */ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/Makefile --- a/tools/blktap/parallax/Makefile Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -XEN_ROOT = ../../.. -include $(XEN_ROOT)/tools/Rules.mk - -PARALLAX_INSTALL_DIR = /usr/sbin - -INSTALL = install -INSTALL_PROG = $(INSTALL) -m0755 -INSTALL_DIR = $(INSTALL) -d -m0755 - -INCLUDES += -I.. -I/usr/include -I $(XEN_LIBXC) - -LDFLAGS = -L.. -lpthread -lz -lblktap - -#PLX_SRCS := -PLX_SRCS := vdi.c -PLX_SRCS += radix.c -PLX_SRCS += snaplog.c -PLX_SRCS += blockstore.c -PLX_SRCS += block-async.c -PLX_SRCS += requests-async.c -VDI_SRCS := $(PLX_SRCS) -PLX_SRCS += parallax.c - -#VDI_TOOLS := -VDI_TOOLS := vdi_create -VDI_TOOLS += vdi_list -VDI_TOOLS += vdi_snap -VDI_TOOLS += vdi_snap_list -VDI_TOOLS += vdi_snap_delete -VDI_TOOLS += vdi_fill -VDI_TOOLS += vdi_tree -VDI_TOOLS += vdi_validate - -CFLAGS += -Werror -CFLAGS += -Wno-unused -CFLAGS += -fno-strict-aliasing -CFLAGS += $(INCLUDES) -CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -# Get gcc to generate the dependencies for us. -CFLAGS += -Wp,-MD,.$(@F).d -DEPS = .*.d - -OBJS = $(patsubst %.c,%.o,$(SRCS)) -IBINS = parallax $(VDI_TOOLS) - -.PHONY: all -all: $(VDI_TOOLS) parallax blockstored - -.PHONY: install -install: all - $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(PARALLAX_INSTALL_DIR) - -.PHONY: clean -clean: - rm -rf *.o *~ $(DEPS) xen TAGS $(VDI_TOOLS) parallax vdi_unittest - -parallax: $(PLX_SRCS) - $(CC) $(CFLAGS) -o parallax -L.. $(LDFLAGS) $(PLX_SRCS) - -${VDI_TOOLS}: %: %.c $(VDI_SRCS) - $(CC) $(CFLAGS) -o $@ $@.c $(LDFLAGS) $(VDI_SRCS) - --include $(DEPS) diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/README --- a/tools/blktap/parallax/README Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,177 +0,0 @@ -Parallax Quick Overview -March 3, 2005 - -This is intended to provide a quick set of instructions to let you -guys play with the current parallax source. In it's current form, the -code will let you run an arbitrary number of VMs off of a single disk -image, doing copy-on-write as they make updates. Each domain is -assigned a virtual disk image (VDI), which may be based on a snapshot -of an existing image. All of the VDI and snapshot management should -currently work. - -The current implementation uses a single file as a blockstore for -_everything_ this will soon be replaced by the fancier backend code -and the local cache. As it stands, Parallax will create -"blockstore.dat" in the directory that you run it from, and use -largefile support to make this grow to unfathomable girth. So, you -probably want to run the daemon off of a local disk, with a lot of -free space. - -Here's how to get going: - -0. Setup: ---------- - -Pick a local directory on a disk with lots of room. You should be -running from a privileged domain (e.g. dom0) with the blocktap -configured in and block backend NOT. - -For convenience (for the moment) copy all of the vdi tools (vdi_*) and -the parallax daemon from tools/blktap into this directory. - -1. Populate the blockstore: ---------------------------- - -First you need to put at least one image into the blockstore. You -will need a disk image, either as a file or local partition. My -general approach has been to - -(a) make a really big sparse file with - - dd if=/dev/zero of=./image bs=4K count=1 seek=[big value] - -(b) put a filesystem into it - - mkfs.ext3 ./image - -(c) mount it using loopback - - mkdir ./mnt - mount -o loop ./image - -(d) cd into it and untar one of the image files from srg-roots. - - cd mnt - tar ... - -NOTE: Beware if your system is FC3. mkfs is not compatible with old -versions of fedora, and so you don't have much choice but to install -further fc3 images if you have used the fc3 version of mkfs. - -(e) unmount the image - - cd .. - umount mnt - -(f) now, create a new VDI to hold the image - - ./vdi_create "My new FC3 VDI" - -(g) get the id of the new VDI. - - ./vdi_list - - | 0 My new FC3 VDI - -(0 is the VDI id... create a few more if you want.) - -(h) hoover your image into the new VDI. - - ./vdi_fill 0 ./image - -This will pull the entire image into the blockstore and set up a -mapping tree for it for VDI 0. Passing a device (i.e. /dev/sda3) -should also work, but vdi_fill has NO notion of sparseness yet, so you -are going to pump a block into the store for each block you read. - -vdi_fill will count up until it is done, and you should be ready to -go. If you want to be anal, you can use vdi_validate to test the VDI -against the original image. - -2. Create some extra VDIs -------------------------- - -VDIs are actually a list of snapshots, and each snapshot is a full -image of mappings. So, to preserve an immutable copy of a current -VDI, do this: - -(a) Snapshot your new VDI. - - ./vdi_snap 0 - -Snapshotting writes the current radix root to the VDI's snapshot log, -and assigns it a new writable root. - -(b) look at the VDI's snapshot log. - - ./vdi_snap_list 0 - - | 16 0 Thu Mar 3 19:27:48 2005 565111 31 - -The first two columns constitute a snapshot id and represent the -(block, offset) of the snapshot record. The Date tells you when the -snapshot was made, and 31 is the radix root node of the snapshot. - -(c) Create a new VDI, based on that snapshot, and look at the list. - - ./vdi_create "FC3 - Copy 1" 16 0 - ./vdi_list - - | 0 My new FC3 VDI - | 1 FC3 - Copy 1 - -NOTE: If you have Graphviz installed on your system, you can use -vdi_tree to generate a postscript of your current set of VDIs and -snapshots. - - -Create as many VDIs as you need for the VMs that you want to run. - -3. Boot some VMs: ------------------ - -Parallax currently uses a hack in xend to pass the VDI id, you need to -modify the disk line of the VM config that is going to mount it. - -(a) set up your vm config, by using the following disk line: - - disk = ['parallax:1,sda1,w,0' ] - -This example uses VDI 1 (from vdi_list above), presents it as sda1 -(writable), and uses dom 0 as the backend. If you were running the -daemon (and tap driver) in some domain other than 0, you would change -this last parameter. - -NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:". - -(b) Run parallax in the backend domain. - - ./parallax - -(c) create your new domain. - - xm create ... - ---- - -That's pretty much all there is to it at the moment. Hope this is -clear enough to get you going. Now, a few serious caveats that will -be sorted out in the almost immediate future: - -WARNINGS: ---------- - -1. There is NO locking in the VDI tools at the moment, so I'd avoid -running them in parallel, or more importantly, running them while the -daemon is running. - -2. I doubt that xend will be very happy about restarting if you have -parallax-using domains. So if it dies while there are active parallax -doms, you may need to reboot. - -3. I've turned off write-in-place. So at the moment, EVERY block -write is a log append on the blockstore. I've been having some probs -with the radix tree's marking of writable blocks after snapshots and -will sort this out very soon. - - diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/block-async.c --- a/tools/blktap/parallax/block-async.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,393 +0,0 @@ -/* block-async.c - * - * Asynchronous block wrappers for parallax. - */ - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> -#include "block-async.h" -#include "blockstore.h" -#include "vdi.h" - - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* We have a queue of outstanding I/O requests implemented as a - * circular producer-consumer ring with free-running buffers. - * to allow reordering, this ring indirects to indexes in an - * ring of io_structs. - * - * the block_* calls may either add an entry to this ring and return, - * or satisfy the request immediately and call the callback directly. - * None of the io calls in parallax should be nested enough to worry - * about stack problems with this approach. - */ - -struct read_args { - uint64_t addr; -}; - -struct write_args { - uint64_t addr; - char *block; -}; - -struct alloc_args { - char *block; -}; - -struct pending_io_req { - enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op; - union { - struct read_args r; - struct write_args w; - struct alloc_args a; - } u; - io_cb_t cb; - void *param; -}; - -void radix_lock_init(struct radix_lock *r) -{ - int i; - - pthread_mutex_init(&r->lock, NULL); - for (i=0; i < 1024; i++) { - r->lines[i] = 0; - r->waiters[i] = NULL; - r->state[i] = ANY; - } -} - -/* maximum outstanding I/O requests issued asynchronously */ -/* must be a power of 2.*/ -#define MAX_PENDING_IO 1024 - -/* how many threads to concurrently issue I/O to the disk. */ -#define IO_POOL_SIZE 10 - -static struct pending_io_req pending_io_reqs[MAX_PENDING_IO]; -static int pending_io_list[MAX_PENDING_IO]; -static unsigned long io_prod = 0, io_cons = 0, io_free = 0; -#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1)) -#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs) -#define PENDING_IO_ENT(_x) \ - (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]]) -#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod) -#define CAN_CONSUME_PENDING_IO (io_cons != io_prod) -static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t pending_io_cond = PTHREAD_COND_INITIALIZER; - -static void init_pending_io(void) -{ - int i; - - for (i=0; i<MAX_PENDING_IO; i++) - pending_io_list[i] = i; - -} - -void block_read(uint64_t addr, io_cb_t cb, void *param) -{ - struct pending_io_req *req; - - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - DPRINTF("Produce (R) %lu (%p)\n", io_prod - 1, req); - req->op = IO_READ; - req->u.r.addr = addr; - req->cb = cb; - req->param = param; - - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - - -void block_write(uint64_t addr, char *block, io_cb_t cb, void *param) -{ - struct pending_io_req *req; - - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req); - req->op = IO_WRITE; - req->u.w.addr = addr; - req->u.w.block = block; - req->cb = cb; - req->param = param; - - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - - -void block_alloc(char *block, io_cb_t cb, void *param) -{ - struct pending_io_req *req; - - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - req->op = IO_ALLOC; - req->u.a.block = block; - req->cb = cb; - req->param = param; - - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - pthread_mutex_lock(&r->lock); - - if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) { - r->lines[row]++; - r->state[row] = READ; - DPRINTF("RLOCK : %3d (row: %d)\n", r->lines[row], row); - pthread_mutex_unlock(&r->lock); - ret.type = IO_INT_T; - ret.u.i = 0; - cb(ret, param); - } else { - struct radix_wait **rwc; - struct radix_wait *rw = - (struct radix_wait *) malloc (sizeof(struct radix_wait)); - DPRINTF("RLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); - rw->type = RLOCK; - rw->param = param; - rw->cb = cb; - rw->next = NULL; - /* append to waiters list. */ - rwc = &r->waiters[row]; - while (*rwc != NULL) rwc = &(*rwc)->next; - *rwc = rw; - pthread_mutex_unlock(&r->lock); - return; - } -} - - -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - pthread_mutex_lock(&r->lock); - - /* the second check here is redundant -- just here for debugging now. */ - if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) { - r->state[row] = STOP; - r->lines[row] = -1; - DPRINTF("WLOCK : %3d (row: %d)\n", r->lines[row], row); - pthread_mutex_unlock(&r->lock); - ret.type = IO_INT_T; - ret.u.i = 0; - cb(ret, param); - } else { - struct radix_wait **rwc; - struct radix_wait *rw = - (struct radix_wait *) malloc (sizeof(struct radix_wait)); - DPRINTF("WLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); - rw->type = WLOCK; - rw->param = param; - rw->cb = cb; - rw->next = NULL; - /* append to waiters list. */ - rwc = &r->waiters[row]; - while (*rwc != NULL) rwc = &(*rwc)->next; - *rwc = rw; - pthread_mutex_unlock(&r->lock); - return; - } - -} - -/* called with radix_lock locked and lock count of zero. */ -static void wake_waiters(struct radix_lock *r, int row) -{ - struct pending_io_req *req; - struct radix_wait *rw; - - if (r->lines[row] != 0) return; - if (r->waiters[row] == NULL) return; - - if (r->waiters[row]->type == WLOCK) { - - rw = r->waiters[row]; - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - req->op = IO_WWAKE; - req->cb = rw->cb; - req->param = rw->param; - r->lines[row] = -1; /* write lock the row. */ - r->state[row] = STOP; - r->waiters[row] = rw->next; - free(rw); - pthread_mutex_unlock(&pending_io_lock); - - } else /* RLOCK */ { - - while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) { - rw = r->waiters[row]; - pthread_mutex_lock(&pending_io_lock); - assert(CAN_PRODUCE_PENDING_IO); - - req = PENDING_IO_ENT(io_prod++); - req->op = IO_RWAKE; - req->cb = rw->cb; - req->param = rw->param; - r->lines[row]++; /* read lock the row. */ - r->state[row] = READ; - r->waiters[row] = rw->next; - free(rw); - pthread_mutex_unlock(&pending_io_lock); - } - - if (r->waiters[row] != NULL) /* There is a write queued still */ - r->state[row] = STOP; - } - - pthread_mutex_lock(&pending_io_lock); - pthread_cond_signal(&pending_io_cond); - pthread_mutex_unlock(&pending_io_lock); -} - -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - - pthread_mutex_lock(&r->lock); - assert(r->lines[row] > 0); /* try to catch misuse. */ - r->lines[row]--; - if (r->lines[row] == 0) { - r->state[row] = ANY; - wake_waiters(r, row); - } - pthread_mutex_unlock(&r->lock); - cb(ret, param); -} - -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param) -{ - struct io_ret ret; - - pthread_mutex_lock(&r->lock); - assert(r->lines[row] == -1); /* try to catch misuse. */ - r->lines[row] = 0; - r->state[row] = ANY; - wake_waiters(r, row); - pthread_mutex_unlock(&r->lock); - cb(ret, param); -} - -/* consumer calls */ -static void do_next_io_req(struct pending_io_req *req) -{ - struct io_ret ret; - void *param; - - switch (req->op) { - case IO_READ: - ret.type = IO_BLOCK_T; - ret.u.b = readblock(req->u.r.addr); - break; - case IO_WRITE: - ret.type = IO_INT_T; - ret.u.i = writeblock(req->u.w.addr, req->u.w.block); - DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr); - break; - case IO_ALLOC: - ret.type = IO_ADDR_T; - ret.u.a = allocblock(req->u.a.block); - break; - case IO_RWAKE: - DPRINTF("WAKE DEFERRED RLOCK!\n"); - ret.type = IO_INT_T; - ret.u.i = 0; - break; - case IO_WWAKE: - DPRINTF("WAKE DEFERRED WLOCK!\n"); - ret.type = IO_INT_T; - ret.u.i = 0; - break; - default: - DPRINTF("Unknown IO operation on pending list!\n"); - return; - } - - param = req->param; - pthread_mutex_lock(&pending_io_lock); - pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req); - pthread_mutex_unlock(&pending_io_lock); - - assert(req->cb != NULL); - req->cb(ret, param); - -} - -void *io_thread(void *param) -{ - int tid; - struct pending_io_req *req; - - /* Set this thread's tid. */ - tid = *(int *)param; - free(param); - -start: - pthread_mutex_lock(&pending_io_lock); - while (io_prod == io_cons) { - pthread_cond_wait(&pending_io_cond, &pending_io_lock); - } - - if (io_prod == io_cons) { - /* unnecessary wakeup. */ - pthread_mutex_unlock(&pending_io_lock); - goto start; - } - - req = PENDING_IO_ENT(io_cons++); - pthread_mutex_unlock(&pending_io_lock); - - do_next_io_req(req); - - goto start; - -} - -static pthread_t io_pool[IO_POOL_SIZE]; -void start_io_threads(void) - -{ - int i, tid=0; - - for (i=0; i < IO_POOL_SIZE; i++) { - int ret, *t; - t = (int *)malloc(sizeof(int)); - *t = tid++; - ret = pthread_create(&io_pool[i], NULL, io_thread, t); - if (ret != 0) printf("Error starting thread %d\n", i); - } - -} - -void init_block_async(void) -{ - init_pending_io(); - start_io_threads(); -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/block-async.h --- a/tools/blktap/parallax/block-async.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -/* block-async.h - * - * Asynchronous block wrappers for parallax. - */ - -#ifndef _BLOCKASYNC_H_ -#define _BLOCKASYNC_H_ - -#include <assert.h> -#include <xenctrl.h> -#include "vdi.h" - -struct io_ret -{ - enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type; - union { - uint64_t a; - char *b; - int i; - } u; -}; - -typedef void (*io_cb_t)(struct io_ret r, void *param); - -/* per-vdi lock structures to make sure requests run in a safe order. */ -struct radix_wait { - enum {RLOCK, WLOCK} type; - io_cb_t cb; - void *param; - struct radix_wait *next; -}; - -struct radix_lock { - pthread_mutex_t lock; - int lines[1024]; - struct radix_wait *waiters[1024]; - enum {ANY, READ, STOP} state[1024]; -}; -void radix_lock_init(struct radix_lock *r); - -void block_read(uint64_t addr, io_cb_t cb, void *param); -void block_write(uint64_t addr, char *block, io_cb_t cb, void *param); -void block_alloc(char *block, io_cb_t cb, void *param); -void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param); -void init_block_async(void); - -static inline uint64_t IO_ADDR(struct io_ret r) -{ - assert(r.type == IO_ADDR_T); - return r.u.a; -} - -static inline char *IO_BLOCK(struct io_ret r) -{ - assert(r.type == IO_BLOCK_T); - return r.u.b; -} - -static inline int IO_INT(struct io_ret r) -{ - assert(r.type == IO_INT_T); - return r.u.i; -} - - -#endif //_BLOCKASYNC_H_ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/blockstore.c --- a/tools/blktap/parallax/blockstore.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1348 +0,0 @@ -/************************************************************************** - * - * blockstore.c - * - * Simple block store interface - * - */ - -#include <fcntl.h> -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/time.h> -#include <stdarg.h> -#include "blockstore.h" -#include <pthread.h> - -//#define BLOCKSTORE_REMOTE -//#define BSDEBUG - -#define RETRY_TIMEOUT 1000000 /* microseconds */ - -/***************************************************************************** - * Debugging - */ -#ifdef BSDEBUG -void DB(char *format, ...) -{ - va_list args; - fprintf(stderr, "[%05u] ", (int)pthread_getspecific(tid_key)); - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); -} -#else -#define DB(format, ...) (void)0 -#endif - -#ifdef BLOCKSTORE_REMOTE - -#include <sys/socket.h> -#include <sys/ioctl.h> -#include <netinet/in.h> -#include <netdb.h> - -/***************************************************************************** - * Network state * - *****************************************************************************/ - -/* The individual disk servers we talks to. These will be referenced by - * an integer index into bsservers[]. - */ -bsserver_t bsservers[MAX_SERVERS]; - -/* The cluster map. This is indexed by an integer cluster number. - */ -bscluster_t bsclusters[MAX_CLUSTERS]; - -/* Local socket. - */ -struct sockaddr_in sin_local; -int bssock = 0; - -/***************************************************************************** - * Notification * - *****************************************************************************/ - -typedef struct pool_thread_t_struct { - pthread_mutex_t ptmutex; - pthread_cond_t ptcv; - int newdata; -} pool_thread_t; - -pool_thread_t pool_thread[READ_POOL_SIZE+1]; - -#define RECV_NOTIFY(tid) { \ - pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ - pool_thread[tid].newdata = 1; \ - DB("CV Waking %u", tid); \ - pthread_cond_signal(&(pool_thread[tid].ptcv)); \ - pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } -#define RECV_AWAIT(tid) { \ - pthread_mutex_lock(&(pool_thread[tid].ptmutex)); \ - if (pool_thread[tid].newdata) { \ - pool_thread[tid].newdata = 0; \ - DB("CV Woken %u", tid); \ - } \ - else { \ - DB("CV Waiting %u", tid); \ - pthread_cond_wait(&(pool_thread[tid].ptcv), \ - &(pool_thread[tid].ptmutex)); \ - } \ - pthread_mutex_unlock(&(pool_thread[tid].ptmutex)); } - -/***************************************************************************** - * Message queue management * - *****************************************************************************/ - -/* Protects the queue manipulation critcal regions. - */ -pthread_mutex_t ptmutex_queue; -#define ENTER_QUEUE_CR pthread_mutex_lock(&ptmutex_queue) -#define LEAVE_QUEUE_CR pthread_mutex_unlock(&ptmutex_queue) - -pthread_mutex_t ptmutex_recv; -#define ENTER_RECV_CR pthread_mutex_lock(&ptmutex_recv) -#define LEAVE_RECV_CR pthread_mutex_unlock(&ptmutex_recv) - -/* A message queue entry. We allocate one of these for every request we send. - * Asynchronous reply reception also used one of these. - */ -typedef struct bsq_t_struct { - struct bsq_t_struct *prev; - struct bsq_t_struct *next; - int status; - int server; - int length; - struct msghdr msghdr; - struct iovec iov[2]; - int tid; - struct timeval tv_sent; - bshdr_t message; - void *block; -} bsq_t; - -#define BSQ_STATUS_MATCHED 1 - -pthread_mutex_t ptmutex_luid; -#define ENTER_LUID_CR pthread_mutex_lock(&ptmutex_luid) -#define LEAVE_LUID_CR pthread_mutex_unlock(&ptmutex_luid) - -static uint64_t luid_cnt = 0x1000ULL; -uint64_t new_luid(void) { - uint64_t luid; - ENTER_LUID_CR; - luid = luid_cnt++; - LEAVE_LUID_CR; - return luid; -} - -/* Queue of outstanding requests. - */ -bsq_t *bs_head = NULL; -bsq_t *bs_tail = NULL; -int bs_qlen = 0; - -/* - */ -void queuedebug(char *msg) { - bsq_t *q; - ENTER_QUEUE_CR; - fprintf(stderr, "Q: %s len=%u\n", msg, bs_qlen); - for (q = bs_head; q; q = q->next) { - fprintf(stderr, " luid=%016llx server=%u\n", - q->message.luid, q->server); - } - LEAVE_QUEUE_CR; -} - -int enqueue(bsq_t *qe) { - ENTER_QUEUE_CR; - qe->next = NULL; - qe->prev = bs_tail; - if (!bs_head) - bs_head = qe; - else - bs_tail->next = qe; - bs_tail = qe; - bs_qlen++; - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("enqueue"); -#endif - return 0; -} - -int dequeue(bsq_t *qe) { - bsq_t *q; - ENTER_QUEUE_CR; - for (q = bs_head; q; q = q->next) { - if (q == qe) { - if (q->prev) - q->prev->next = q->next; - else - bs_head = q->next; - if (q->next) - q->next->prev = q->prev; - else - bs_tail = q->prev; - bs_qlen--; - goto found; - } - } - - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("dequeue not found"); -#endif - return 0; - - found: - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("dequeue not found"); -#endif - return 1; -} - -bsq_t *queuesearch(bsq_t *qe) { - bsq_t *q; - ENTER_QUEUE_CR; - for (q = bs_head; q; q = q->next) { - if ((qe->server == q->server) && - (qe->message.operation == q->message.operation) && - (qe->message.luid == q->message.luid)) { - - if ((q->message.operation == BSOP_READBLOCK) && - ((q->message.flags & BSOP_FLAG_ERROR) == 0)) { - q->block = qe->block; - qe->block = NULL; - } - q->length = qe->length; - q->message.flags = qe->message.flags; - q->message.id = qe->message.id; - q->status |= BSQ_STATUS_MATCHED; - - if (q->prev) - q->prev->next = q->next; - else - bs_head = q->next; - if (q->next) - q->next->prev = q->prev; - else - bs_tail = q->prev; - q->next = NULL; - q->prev = NULL; - bs_qlen--; - goto found; - } - } - - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("queuesearch not found"); -#endif - return NULL; - - found: - LEAVE_QUEUE_CR; -#ifdef BSDEBUG - queuedebug("queuesearch found"); -#endif - return q; -} - -/***************************************************************************** - * Network communication * - *****************************************************************************/ - -int send_message(bsq_t *qe) { - int rc; - - qe->msghdr.msg_name = (void *)&(bsservers[qe->server].sin); - qe->msghdr.msg_namelen = sizeof(struct sockaddr_in); - qe->msghdr.msg_iov = qe->iov; - if (qe->block) - qe->msghdr.msg_iovlen = 2; - else - qe->msghdr.msg_iovlen = 1; - qe->msghdr.msg_control = NULL; - qe->msghdr.msg_controllen = 0; - qe->msghdr.msg_flags = 0; - - qe->iov[0].iov_base = (void *)&(qe->message); - qe->iov[0].iov_len = MSGBUFSIZE_ID; - - if (qe->block) { - qe->iov[1].iov_base = qe->block; - qe->iov[1].iov_len = BLOCK_SIZE; - } - - qe->message.luid = new_luid(); - - qe->status = 0; - qe->tid = (int)pthread_getspecific(tid_key); - if (enqueue(qe) < 0) { - fprintf(stderr, "Error enqueuing request.\n"); - return -1; - } - - gettimeofday(&(qe->tv_sent), NULL); - DB("send_message to %d luid=%016llx\n", qe->server, qe->message.luid); - rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); - //rc = sendto(bssock, (void *)&(qe->message), qe->length, 0, - // (struct sockaddr *)&(bsservers[qe->server].sin), - // sizeof(struct sockaddr_in)); - if (rc < 0) - return rc; - - return rc; -} - -int recv_message(bsq_t *qe) { - struct sockaddr_in from; - //int flen = sizeof(from); - int rc; - - qe->msghdr.msg_name = &from; - qe->msghdr.msg_namelen = sizeof(struct sockaddr_in); - qe->msghdr.msg_iov = qe->iov; - if (qe->block) - qe->msghdr.msg_iovlen = 2; - else - qe->msghdr.msg_iovlen = 1; - qe->msghdr.msg_control = NULL; - qe->msghdr.msg_controllen = 0; - qe->msghdr.msg_flags = 0; - - qe->iov[0].iov_base = (void *)&(qe->message); - qe->iov[0].iov_len = MSGBUFSIZE_ID; - if (qe->block) { - qe->iov[1].iov_base = qe->block; - qe->iov[1].iov_len = BLOCK_SIZE; - } - - rc = recvmsg(bssock, &(qe->msghdr), 0); - - //return recvfrom(bssock, (void *)&(qe->message), sizeof(bsmsg_t), 0, - // (struct sockaddr *)&from, &flen); - return rc; -} - -int get_server_number(struct sockaddr_in *sin) { - int i; - -#ifdef BSDEBUG2 - fprintf(stderr, - "get_server_number(%u.%u.%u.%u/%u)\n", - (unsigned int)sin->sin_addr.s_addr & 0xff, - ((unsigned int)sin->sin_addr.s_addr >> 8) & 0xff, - ((unsigned int)sin->sin_addr.s_addr >> 16) & 0xff, - ((unsigned int)sin->sin_addr.s_addr >> 24) & 0xff, - (unsigned int)sin->sin_port); -#endif - - for (i = 0; i < MAX_SERVERS; i++) { - if (bsservers[i].hostname) { -#ifdef BSDEBUG2 - fprintf(stderr, - "get_server_number check %u.%u.%u.%u/%u\n", - (unsigned int)bsservers[i].sin.sin_addr.s_addr&0xff, - ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 8)&0xff, - ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 16)&0xff, - ((unsigned int)bsservers[i].sin.sin_addr.s_addr >> 24)&0xff, - (unsigned int)bsservers[i].sin.sin_port); -#endif - if ((sin->sin_family == bsservers[i].sin.sin_family) && - (sin->sin_port == bsservers[i].sin.sin_port) && - (memcmp((void *)&(sin->sin_addr), - (void *)&(bsservers[i].sin.sin_addr), - sizeof(struct in_addr)) == 0)) { - return i; - } - } - } - - return -1; -} - -void *rx_buffer = NULL; -bsq_t rx_qe; -bsq_t *recv_any(void) { - struct sockaddr_in from; - int rc; - - DB("ENTER recv_any\n"); - - rx_qe.msghdr.msg_name = &from; - rx_qe.msghdr.msg_namelen = sizeof(struct sockaddr_in); - rx_qe.msghdr.msg_iov = rx_qe.iov; - if (!rx_buffer) { - rx_buffer = malloc(BLOCK_SIZE); - if (!rx_buffer) { - perror("recv_any malloc"); - return NULL; - } - } - rx_qe.block = rx_buffer; - rx_buffer = NULL; - rx_qe.msghdr.msg_iovlen = 2; - rx_qe.msghdr.msg_control = NULL; - rx_qe.msghdr.msg_controllen = 0; - rx_qe.msghdr.msg_flags = 0; - - rx_qe.iov[0].iov_base = (void *)&(rx_qe.message); - rx_qe.iov[0].iov_len = MSGBUFSIZE_ID; - rx_qe.iov[1].iov_base = rx_qe.block; - rx_qe.iov[1].iov_len = BLOCK_SIZE; - - rc = recvmsg(bssock, &(rx_qe.msghdr), 0); - if (rc < 0) { - perror("recv_any"); - return NULL; - } - - rx_qe.length = rc; - rx_qe.server = get_server_number(&from); - - DB("recv_any from %d luid=%016llx len=%u\n", - rx_qe.server, rx_qe.message.luid, rx_qe.length); - - return &rx_qe; -} - -void recv_recycle_buffer(bsq_t *q) { - if (q->block) { - rx_buffer = q->block; - q->block = NULL; - } -} - -// cycle through reading any incoming, searching for a match in the -// queue, until we have all we need. -int wait_recv(bsq_t **reqs, int numreqs) { - bsq_t *q, *m; - unsigned int x, i; - int tid = (int)pthread_getspecific(tid_key); - - DB("ENTER wait_recv %u\n", numreqs); - - checkmatch: - x = 0xffffffff; - for (i = 0; i < numreqs; i++) { - x &= reqs[i]->status; - } - if ((x & BSQ_STATUS_MATCHED)) { - DB("LEAVE wait_recv\n"); - return numreqs; - } - - RECV_AWAIT(tid); - - /* - rxagain: - ENTER_RECV_CR; - q = recv_any(); - LEAVE_RECV_CR; - if (!q) - return -1; - - m = queuesearch(q); - recv_recycle_buffer(q); - if (!m) { - fprintf(stderr, "Unmatched RX\n"); - goto rxagain; - } - */ - - goto checkmatch; - -} - -/* retry - */ -static int retry_count = 0; -int retry(bsq_t *qe) -{ - int rc; - gettimeofday(&(qe->tv_sent), NULL); - DB("retry to %d luid=%016llx\n", qe->server, qe->message.luid); - retry_count++; - rc = sendmsg(bssock, &(qe->msghdr), MSG_DONTWAIT); - if (rc < 0) - return rc; - return 0; -} - -/* queue runner - */ -void *queue_runner(void *arg) -{ - for (;;) { - struct timeval now; - long long nowus, sus; - bsq_t *q; - int r; - - sleep(1); - - gettimeofday(&now, NULL); - nowus = now.tv_usec + now.tv_sec * 1000000; - ENTER_QUEUE_CR; - r = retry_count; - for (q = bs_head; q; q = q->next) { - sus = q->tv_sent.tv_usec + q->tv_sent.tv_sec * 1000000; - if ((nowus - sus) > RETRY_TIMEOUT) { - if (retry(q) < 0) { - fprintf(stderr, "Error on sendmsg retry.\n"); - } - } - } - if (r != retry_count) { - fprintf(stderr, "RETRIES: %u %u\n", retry_count - r, retry_count); - } - LEAVE_QUEUE_CR; - } -} - -/* receive loop - */ -void *receive_loop(void *arg) -{ - bsq_t *q, *m; - - for(;;) { - q = recv_any(); - if (!q) { - fprintf(stderr, "recv_any error\n"); - } - else { - m = queuesearch(q); - recv_recycle_buffer(q); - if (!m) { - fprintf(stderr, "Unmatched RX\n"); - } - else { - DB("RX MATCH"); - RECV_NOTIFY(m->tid); - } - } - } -} -pthread_t pthread_recv; - -/***************************************************************************** - * Reading * - *****************************************************************************/ - -void *readblock_indiv(int server, uint64_t id) { - void *block; - bsq_t *qe; - int len, rc; - - qe = (bsq_t *)malloc(sizeof(bsq_t)); - if (!qe) { - perror("readblock qe malloc"); - return NULL; - } - qe->block = NULL; - - /* - qe->block = malloc(BLOCK_SIZE); - if (!qe->block) { - perror("readblock qe malloc"); - free((void *)qe); - return NULL; - } - */ - - qe->server = server; - - qe->message.operation = BSOP_READBLOCK; - qe->message.flags = 0; - qe->message.id = id; - qe->length = MSGBUFSIZE_ID; - - if (send_message(qe) < 0) { - perror("readblock sendto"); - goto err; - } - - /*len = recv_message(qe); - if (len < 0) { - perror("readblock recv"); - goto err; - }*/ - - rc = wait_recv(&qe, 1); - if (rc < 0) { - perror("readblock recv"); - goto err; - } - - if ((qe->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "readblock server error\n"); - goto err; - } - if (qe->length < MSGBUFSIZE_BLOCK) { - fprintf(stderr, "readblock recv short (%u)\n", len); - goto err; - } - /* if ((block = malloc(BLOCK_SIZE)) == NULL) { - perror("readblock malloc"); - goto err; - } - memcpy(block, qe->message.block, BLOCK_SIZE); - */ - block = qe->block; - - free((void *)qe); - return block; - - err: - free(qe->block); - free((void *)qe); - return NULL; -} - -/** - * readblock: read a block from disk - * @id: block id to read - * - * @return: pointer to block, NULL on error - */ -void *readblock(uint64_t id) { - int map = (int)BSID_MAP(id); - uint64_t xid; - static int i = CLUSTER_MAX_REPLICAS - 1; - void *block = NULL; - - /* special case for the "superblock" just use the first block on the - * first replica. (extend to blocks < 6 for vdi bug) - */ - if (id < 6) { - block = readblock_indiv(bsclusters[map].servers[0], id); - goto out; - } - - i++; - if (i >= CLUSTER_MAX_REPLICAS) - i = 0; - switch (i) { - case 0: - xid = BSID_REPLICA0(id); - break; - case 1: - xid = BSID_REPLICA1(id); - break; - case 2: - xid = BSID_REPLICA2(id); - break; - } - - block = readblock_indiv(bsclusters[map].servers[i], xid); - - out: -#ifdef BSDEBUG - if (block) - fprintf(stderr, "READ: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", - id, - (unsigned int)((unsigned char *)block)[0], - (unsigned int)((unsigned char *)block)[1], - (unsigned int)((unsigned char *)block)[2], - (unsigned int)((unsigned char *)block)[3], - (unsigned int)((unsigned char *)block)[4], - (unsigned int)((unsigned char *)block)[5], - (unsigned int)((unsigned char *)block)[6], - (unsigned int)((unsigned char *)block)[7]); - else - fprintf(stderr, "READ: %016llx NULL\n", id); -#endif - return block; -} - -/***************************************************************************** - * Writing * - *****************************************************************************/ - -bsq_t *writeblock_indiv(int server, uint64_t id, void *block) { - - bsq_t *qe; - int len; - - qe = (bsq_t *)malloc(sizeof(bsq_t)); - if (!qe) { - perror("writeblock qe malloc"); - goto err; - } - qe->server = server; - - qe->message.operation = BSOP_WRITEBLOCK; - qe->message.flags = 0; - qe->message.id = id; - //memcpy(qe->message.block, block, BLOCK_SIZE); - qe->block = block; - qe->length = MSGBUFSIZE_BLOCK; - - if (send_message(qe) < 0) { - perror("writeblock sendto"); - goto err; - } - - return qe; - - err: - free((void *)qe); - return NULL; -} - - -/** - * writeblock: write an existing block to disk - * @id: block id - * @block: pointer to block - * - * @return: zero on success, -1 on failure - */ -int writeblock(uint64_t id, void *block) { - - int map = (int)BSID_MAP(id); - int rep0 = bsclusters[map].servers[0]; - int rep1 = bsclusters[map].servers[1]; - int rep2 = bsclusters[map].servers[2]; - bsq_t *reqs[3]; - int rc; - - reqs[0] = reqs[1] = reqs[2] = NULL; - -#ifdef BSDEBUG - fprintf(stderr, - "WRITE: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", - id, - (unsigned int)((unsigned char *)block)[0], - (unsigned int)((unsigned char *)block)[1], - (unsigned int)((unsigned char *)block)[2], - (unsigned int)((unsigned char *)block)[3], - (unsigned int)((unsigned char *)block)[4], - (unsigned int)((unsigned char *)block)[5], - (unsigned int)((unsigned char *)block)[6], - (unsigned int)((unsigned char *)block)[7]); -#endif - - /* special case for the "superblock" just use the first block on the - * first replica. (extend to blocks < 6 for vdi bug) - */ - if (id < 6) { - reqs[0] = writeblock_indiv(rep0, id, block); - if (!reqs[0]) - return -1; - rc = wait_recv(reqs, 1); - return rc; - } - - reqs[0] = writeblock_indiv(rep0, BSID_REPLICA0(id), block); - if (!reqs[0]) - goto err; - reqs[1] = writeblock_indiv(rep1, BSID_REPLICA1(id), block); - if (!reqs[1]) - goto err; - reqs[2] = writeblock_indiv(rep2, BSID_REPLICA2(id), block); - if (!reqs[2]) - goto err; - - rc = wait_recv(reqs, 3); - if (rc < 0) { - perror("writeblock recv"); - goto err; - } - if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "writeblock server0 error\n"); - goto err; - } - if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "writeblock server1 error\n"); - goto err; - } - if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "writeblock server2 error\n"); - goto err; - } - - - free((void *)reqs[0]); - free((void *)reqs[1]); - free((void *)reqs[2]); - return 0; - - err: - if (reqs[0]) { - dequeue(reqs[0]); - free((void *)reqs[0]); - } - if (reqs[1]) { - dequeue(reqs[1]); - free((void *)reqs[1]); - } - if (reqs[2]) { - dequeue(reqs[2]); - free((void *)reqs[2]); - } - return -1; -} - -/***************************************************************************** - * Allocation * - *****************************************************************************/ - -/** - * allocblock: write a new block to disk - * @block: pointer to block - * - * @return: new id of block on disk - */ -uint64_t allocblock(void *block) { - return allocblock_hint(block, 0); -} - -bsq_t *allocblock_hint_indiv(int server, void *block, uint64_t hint) { - bsq_t *qe; - int len; - - qe = (bsq_t *)malloc(sizeof(bsq_t)); - if (!qe) { - perror("allocblock_hint qe malloc"); - goto err; - } - qe->server = server; - - qe->message.operation = BSOP_ALLOCBLOCK; - qe->message.flags = 0; - qe->message.id = hint; - //memcpy(qe->message.block, block, BLOCK_SIZE); - qe->block = block; - qe->length = MSGBUFSIZE_BLOCK; - - if (send_message(qe) < 0) { - perror("allocblock_hint sendto"); - goto err; - } - - return qe; - - err: - free((void *)qe); - return NULL; -} - -/** - * allocblock_hint: write a new block to disk - * @block: pointer to block - * @hint: allocation hint - * - * @return: new id of block on disk - */ -uint64_t allocblock_hint(void *block, uint64_t hint) { - int map = (int)hint; - int rep0 = bsclusters[map].servers[0]; - int rep1 = bsclusters[map].servers[1]; - int rep2 = bsclusters[map].servers[2]; - bsq_t *reqs[3]; - int rc; - uint64_t id0, id1, id2; - - reqs[0] = reqs[1] = reqs[2] = NULL; - - DB("ENTER allocblock\n"); - - reqs[0] = allocblock_hint_indiv(rep0, block, hint); - if (!reqs[0]) - goto err; - reqs[1] = allocblock_hint_indiv(rep1, block, hint); - if (!reqs[1]) - goto err; - reqs[2] = allocblock_hint_indiv(rep2, block, hint); - if (!reqs[2]) - goto err; - - rc = wait_recv(reqs, 3); - if (rc < 0) { - perror("allocblock recv"); - goto err; - } - if ((reqs[0]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "allocblock server0 error\n"); - goto err; - } - if ((reqs[1]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "allocblock server1 error\n"); - goto err; - } - if ((reqs[2]->message.flags & BSOP_FLAG_ERROR)) { - fprintf(stderr, "allocblock server2 error\n"); - goto err; - } - - id0 = reqs[0]->message.id; - id1 = reqs[1]->message.id; - id2 = reqs[2]->message.id; - -#ifdef BSDEBUG - fprintf(stderr, "ALLOC: %016llx %02x%02x %02x%02x %02x%02x %02x%02x\n", - BSID(map, id0, id1, id2), - (unsigned int)((unsigned char *)block)[0], - (unsigned int)((unsigned char *)block)[1], - (unsigned int)((unsigned char *)block)[2], - (unsigned int)((unsigned char *)block)[3], - (unsigned int)((unsigned char *)block)[4], - (unsigned int)((unsigned char *)block)[5], - (unsigned int)((unsigned char *)block)[6], - (unsigned int)((unsigned char *)block)[7]); -#endif - - free((void *)reqs[0]); - free((void *)reqs[1]); - free((void *)reqs[2]); - return BSID(map, id0, id1, id2); - - err: - if (reqs[0]) { - dequeue(reqs[0]); - free((void *)reqs[0]); - } - if (reqs[1]) { - dequeue(reqs[1]); - free((void *)reqs[1]); - } - if (reqs[2]) { - dequeue(reqs[2]); - free((void *)reqs[2]); - } - return 0; -} - -#else /* /BLOCKSTORE_REMOTE */ - -/***************************************************************************** - * Local storage version * - *****************************************************************************/ - -/** - * readblock: read a block from disk - * @id: block id to read - * - * @return: pointer to block, NULL on error - */ - -void *readblock(uint64_t id) { - void *block; - int block_fp; - -//printf("readblock(%llu)\n", id); - block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return NULL; - } - - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - printf ("%Ld ", id); - printf ("%Ld\n", (id - 1) * BLOCK_SIZE); - perror("readblock lseek"); - goto err; - } - if ((block = malloc(BLOCK_SIZE)) == NULL) { - perror("readblock malloc"); - goto err; - } - if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("readblock read"); - free(block); - goto err; - } - close(block_fp); - return block; - -err: - close(block_fp); - return NULL; -} - -/** - * writeblock: write an existing block to disk - * @id: block id - * @block: pointer to block - * - * @return: zero on success, -1 on failure - */ -int writeblock(uint64_t id, void *block) { - - int block_fp; - - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return -1; - } - - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - perror("writeblock lseek"); - goto err; - } - if (write(block_fp, block, BLOCK_SIZE) < 0) { - perror("writeblock write"); - goto err; - } - close(block_fp); - return 0; - -err: - close(block_fp); - return -1; -} - -/** - * allocblock: write a new block to disk - * @block: pointer to block - * - * @return: new id of block on disk - */ - -uint64_t allocblock(void *block) { - uint64_t lb; - off64_t pos; - int block_fp; - - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return 0; - } - - pos = lseek64(block_fp, 0, SEEK_END); - if (pos == (off64_t)-1) { - perror("allocblock lseek"); - goto err; - } - if (pos % BLOCK_SIZE != 0) { - fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); - goto err; - } - if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("allocblock write"); - goto err; - } - lb = pos / BLOCK_SIZE + 1; -//printf("alloc(%Ld)\n", lb); - close(block_fp); - return lb; - -err: - close(block_fp); - return 0; - -} - -/** - * allocblock_hint: write a new block to disk - * @block: pointer to block - * @hint: allocation hint - * - * @return: new id of block on disk - */ -uint64_t allocblock_hint(void *block, uint64_t hint) { - return allocblock(block); -} - -#endif /* BLOCKSTORE_REMOTE */ - -/***************************************************************************** - * Memory management * - *****************************************************************************/ - -/** - * newblock: get a new in-memory block set to zeros - * - * @return: pointer to new block, NULL on error - */ -void *newblock(void) { - void *block = malloc(BLOCK_SIZE); - if (block == NULL) { - perror("newblock"); - return NULL; - } - memset(block, 0, BLOCK_SIZE); - return block; -} - - -/** - * freeblock: unallocate an in-memory block - * @id: block id (zero if this is only in-memory) - * @block: block to be freed - */ -void freeblock(void *block) { - free(block); -} - -static freeblock_t *new_freeblock(void) -{ - freeblock_t *fb; - - fb = newblock(); - - if (fb == NULL) return NULL; - - fb->magic = FREEBLOCK_MAGIC; - fb->next = 0ULL; - fb->count = 0ULL; - memset(fb->list, 0, sizeof fb->list); - - return fb; -} - -void releaseblock(uint64_t id) -{ - blockstore_super_t *bs_super; - freeblock_t *fl_current; - - /* get superblock */ - bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); - - /* get freeblock_current */ - if (bs_super->freelist_current == 0ULL) - { - fl_current = new_freeblock(); - bs_super->freelist_current = allocblock(fl_current); - writeblock(BLOCKSTORE_SUPER, bs_super); - } else { - fl_current = readblock(bs_super->freelist_current); - } - - /* if full, chain to superblock and allocate new current */ - - if (fl_current->count == FREEBLOCK_SIZE) { - fl_current->next = bs_super->freelist_full; - writeblock(bs_super->freelist_current, fl_current); - bs_super->freelist_full = bs_super->freelist_current; - freeblock(fl_current); - fl_current = new_freeblock(); - bs_super->freelist_current = allocblock(fl_current); - writeblock(BLOCKSTORE_SUPER, bs_super); - } - - /* append id to current */ - fl_current->list[fl_current->count++] = id; - writeblock(bs_super->freelist_current, fl_current); - - freeblock(fl_current); - freeblock(bs_super); - - -} - -/* freelist debug functions: */ -void freelist_count(int print_each) -{ - blockstore_super_t *bs_super; - freeblock_t *fb; - uint64_t total = 0, next; - - bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); - - if (bs_super->freelist_current == 0ULL) { - printf("freelist is empty!\n"); - return; - } - - fb = readblock(bs_super->freelist_current); - printf("%Ld entires on current.\n", fb->count); - total += fb->count; - if (print_each == 1) - { - int i; - for (i=0; i< fb->count; i++) - printf(" %Ld\n", fb->list[i]); - } - - freeblock(fb); - - if (bs_super->freelist_full == 0ULL) { - printf("freelist_full is empty!\n"); - return; - } - - next = bs_super->freelist_full; - for (;;) { - fb = readblock(next); - total += fb->count; - if (print_each == 1) - { - int i; - for (i=0; i< fb->count; i++) - printf(" %Ld\n", fb->list[i]); - } - next = fb->next; - freeblock(fb); - if (next == 0ULL) break; - } - printf("Total of %Ld ids on freelist.\n", total); -} - -/***************************************************************************** - * Initialisation * - *****************************************************************************/ - -int __init_blockstore(void) -{ - int i; - blockstore_super_t *bs_super; - uint64_t ret; - int block_fp; - -#ifdef BLOCKSTORE_REMOTE - struct hostent *addr; - - pthread_mutex_init(&ptmutex_queue, NULL); - pthread_mutex_init(&ptmutex_luid, NULL); - pthread_mutex_init(&ptmutex_recv, NULL); - /*pthread_mutex_init(&ptmutex_notify, NULL);*/ - for (i = 0; i <= READ_POOL_SIZE; i++) { - pool_thread[i].newdata = 0; - pthread_mutex_init(&(pool_thread[i].ptmutex), NULL); - pthread_cond_init(&(pool_thread[i].ptcv), NULL); - } - - bsservers[0].hostname = "firebug.cl.cam.ac.uk"; - bsservers[1].hostname = "planb.cl.cam.ac.uk"; - bsservers[2].hostname = "simcity.cl.cam.ac.uk"; - bsservers[3].hostname = NULL/*"gunfighter.cl.cam.ac.uk"*/; - bsservers[4].hostname = NULL/*"galaxian.cl.cam.ac.uk"*/; - bsservers[5].hostname = NULL/*"firetrack.cl.cam.ac.uk"*/; - bsservers[6].hostname = NULL/*"funfair.cl.cam.ac.uk"*/; - bsservers[7].hostname = NULL/*"felix.cl.cam.ac.uk"*/; - bsservers[8].hostname = NULL; - bsservers[9].hostname = NULL; - bsservers[10].hostname = NULL; - bsservers[11].hostname = NULL; - bsservers[12].hostname = NULL; - bsservers[13].hostname = NULL; - bsservers[14].hostname = NULL; - bsservers[15].hostname = NULL; - - for (i = 0; i < MAX_SERVERS; i++) { - if (!bsservers[i].hostname) - continue; - addr = gethostbyname(bsservers[i].hostname); - if (!addr) { - perror("bad hostname"); - return -1; - } - bsservers[i].sin.sin_family = addr->h_addrtype; - bsservers[i].sin.sin_port = htons(BLOCKSTORED_PORT); - bsservers[i].sin.sin_addr.s_addr = - ((struct in_addr *)(addr->h_addr))->s_addr; - } - - /* Cluster map - */ - bsclusters[0].servers[0] = 0; - bsclusters[0].servers[1] = 1; - bsclusters[0].servers[2] = 2; - bsclusters[1].servers[0] = 1; - bsclusters[1].servers[1] = 2; - bsclusters[1].servers[2] = 3; - bsclusters[2].servers[0] = 2; - bsclusters[2].servers[1] = 3; - bsclusters[2].servers[2] = 4; - bsclusters[3].servers[0] = 3; - bsclusters[3].servers[1] = 4; - bsclusters[3].servers[2] = 5; - bsclusters[4].servers[0] = 4; - bsclusters[4].servers[1] = 5; - bsclusters[4].servers[2] = 6; - bsclusters[5].servers[0] = 5; - bsclusters[5].servers[1] = 6; - bsclusters[5].servers[2] = 7; - bsclusters[6].servers[0] = 6; - bsclusters[6].servers[1] = 7; - bsclusters[6].servers[2] = 0; - bsclusters[7].servers[0] = 7; - bsclusters[7].servers[1] = 0; - bsclusters[7].servers[2] = 1; - - /* Local socket set up - */ - bssock = socket(AF_INET, SOCK_DGRAM, 0); - if (bssock < 0) { - perror("Bad socket"); - return -1; - } - memset(&sin_local, 0, sizeof(sin_local)); - sin_local.sin_family = AF_INET; - sin_local.sin_port = htons(BLOCKSTORED_PORT); - sin_local.sin_addr.s_addr = htonl(INADDR_ANY); - if (bind(bssock, (struct sockaddr *)&sin_local, sizeof(sin_local)) < 0) { - perror("bind"); - close(bssock); - return -1; - } - - pthread_create(&pthread_recv, NULL, receive_loop, NULL); - pthread_create(&pthread_recv, NULL, queue_runner, NULL); - -#else /* /BLOCKSTORE_REMOTE */ - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return -1; - exit(-1); - } - - if (lseek(block_fp, 0, SEEK_END) == 0) { - bs_super = newblock(); - bs_super->magic = BLOCKSTORE_MAGIC; - bs_super->freelist_full = 0LL; - bs_super->freelist_current = 0LL; - - ret = allocblock(bs_super); - - freeblock(bs_super); - } else { - bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); - if (bs_super->magic != BLOCKSTORE_MAGIC) - { - printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n"); - exit(-1); - } - freeblock(bs_super); - } - - close(block_fp); - -#endif /* BLOCKSTORE_REMOTE */ - return 0; -} - -void __exit_blockstore(void) -{ - int i; -#ifdef BLOCKSTORE_REMOTE - pthread_mutex_destroy(&ptmutex_recv); - pthread_mutex_destroy(&ptmutex_luid); - pthread_mutex_destroy(&ptmutex_queue); - /*pthread_mutex_destroy(&ptmutex_notify); - pthread_cond_destroy(&ptcv_notify);*/ - for (i = 0; i <= READ_POOL_SIZE; i++) { - pthread_mutex_destroy(&(pool_thread[i].ptmutex)); - pthread_cond_destroy(&(pool_thread[i].ptcv)); - } -#endif -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/blockstore.h --- a/tools/blktap/parallax/blockstore.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,134 +0,0 @@ -/************************************************************************** - * - * blockstore.h - * - * Simple block store interface - * - */ - -#ifndef __BLOCKSTORE_H__ -#define __BLOCKSTORE_H__ - -#include <netinet/in.h> -#include <xenctrl.h> - -#define BLOCK_SIZE 4096 -#define BLOCK_SHIFT 12 -#define BLOCK_MASK 0xfffffffffffff000LL - -/* XXX SMH: where is the below supposed to be defined???? */ -#ifndef SECTOR_SHIFT -#define SECTOR_SHIFT 9 -#endif - -#define FREEBLOCK_SIZE (BLOCK_SIZE / sizeof(uint64_t)) - (3 * sizeof(uint64_t)) -#define FREEBLOCK_MAGIC 0x0fee0fee0fee0feeULL - -typedef struct { - uint64_t magic; - uint64_t next; - uint64_t count; - uint64_t list[FREEBLOCK_SIZE]; -} freeblock_t; - -#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaaULL -#define BLOCKSTORE_SUPER 1ULL - -typedef struct { - uint64_t magic; - uint64_t freelist_full; - uint64_t freelist_current; -} blockstore_super_t; - -extern void *newblock(); -extern void *readblock(uint64_t id); -extern uint64_t allocblock(void *block); -extern uint64_t allocblock_hint(void *block, uint64_t hint); -extern int writeblock(uint64_t id, void *block); - -/* Add this blockid to a freelist, to be recycled by the allocator. */ -extern void releaseblock(uint64_t id); - -/* this is a memory free() operation for block-sized allocations */ -extern void freeblock(void *block); -extern int __init_blockstore(void); - -/* debug for freelist. */ -void freelist_count(int print_each); -#define ALLOCFAIL (((uint64_t)(-1))) - -/* Distribution - */ -#define BLOCKSTORED_PORT 9346 - -struct bshdr_t_struct { - uint32_t operation; - uint32_t flags; - uint64_t id; - uint64_t luid; -} __attribute__ ((packed)); -typedef struct bshdr_t_struct bshdr_t; - -struct bsmsg_t_struct { - bshdr_t hdr; - unsigned char block[BLOCK_SIZE]; -} __attribute__ ((packed)); - -typedef struct bsmsg_t_struct bsmsg_t; - -#define MSGBUFSIZE_OP sizeof(uint32_t) -#define MSGBUFSIZE_FLAGS (sizeof(uint32_t) + sizeof(uint32_t)) -#define MSGBUFSIZE_ID (sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint64_t) + sizeof(uint64_t)) -#define MSGBUFSIZE_BLOCK sizeof(bsmsg_t) - -#define BSOP_READBLOCK 0x01 -#define BSOP_WRITEBLOCK 0x02 -#define BSOP_ALLOCBLOCK 0x03 -#define BSOP_FREEBLOCK 0x04 - -#define BSOP_FLAG_ERROR 0x01 - -#define BS_ALLOC_SKIP 10 -#define BS_ALLOC_HACK - -/* Remote hosts and cluster map - XXX need to generalise - */ - -/* - - Interim ID format is - - 63 60 59 40 39 20 19 0 - +----+--------------------+--------------------+--------------------+ - |map | replica 2 | replica 1 | replica 0 | - +----+--------------------+--------------------+--------------------+ - - The map is an index into a table detailing which machines form the - cluster. - - */ - -#define BSID_REPLICA0(_id) ((_id)&0xfffffULL) -#define BSID_REPLICA1(_id) (((_id)>>20)&0xfffffULL) -#define BSID_REPLICA2(_id) (((_id)>>40)&0xfffffULL) -#define BSID_MAP(_id) (((_id)>>60)&0xfULL) - -#define BSID(_map, _rep0, _rep1, _rep2) ((((uint64_t)(_map))<<60) | \ - (((uint64_t)(_rep2))<<40) | \ - (((uint64_t)(_rep1))<<20) | ((uint64_t)(_rep0))) - -typedef struct bsserver_t_struct { - char *hostname; - struct sockaddr_in sin; -} bsserver_t; - -#define MAX_SERVERS 16 - -#define CLUSTER_MAX_REPLICAS 3 -typedef struct bscluster_t_struct { - int servers[CLUSTER_MAX_REPLICAS]; -} bscluster_t; - -#define MAX_CLUSTERS 16 - -#endif /* __BLOCKSTORE_H__ */ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/blockstored.c --- a/tools/blktap/parallax/blockstored.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,275 +0,0 @@ -/************************************************************************** - * - * blockstored.c - * - * Block store daemon. - * - */ - -#include <fcntl.h> -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/socket.h> -#include <sys/ioctl.h> -#include <netinet/in.h> -#include <errno.h> -#include "blockstore.h" - -//#define BSDEBUG - -int readblock_into(uint64_t id, void *block); - -int open_socket(uint16_t port) { - - struct sockaddr_in sn; - int sock; - - sock = socket(AF_INET, SOCK_DGRAM, 0); - if (sock < 0) { - perror("Bad socket"); - return -1; - } - memset(&sn, 0, sizeof(sn)); - sn.sin_family = AF_INET; - sn.sin_port = htons(port); - sn.sin_addr.s_addr = htonl(INADDR_ANY); - if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) { - perror("bind"); - close(sock); - return -1; - } - - return sock; -} - -static int block_fp = -1; -static int bssock = -1; - -int send_reply(struct sockaddr_in *peer, void *buffer, int len) { - - int rc; - -#ifdef BSDEBUG - fprintf(stdout, "TX: %u bytes op=%u id=0x%llx\n", - len, ((bsmsg_t *)buffer)->hdr.operation, ((bsmsg_t *)buffer)->hdr.id); -#endif - rc = sendto(bssock, buffer, len, 0, (struct sockaddr *)peer, sizeof(*peer)); - if (rc < 0) { - perror("send_reply"); - return 1; - } - - - return 0; -} - -static bsmsg_t msgbuf; - -void service_loop(void) { - - for (;;) { - int rc, len; - struct sockaddr_in from; - size_t slen = sizeof(from); - uint64_t bid; - - len = recvfrom(bssock, (void *)&msgbuf, sizeof(msgbuf), 0, - (struct sockaddr *)&from, &slen); - - if (len < 0) { - perror("recvfrom"); - continue; - } - - if (len < MSGBUFSIZE_OP) { - fprintf(stderr, "Short packet.\n"); - continue; - } - -#ifdef BSDEBUG - fprintf(stdout, "RX: %u bytes op=%u id=0x%llx\n", - len, msgbuf.hdr.operation, msgbuf.hdr.id); -#endif - - switch (msgbuf.hdr.operation) { - case BSOP_READBLOCK: - if (len < MSGBUFSIZE_ID) { - fprintf(stderr, "Short packet (readblock %u).\n", len); - continue; - } - rc = readblock_into(msgbuf.hdr.id, msgbuf.block); - if (rc < 0) { - fprintf(stderr, "readblock error\n"); - msgbuf.hdr.flags = BSOP_FLAG_ERROR; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - continue; - } - msgbuf.hdr.flags = 0; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_BLOCK); - break; - case BSOP_WRITEBLOCK: - if (len < MSGBUFSIZE_BLOCK) { - fprintf(stderr, "Short packet (writeblock %u).\n", len); - continue; - } - rc = writeblock(msgbuf.hdr.id, msgbuf.block); - if (rc < 0) { - fprintf(stderr, "writeblock error\n"); - msgbuf.hdr.flags = BSOP_FLAG_ERROR; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - continue; - } - msgbuf.hdr.flags = 0; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - break; - case BSOP_ALLOCBLOCK: - if (len < MSGBUFSIZE_BLOCK) { - fprintf(stderr, "Short packet (allocblock %u).\n", len); - continue; - } - bid = allocblock(msgbuf.block); - if (bid == ALLOCFAIL) { - fprintf(stderr, "allocblock error\n"); - msgbuf.hdr.flags = BSOP_FLAG_ERROR; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - continue; - } - msgbuf.hdr.id = bid; - msgbuf.hdr.flags = 0; - send_reply(&from, (void *)&msgbuf, MSGBUFSIZE_ID); - break; - } - - } -} - -/** - * readblock: read a block from disk - * @id: block id to read - * @block: pointer to buffer to receive block - * - * @return: 0 if OK, other on error - */ - -int readblock_into(uint64_t id, void *block) { - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - printf ("%Ld\n", (id - 1) * BLOCK_SIZE); - perror("readblock lseek"); - return -1; - } - if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("readblock read"); - return -1; - } - return 0; -} - -/** - * writeblock: write an existing block to disk - * @id: block id - * @block: pointer to block - * - * @return: zero on success, -1 on failure - */ -int writeblock(uint64_t id, void *block) { - if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - perror("writeblock lseek"); - return -1; - } - if (write(block_fp, block, BLOCK_SIZE) < 0) { - perror("writeblock write"); - return -1; - } - return 0; -} - -/** - * allocblock: write a new block to disk - * @block: pointer to block - * - * @return: new id of block on disk - */ -static uint64_t lastblock = 0; - -uint64_t allocblock(void *block) { - uint64_t lb; - off64_t pos; - - retry: - pos = lseek64(block_fp, 0, SEEK_END); - if (pos == (off64_t)-1) { - perror("allocblock lseek"); - return ALLOCFAIL; - } - if (pos % BLOCK_SIZE != 0) { - fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); - return ALLOCFAIL; - } - if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("allocblock write"); - return ALLOCFAIL; - } - lb = pos / BLOCK_SIZE + 1; - -#ifdef BS_ALLOC_HACK - if (lb < BS_ALLOC_SKIP) - goto retry; -#endif - - if (lb <= lastblock) - printf("[*** %Ld alredy allocated! ***]\n", lb); - - lastblock = lb; - return lb; -} - -/** - * newblock: get a new in-memory block set to zeros - * - * @return: pointer to new block, NULL on error - */ -void *newblock(void) { - void *block = malloc(BLOCK_SIZE); - if (block == NULL) { - perror("newblock"); - return NULL; - } - memset(block, 0, BLOCK_SIZE); - return block; -} - - -/** - * freeblock: unallocate an in-memory block - * @id: block id (zero if this is only in-memory) - * @block: block to be freed - */ -void freeblock(void *block) { - free(block); -} - - -int main(int argc, char **argv) -{ - block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (block_fp < 0) { - perror("open"); - return -1; - } - - bssock = open_socket(BLOCKSTORED_PORT); - if (bssock < 0) { - return -1; - } - - service_loop(); - - close(bssock); - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/bstest.c --- a/tools/blktap/parallax/bstest.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,191 +0,0 @@ -/************************************************************************** - * - * bstest.c - * - * Block store daemon test program. - * - * usage: bstest <host>|X {r|w|a} ID - * - */ - -#include <fcntl.h> -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/socket.h> -#include <sys/ioctl.h> -#include <netinet/in.h> -#include <netdb.h> -#include <errno.h> -#include "blockstore.h" - -int direct(char *host, uint32_t op, uint64_t id, int len) { - struct sockaddr_in sn, peer; - int sock; - bsmsg_t msgbuf; - int rc, slen; - struct hostent *addr; - - addr = gethostbyname(host); - if (!addr) { - perror("bad hostname"); - exit(1); - } - peer.sin_family = addr->h_addrtype; - peer.sin_port = htons(BLOCKSTORED_PORT); - peer.sin_addr.s_addr = ((struct in_addr *)(addr->h_addr))->s_addr; - fprintf(stderr, "Sending to: %u.%u.%u.%u\n", - (unsigned int)(unsigned char)addr->h_addr[0], - (unsigned int)(unsigned char)addr->h_addr[1], - (unsigned int)(unsigned char)addr->h_addr[2], - (unsigned int)(unsigned char)addr->h_addr[3]); - - sock = socket(AF_INET, SOCK_DGRAM, 0); - if (sock < 0) { - perror("Bad socket"); - exit(1); - } - memset(&sn, 0, sizeof(sn)); - sn.sin_family = AF_INET; - sn.sin_port = htons(BLOCKSTORED_PORT); - sn.sin_addr.s_addr = htonl(INADDR_ANY); - if (bind(sock, (struct sockaddr *)&sn, sizeof(sn)) < 0) { - perror("bind"); - close(sock); - exit(1); - } - - memset((void *)&msgbuf, 0, sizeof(msgbuf)); - msgbuf.operation = op; - msgbuf.id = id; - - rc = sendto(sock, (void *)&msgbuf, len, 0, - (struct sockaddr *)&peer, sizeof(peer)); - if (rc < 0) { - perror("sendto"); - exit(1); - } - - slen = sizeof(peer); - len = recvfrom(sock, (void *)&msgbuf, sizeof(msgbuf), 0, - (struct sockaddr *)&peer, &slen); - if (len < 0) { - perror("recvfrom"); - exit(1); - } - - printf("Reply %u bytes:\n", len); - if (len >= MSGBUFSIZE_OP) - printf(" operation: %u\n", msgbuf.operation); - if (len >= MSGBUFSIZE_FLAGS) - printf(" flags: 0x%x\n", msgbuf.flags); - if (len >= MSGBUFSIZE_ID) - printf(" id: %llu\n", msgbuf.id); - if (len >= (MSGBUFSIZE_ID + 4)) - printf(" data: %02x %02x %02x %02x...\n", - (unsigned int)msgbuf.block[0], - (unsigned int)msgbuf.block[1], - (unsigned int)msgbuf.block[2], - (unsigned int)msgbuf.block[3]); - - if (sock > 0) - close(sock); - - return 0; -} - -int main (int argc, char **argv) { - - uint32_t op = 0; - uint64_t id = 0; - int len = 0, rc; - void *block; - - if (argc < 3) { - fprintf(stderr, "usage: bstest <host>|X {r|w|a} ID\n"); - return 1; - } - - switch (argv[2][0]) { - case 'r': - case 'R': - op = BSOP_READBLOCK; - len = MSGBUFSIZE_ID; - break; - case 'w': - case 'W': - op = BSOP_WRITEBLOCK; - len = MSGBUFSIZE_BLOCK; - break; - case 'a': - case 'A': - op = BSOP_ALLOCBLOCK; - len = MSGBUFSIZE_BLOCK; - break; - default: - fprintf(stderr, "Unknown action '%s'.\n", argv[2]); - return 1; - } - - if (argc >= 4) - id = atoll(argv[3]); - - if (strcmp(argv[1], "X") == 0) { - rc = __init_blockstore(); - if (rc < 0) { - fprintf(stderr, "blockstore init failed.\n"); - return 1; - } - switch(op) { - case BSOP_READBLOCK: - block = readblock(id); - if (block) { - printf("data: %02x %02x %02x %02x...\n", - (unsigned int)((unsigned char*)block)[0], - (unsigned int)((unsigned char*)block)[1], - (unsigned int)((unsigned char*)block)[2], - (unsigned int)((unsigned char*)block)[3]); - } - break; - case BSOP_WRITEBLOCK: - block = malloc(BLOCK_SIZE); - if (!block) { - perror("bstest malloc"); - return 1; - } - memset(block, 0, BLOCK_SIZE); - rc = writeblock(id, block); - if (rc != 0) { - printf("error\n"); - } - else { - printf("OK\n"); - } - break; - case BSOP_ALLOCBLOCK: - block = malloc(BLOCK_SIZE); - if (!block) { - perror("bstest malloc"); - return 1; - } - memset(block, 0, BLOCK_SIZE); - id = allocblock_hint(block, id); - if (id == 0) { - printf("error\n"); - } - else { - printf("ID: %llu\n", id); - } - break; - } - } - else { - direct(argv[1], op, id, len); - } - - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/parallax.c --- a/tools/blktap/parallax/parallax.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,608 +0,0 @@ -/************************************************************************** - * - * parallax.c - * - * The Parallax Storage Server - * - */ - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> -#include "blktaplib.h" -#include "blockstore.h" -#include "vdi.h" -#include "block-async.h" -#include "requests-async.h" - -#define PARALLAX_DEV 61440 -#define SECTS_PER_NODE 8 - - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* ------[ session records ]----------------------------------------------- */ - -#define BLKIF_HASHSZ 1024 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) - -#define VDI_HASHSZ 16 -#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1)) - -typedef struct blkif { - domid_t domid; - unsigned int handle; - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - vdi_t *vdi_hash[VDI_HASHSZ]; - struct blkif *hash_next; -} blkif_t; - -static blkif_t *blkif_hash[BLKIF_HASHSZ]; - -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) -{ - if ( handle != 0 ) - printf("blktap/parallax don't currently support non-0 dev handles!\n"); - - blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif != NULL) && - ((blkif->domid != domid) || (blkif->handle != handle)) ) - blkif = blkif->hash_next; - return blkif; -} - -vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device) -{ - vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)]; - - while ((vdi != NULL) && (vdi->vdevice != device)) - vdi = vdi->next; - - return vdi; -} - -/* ------[ control message handling ]-------------------------------------- */ - -void blkif_create(blkif_be_create_t *create) -{ - domid_t domid = create->domid; - unsigned int handle = create->blkif_handle; - blkif_t **pblkif, *blkif; - - DPRINTF("parallax (blkif_create): create is %p\n", create); - - if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL ) - { - DPRINTF("Could not create blkif: out of memory\n"); - create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->handle = handle; - blkif->status = DISCONNECTED; - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( *pblkif != NULL ) - { - if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) - { - DPRINTF("Could not create blkif: already exists (%d,%d)\n", - domid, handle); - create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; - free(blkif); - return; - } - pblkif = &(*pblkif)->hash_next; - } - - blkif->hash_next = *pblkif; - *pblkif = blkif; - - DPRINTF("Successfully created blkif\n"); - create->status = BLKIF_BE_STATUS_OKAY; -} - -void blkif_destroy(blkif_be_destroy_t *destroy) -{ - domid_t domid = destroy->domid; - unsigned int handle = destroy->blkif_handle; - blkif_t **pblkif, *blkif; - - DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif = *pblkif) != NULL ) - { - if ( (blkif->domid == domid) && (blkif->handle == handle) ) - { - if ( blkif->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pblkif = &blkif->hash_next; - } - - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pblkif = blkif->hash_next; - free(blkif); - destroy->status = BLKIF_BE_STATUS_OKAY; -} - -void vbd_create(blkif_be_vbd_create_t *create) -{ - blkif_t *blkif; - vdi_t *vdi, **vdip; - blkif_vdev_t vdevice = create->vdevice; - - DPRINTF("parallax (vbd_create): create=%p\n", create); - - blkif = blkif_find_by_handle(create->domid, create->blkif_handle); - if ( blkif == NULL ) - { - DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", - create->domid, create->blkif_handle); - create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - /* VDI identifier is in grow->extent.sector_start */ - DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", - (unsigned long)create->dev_handle); - - vdi = vdi_get(create->dev_handle); - if (vdi == NULL) - { - printf("parallax (vbd_create): VDI %lx not found.\n", - (unsigned long)create->dev_handle); - create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; - return; - } - - vdi->next = NULL; - vdi->vdevice = vdevice; - vdip = &blkif->vdi_hash[VDI_HASH(vdevice)]; - while (*vdip != NULL) - vdip = &(*vdip)->next; - *vdip = vdi; - - DPRINTF("blkif_create succeeded\n"); - create->status = BLKIF_BE_STATUS_OKAY; -} - -void vbd_destroy(blkif_be_vbd_destroy_t *destroy) -{ - blkif_t *blkif; - vdi_t *vdi, **vdip; - blkif_vdev_t vdevice = destroy->vdevice; - - blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); - if ( blkif == NULL ) - { - DPRINTF("vbd_destroy attempted for non-existent blkif (%u,%u)\n", - destroy->domid, destroy->blkif_handle); - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - vdip = &blkif->vdi_hash[VDI_HASH(vdevice)]; - while ((*vdip != NULL) && ((*vdip)->vdevice != vdevice)) - vdip = &(*vdip)->next; - - if (*vdip != NULL) - { - vdi = *vdip; - *vdip = vdi->next; - vdi_put(vdi); - } - -} - -int parallax_control(control_msg_t *msg) -{ - domid_t domid; - int ret; - - DPRINTF("parallax_control: msg is %p\n", msg); - - if (msg->type != CMSG_BLKIF_BE) - { - printf("Unexpected control message (%d)\n", msg->type); - return 0; - } - - switch(msg->subtype) - { - case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_be_create_t) ) - goto parse_error; - blkif_create((blkif_be_create_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_be_destroy_t) ) - goto parse_error; - blkif_destroy((blkif_be_destroy_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_VBD_CREATE: - if ( msg->length != sizeof(blkif_be_vbd_create_t) ) - goto parse_error; - vbd_create((blkif_be_vbd_create_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_VBD_DESTROY: - if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) - goto parse_error; - vbd_destroy((blkif_be_vbd_destroy_t *)msg->msg); - break; - - case CMSG_BLKIF_BE_CONNECT: - case CMSG_BLKIF_BE_DISCONNECT: - /* we don't manage the device channel, the tap does. */ - break; - - default: - goto parse_error; - } - return 0; -parse_error: - printf("Bad control message!\n"); - return 0; - -} - -int parallax_probe(blkif_request_t *req, blkif_t *blkif) -{ - blkif_response_t *rsp; - vdisk_t *img_info; - vdi_t *vdi; - int i, nr_vdis = 0; - - DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); - - /* We expect one buffer only. */ - if ( req->nr_segments != 1 ) - goto err; - - /* Make sure the buffer is page-sized. */ - if ( (req->seg[0].first_sect != 0) || (req->seg[0].last_sect != 7) ) - goto err; - - /* fill the list of devices */ - for (i=0; i<VDI_HASHSZ; i++) { - vdi = blkif->vdi_hash[i]; - while (vdi) { - img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); - img_info[nr_vdis].device = vdi->vdevice; - img_info[nr_vdis].info = 0; - /* The -1 here accounts for the LSB in the radix tree */ - img_info[nr_vdis].capacity = - ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE); - nr_vdis++; - vdi = vdi->next; - } - } - - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_PROBE; - rsp->status = nr_vdis; /* number of disks */ - - DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis); - return BLKTAP_RESPOND; -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_PROBE; - rsp->status = BLKIF_RSP_ERROR; - - DPRINTF("parallax_probe: send error response\n"); - return BLKTAP_RESPOND; -} - -typedef struct { - blkif_request_t *req; - int count; - int error; - pthread_mutex_t mutex; -} pending_t; - -#define MAX_REQUESTS 64 -pending_t pending_list[MAX_REQUESTS]; - -struct cb_param { - pending_t *pent; - int segment; - uint64_t sector; - uint64_t vblock; /* for debug printing -- can be removed. */ -}; - -static void read_cb(struct io_ret r, void *in_param) -{ - struct cb_param *param = (struct cb_param *)in_param; - pending_t *p = param->pent; - int segment = param->segment; - blkif_request_t *req = p->req; - unsigned long size, offset, start; - char *dpage, *spage; - - spage = IO_BLOCK(r); - if (spage == NULL) { p->error++; goto finish; } - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment); - - /* Calculate read size and offset within the read block. */ - - offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE; - size = (req->seg[segment].last_sect - req->seg[segment].first_sect + 1) << - SECTOR_SHIFT; - start = req->seg[segment].first_sect << SECTOR_SHIFT; - - DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), " - "vblock %llx, " - "size %lx\n", - param->sector, - p->req->seg[segment].first_sect, - p->req->seg[segment].last_sect, - param->vblock, size); - - memcpy(dpage + start, spage + offset, size); - freeblock(spage); - - /* Done the read. Now update the pending record. */ - finish: - pthread_mutex_lock(&p->mutex); - p->count--; - - if (p->count == 0) { - blkif_response_t *rsp; - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - if (p->error == 0) { - rsp->status = BLKIF_RSP_OKAY; - } else { - rsp->status = BLKIF_RSP_ERROR; - } - blktap_inject_response(rsp); - } - - pthread_mutex_unlock(&p->mutex); - - free(param); /* TODO: replace with cached alloc/dealloc */ -} - -int parallax_read(blkif_request_t *req, blkif_t *blkif) -{ - blkif_response_t *rsp; - uint64_t vblock, gblock; - vdi_t *vdi; - uint64_t sector; - int i; - char *dpage, *spage; - pending_t *pent; - - vdi = blkif_get_vdi(blkif, req->device); - - if ( vdi == NULL ) - goto err; - - pent = &pending_list[ID_TO_IDX(req->id)]; - pent->count = req->nr_segments; - pent->req = req; - pthread_mutex_init(&pent->mutex, NULL); - - for (i = 0; i < req->nr_segments; i++) { - pthread_t tid; - int ret; - struct cb_param *p; - - /* Round the requested segment to a block address. */ - sector = req->sector_number + (8*i); - vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; - - /* TODO: Replace this call to malloc with a cached allocation */ - p = (struct cb_param *)malloc(sizeof(struct cb_param)); - p->pent = pent; - p->sector = sector; - p->segment = i; - p->vblock = vblock; /* dbg */ - - /* Get that block from the store. */ - vdi_read(vdi, vblock, read_cb, (void *)p); - } - - return BLKTAP_STOLEN; - -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - rsp->status = BLKIF_RSP_ERROR; - - return BLKTAP_RESPOND; -} - -static void write_cb(struct io_ret r, void *in_param) -{ - struct cb_param *param = (struct cb_param *)in_param; - pending_t *p = param->pent; - blkif_request_t *req = p->req; - - /* catch errors from the block code. */ - if (IO_INT(r) < 0) p->error++; - - pthread_mutex_lock(&p->mutex); - p->count--; - - if (p->count == 0) { - blkif_response_t *rsp; - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; - if (p->error == 0) { - rsp->status = BLKIF_RSP_OKAY; - } else { - rsp->status = BLKIF_RSP_ERROR; - } - blktap_inject_response(rsp); - } - - pthread_mutex_unlock(&p->mutex); - - free(param); /* TODO: replace with cached alloc/dealloc */ -} - -int parallax_write(blkif_request_t *req, blkif_t *blkif) -{ - blkif_response_t *rsp; - uint64_t sector; - int i, writable = 0; - uint64_t vblock, gblock; - char *spage; - unsigned long size, offset, start; - vdi_t *vdi; - pending_t *pent; - - vdi = blkif_get_vdi(blkif, req->device); - - if ( vdi == NULL ) - goto err; - - pent = &pending_list[ID_TO_IDX(req->id)]; - pent->count = req->nr_segments; - pent->req = req; - pthread_mutex_init(&pent->mutex, NULL); - - for (i = 0; i < req->nr_segments; i++) { - struct cb_param *p; - - spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - - /* Round the requested segment to a block address. */ - - sector = req->sector_number + (8*i); - vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; - - /* Calculate read size and offset within the read block. */ - - offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE; - size = (req->seg[i].last_sect - req->seg[i].first_sect + 1) << - SECTOR_SHIFT; - start = req->seg[i].first_sect << SECTOR_SHIFT; - - DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld), " - "vblock %llx, gblock %llx, " - "size %lx\n", - sector, - req->seg[i].first_sect, req->seg[i].last_sect, - vblock, gblock, size); - - /* XXX: For now we just freak out if they try to write a */ - /* non block-sized, block-aligned page. */ - - if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) { - printf("]\n] STRANGE WRITE!\n]\n"); - goto err; - } - - /* TODO: Replace this call to malloc with a cached allocation */ - p = (struct cb_param *)malloc(sizeof(struct cb_param)); - p->pent = pent; - p->sector = sector; - p->segment = i; - p->vblock = vblock; /* dbg */ - - /* Issue the write to the store. */ - vdi_write(vdi, vblock, spage, write_cb, (void *)p); - } - - return BLKTAP_STOLEN; - -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; - rsp->status = BLKIF_RSP_ERROR; - - return BLKTAP_RESPOND; -} - -int parallax_request(blkif_request_t *req) -{ - blkif_response_t *rsp; - domid_t dom = ID_TO_DOM(req->id); - blkif_t *blkif = blkif_find_by_handle(dom, 0); - - if (blkif == NULL) - goto err; - - if ( req->operation == BLKIF_OP_PROBE ) { - - return parallax_probe(req, blkif); - - } else if ( req->operation == BLKIF_OP_READ ) { - - return parallax_read(req, blkif); - - } else if ( req->operation == BLKIF_OP_WRITE ) { - - return parallax_write(req, blkif); - - } else { - printf("Unknown request message type!\n"); - /* Unknown operation */ - goto err; - } - -err: - rsp = (blkif_response_t *)req; - rsp->operation = req->operation; - rsp->id = req->id; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - -void __init_parallax(void) -{ - memset(blkif_hash, 0, sizeof(blkif_hash)); -} - - - -int main(int argc, char *argv[]) -{ - DPRINTF("parallax: starting.\n"); - __init_blockstore(); - DPRINTF("parallax: initialized blockstore...\n"); - init_block_async(); - DPRINTF("parallax: initialized async blocks...\n"); - __init_vdi(); - DPRINTF("parallax: initialized vdi registry etc...\n"); - __init_parallax(); - DPRINTF("parallax: initialized local stuff..\n"); - - blktap_register_ctrl_hook("parallax_control", parallax_control); - blktap_register_request_hook("parallax_request", parallax_request); - DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); - blktap_listen(); - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/radix.c --- a/tools/blktap/parallax/radix.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,631 +0,0 @@ -/* - * Radix tree for mapping (up to) 63-bit virtual block IDs to - * 63-bit global block IDs - * - * Pointers within the tree set aside the least significant bit to indicate - * whther or not the target block is writable from this node. - * - * The block with ID 0 is assumed to be an empty block of all zeros - */ - -#include <unistd.h> -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> -#include <string.h> -#include <pthread.h> -#include "blockstore.h" -#include "radix.h" - -#define RADIX_TREE_MAP_SHIFT 9 -#define RADIX_TREE_MAP_MASK 0x1ff -#define RADIX_TREE_MAP_ENTRIES 512 - -/* -#define DEBUG -*/ - -/* Experimental radix cache. */ - -static pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER; -static int rcache_count = 0; -#define RCACHE_MAX 1024 - -typedef struct rcache_st { - radix_tree_node *node; - uint64_t id; - struct rcache_st *hash_next; - struct rcache_st *cache_next; - struct rcache_st *cache_prev; -} rcache_t; - -static rcache_t *rcache_head = NULL; -static rcache_t *rcache_tail = NULL; - -#define RCHASH_SIZE 512ULL -rcache_t *rcache[RCHASH_SIZE]; -#define RCACHE_HASH(_id) ((_id) & (RCHASH_SIZE - 1)) - -void __rcache_init(void) -{ - int i; - - for (i=0; i<RCHASH_SIZE; i++) - rcache[i] = NULL; -} - - -void rcache_write(uint64_t id, radix_tree_node *node) -{ - rcache_t *r, *tmp, **curs; - - pthread_mutex_lock(&rcache_mutex); - - /* Is it already in the cache? */ - r = rcache[RCACHE_HASH(id)]; - - for (;;) { - if (r == NULL) - break; - if (r->id == id) - { - memcpy(r->node, node, BLOCK_SIZE); - - /* bring to front. */ - if (r != rcache_head) { - - if (r == rcache_tail) { - if (r->cache_prev != NULL) rcache_tail = r->cache_prev; - rcache_tail->cache_next = NULL; - } - - tmp = r->cache_next; - if (r->cache_next != NULL) r->cache_next->cache_prev - = r->cache_prev; - if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp; - - r->cache_prev = NULL; - r->cache_next = rcache_head; - if (rcache_head != NULL) rcache_head->cache_prev = r; - rcache_head = r; - } - -//printf("Update (%Ld)\n", r->id); - goto done; - } - r = r->hash_next; - } - - if ( rcache_count == RCACHE_MAX ) - { - /* Remove an entry */ - - r = rcache_tail; - if (r->cache_prev != NULL) rcache_tail = r->cache_prev; - rcache_tail->cache_next = NULL; - freeblock(r->node); - - curs = &rcache[RCACHE_HASH(r->id)]; - while ((*curs) != r) - curs = &(*curs)->hash_next; - *curs = r->hash_next; -//printf("Evict (%Ld)\n", r->id); - - } else { - - r = (rcache_t *)malloc(sizeof(rcache_t)); - rcache_count++; - } - - r->node = newblock(); - memcpy(r->node, node, BLOCK_SIZE); - r->id = id; - - r->hash_next = rcache[RCACHE_HASH(id)]; - rcache[RCACHE_HASH(id)] = r; - - r->cache_prev = NULL; - r->cache_next = rcache_head; - if (rcache_head != NULL) rcache_head->cache_prev = r; - rcache_head = r; - if (rcache_tail == NULL) rcache_tail = r; - -//printf("Added (%Ld, %p)\n", id, r->node); -done: - pthread_mutex_unlock(&rcache_mutex); -} - -radix_tree_node *rcache_read(uint64_t id) -{ - rcache_t *r, *tmp; - radix_tree_node *node = NULL; - - pthread_mutex_lock(&rcache_mutex); - - r = rcache[RCACHE_HASH(id)]; - - for (;;) { - if (r == NULL) { -//printf("Miss (%Ld)\n", id); - goto done; - } - if (r->id == id) break; - r = r->hash_next; - } - - /* bring to front. */ - if (r != rcache_head) - { - if (r == rcache_tail) { - if (r->cache_prev != NULL) rcache_tail = r->cache_prev; - rcache_tail->cache_next = NULL; - } - tmp = r->cache_next; - if (r->cache_next != NULL) r->cache_next->cache_prev = r->cache_prev; - if (r->cache_prev != NULL) r->cache_prev->cache_next = tmp; - - r->cache_prev = NULL; - r->cache_next = rcache_head; - if (rcache_head != NULL) rcache_head->cache_prev = r; - rcache_head = r; - } - - node = newblock(); - memcpy(node, r->node, BLOCK_SIZE); - -//printf("Hit (%Ld, %p)\n", id, r->node); -done: - pthread_mutex_unlock(&rcache_mutex); - - return(node); -} - - -void *rc_readblock(uint64_t id) -{ - void *ret; - - ret = (void *)rcache_read(id); - - if (ret != NULL) return ret; - - ret = readblock(id); - - if (ret != NULL) - rcache_write(id, ret); - - return(ret); -} - -uint64_t rc_allocblock(void *block) -{ - uint64_t ret; - - ret = allocblock(block); - - if (ret != ZERO) - rcache_write(ret, block); - - return(ret); -} - -int rc_writeblock(uint64_t id, void *block) -{ - int ret; - - ret = writeblock(id, block); - rcache_write(id, block); - - return(ret); -} - - -/* - * block device interface and other helper functions - * with these functions, block id is just a 63-bit number, with - * no special consideration for the LSB - */ -radix_tree_node cloneblock(radix_tree_node block); - -/* - * main api - * with these functions, the LSB of root always indicates - * whether or not the block is writable, including the return - * values of update and snapshot - */ -uint64_t lookup(int height, uint64_t root, uint64_t key); -uint64_t update(int height, uint64_t root, uint64_t key, uint64_t val); -uint64_t snapshot(uint64_t root); - -/** - * cloneblock: clone an existing block in memory - * @block: the old block - * - * @return: new block, with LSB cleared for every entry - */ -radix_tree_node cloneblock(radix_tree_node block) { - radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE); - int i; - if (node == NULL) { - perror("cloneblock malloc"); - return NULL; - } - for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) - node[i] = block[i] & ONEMASK; - return node; -} - -/** - * lookup: find a value given a key - * @height: height in bits of the radix tree - * @root: root node id, with set LSB indicating writable node - * @key: key to lookup - * - * @return: value on success, zero on error - */ - -uint64_t lookup(int height, uint64_t root, uint64_t key) { - radix_tree_node node; - uint64_t mask = ONE; - - assert(key >> height == 0); - - /* the root block may be smaller to ensure all leaves are full */ - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - - /* now carve off equal sized chunks at each step */ - for (;;) { - uint64_t oldroot; - -#ifdef DEBUG - printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root, - (int) ((key >> height) & RADIX_TREE_MAP_MASK), - (iswritable(root) ? "" : " (readonly)")); -#endif - - if (getid(root) == ZERO) - return ZERO; - - oldroot = root; - node = (radix_tree_node) rc_readblock(getid(root)); - if (node == NULL) - return ZERO; - - root = node[(key >> height) & RADIX_TREE_MAP_MASK]; - mask &= root; - freeblock(node); - - if (height == 0) - return ( root & ONEMASK ) | mask; - - height -= RADIX_TREE_MAP_SHIFT; - } - - return ZERO; -} - -/* - * update: set a radix tree entry, doing copy-on-write as necessary - * @height: height in bits of the radix tree - * @root: root node id, with set LSB indicating writable node - * @key: key to set - * @val: value to set, s.t. radix(key)=val - * - * @returns: (possibly new) root id on success (with LSB=1), 0 on failure - */ - -uint64_t update(int height, uint64_t root, uint64_t key, uint64_t val) { - int offset; - uint64_t child; - radix_tree_node node; - - /* base case--return val */ - if (height == 0) - return val; - - /* the root block may be smaller to ensure all leaves are full */ - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - offset = (key >> height) & RADIX_TREE_MAP_MASK; - -#ifdef DEBUG - printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root, - offset, (iswritable(root)?"":" (clone)")); -#endif - - /* load a block, or create a new one */ - if (root == ZERO) { - node = (radix_tree_node) newblock(); - } else { - node = (radix_tree_node) rc_readblock(getid(root)); - - if (!iswritable(root)) { - /* need to clone this node */ - radix_tree_node oldnode = node; - node = cloneblock(node); - freeblock(oldnode); - root = ZERO; - } - } - - if (node == NULL) { -#ifdef DEBUG - printf("update: node is null!\n"); -#endif - return ZERO; - } - - child = update(height, node[offset], key, val); - - if (child == ZERO) { - freeblock(node); - return ZERO; - } else if (child == node[offset]) { - /* no change, so we already owned the child */ - assert(iswritable(root)); - - freeblock(node); - return root; - } - - node[offset] = child; - - /* new/cloned blocks need to be saved */ - if (root == ZERO) { - /* mark this as an owned block */ - root = rc_allocblock(node); - if (root) - root = writable(root); - } else if (rc_writeblock(getid(root), node) < 0) { - freeblock(node); - return ZERO; - } - - freeblock(node); - return root; -} - -/** - * snapshot: create a snapshot - * @root: old root node - * - * @return: new root node, 0 on error - */ -uint64_t snapshot(uint64_t root) { - radix_tree_node node, newnode; - - if ((node = rc_readblock(getid(root))) == NULL) - return ZERO; - - newnode = cloneblock(node); - freeblock(node); - if (newnode == NULL) - return ZERO; - - root = rc_allocblock(newnode); - freeblock(newnode); - - if (root == ZERO) - return ZERO; - else - return writable(root); -} - -/** - * collapse: collapse a parent onto a child. - * - * NOTE: This assumes that parent and child really are, and further that - * there are no other children forked from this parent. (children of the - * child are okay...) - */ - -int collapse(int height, uint64_t proot, uint64_t croot) -{ - int i, numlinks, ret, total = 0; - radix_tree_node pnode, cnode; - - if (height == 0) { - height = -1; /* terminate recursion */ - } else { - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - } - numlinks = (1UL << RADIX_TREE_MAP_SHIFT); - - /* Terminal cases: */ - - if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) ) - return -1; - - /* get roots */ - if ((pnode = readblock(getid(proot))) == NULL) - return -1; - - if ((cnode = readblock(getid(croot))) == NULL) - { - freeblock(pnode); - return -1; - } - - /* For each writable link in proot */ - for (i=0; i<numlinks; i++) - { - if ( pnode[i] == cnode[i] ) continue; - - /* collapse (next level) */ - /* if height != 0 and writable... */ - if (( height >= 0 ) && ( iswritable(pnode[i]) ) ) - { - //printf(" %Ld is writable (i=%d).\n", getid(pnode[i]), i); - ret = collapse(height, pnode[i], cnode[i]); - if (ret == -1) - { - total = -1; - } else { - total += ret; - } - } - - - } - - /* if plink is writable, AND clink is writable -> free plink block */ - if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) - { - releaseblock(getid(proot)); - if (ret >=0) total++; - //printf(" Delete %Ld\n", getid(proot)); - } -//printf("done : %Ld\n", getid(proot)); - return total; - -} - - -void print_root(uint64_t root, int height, FILE *dot_f) -{ - FILE *f; - int i; - radix_tree_node node; - char *style[2] = { "", "style=bold,color=blue," }; - - if (dot_f == NULL) { - f = fopen("radix.dot", "w"); - if (f == NULL) { - perror("print_root: open"); - return; - } - - /* write graph preamble */ - fprintf(f, "digraph G {\n"); - - /* add a node for this root. */ - fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", - getid(root), style[iswritable(root)], getid(root)); - } - - printf("print_root(%Ld)\n", getid(root)); - - /* base case */ - if (height == 0) { - /* add a node and edge for each child root */ - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return; - - for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) { - if (node[i] != ZERO) { - fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", - getid(node[i]), style[iswritable(node[i])], - getid(node[i])); - fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), - getid(node[i]), i); - } - } - freeblock(node); - return; - } - - /* the root block may be smaller to ensure all leaves are full */ - height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; - - if (getid(root) == ZERO) - return; - - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return; - - /* add a node and edge for each child root */ - for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) - if (node[i] != ZERO) { - fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", - getid(node[i]), style[iswritable(node[i])], - getid(node[i])); - - print_root(node[i], height-RADIX_TREE_MAP_SHIFT, f); - fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root), - getid(node[i]), i); - } - - freeblock(node); - - /* write graph postamble */ - if (dot_f == NULL) { - fprintf(f, "}\n"); - fclose(f); - } -} - -#ifdef RADIX_STANDALONE - -int main(int argc, char **argv) { - uint64_t key = ZERO, val = ZERO; - uint64_t root = writable(2ULL); - uint64_t p = ZERO, c = ZERO; - int v; - char buff[4096]; - - __init_blockstore(); - - memset(buff, 0, 4096); - /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644); - - if (fp < 3) { - perror("open"); - return -1; - } - if (lseek(fp, 0, SEEK_END) == 0) { - write(fp, buff, 4096); - }*/ - - allocblock(buff); - - printf("Recognized commands:\n" - "Note: the LSB of a node number indicates if it is writable\n" - " root <node> set root to <node>\n" - " snapshot take a snapshot of the root\n" - " set <key> <val> set key=val\n" - " get <key> query key\n" - " c <proot> <croot> collapse\n" - " pr print tree to dot\n" - " pf <1=verbose> print freelist\n" - " quit\n" - "\nroot = %Ld\n", root); - for (;;) { - //print_root(root, 34, NULL); - //system("dot radix.dot -Tps -o radix.ps"); - - printf("> "); - fflush(stdout); - fgets(buff, 1024, stdin); - if (feof(stdin)) - break; - if (sscanf(buff, " root %Ld", &root) == 1) { - printf("root set to %Ld\n", root); - } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) { - root = update(34, root, key, val); - printf("root = %Ld\n", root); - } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) { - v = collapse(34, p, c); - printf("reclaimed %d blocks.\n", v); - } else if (sscanf(buff, " get %Ld", &key) == 1) { - val = lookup(34, root, key); - printf("value = %Ld\n", val); - } else if (!strcmp(buff, "quit\n")) { - break; - } else if (!strcmp(buff, "snapshot\n")) { - root = snapshot(root); - printf("new root = %Ld\n", root); - } else if (sscanf(buff, " pr %Ld", &root) == 1) { - print_root(root, 34, NULL); - } else if (sscanf(buff, " pf %d", &v) == 1) { - freelist_count(v); - } else if (!strcmp(buff, "pf\n")) { - freelist_count(0); - } else { - printf("command not recognized\n"); - } - } - return 0; -} - -#endif diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/radix.h --- a/tools/blktap/parallax/radix.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -/* - * Radix tree for mapping (up to) 63-bit virtual block IDs to - * 63-bit global block IDs - * - * Pointers within the tree set aside the least significant bit to indicate - * whther or not the target block is writable from this node. - * - * The block with ID 0 is assumed to be an empty block of all zeros - */ - -#ifndef __RADIX_H__ -#define __RADIX_H__ - -/* I don't really like exposing these, but... */ -#define getid(x) (((x)>>1)&0x7fffffffffffffffLL) -#define putid(x) ((x)<<1) -#define writable(x) (((x)<<1)|1LL) -#define iswritable(x) ((x)&1LL) -#define ZERO 0LL -#define ONE 1LL -#define ONEMASK 0xffffffffffffffeLL - -#define RADIX_TREE_MAP_SHIFT 9 -#define RADIX_TREE_MAP_MASK 0x1ff -#define RADIX_TREE_MAP_ENTRIES 512 - -typedef uint64_t *radix_tree_node; - - -/* - * main api - * with these functions, the LSB of root always indicates - * whether or not the block is writable, including the return - * values of update and snapshot - */ -uint64_t lookup(int height, uint64_t root, uint64_t key); -uint64_t update(int height, uint64_t root, uint64_t key, uint64_t val); -uint64_t snapshot(uint64_t root); -int collapse(int height, uint64_t proot, uint64_t croot); -int isprivate(int height, uint64_t root, uint64_t key); - - -void __rcache_init(void); - -#endif /* __RADIX_H__ */ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/requests-async.c --- a/tools/blktap/parallax/requests-async.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,762 +0,0 @@ -/* requests-async.c - * - * asynchronous request dispatcher for radix access in parallax. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <ctype.h> -#include <assert.h> -#include <pthread.h> -#include <err.h> -#include <zlib.h> /* for crc32() */ -#include "requests-async.h" -#include "vdi.h" -#include "radix.h" - -#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18) -#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9) -#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL)) - - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -struct block_info { - uint32_t crc; - uint32_t unused; -}; - -struct io_req { - enum { IO_OP_READ, IO_OP_WRITE } op; - uint64_t root; - uint64_t vaddr; - int state; - io_cb_t cb; - void *param; - struct radix_lock *lock; - - /* internal stuff: */ - struct io_ret retval;/* holds the return while we unlock. */ - char *block; /* the block to write */ - radix_tree_node radix[3]; - uint64_t radix_addr[3]; - struct block_info bi; -}; - -void clear_w_bits(radix_tree_node node) -{ - int i; - for (i=0; i<RADIX_TREE_MAP_ENTRIES; i++) - node[i] = node[i] & ONEMASK; - return; -} - -void clear_L3_w_bits(radix_tree_node node) -{ - int i; - for (i=0; i<RADIX_TREE_MAP_ENTRIES; i+=2) - node[i] = node[i] & ONEMASK; - return; -} - -enum states { - /* both */ - READ_L1, - READ_L2, - READ_L3, - - /* read */ - READ_LOCKED, - READ_DATA, - READ_UNLOCKED, - RETURN_ZERO, - - /* write */ - WRITE_LOCKED, - WRITE_DATA, - WRITE_L3, - WRITE_UNLOCKED, - - /* L3 Zero Path */ - ALLOC_DATA_L3z, - WRITE_L3_L3z, - - /* L3 Fault Path */ - ALLOC_DATA_L3f, - WRITE_L3_L3f, - - /* L2 Zero Path */ - ALLOC_DATA_L2z, - WRITE_L2_L2z, - ALLOC_L3_L2z, - WRITE_L2_L3z, - - /* L2 Fault Path */ - READ_L3_L2f, - ALLOC_DATA_L2f, - WRITE_L2_L2f, - ALLOC_L3_L2f, - WRITE_L2_L3f, - - /* L1 Zero Path */ - ALLOC_DATA_L1z, - ALLOC_L3_L1z, - ALLOC_L2_L1z, - WRITE_L1_L1z, - - /* L1 Fault Path */ - READ_L2_L1f, - READ_L3_L1f, - ALLOC_DATA_L1f, - ALLOC_L3_L1f, - ALLOC_L2_L1f, - WRITE_L1_L1f, - -}; - -enum radix_offsets { - L1 = 0, - L2 = 1, - L3 = 2 -}; - - -static void read_cb(struct io_ret ret, void *param); -static void write_cb(struct io_ret ret, void *param); - -int vdi_read(vdi_t *vdi, uint64_t vaddr, io_cb_t cb, void *param) -{ - struct io_req *req; - - if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR; - /* Every second line in the bottom-level radix tree is used to */ - /* store crc32 values etc. We shift the vadder here to achied this. */ - vaddr <<= 1; - - req = (struct io_req *)malloc(sizeof (struct io_req)); - if (req == NULL) return ERR_NOMEM; - - req->radix[0] = req->radix[1] = req->radix[2] = NULL; - req->op = IO_OP_READ; - req->root = vdi->radix_root; - req->lock = vdi->radix_lock; - req->vaddr = vaddr; - req->cb = cb; - req->param = param; - req->state = READ_LOCKED; - - block_rlock(req->lock, L1_IDX(vaddr), read_cb, req); - - return 0; -} - - -int vdi_write(vdi_t *vdi, uint64_t vaddr, char *block, - io_cb_t cb, void *param) -{ - struct io_req *req; - - if (!VALID_VADDR(vaddr)) return ERR_BAD_VADDR; - /* Every second line in the bottom-level radix tree is used to */ - /* store crc32 values etc. We shift the vadder here to achied this. */ - vaddr <<= 1; - - req = (struct io_req *)malloc(sizeof (struct io_req)); - if (req == NULL) return ERR_NOMEM; - - req->radix[0] = req->radix[1] = req->radix[2] = NULL; - req->op = IO_OP_WRITE; - req->root = vdi->radix_root; - req->lock = vdi->radix_lock; - req->vaddr = vaddr; - req->block = block; - /* Todo: add a pseodoheader to the block to include some location */ - /* information in the CRC as well. */ - req->bi.crc = (uint32_t) crc32(0L, Z_NULL, 0); - req->bi.crc = (uint32_t) crc32(req->bi.crc, block, BLOCK_SIZE); - req->bi.unused = 0xdeadbeef; - - req->cb = cb; - req->param = param; - req->radix_addr[L1] = getid(req->root); /* for consistency */ - req->state = WRITE_LOCKED; - - block_wlock(req->lock, L1_IDX(vaddr), write_cb, req); - - - return 0; -} - -static void read_cb(struct io_ret ret, void *param) -{ - struct io_req *req = (struct io_req *)param; - radix_tree_node node; - uint64_t idx; - char *block; - void *req_param; - - DPRINTF("read_cb\n"); - /* get record */ - switch(req->state) { - - case READ_LOCKED: - - DPRINTF("READ_LOCKED\n"); - req->state = READ_L1; - block_read(getid(req->root), read_cb, req); - break; - - case READ_L1: /* block is the radix root */ - - DPRINTF("READ_L1\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - node = (radix_tree_node) block; - idx = getid( node[L1_IDX(req->vaddr)] ); - free(block); - if ( idx == ZERO ) { - req->state = RETURN_ZERO; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - } else { - req->state = READ_L2; - block_read(idx, read_cb, req); - } - break; - - case READ_L2: - - DPRINTF("READ_L2\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - node = (radix_tree_node) block; - idx = getid( node[L2_IDX(req->vaddr)] ); - free(block); - if ( idx == ZERO ) { - req->state = RETURN_ZERO; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - } else { - req->state = READ_L3; - block_read(idx, read_cb, req); - } - break; - - case READ_L3: - { - struct block_info *bi; - - DPRINTF("READ_L3\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - node = (radix_tree_node) block; - idx = getid( node[L3_IDX(req->vaddr)] ); - bi = (struct block_info *) &node[L3_IDX(req->vaddr) + 1]; - req->bi = *bi; - free(block); - if ( idx == ZERO ) { - req->state = RETURN_ZERO; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - } else { - req->state = READ_DATA; - block_read(idx, read_cb, req); - } - break; - } - case READ_DATA: - { - uint32_t crc; - - DPRINTF("READ_DATA\n"); - block = IO_BLOCK(ret); - if (block == NULL) goto fail; - - /* crc check */ - crc = (uint32_t) crc32(0L, Z_NULL, 0); - crc = (uint32_t) crc32(crc, block, BLOCK_SIZE); - if (crc != req->bi.crc) { - /* TODO: add a retry loop here. */ - /* Do this after the cache is added -- make sure to */ - /* invalidate the bad page before reissuing the read. */ - - warn("Bad CRC on vaddr (%Lu:%d)\n", req->vaddr, req->bi.unused); -#ifdef PRINT_BADCRC_PAGES - { - int j; - for (j=0; j<BLOCK_SIZE; j++) { - if isprint(block[j]) { - printf("%c", block[j]); - } else { - printf("."); - } - if ((j % 64) == 0) printf("\n"); - } - } -#endif /* PRINT_BADCRC_PAGES */ - - /* fast and loose for the moment. */ - /* goto fail; */ - } - - req->retval = ret; - req->state = READ_UNLOCKED; - block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); - break; - } - case READ_UNLOCKED: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("READ_UNLOCKED\n"); - req_param = req->param; - r = req->retval; - cb = req->cb; - free(req); - cb(r, req_param); - break; - } - - case RETURN_ZERO: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("RETURN_ZERO\n"); - req_param = req->param; - cb = req->cb; - free(req); - r.type = IO_BLOCK_T; - r.u.b = newblock(); - cb(r, req_param); - break; - } - - default: - DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); - goto fail; - } - - return; - - fail: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("asyn_read had a read error.\n"); - req_param = req->param; - r = ret; - cb = req->cb; - free(req); - cb(r, req_param); - } - - -} - -static void write_cb(struct io_ret r, void *param) -{ - struct io_req *req = (struct io_req *)param; - radix_tree_node node; - uint64_t a, addr; - void *req_param; - struct block_info *bi; - - switch(req->state) { - - case WRITE_LOCKED: - - DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr)); - req->state = READ_L1; - block_read(getid(req->root), write_cb, req); - break; - - case READ_L1: /* block is the radix root */ - - DPRINTF("READ_L1\n"); - node = (radix_tree_node) IO_BLOCK(r); - if (node == NULL) goto fail; - a = node[L1_IDX(req->vaddr)]; - addr = getid(a); - - req->radix_addr[L2] = addr; - req->radix[L1] = node; - - if ( addr == ZERO ) { - /* L1 empty subtree: */ - req->state = ALLOC_DATA_L1z; - block_alloc( req->block, write_cb, req ); - } else if ( !iswritable(a) ) { - /* L1 fault: */ - req->state = READ_L2_L1f; - block_read( addr, write_cb, req ); - } else { - req->state = READ_L2; - block_read( addr, write_cb, req ); - } - break; - - case READ_L2: - - DPRINTF("READ_L2\n"); - node = (radix_tree_node) IO_BLOCK(r); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix_addr[L3] = addr; - req->radix[L2] = node; - - if ( addr == ZERO ) { - /* L2 empty subtree: */ - req->state = ALLOC_DATA_L2z; - block_alloc( req->block, write_cb, req ); - } else if ( !iswritable(a) ) { - /* L2 fault: */ - req->state = READ_L3_L2f; - block_read( addr, write_cb, req ); - } else { - req->state = READ_L3; - block_read( addr, write_cb, req ); - } - break; - - case READ_L3: - - DPRINTF("READ_L3\n"); - node = (radix_tree_node) IO_BLOCK(r); - if (node == NULL) goto fail; - a = node[L3_IDX(req->vaddr)]; - addr = getid(a); - - req->radix[L3] = node; - - if ( addr == ZERO ) { - /* L3 fault: */ - req->state = ALLOC_DATA_L3z; - block_alloc( req->block, write_cb, req ); - } else if ( !iswritable(a) ) { - /* L3 fault: */ - req->state = ALLOC_DATA_L3f; - block_alloc( req->block, write_cb, req ); - } else { - req->state = WRITE_DATA; - block_write( addr, req->block, write_cb, req ); - } - break; - - case WRITE_DATA: - - DPRINTF("WRITE_DATA\n"); - /* The L3 radix points to the correct block, we just need to */ - /* update the crc. */ - if (IO_INT(r) < 0) goto fail; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 101; - *bi = req->bi; - req->state = WRITE_L3; - block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); - break; - - /* L3 Zero Path: */ - - case ALLOC_DATA_L3z: - - DPRINTF("ALLOC_DATA_L3z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 102; - *bi = req->bi; - req->state = WRITE_L3_L3z; - block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); - break; - - /* L3 Fault Path: */ - - case ALLOC_DATA_L3f: - - DPRINTF("ALLOC_DATA_L3f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 103; - *bi = req->bi; - req->state = WRITE_L3_L3f; - block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); - break; - - /* L2 Zero Path: */ - - case ALLOC_DATA_L2z: - - DPRINTF("ALLOC_DATA_L2z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3] = newblock(); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 104; - *bi = req->bi; - req->state = ALLOC_L3_L2z; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L2z: - - DPRINTF("ALLOC_L3_L2z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = WRITE_L2_L2z; - block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); - break; - - /* L2 Fault Path: */ - - case READ_L3_L2f: - - DPRINTF("READ_L3_L2f\n"); - node = (radix_tree_node) IO_BLOCK(r); - clear_L3_w_bits(node); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix[L3] = node; - req->state = ALLOC_DATA_L2f; - block_alloc( req->block, write_cb, req ); - break; - - case ALLOC_DATA_L2f: - - DPRINTF("ALLOC_DATA_L2f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 105; - *bi = req->bi; - req->state = ALLOC_L3_L2f; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L2f: - - DPRINTF("ALLOC_L3_L2f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = WRITE_L2_L2f; - block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); - break; - - /* L1 Zero Path: */ - - case ALLOC_DATA_L1z: - - DPRINTF("ALLOC_DATA_L1z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3] = newblock(); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 106; - *bi = req->bi; - req->state = ALLOC_L3_L1z; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L1z: - - DPRINTF("ALLOC_L3_L1z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2] = newblock(); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = ALLOC_L2_L1z; - block_alloc( (char*)req->radix[L2], write_cb, req ); - break; - - case ALLOC_L2_L1z: - - DPRINTF("ALLOC_L2_L1z\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L1][L1_IDX(req->vaddr)] = a; - req->state = WRITE_L1_L1z; - block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); - break; - - /* L1 Fault Path: */ - - case READ_L2_L1f: - - DPRINTF("READ_L2_L1f\n"); - node = (radix_tree_node) IO_BLOCK(r); - clear_w_bits(node); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix_addr[L3] = addr; - req->radix[L2] = node; - - if (addr == ZERO) { - /* nothing below L2, create an empty L3 and alloc data. */ - /* (So skip READ_L3_L1f.) */ - req->radix[L3] = newblock(); - req->state = ALLOC_DATA_L1f; - block_alloc( req->block, write_cb, req ); - } else { - req->state = READ_L3_L1f; - block_read( addr, write_cb, req ); - } - break; - - case READ_L3_L1f: - - DPRINTF("READ_L3_L1f\n"); - node = (radix_tree_node) IO_BLOCK(r); - clear_L3_w_bits(node); - if (node == NULL) goto fail; - a = node[L2_IDX(req->vaddr)]; - addr = getid(a); - - req->radix[L3] = node; - req->state = ALLOC_DATA_L1f; - block_alloc( req->block, write_cb, req ); - break; - - case ALLOC_DATA_L1f: - - DPRINTF("ALLOC_DATA_L1f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L3][L3_IDX(req->vaddr)] = a; - bi = (struct block_info *) &req->radix[L3][L3_IDX(req->vaddr)+1]; - req->bi.unused = 107; - *bi = req->bi; - req->state = ALLOC_L3_L1f; - block_alloc( (char*)req->radix[L3], write_cb, req ); - break; - - case ALLOC_L3_L1f: - - DPRINTF("ALLOC_L3_L1f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L2][L2_IDX(req->vaddr)] = a; - req->state = ALLOC_L2_L1f; - block_alloc( (char*)req->radix[L2], write_cb, req ); - break; - - case ALLOC_L2_L1f: - - DPRINTF("ALLOC_L2_L1f\n"); - addr = IO_ADDR(r); - a = writable(addr); - req->radix[L1][L1_IDX(req->vaddr)] = a; - req->state = WRITE_L1_L1f; - block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); - break; - - case WRITE_L3: - case WRITE_L3_L3z: - case WRITE_L3_L3f: - case WRITE_L2_L2z: - case WRITE_L2_L2f: - case WRITE_L1_L1z: - case WRITE_L1_L1f: - { - int i; - DPRINTF("DONE\n"); - /* free any saved node vals. */ - for (i=0; i<3; i++) - if (req->radix[i] != 0) free(req->radix[i]); - req->retval = r; - req->state = WRITE_UNLOCKED; - block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req); - break; - } - case WRITE_UNLOCKED: - { - struct io_ret r; - io_cb_t cb; - DPRINTF("WRITE_UNLOCKED!\n"); - req_param = req->param; - r = req->retval; - cb = req->cb; - free(req); - cb(r, req_param); - break; - } - - default: - DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); - goto fail; - } - - return; - - fail: - { - struct io_ret r; - io_cb_t cb; - int i; - - DPRINTF("asyn_write had a read error mid-way.\n"); - req_param = req->param; - cb = req->cb; - r.type = IO_INT_T; - r.u.i = -1; - /* free any saved node vals. */ - for (i=0; i<3; i++) - free(req->radix[i]); - free(req); - cb(r, req_param); - } -} - -char *vdi_read_s(vdi_t *vdi, uint64_t vaddr) -{ - pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; - char *block = NULL; - int ret; - - void reads_cb(struct io_ret r, void *param) - { - block = IO_BLOCK(r); - pthread_mutex_unlock((pthread_mutex_t *)param); - } - - pthread_mutex_lock(&m); - ret = vdi_read(vdi, vaddr, reads_cb, &m); - - if (ret == 0) pthread_mutex_lock(&m); - - return block; -} - - -int vdi_write_s(vdi_t *vdi, uint64_t vaddr, char *block) -{ - pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; - int ret, result; - - void writes_cb(struct io_ret r, void *param) - { - result = IO_INT(r); - pthread_mutex_unlock((pthread_mutex_t *)param); - } - - pthread_mutex_lock(&m); - ret = vdi_write(vdi, vaddr, block, writes_cb, &m); - - if (ret == 0) pthread_mutex_lock(&m); - - return result; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/requests-async.h --- a/tools/blktap/parallax/requests-async.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -#ifndef _REQUESTSASYNC_H_ -#define _REQUESTSASYNC_H_ - -#include "block-async.h" -#include "blockstore.h" /* for newblock etc. */ - -/* -#define BLOCK_SIZE 4096 -#define ZERO 0ULL -#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU) -#define iswritable(x) (((x) & 1LLU) != 0) -#define writable(x) (((x) << 1) | 1LLU) -#define readonly(x) ((uint64_t)((x) << 1)) -*/ - -#define VADDR_MASK 0x0000000003ffffffLLU /* 26-bits = 256Gig */ -#define VALID_VADDR(x) (((x) & VADDR_MASK) == (x)) - -int vdi_read (vdi_t *vdi, uint64_t vaddr, io_cb_t cb, void *param); -int vdi_write(vdi_t *vdi, uint64_t vaddr, char *block, io_cb_t cb, void *param); - -/* synchronous versions: */ -char *vdi_read_s (vdi_t *vdi, uint64_t vaddr); -int vdi_write_s(vdi_t *vdi, uint64_t vaddr, char *block); - -#define ERR_BAD_VADDR -1 -#define ERR_NOMEM -2 - -#endif //_REQUESTSASYNC_H_ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/snaplog.c --- a/tools/blktap/parallax/snaplog.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,238 +0,0 @@ -/************************************************************************** - * - * snaplog.c - * - * Snapshot log on-disk data structure. - * - */ - - /* VDI histories are made from chains of snapshot logs. These logs record - * the (radix) root and timestamp of individual snapshots. - * - * creation of a new VDI involves 'forking' a snapshot log, by creating a - * new, empty log (in a new VDI) and parenting it off of a record in an - * existing snapshot log. - * - * snapshot log blocks have at most one writer. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <sys/time.h> -#include "blockstore.h" -#include "snaplog.h" - - - -snap_block_t *snap_get_block(uint64_t block) -{ - snap_block_t *blk = (snap_block_t *)readblock(block); - - if ( blk == NULL) - return NULL; - if ( blk->hdr.magic != SNAP_MAGIC ) { - freeblock(blk); - return NULL; - } - - return blk; -} - -int snap_get_id(snap_id_t *id, snap_rec_t *target) -{ - snap_block_t *blk; - - if ( id == NULL ) - return -1; - - blk = snap_get_block(id->block); - - if ( blk == NULL ) - return -1; - - if ( id->index > blk->hdr.nr_entries ) { - freeblock(blk); - return -1; - } - - *target = blk->snaps[id->index]; - freeblock(blk); - return 0; -} - -int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id, - snap_id_t *new_id) -{ - snap_rec_t parent_rec, fork_rec; - snap_block_t *blk, *pblk; - /* - if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) ) - return -1; - - if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) ) - return -1; -*/ - blk = (snap_block_t *)newblock(); - blk->hdr.magic = SNAP_MAGIC; - blk->hdr.nr_entries = 0; - blk->hdr.log_entries = 0; - blk->hdr.immutable = 0; - - if ( (parent_id != NULL) - && (parent_id->block != fork_id->block) - && (parent_id->block != 0)) { - - pblk = snap_get_block(parent_id->block); - blk->hdr.log_entries = pblk->hdr.log_entries; - freeblock(pblk); - } - - if (parent_id != NULL) { - blk->hdr.parent_block = *parent_id; - blk->hdr.fork_block = *fork_id; - } else { - blk->hdr.parent_block = null_snap_id; - blk->hdr.fork_block = null_snap_id; - } - - new_id->index = 0; - new_id->block = allocblock(blk); - freeblock(blk); - if (new_id->block == 0) - return -1; - - return 0; -} - -int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id) -{ - return __snap_block_create(parent_id, parent_id, new_id); -} - -int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id) -{ - snap_id_t id = *old_id; - snap_block_t *blk = snap_get_block(id.block); - - if ( rec->deleted == 1 ) { - printf("Attempt to append a deleted snapshot!\n"); - return -1; - } - - if ( blk->hdr.immutable != 0 ) { - printf("Attempt to snap an immutable snap block!\n"); - return -1; - } - - new_id->block = id.block; - - if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) { - int ret; - - id.index--; /* make id point to the last full record */ - - ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id); - if ( ret != 0 ) { - freeblock(blk); - return -1; - } - - blk->hdr.immutable = 1; - writeblock(id.block, blk); - freeblock(blk); - blk = snap_get_block(new_id->block); - id = *new_id; - } - - blk->snaps[blk->hdr.nr_entries] = *rec; - blk->hdr.nr_entries++; - blk->hdr.log_entries++; - new_id->index = blk->hdr.nr_entries; - //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries); - writeblock(id.block, blk); - freeblock(blk); - return 0; -} - -int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id) -{ - snap_block_t *p_blk, *c_blk, *blk; - snap_rec_t *p_rec, *c_rec; - int ret = -1; - - p_blk = snap_get_block(p_id->block); - - if (p_blk == NULL) return(-1); - - if (c_id->block == p_id->block) - { - c_blk = p_blk; - } else { - c_blk = snap_get_block(c_id->block); - } - - if (p_blk == NULL) { - freeblock(p_blk); - return(-1); - } - - /* parent and child must not be deleted. */ - p_rec = &p_blk->snaps[p_id->index]; - c_rec = &c_blk->snaps[c_id->index]; - /* - if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) { - printf("One of those snaps is already deleted.\n"); - goto done; - } - */ - /* first non-deleted thing in the log before child must be parent. */ - - /* XXX todo: text the range here for delete (and eventually fork) bits) */ - /* for now, snaps must be consecutive, on the same log page: */ - - if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1)) - { - printf("Deleting non-consecutive snaps is not done yet.\n"); - goto done; - } - - /* mark parent as deleted XXX: may need to lock parent block here.*/ - p_rec->deleted = 1; - writeblock(p_id->block, p_blk); - - /* delete the parent */ - printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root); - ret = collapse(height, p_rec->radix_root, c_rec->radix_root); - - /* return the number of blocks reclaimed. */ - -done: - if (c_blk != p_blk) freeblock(c_blk); - freeblock(p_blk); - - return(ret); -} - -void snap_print_history(snap_id_t *snap_id) -{ - snap_id_t id = *snap_id; - unsigned int idx = id.index; - snap_block_t *new_blk, *blk = snap_get_block(id.block); - - while ( blk ) { - printf("[Snap block %Ld]:\n", id.block); - do { - printf(" %03u: root: %Ld ts: %ld.%ld\n", idx, - blk->snaps[idx].radix_root, - blk->snaps[idx].timestamp.tv_sec, - blk->snaps[idx].timestamp.tv_usec); - } while (idx-- != 0); - - id = blk->hdr.parent_block; - if (id.block != 0) { - new_blk = snap_get_block(id.block); - } - freeblock(blk); - blk = new_blk; - } -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/snaplog.h --- a/tools/blktap/parallax/snaplog.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -/************************************************************************** - * - * snaplog.h - * - * Snapshot log on-disk data structure. - * - */ - -#include "radix.h" -#include "blockstore.h" /* for BLOCK_SIZE */ - -#ifndef __SNAPLOG_H__ -#define __SNAPLOG_H__ - -typedef struct snap_id { - uint64_t block; - unsigned int index; -} snap_id_t; - -typedef struct snap_rec { - uint64_t radix_root; - struct timeval timestamp; - /* flags: */ - unsigned deleted:1; -} snap_rec_t; - - -int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id); -int snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id); -int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id); -void snap_print_history(snap_id_t *snap_id); -int snap_get_id(snap_id_t *id, snap_rec_t *target); - - -/* exported for vdi debugging */ -#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL - -static const snap_id_t null_snap_id = { 0, 0 }; - -typedef struct snap_block_hdr { - uint64_t magic; - snap_id_t parent_block; /* parent block within this chain */ - snap_id_t fork_block; /* where this log was forked */ - unsigned log_entries; /* total entries since forking */ - unsigned short nr_entries; /* entries in snaps[] */ - unsigned short immutable; /* has this snap page become immutable? */ -} snap_block_hdr_t; - - -#define SNAPS_PER_BLOCK \ - ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t)) - -typedef struct snap_block { - snap_block_hdr_t hdr; - snap_rec_t snaps[SNAPS_PER_BLOCK]; -} snap_block_t; - - -snap_block_t *snap_get_block(uint64_t block); - -#endif /* __SNAPLOG_H__ */ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi.c --- a/tools/blktap/parallax/vdi.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,367 +0,0 @@ -/************************************************************************** - * - * vdi.c - * - * Virtual Disk Image (VDI) Interfaces - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <fcntl.h> -#include <string.h> -#include <sys/time.h> -#include <pthread.h> -#include "blockstore.h" -#include "block-async.h" -#include "requests-async.h" -#include "radix.h" -#include "vdi.h" - -#define VDI_REG_BLOCK 2LL -#define VDI_RADIX_ROOT writable(3) - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* I haven't decided about this registry stuff, so this is just a really - * quick lash-up so that there is some way to track VDIs. - * - * (Most vdi access should be with a direct handle to the block, so this - * registry is just for start-of-day lookup and other control operations.) - */ - -vdi_registry_t *create_vdi_registry(void) -{ - vdi_registry_t *reg = (vdi_registry_t *)newblock(); - - if (reg == NULL) - return NULL; - - /* zero-fill the vdi radix root while we have an empty block. */ - writeblock(VDI_RADIX_ROOT, (void *)reg); - - - DPRINTF("[vdi.c] Creating VDI registry!\n"); - reg->magic = VDI_REG_MAGIC; - reg->nr_vdis = 0; - - writeblock(VDI_REG_BLOCK, (void *)reg); - - return reg; -} - -vdi_registry_t *get_vdi_registry(void) -{ - vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK); - - if ( vdi_reg == NULL ) - vdi_reg = create_vdi_registry(); - - if ( vdi_reg->magic != VDI_REG_MAGIC ) { - freeblock(vdi_reg); - return NULL; - } - - return vdi_reg; -} - - -vdi_t *vdi_create(snap_id_t *parent_snap, char *name) -{ - int ret; - vdi_t *vdi; - vdi_registry_t *vdi_reg; - snap_rec_t snap_rec; - - /* create a vdi struct */ - vdi = newblock(); - if (vdi == NULL) - return NULL; - - if ( snap_get_id(parent_snap, &snap_rec) == 0 ) { - vdi->radix_root = snapshot(snap_rec.radix_root); - } else { - vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */ - vdi->radix_root = writable(vdi->radix_root); /* grr. */ - } - - /* create a snapshot log, and add it to the vdi struct */ - - ret = snap_block_create(parent_snap, &vdi->snap); - if ( ret != 0 ) { - DPRINTF("Error getting snap block in vdi_create.\n"); - freeblock(vdi); - return NULL; - } - - /* append the vdi to the registry, fill block and id. */ - /* implicit allocation means we have to write the vdi twice here. */ - vdi_reg = get_vdi_registry(); - if ( vdi_reg == NULL ) { - freeblock(vdi); - return NULL; - } - - vdi->block = allocblock((void *)vdi); - vdi->id = vdi_reg->nr_vdis++; - strncpy(vdi->name, name, VDI_NAME_SZ); - vdi->name[VDI_NAME_SZ] = '\0'; - vdi->radix_lock = NULL; /* for tidiness */ - writeblock(vdi->block, (void *)vdi); - - update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block); - writeblock(VDI_REG_BLOCK, (void *)vdi_reg); - freeblock(vdi_reg); - - vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); - if (vdi->radix_lock == NULL) - { - perror("couldn't malloc radix_lock for new vdi!"); - freeblock(vdi); - return NULL; - } - radix_lock_init(vdi->radix_lock); - - return vdi; -} - -/* vdi_get and vdi_put currently act more like alloc/free -- they don't - * do refcount-based allocation. - */ -vdi_t *vdi_get(uint64_t vdi_id) -{ - uint64_t vdi_blk; - vdi_t *vdi; - - vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id); - - if ( vdi_blk == 0 ) - return NULL; - - vdi = (vdi_t *)readblock(vdi_blk); - - vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); - if (vdi->radix_lock == NULL) - { - perror("couldn't malloc radix_lock for new vdi!"); - freeblock(vdi); - return NULL; - } - radix_lock_init(vdi->radix_lock); - - return vdi; -} - -void vdi_put(vdi_t *vdi) -{ - free(vdi->radix_lock); - freeblock(vdi); -} - -void vdi_snapshot(vdi_t *vdi) -{ - snap_rec_t rec; - int ret; - - rec.radix_root = vdi->radix_root; - gettimeofday(&rec.timestamp, NULL); - rec.deleted = 0; - - vdi->radix_root = snapshot(vdi->radix_root); - ret = snap_append(&vdi->snap, &rec, &vdi->snap); - if ( ret != 0 ) { - printf("snap_append returned failure\n"); - return; - } - writeblock(vdi->block, vdi); -} - -int __init_vdi() -{ - /* sneak this in here for the moment. */ - __rcache_init(); - - /* force the registry to be created if it doesn't exist. */ - vdi_registry_t *vdi_reg = get_vdi_registry(); - if (vdi_reg == NULL) { - printf("[vdi.c] Couldn't get/create a VDI registry!\n"); - return -1; - } - freeblock(vdi_reg); - - - return 0; -} - -#ifdef VDI_STANDALONE - -#define TEST_VDIS 50 -#define NR_ITERS 50000 -#define FORK_POINTS 200 -#define INIT_VDIS 3 -#define INIT_SNAPS 40 - -/* These must be of decreasing size: */ -#define NEW_FORK (RAND_MAX-(RAND_MAX/1000)) -#define NEW_ROOT_VDI (RAND_MAX-((RAND_MAX/1000)*2)) -#define NEW_FORK_VDI (RAND_MAX-((RAND_MAX/1000)*3)) - -#define GRAPH_DOT_FILE "vdi.dot" -#define GRAPH_PS_FILE "vdi.ps" - - -typedef struct sh_st { - snap_id_t id; - struct sh_st *next; -} sh_t; - -#define SNAP_HASHSZ 1024 -sh_t *node_hash[SNAP_HASHSZ]; -#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ) - -#define SNAPID_EQUAL(_a,_b) \ - (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index)) -int sh_check_and_add(snap_id_t *id) -{ - sh_t **s = &node_hash[SNAP_HASH(id)]; - - while (*s != NULL) { - if (SNAPID_EQUAL(&((*s)->id), id)) - return 1; - *s = (*s)->next; - } - - *s = (sh_t *)malloc(sizeof(sh_t)); - (*s)->id = *id; - (*s)->next = NULL; - - return 0; -} - -int main(int argc, char *argv[]) -{ - vdi_t *vdi_list[TEST_VDIS]; - snap_id_t id, fork_points[FORK_POINTS]; - int nr_vdis = 0, nr_forks = 0; - int i, j, r; - FILE *f; - char name[VDI_NAME_SZ]; - - __init_blockstore(); - __init_vdi(); - - printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS); - - for (i=0; i<INIT_VDIS; i++) { - r=rand(); - - sprintf(name, "VDI Number %d", nr_vdis); - vdi_list[i] = vdi_create(NULL, name); - for (j=0; j<(r%INIT_SNAPS); j++) - vdi_snapshot(vdi_list[i]); - fork_points[i] = vdi_list[i]->snap; - nr_vdis++; - nr_forks++; - } - - printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS); - - for (i=0; i<NR_ITERS; i++) { - r = rand(); - - if ( r > NEW_FORK ) { - if ( nr_forks > FORK_POINTS ) - continue; - id = vdi_list[r%nr_vdis]->snap; - if ( ( id.block == 0 ) || ( id.index == 0 ) ) - continue; - id.index--; - fork_points[nr_forks++] = id; - - } else if ( r > NEW_ROOT_VDI ) { - - if ( nr_vdis == TEST_VDIS ) - continue; - - sprintf(name, "VDI Number %d.", nr_vdis); - vdi_list[nr_vdis++] = vdi_create(NULL, name); - - } else if ( r > NEW_FORK_VDI ) { - - if ( nr_vdis == TEST_VDIS ) - continue; - - sprintf(name, "VDI Number %d.", nr_vdis); - vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name); - - } else /* SNAPSHOT */ { - - vdi_snapshot(vdi_list[r%nr_vdis]); - - } - } - - /* now dump it out to a dot file. */ - printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis); - - f = fopen(GRAPH_DOT_FILE, "w"); - - /* write graph preamble */ - fprintf(f, "digraph G {\n"); - fprintf(f, " rankdir=LR\n"); - - for (i=0; i<nr_vdis; i++) { - char oldnode[255]; - snap_block_t *blk; - snap_id_t id = vdi_list[i]->snap; - int nr_snaps, done=0; - - /* add a node for the id */ -printf("vdi: %d\n", i); - fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", - id.block, id.index, vdi_list[i]->name, - id.block, id.index); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - - while (id.block != 0) { - blk = snap_get_block(id.block); - nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index); - id = blk->hdr.fork_block; - - done = sh_check_and_add(&id); - - /* add a node for the fork_id */ - if (!done) { - fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", - id.block, id.index, - id.block, id.index); - } - - /* add an edge between them */ - fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n", - id.block, id.index, oldnode, nr_snaps); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - freeblock(blk); - - if (done) break; - } - } - - /* write graph postamble */ - fprintf(f, "}\n"); - fclose(f); - - printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE); - { - char cmd[255]; - sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE); - system(cmd); - } - return 0; -} - -#endif diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi.h --- a/tools/blktap/parallax/vdi.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -#ifndef _VDI_H_ -#define _VDI_H_ -/************************************************************************** - * - * vdi.h - * - * Virtual Disk Image (VDI) Interfaces - * - */ - -#ifndef __VDI_H__ -#define __VDI_H__ - -#include "blktaplib.h" -#include "snaplog.h" - -#define VDI_HEIGHT 27 /* Note that these are now hard-coded */ -#define VDI_REG_HEIGHT 27 /* in the async lookup code */ - -#define VDI_NAME_SZ 256 - - -typedef struct vdi { - uint64_t id; /* unique vdi id -- used by the registry */ - uint64_t block; /* block where this vdi lives (also unique)*/ - uint64_t radix_root; /* radix root node for block mappings */ - snap_id_t snap; /* next snapshot slot for this VDI */ - struct vdi *next; /* used to hash-chain in blkif. */ - blkif_vdev_t vdevice; /* currently mounted as... */ - struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs */ - char name[VDI_NAME_SZ];/* human readable vdi name */ -} vdi_t; - -#define VDI_REG_MAGIC 0xff00ff0bb0ff00ffLL - -typedef struct vdi_registry { - uint64_t magic; - uint64_t nr_vdis; -} vdi_registry_t; - - -int __init_vdi(void); - -vdi_t *vdi_get(uint64_t vdi_id); -void vdi_put(vdi_t *vdi); -vdi_registry_t *get_vdi_registry(void); -vdi_t *vdi_create(snap_id_t *parent_snap, char *name); -uint64_t vdi_lookup_block(vdi_t *vdi, uint64_t vdi_block, int *writable); -void vdi_update_block(vdi_t *vdi, uint64_t vdi_block, uint64_t g_block); -void vdi_snapshot(vdi_t *vdi); - - -#endif /* __VDI_H__ */ - -#endif //_VDI_H_ diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_create.c --- a/tools/blktap/parallax/vdi_create.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -/************************************************************************** - * - * vdi_create.c - * - * Create a new vdi. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - char name[VDI_NAME_SZ] = ""; - snap_id_t id; - int from_snap = 0; - - __init_blockstore(); - __init_vdi(); - - if ( argc == 1 ) { - printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]); - exit(-1); - } - - strncpy( name, argv[1], VDI_NAME_SZ); - name[VDI_NAME_SZ] = '\0'; - - if ( argc > 3 ) { - id.block = (uint64_t) atoll(argv[2]); - id.index = (unsigned int) atol (argv[3]); - from_snap = 1; - } - - vdi = vdi_create( from_snap ? &id : NULL, name); - - if ( vdi == NULL ) { - printf("Failed to create VDI!\n"); - freeblock(vdi); - exit(-1); - } - - freeblock(vdi); - - return (0); -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_fill.c --- a/tools/blktap/parallax/vdi_fill.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -/************************************************************************** - * - * vdi_fill.c - * - * Hoover a file or device into a vdi. - * You must first create the vdi with vdi_create. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include "blockstore.h" -#include "radix.h" -#include "requests-async.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - uint64_t id; - int fd; - struct stat st; - uint64_t tot_size; - char spage[BLOCK_SIZE]; - char *dpage; - uint64_t vblock = 0, count=0; - - __init_blockstore(); - init_block_async(); - __init_vdi(); - - if ( argc < 3 ) { - printf("usage: %s <VDI id> <filename>\n", argv[0]); - exit(-1); - } - - id = (uint64_t) atoll(argv[1]); - - vdi = vdi_get( id ); - - if ( vdi == NULL ) { - printf("Failed to retreive VDI %Ld!\n", id); - exit(-1); - } - - fd = open(argv[2], O_RDONLY | O_LARGEFILE); - - if (fd < 0) { - printf("Couldn't open %s!\n", argv[2]); - exit(-1); - } - - if ( fstat(fd, &st) != 0 ) { - printf("Couldn't stat %s!\n", argv[2]); - exit(-1); - } - - tot_size = (uint64_t) st.st_size; - printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size); - - printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE); - printf(" "); - while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) { - vdi_write_s(vdi, vblock, spage); - - vblock++; - if ((vblock % 512) == 0) - printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock); - fflush(stdout); - } - printf("\n"); - - freeblock(vdi); - - return (0); -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_list.c --- a/tools/blktap/parallax/vdi_list.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -/************************************************************************** - * - * vdi_list.c - * - * Print a list of VDIs on the block store. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_registry_t *reg; - vdi_t *vdi; - int i; - - __init_blockstore(); - __init_vdi(); - - reg = get_vdi_registry(); - - if ( reg == NULL ) { - printf("couldn't get VDI registry.\n"); - exit(-1); - } - - for (i=0; i < reg->nr_vdis; i++) { - vdi = vdi_get(i); - - if ( vdi != NULL ) { - - printf("%10Ld %60s\n", vdi->id, vdi->name); - freeblock(vdi); - - } - } - - freeblock(reg); - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_snap.c --- a/tools/blktap/parallax/vdi_snap.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -/************************************************************************** - * - * vdi_snap.c - * - * Snapshot a vdi. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - uint64_t id; - - __init_blockstore(); - __init_vdi(); - - if ( argc == 1 ) { - printf("usage: %s <VDI id>\n", argv[0]); - exit(-1); - } - - id = (uint64_t) atoll(argv[1]); - - vdi = vdi_get(id); - - if ( vdi == NULL ) { - printf("couldn't find the requested VDI.\n"); - freeblock(vdi); - exit(-1); - } - - vdi_snapshot(vdi); - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_snap_delete.c --- a/tools/blktap/parallax/vdi_snap_delete.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -/************************************************************************** - * - * vdi_snap_delete.c - * - * Delete a snapshot. - * - * This is not finished: right now it takes a snap n and calls - * snap_collapse(n,n+1). - * - * TODO: support for non-consecutive, non-same-block snaps - * Avoid forking probs. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "snaplog.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - snap_id_t id, c_id; - int ret; - - __init_blockstore(); - __init_vdi(); - - if ( argc != 3 ) { - printf("usage: %s <snap block> <snap idx>\n", argv[0]); - exit(-1); - } - - id.block = (uint64_t) atoll(argv[1]); - id.index = (unsigned int) atol (argv[2]); - - c_id = id; - c_id.index++; - - ret = snap_collapse(VDI_HEIGHT, &id, &c_id); - - printf("Freed %d blocks.\n", ret); - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_snap_list.c --- a/tools/blktap/parallax/vdi_snap_list.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -/************************************************************************** - * - * vdi_snap_list.c - * - * Print a list of snapshots for the specified vdi. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - uint64_t id; - int i, max_snaps = -1; - snap_block_t *blk; - snap_id_t sid; - char *t; - - __init_blockstore(); - __init_vdi(); - - if ( argc == 1 ) { - printf("usage: %s <VDI id> [max snaps]\n", argv[0]); - exit(-1); - } - - id = (uint64_t) atoll(argv[1]); - - if ( argc > 2 ) { - max_snaps = atoi(argv[2]); - } - - vdi = vdi_get(id); - - if ( vdi == NULL ) { - printf("couldn't find the requested VDI.\n"); - freeblock(vdi); - exit(-1); - } - - sid = vdi->snap; - sid.index--; - - //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", - // "radix root", "d"); - printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", - "radix root", "d"); - - while (sid.block != 0) { - blk = snap_get_block(sid.block); - for (i = sid.index; i >= 0; i--) { - if ( max_snaps == 0 ) { - freeblock(blk); - goto done; - } - t = ctime(&blk->snaps[i].timestamp.tv_sec); - t[strlen(t)-1] = '\0'; - //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n", - printf("%8Ld%4u%30s %06lu %12Ld %1s\n", - sid.block, i, - //blk->snaps[i].timestamp.tv_sec, - t, - blk->snaps[i].timestamp.tv_usec, - blk->snaps[i].radix_root, - blk->snaps[i].deleted ? "*" : " "); - if ( max_snaps != -1 ) - max_snaps--; - } - sid = blk->hdr.parent_block; - freeblock(blk); - } -done: - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_tree.c --- a/tools/blktap/parallax/vdi_tree.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,132 +0,0 @@ -/************************************************************************** - * - * vdi_tree.c - * - * Output current vdi tree to dot and postscript. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -#define GRAPH_DOT_FILE "vdi.dot" -#define GRAPH_PS_FILE "vdi.ps" - -typedef struct sh_st { - snap_id_t id; - struct sh_st *next; -} sh_t; - -#define SNAP_HASHSZ 1024 -sh_t *node_hash[SNAP_HASHSZ]; -#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ) - -#define SNAPID_EQUAL(_a,_b) \ - (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index)) -int sh_check_and_add(snap_id_t *id) -{ - sh_t **s = &node_hash[SNAP_HASH(id)]; - - while (*s != NULL) { - if (SNAPID_EQUAL(&((*s)->id), id)) - return 1; - *s = (*s)->next; - } - - *s = (sh_t *)malloc(sizeof(sh_t)); - (*s)->id = *id; - (*s)->next = NULL; - - return 0; -} - -int main(int argc, char *argv[]) -{ - FILE *f; - char dot_file[255] = GRAPH_DOT_FILE; - char ps_file[255] = GRAPH_PS_FILE; - int nr_vdis = 0, nr_forks = 0; - vdi_registry_t *reg; - vdi_t *vdi; - int i; - - __init_blockstore(); - __init_vdi(); - - reg = get_vdi_registry(); - - if ( reg == NULL ) { - printf("couldn't get VDI registry.\n"); - exit(-1); - } - - if ( argc > 1 ) { - strncpy(ps_file, argv[1], 255); - ps_file[255] = '\0'; - } - - /* now dump it out to a dot file. */ - printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis); - - f = fopen(dot_file, "w"); - - /* write graph preamble */ - fprintf(f, "digraph G {\n"); - fprintf(f, " rankdir=LR\n"); - - for (i=0; i<reg->nr_vdis; i++) { - char oldnode[255]; - snap_block_t *blk; - snap_id_t id; - int nr_snaps, done=0; - - vdi = vdi_get(i); - id = vdi->snap; - /* add a node for the id */ -printf("vdi: %d\n", i); - fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n", - id.block, id.index, vdi->name, - id.block, id.index); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - - while (id.block != 0) { - blk = snap_get_block(id.block); - nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index); - id = blk->hdr.fork_block; - - done = sh_check_and_add(&id); - - /* add a node for the fork_id */ - if (!done) { - fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n", - id.block, id.index, - id.block, id.index); - } - - /* add an edge between them */ - fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n", - id.block, id.index, oldnode, nr_snaps); - sprintf(oldnode, "n%Ld%d", id.block, id.index); - freeblock(blk); - - if (done) break; - } - } - - /* write graph postamble */ - fprintf(f, "}\n"); - fclose(f); - - printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE); - { - char cmd[255]; - sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file); - system(cmd); - } - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_unittest.c --- a/tools/blktap/parallax/vdi_unittest.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ -/************************************************************************** - * - * vdi_unittest.c - * - * Run a small test workload to ensure that data access through a vdi - * is (at least superficially) correct. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include "requests-async.h" -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" - -#define TEST_PAGES 32 -static char *zero_page; -static char pages[TEST_PAGES][BLOCK_SIZE]; -static int next_page = 0; - -void fill_test_pages(void) -{ - int i, j; - long *page; - - for (i=0; i< TEST_PAGES; i++) { - page = (unsigned long *)pages[i]; - for (j=0; j<(BLOCK_SIZE/4); j++) { - page[j] = random(); - } - } - - zero_page = newblock(); -} - -inline uint64_t make_vaddr(uint64_t L1, uint64_t L2, uint64_t L3) -{ - uint64_t ret = L1; - - ret = (ret << 9) | L2; - ret = (ret << 9) | L3; - - return ret; -} - -void touch_block(vdi_t *vdi, uint64_t L1, uint64_t L2, uint64_t L3) -{ - uint64_t vaddr; - char *page = pages[next_page++]; - char *rpage = NULL; - - printf("TOUCH (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3); - - vaddr = make_vaddr(L1, L2, L3); - vdi_write_s(vdi, vaddr, page); - rpage = vdi_read_s(vdi, vaddr); - - if (rpage == NULL) - { - printf( "read %Lu returned NULL\n", vaddr); - return; - } - - if (memcmp(page, rpage, BLOCK_SIZE) != 0) - { - printf( "read %Lu returned a different page\n", vaddr); - return; - } - - freeblock(rpage); -} - -void test_block(vdi_t *vdi, uint64_t L1, uint64_t L2, uint64_t L3, char *page) -{ - uint64_t vaddr; - char *rpage = NULL; - - printf("TEST (%3Lu, %3Lu, %3Lu)\n", L1, L2, L3); - - vaddr = make_vaddr(L1, L2, L3); - rpage = vdi_read_s(vdi, vaddr); - - if (rpage == NULL) - { - printf( "read %Lu returned NULL\n", vaddr); - return; - } - - if (memcmp(page, rpage, BLOCK_SIZE) != 0) - { - printf( "read %Lu returned a different page\n", vaddr); - return; - } - - freeblock(rpage); -} - -void coverage_test(vdi_t *vdi) -{ - uint64_t vaddr; - int i, j, k; - - /* Do a series of writes and reads to test all paths through the - * async radix code. The radix request code will dump CRC warnings - * if there are data problems here as well. - */ - - /* L1 Zero */ - touch_block(vdi, 0, 0, 0); - - /* L2 Zero */ - i = next_page; - touch_block(vdi, 0, 1, 0); - - /* L3 Zero */ - j = next_page; - touch_block(vdi, 0, 0, 1); - k = next_page; - touch_block(vdi, 0, 1, 1); - - /* Direct write */ - touch_block(vdi, 0, 0, 0); - - vdi_snapshot(vdi); - - /* L1 fault */ - touch_block(vdi, 0, 0, 0); - /* test the read-only branches that should have been copied over. */ - test_block(vdi, 0, 1, 0, pages[i]); - test_block(vdi, 0, 0, 1, pages[j]); - - /* L2 fault */ - touch_block(vdi, 0, 1, 0); - test_block(vdi, 0, 1, 1, pages[k]); - - /* L3 fault */ - touch_block(vdi, 0, 0, 1); - - /* read - L1 zero */ - test_block(vdi, 1, 0, 0, zero_page); - - /* read - L2 zero */ - test_block(vdi, 0, 2, 0, zero_page); - - /* read - L3 zero */ - test_block(vdi, 0, 0, 2, zero_page); -} - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - uint64_t id; - int fd; - struct stat st; - uint64_t tot_size; - char spage[BLOCK_SIZE]; - char *dpage; - uint64_t vblock = 0, count=0; - - __init_blockstore(); - init_block_async(); - __init_vdi(); - - vdi = vdi_create( NULL, "UNIT TEST VDI"); - - if ( vdi == NULL ) { - printf("Failed to create VDI!\n"); - freeblock(vdi); - exit(-1); - } - - fill_test_pages(); - coverage_test(vdi); - - freeblock(vdi); - - return (0); -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/parallax/vdi_validate.c --- a/tools/blktap/parallax/vdi_validate.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ -/************************************************************************** - * - * vdi_validate.c - * - * Intended to sanity-check vm_fill and the underlying vdi code. - * - * Block-by-block compare of a vdi with a file/device on the disk. - * - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include "blockstore.h" -#include "radix.h" -#include "vdi.h" -#include "requests-async.h" - -int main(int argc, char *argv[]) -{ - vdi_t *vdi; - uint64_t id; - int fd; - struct stat st; - uint64_t tot_size; - char spage[BLOCK_SIZE], *dpage; - char *vpage; - uint64_t vblock = 0, count=0; - - __init_blockstore(); - init_block_async(); - __init_vdi(); - - if ( argc < 3 ) { - printf("usage: %s <VDI id> <filename>\n", argv[0]); - exit(-1); - } - - id = (uint64_t) atoll(argv[1]); - - vdi = vdi_get( id ); - - if ( vdi == NULL ) { - printf("Failed to retreive VDI %Ld!\n", id); - exit(-1); - } - - fd = open(argv[2], O_RDONLY | O_LARGEFILE); - - if (fd < 0) { - printf("Couldn't open %s!\n", argv[2]); - exit(-1); - } - - if ( fstat(fd, &st) != 0 ) { - printf("Couldn't stat %s!\n", argv[2]); - exit(-1); - } - - tot_size = (uint64_t) st.st_size; - printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size); - - printf(" "); - while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) { - - dpage = vdi_read_s(vdi, vblock); - - if (dpage == NULL) { - printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock); - exit(0); - } - - if (memcmp(spage, dpage, BLOCK_SIZE) != 0) { - printf("\n\nblocks don't match! (%Ld)\n", vblock); - exit(0); - } - - freeblock(dpage); - - vblock++; - if ((vblock % 1024) == 0) { - printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock); - fflush(stdout); - } - } - printf("\n"); - - printf("VDI %Ld looks good!\n", id); - - freeblock(vdi); - - return (0); -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/ublkback/Makefile --- a/tools/blktap/ublkback/Makefile Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ - -XEN_ROOT = ../../.. -include $(XEN_ROOT)/tools/Rules.mk - -INCLUDES += -I.. - -INSTALL = install -INSTALL_PROG = $(INSTALL) -m0755 -IBIN = ublkback -INSTALL_DIR = /usr/sbin - -CFLAGS += -Werror -CFLAGS += -Wno-unused -CFLAGS += -fno-strict-aliasing -CFLAGS += -I $(XEN_LIBXC) -CFLAGS += $(INCLUDES) -I. -CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -# Get gcc to generate the dependencies for us. -CFLAGS += -Wp,-MD,.$(@F).d -DEPS = .*.d - -OBJS = $(patsubst %.c,%.o,$(SRCS)) - -.PHONY: all -all: $(IBIN) - -LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse) - -.PHONY: install -install: - $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INSTALL_DIR) - -.PHONY: clean -clean: - rm -rf *.o*~ $(DEPS) xen TAGS $(IBIN) - -ublkback: - $(CC) $(CFLAGS) -o ublkback -L$(XEN_LIBXC) -L. -L.. \ - -lblktap -laio ublkback.c ublkbacklib.c -pg - --include $(DEPS) diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/ublkback/ublkback.c --- a/tools/blktap/ublkback/ublkback.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -/* ublkback.c - * - * libaio-based userlevel backend. - */ - -#include "blktaplib.h" -#include "ublkbacklib.h" - - -int main(int argc, char *argv[]) -{ - ublkback_init(); - - register_new_blkif_hook(ublkback_new_blkif); - blktap_listen(); - - return 0; -} diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/ublkback/ublkbacklib.c --- a/tools/blktap/ublkback/ublkbacklib.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,473 +0,0 @@ -/* ublkbacklib.c - * - * file/device image-backed block device -- using linux libaio. - * - * (c) 2004 Andrew Warfield. - * - * Xend has been modified to use an amorfs:[fsid] disk tag. - * This will show up as device type (maj:240,min:0) = 61440. - * - * The fsid is placed in the sec_start field of the disk extent. - * - * NOTE: This doesn't work. Grrr. - */ - -#define _GNU_SOURCE -#define __USE_LARGEFILE64 - -#include <stdio.h> -#include <stdlib.h> -#include <fcntl.h> -#include <string.h> -#include <db.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sys/poll.h> -#include <unistd.h> -#include <errno.h> -#include <libaio.h> -#include <pthread.h> -#include <time.h> -#include <err.h> -#include "blktaplib.h" - -/* XXXX: */ -/* Current code just mounts this file/device to any requests that come in. */ -//#define TMP_IMAGE_FILE_NAME "/dev/sda1" -#define TMP_IMAGE_FILE_NAME "fc3.image" - -#define MAX_REQUESTS 64 /* must be synced with the blkif drivers. */ -#define MAX_SEGMENTS_PER_REQ 11 -#define SECTOR_SHIFT 9 -#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ) - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) ((void)0) -#endif - -/* Note on pending_reqs: I assume all reqs are queued before they start to - * get filled. so count of 0 is an unused record. - */ -typedef struct { - blkif_request_t req; - blkif_t *blkif; - int count; -} pending_req_t; - -static pending_req_t pending_list[MAX_REQUESTS]; -static io_context_t ctx; -static struct iocb *iocb_free[MAX_AIO_REQS]; -static int iocb_free_count; - -/* ---[ Notification mecahnism ]--------------------------------------- */ - -enum { - READ = 0, - WRITE = 1 -}; - -static int aio_notify[2]; -static volatile int aio_listening = 0; -static pthread_mutex_t notifier_sem = PTHREAD_MUTEX_INITIALIZER; - -static struct io_event aio_events[MAX_AIO_REQS]; -static int aio_event_count = 0; - -/* this is commented out in libaio.h for some reason. */ -extern int io_queue_wait(io_context_t ctx, struct timespec *timeout); - -static void *notifier_thread(void *arg) -{ - int ret; - int msg = 0x00feeb00; - - DPRINTF("Notifier thread started.\n"); - for (;;) { - pthread_mutex_lock(¬ifier_sem); - if ((ret = io_getevents(ctx, 1, MAX_AIO_REQS, aio_events, 0)) > 0) { - aio_event_count = ret; - write(aio_notify[WRITE], &msg, sizeof(msg)); - } else { - printf("[io_queue_wait error! %d]\n", errno); - pthread_mutex_unlock(¬ifier_sem); - } - } -} - -/* --- Talking to xenstore: ------------------------------------------- */ - -int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done); -int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done); - -typedef struct image { - /* These need to turn into an array/rbtree for multi-disk support. */ - int fd; - uint64_t fsid; - blkif_vdev_t vdevice; - long int size; - long int secsize; - long int info; -} image_t; - -long int ublkback_get_size(blkif_t *blkif) -{ - image_t *img = (image_t *)blkif->prv; - return img->size; -} - -long int ublkback_get_secsize(blkif_t *blkif) -{ - image_t *img = (image_t *)blkif->prv; - return img->secsize; -} - -unsigned ublkback_get_info(blkif_t *blkif) -{ - image_t *img = (image_t *)blkif->prv; - return img->info; -} - -static struct blkif_ops ublkback_ops = { - get_size: ublkback_get_size, - get_secsize: ublkback_get_secsize, - get_info: ublkback_get_info, -}; - -int ublkback_new_blkif(blkif_t *blkif) -{ - image_t *image; - struct stat stat; - int ret; - - image = (image_t *)malloc(sizeof(image_t)); - if (image == NULL) { - printf("error allocating image record.\n"); - return -ENOMEM; - } - - /* Open it. */ - image->fd = open(TMP_IMAGE_FILE_NAME, - O_RDWR | O_DIRECT | O_LARGEFILE); - - if ((image->fd < 0) && (errno == EINVAL)) { - /* Maybe O_DIRECT isn't supported. */ - warn("open() failed on '%s', trying again without O_DIRECT", - TMP_IMAGE_FILE_NAME); - image->fd = open(TMP_IMAGE_FILE_NAME, O_RDWR | O_LARGEFILE); - } - - if (image->fd < 0) { - warn("Couldn't open image file!"); - free(image); - return -EINVAL; - } - - /* Size it. */ - ret = fstat(image->fd, &stat); - if (ret != 0) { - printf("Couldn't stat image in PROBE!"); - return -EINVAL; - } - - image->size = (stat.st_size >> SECTOR_SHIFT); - - /* TODO: IOCTL to get size of raw device. */ -/* - ret = ioctl(img->fd, BLKGETSIZE, &blksize); - if (ret != 0) { - printf("Couldn't ioctl image in PROBE!\n"); - goto err; - } -*/ - if (image->size == 0) - image->size =((uint64_t) 16836057); - image->secsize = 512; - image->info = 0; - - /* Register the hooks */ - blkif_register_request_hook(blkif, "Ublkback req.", ublkback_request); - blkif_register_response_hook(blkif, "Ublkback resp.", ublkback_response); - - - printf(">X<Created a new blkif! pdev was %ld, but you got %s\n", - blkif->pdev, TMP_IMAGE_FILE_NAME); - - blkif->ops = &ublkback_ops; - blkif->prv = (void *)image; - - return 0; -} - - -/* --- Moving the bits: ----------------------------------------------- */ - -static int batch_count = 0; -int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done) -{ - int fd; - uint64_t sector; - char *spage, *dpage; - int ret, i, idx; - blkif_response_t *rsp; - domid_t dom = ID_TO_DOM(req->id); - static struct iocb *ioq[MAX_SEGMENTS_PER_REQ*MAX_REQUESTS]; - static int io_idx = 0; - struct iocb *io; - image_t *img; - - img = (image_t *)blkif->prv; - fd = img->fd; - - switch (req->operation) - { - case BLKIF_OP_WRITE: - { - unsigned long size; - - batch_count++; - - idx = ID_TO_IDX(req->id); - ASSERT(pending_list[idx].count == 0); - memcpy(&pending_list[idx].req, req, sizeof(*req)); - pending_list[idx].count = req->nr_segments; - pending_list[idx].blkif = blkif; - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = req->seg[i].last_sect - req->seg[i].first_sect + 1; - - if (req->seg[i].first_sect != 0) - DPRINTF("iWR: sec_nr: %10llu sec: %10llu (%1lu,%1lu) " - "pos: %15lu\n", - req->sector_number, sector, - req->seg[i].first_sect, req->seg[i].last_sect, - (long)(sector << SECTOR_SHIFT)); - - spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - spage += req->seg[i].first_sect << SECTOR_SHIFT; - - /*convert size and sector to byte offsets */ - size <<= SECTOR_SHIFT; - sector <<= SECTOR_SHIFT; - - io = iocb_free[--iocb_free_count]; - io_prep_pwrite(io, fd, spage, size, sector); - io->data = (void *)idx; - //ioq[i] = io; - ioq[io_idx++] = io; - } - - if (batch_done) { - ret = io_submit(ctx, io_idx, ioq); - batch_count = 0; - if (ret < 0) - printf("BADNESS: io_submit error! (%d)\n", errno); - io_idx = 0; - } - - return BLKTAP_STOLEN; - - } - case BLKIF_OP_READ: - { - unsigned long size; - - batch_count++; - idx = ID_TO_IDX(req->id); - ASSERT(pending_list[idx].count == 0); - memcpy(&pending_list[idx].req, req, sizeof(*req)); - pending_list[idx].count = req->nr_segments; - pending_list[idx].blkif = blkif; - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = req->seg[i].last_sect - req->seg[i].first_sect + 1; - - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - dpage += req->seg[i].first_sect << SECTOR_SHIFT; - - if (req->seg[i].first_sect != 0) - DPRINTF("iRD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) " - "pos: %15lu dpage: %p\n", - req->sector_number, sector, - req->seg[i].first_sect, req->seg[i].last_sect, - (long)(sector << SECTOR_SHIFT), dpage); - - /*convert size and sector to byte offsets */ - size <<= SECTOR_SHIFT; - sector <<= SECTOR_SHIFT; - - - /* - * NB: Looks like AIO now has non-page aligned support, this path - * can probably be removed... Only really used for hunting - * superblocks anyway... ;) - */ - if ( ((unsigned long)dpage % PAGE_SIZE) != 0 ) { - /* AIO to raw devices must be page aligned, so do this read - * synchronously. The OS is probably just looking for - * a superblock or something, so this won't hurt performance. - */ - int ret; - - printf("Slow path block read.\n"); - /* Question: do in-progress aio ops modify the file cursor? */ - ret = lseek(fd, sector, SEEK_SET); - if (ret == (off_t)-1) - printf("lseek failed!\n"); - ret = read(fd, dpage, size); - if (ret < 0) - printf("read problem (%d)\n", ret); - printf("|\n|\n| read: %lld, %lu, %d\n|\n|\n", sector, size, ret); - - /* not an async request any more... */ - pending_list[idx].count--; - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - rsp->status = BLKIF_RSP_OKAY; - return BLKTAP_RESPOND; - /* Doh -- need to flush aio if this is end-of-batch */ - } - - io = iocb_free[--iocb_free_count]; - - io_prep_pread(io, fd, dpage, size, sector); - io->data = (void *)idx; - - ioq[io_idx++] = io; - //ioq[i] = io; - } - - if (batch_done) { - ret = io_submit(ctx, io_idx, ioq); - batch_count = 0; - if (ret < 0) - printf("BADNESS: io_submit error! (%d)\n", errno); - io_idx = 0; - } - - return BLKTAP_STOLEN; - - } - } - - printf("Unknown block operation!\n"); -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - - -int ublkback_pollhook(int fd) -{ - struct io_event *ep; - int n, ret, idx; - blkif_request_t *req; - blkif_response_t *rsp; - int responses_queued = 0; - int pages=0; - - for (ep = aio_events; aio_event_count-- > 0; ep++) { - struct iocb *io = ep->obj; - idx = (int) ep->data; - - if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){ - printf("invalid index returned(%u)!\n", idx); - break; - } - - if ((int)ep->res < 0) - printf("***\n***aio request error! (%d,%d)\n***\n", - (int)ep->res, (int)ep->res2); - - pending_list[idx].count--; - iocb_free[iocb_free_count++] = io; - pages++; - - if (pending_list[idx].count == 0) { - blkif_request_t tmp = pending_list[idx].req; - rsp = (blkif_response_t *)&pending_list[idx].req; - rsp->id = tmp.id; - rsp->operation = tmp.operation; - rsp->status = BLKIF_RSP_OKAY; - blkif_inject_response(pending_list[idx].blkif, rsp); - responses_queued++; - } - } - - if (responses_queued) { - blktap_kick_responses(); - } - - read(aio_notify[READ], &idx, sizeof(idx)); - aio_listening = 1; - pthread_mutex_unlock(¬ifier_sem); - - return 0; -} - -/* the image library terminates the request stream. _resp is a noop. */ -int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done) -{ - return BLKTAP_PASS; -} - -void ublkback_init(void) -{ - int i, rc; - pthread_t p; - - for (i = 0; i < MAX_REQUESTS; i++) - pending_list[i].count = 0; - - memset(&ctx, 0, sizeof(ctx)); - rc = io_queue_init(MAX_AIO_REQS, &ctx); - if (rc != 0) { - printf("queue_init failed! (%d)\n", rc); - exit(0); - } - - for (i=0; i<MAX_AIO_REQS; i++) { - if (!(iocb_free[i] = (struct iocb *)malloc(sizeof(struct iocb)))) { - printf("error allocating iocb array\n"); - exit(0); - } - iocb_free_count = i; - } - - rc = pipe(aio_notify); - if (rc != 0) { - printf("pipe failed! (%d)\n", errno); - exit(0); - } - - rc = pthread_create(&p, NULL, notifier_thread, NULL); - if (rc != 0) { - printf("pthread_create failed! (%d)\n", errno); - exit(0); - } - - aio_listening = 1; - - blktap_attach_poll(aio_notify[READ], POLLIN, ublkback_pollhook); -} - diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/ublkback/ublkbacklib.h --- a/tools/blktap/ublkback/ublkbacklib.h Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -/* blkaiolib.h - * - * aio image-backed block device. - * - * (c) 2004 Andrew Warfield. - * - * Xend has been modified to use an amorfs:[fsid] disk tag. - * This will show up as device type (maj:240,min:0) = 61440. - * - * The fsid is placed in the sec_start field of the disk extent. - */ - -int ublkback_request(blkif_request_t *req, int batch_done); -int ublkback_response(blkif_response_t *rsp); /* noop */ -int ublkback_new_blkif(blkif_t *blkif); -void ublkback_init(void); diff -r 533bad7c0883 -r 840f33e54054 tools/blktap/xenbus.c --- a/tools/blktap/xenbus.c Fri Jun 16 18:19:40 2006 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,568 +0,0 @@ -/* - * xenbus.c - * - * xenbus interface to the blocktap. - * - * this handles the top-half of integration with block devices through the - * store -- the tap driver negotiates the device channel etc, while the - * userland tap clinet needs to sort out the disk parameters etc. - * - * A. Warfield 2005 Based primarily on the blkback and xenbus driver code. - * Comments there apply here... - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <err.h> -#include <stdarg.h> -#include <errno.h> -#include <xs.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <poll.h> -#include "blktaplib.h" -#include "list.h" - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* --- Xenstore / Xenbus helpers ---------------------------------------- */ -/* - * These should all be pulled out into the xenstore API. I'm faulting commands - * in from the xenbus interface as i need them. - */ - - -/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */ -int xs_gather(struct xs_handle *xs, const char *dir, ...) -{ - va_list ap; - const char *name; - char *path; - int ret = 0; - - va_start(ap, dir); - while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { - const char *fmt = va_arg(ap, char *); - void *result = va_arg(ap, void *); - char *p; - - if (asprintf(&path, "%s/%s", dir, name) == -1) - { - warn("allocation error in xs_gather!\n"); - ret = ENOMEM; - break; - } - p = xs_read(xs, path, NULL); - free(path); - if (p == NULL) { - ret = ENOENT; - break; - } - if (fmt) { - if (sscanf(p, fmt, result) == 0) - ret = EINVAL; - free(p); - } else - *(char **)result = p; - } - va_end(ap); - return ret; -} - -/* Single printf and write: returns -errno or 0. */ -int xs_printf(struct xs_handle *h, const char *dir, const char *node, - const char *fmt, ...) -{ - char *buf, *path; - va_list ap; - int ret; - - va_start(ap, fmt); - ret = vasprintf(&buf, fmt, ap); - va_end(ap); - - asprintf(&path, "%s/%s", dir, node); - - if ((path == NULL) || (buf == NULL)) - return 0; - - ret = xs_write(h, path, buf, strlen(buf)+1); - - free(buf); - free(path); - - return ret; -} - - -int xs_exists(struct xs_handle *h, const char *path) -{ - char **d; - int num; - - d = xs_directory(h, path, &num); - if (d == NULL) - return 0; - free(d); - return 1; -} - - - -/* This assumes that the domain name we are looking for is unique! */ -char *get_dom_domid(struct xs_handle *h, const char *name) -{ - char **e, *val, *domid = NULL; - int num, i, len; - char *path; - - e = xs_directory(h, "/local/domain", &num); - - i=0; - while (i < num) { - asprintf(&path, "/local/domain/%s/name", e[i]); - val = xs_read(h, path, &len); - free(path); - if (val == NULL) - continue; - if (strcmp(val, name) == 0) { - /* match! */ - asprintf(&path, "/local/domain/%s/domid", e[i]); - domid = xs_read(h, path, &len); - free(val); - free(path); - break; - } - free(val); - i++; - } - - free(e); - return domid; -} - -static int strsep_len(const char *str, char c, unsigned int len) -{ - unsigned int i; - - for (i = 0; str[i]; i++) - if (str[i] == c) { - if (len == 0) - return i; - len--; - } - return (len == 0) ? i : -ERANGE; -} - - -/* xenbus watches: */ -/* Register callback to watch this node. */ -struct xenbus_watch -{ - struct list_head list; - char *node; - void (*callback)(struct xs_handle *h, - struct xenbus_watch *, - const char *node); -}; - -static LIST_HEAD(watches); - -/* A little paranoia: we don't just trust token. */ -static struct xenbus_watch *find_watch(const char *token) -{ - struct xenbus_watch *i, *cmp; - - cmp = (void *)strtoul(token, NULL, 16); - - list_for_each_entry(i, &watches, list) - if (i == cmp) - return i; - return NULL; -} - -/* Register callback to watch this node. like xs_watch, return 0 on failure */ -int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) -{ - /* Pointer in ascii is the token. */ - char token[sizeof(watch) * 2 + 1]; - int er; - - sprintf(token, "%lX", (long)watch); - if (find_watch(token)) - { - warn("watch collision!"); - return -EINVAL; - } - - er = xs_watch(h, watch->node, token); - if (er != 0) { - list_add(&watch->list, &watches); - } - - return er; -} - -int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) -{ - char token[sizeof(watch) * 2 + 1]; - int er; - - sprintf(token, "%lX", (long)watch); - if (!find_watch(token)) - { - warn("no such watch!"); - return -EINVAL; - } - - - er = xs_unwatch(h, watch->node, token); - list_del(&watch->list); - - if (er == 0) - warn("XENBUS Failed to release watch %s: %i", - watch->node, er); - return 0; -} - -/* Re-register callbacks to all watches. */ -void reregister_xenbus_watches(struct xs_handle *h) -{ - struct xenbus_watch *watch; - char token[sizeof(watch) * 2 + 1]; - - list_for_each_entry(watch, &watches, list) { - sprintf(token, "%lX", (long)watch); - xs_watch(h, watch->node, token); - } -} - -/* based on watch_thread() */ -int xs_fire_next_watch(struct xs_handle *h) -{ - char **res; - char *token; - char *node = NULL; - struct xenbus_watch *w; - int er; - unsigned int num; - - res = xs_read_watch(h, &num); - if (res == NULL) - return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */ - - node = res[XS_WATCH_PATH]; - token = res[XS_WATCH_TOKEN]; - - w = find_watch(token); - if (!w) - { - warn("unregistered watch fired"); - goto done; - } - w->callback(h, w, node); - - done: - free(res); - return 1; -} - - - - -/* ---------------------------------------------------------------------- */ - -struct backend_info -{ - /* our communications channel */ - blkif_t *blkif; - - long int frontend_id; - long int pdev; - long int readonly; - - /* watch back end for changes */ - struct xenbus_watch backend_watch; - char *backpath; - - /* watch front end for changes */ - struct xenbus_watch watch; - char *frontpath; - - struct list_head list; -}; - -static LIST_HEAD(belist); - -static struct backend_info *be_lookup_be(const char *bepath) -{ - struct backend_info *be; - - list_for_each_entry(be, &belist, list) - if (strcmp(bepath, be->backpath) == 0) - return be; - return (struct backend_info *)NULL; -} - -static int be_exists_be(const char *bepath) -{ - return ( be_lookup_be(bepath) != NULL ); -} - -static struct backend_info *be_lookup_fe(const char *fepath) -{ - struct backend_info *be; - - list_for_each_entry(be, &belist, list) - if (strcmp(fepath, be->frontpath) == 0) - return be; - return (struct backend_info *)NULL; -} - -static int backend_remove(struct xs_handle *h, struct backend_info *be) -{ - /* Turn off watches. */ - if (be->watch.node) - unregister_xenbus_watch(h, &be->watch); - if (be->backend_watch.node) - unregister_xenbus_watch(h, &be->backend_watch); - - /* Unhook from be list. */ - list_del(&be->list); - - /* Free everything else. */ - if (be->blkif) - free_blkif(be->blkif); - free(be->frontpath); - free(be->backpath); - free(be); - return 0; -} - -static void frontend_changed(struct xs_handle *h, struct xenbus_watch *w, - const char *fepath_im) -{ - struct backend_info *be; - char *fepath = NULL; - int er; - - be = be_lookup_fe(w->node); - if (be == NULL) - { - warn("frontend changed called for nonexistent backend! (%s)", fepath); - goto fail; - } - - /* If other end is gone, delete ourself. */ - if (w->node && !xs_exists(h, be->frontpath)) { - DPRINTF("DELETING BE: %s\n", be->backpath); - backend_remove(h, be); - return; - } - - if (be->blkif == NULL || (be->blkif->state == CONNECTED)) - return; - - /* Supply the information about the device the frontend needs */ - er = xs_transaction_start(h, be->backpath); - if (er == 0) { - warn("starting transaction"); - goto fail; - } - - er = xs_printf(h, be->backpath, "sectors", "%lu", - be->blkif->ops->get_size(be->blkif)); - if (er == 0) { - warn("writing sectors"); - goto fail; - } - - er = xs_printf(h, be->backpath, "info", "%u", - be->blkif->ops->get_info(be->blkif)); - if (er == 0) { - warn("writing info"); - goto fail; - } - - er = xs_printf(h, be->backpath, "sector-size", "%lu", - be->blkif->ops->get_secsize(be->blkif)); - if (er == 0) { - warn("writing sector-size"); - goto fail; - } - - be->blkif->state = CONNECTED; - - xs_transaction_end(h, 0); - - return; - - fail: - free(fepath); -} - - -static void backend_changed(struct xs_handle *h, struct xenbus_watch *w, - const char *bepath_im) -{ - struct backend_info *be; - char *path = NULL, *p; - int len, er; - long int pdev = 0, handle; - - be = be_lookup_be(w->node); - if (be == NULL) - { - warn("backend changed called for nonexistent backend! (%s)", w->node); - goto fail; - } - - er = xs_gather(h, be->backpath, "physical-device", "%li", &pdev, NULL); - if (er != 0) - goto fail; - - if (be->pdev && be->pdev != pdev) { - warn("changing physical-device not supported"); - goto fail; - } - be->pdev = pdev; - - asprintf(&path, "%s/%s", w->node, "read-only"); - if (xs_exists(h, path)) - be->readonly = 1; - - if (be->blkif == NULL) { - /* Front end dir is a number, which is used as the handle. */ - p = strrchr(be->frontpath, '/') + 1; - handle = strtoul(p, NULL, 0); - - be->blkif = alloc_blkif(be->frontend_id); - if (be->blkif == NULL) - goto fail; - - er = blkif_init(be->blkif, handle, be->pdev, be->readonly); - if (er) - goto fail; - - DPRINTF("[BECHG]: ADDED A NEW BLKIF (%s)\n", w->node); - - /* Pass in NULL node to skip exist test. */ - frontend_changed(h, &be->watch, NULL); - } - - fail: - free(path); -} - -static void blkback_probe(struct xs_handle *h, struct xenbus_watch *w, - const char *bepath_im) -{ - struct backend_info *be = NULL; - char *frontend = NULL, *bepath = NULL; - int er, len; - - bepath = strdup(bepath_im); - if (!bepath) - return; - len = strsep_len(bepath, '/', 6); - if (len < 0) - goto free_be; - - bepath[len] = '\0'; /*truncate the passed-in string with predjudice. */ - - be = malloc(sizeof(*be)); - if (!be) { - warn("allocating backend structure"); - goto free_be; - } - memset(be, 0, sizeof(*be)); - - frontend = NULL; - er = xs_gather(h, bepath, - "frontend-id", "%li", &be->frontend_id, - "frontend", NULL, &frontend, - NULL); - if (er) - goto free_be; - - if (strlen(frontend) == 0 || !xs_exists(h, frontend)) { - /* If we can't get a frontend path and a frontend-id, - * then our bus-id is no longer valid and we need to - * destroy the backend device. - */ - DPRINTF("No frontend (%s)\n", frontend); - goto free_be; - } - - /* Are we already tracking this device? */ - if (be_exists_be(bepath)) - goto free_be; - - be->backpath = bepath; - be->backend_watch.node = be->backpath; - be->backend_watch.callback = backend_changed; - er = register_xenbus_watch(h, &be->backend_watch); - if (er == 0) { - be->backend_watch.node = NULL; - warn("error adding backend watch on %s", bepath); - goto free_be; - } - - be->frontpath = frontend; - be->watch.node = be->frontpath; - be->watch.callback = frontend_changed; - er = register_xenbus_watch(h, &be->watch); - if (er == 0) { - be->watch.node = NULL; - warn("adding frontend watch on %s", be->frontpath); - goto free_be; - } - - list_add(&be->list, &belist); - - DPRINTF("[PROBE]: ADDED NEW DEVICE (%s)\n", bepath_im); - - backend_changed(h, &be->backend_watch, bepath); - return; - - free_be: - if (be && (be->backend_watch.node)) - unregister_xenbus_watch(h, &be->backend_watch); - free(frontend); - free(bepath); - free(be); - return; -} - - -int add_blockdevice_probe_watch(struct xs_handle *h, const char *domname) -{ - char *domid, *path; - struct xenbus_watch *vbd_watch; - int er; - - domid = get_dom_domid(h, domname); - - DPRINTF("%s: %s\n", domname, (domid != NULL) ? domid : "[ not found! ]"); - - asprintf(&path, "/local/domain/%s/backend/vbd", domid); - if (path == NULL) - return -ENOMEM; - - vbd_watch = (struct xenbus_watch *)malloc(sizeof(struct xenbus_watch)); - vbd_watch->node = path; - vbd_watch->callback = blkback_probe; - er = register_xenbus_watch(h, vbd_watch); - if (er == 0) { - warn("Error adding vbd probe watch %s", path); - return -EINVAL; - } - - return 0; -} _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |