[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Updates to blktap driver and user code.
# HG changeset patch # User akw27@xxxxxxxxxxxxxxxxxxxxxx # Node ID f59e0163540ecca3a45ea9558126e84a8403408a # Parent 523078a332879ceb8234b9ed5924c30ba3114d7b Updates to blktap driver and user code. Mostly this makes the tap code work again with all of the changes that have happened to the block drivers recently. We now use a shared page per VBD (to the driver), and handle control information through the store. The taplib interfaces have changed to be based around per-vbd data structures that you can attach arbitrary handlers for. There is also initial code for a user-level blockback driver, which aims to get around the use of loopbacks for file-based vbds. Still plenty of work to do here -- this is a working incremental checkin and I'm away from this for the next four weeks. Signed-off-by: Andrew Warfield <andrew.warfield@xxxxxxxxxxxx> diff -r 523078a33287 -r f59e0163540e .hgignore --- a/.hgignore Sun Sep 4 15:08:16 2005 +++ b/.hgignore Sun Sep 4 21:19:44 2005 @@ -82,6 +82,7 @@ ^tools/blktap/parallax/vdi_validate$ ^tools/blktap/parallax/parallax$ ^tools/blktap/parallax/blockstored$ +^tools/blktap/ublkback/ublkback$ ^tools/blktap/xen/.*$ ^tools/check/\..*$ ^tools/cmdline/.*$ diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/blktap/Makefile Sun Sep 4 15:08:16 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/Makefile Sun Sep 4 21:19:44 2005 @@ -1,3 +1,3 @@ -obj-y := blktap_userdev.o blktap_datapath.o blktap_controlmsg.o blktap.o +obj-y := xenbus.o interface.o blktap.o diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Sun Sep 4 15:08:16 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Sun Sep 4 21:19:44 2005 @@ -1,90 +1,916 @@ /****************************************************************************** - * blktap.c + * arch/xen/drivers/blkif/blktap/blktap.c * - * XenLinux virtual block-device tap. + * This is a modified version of the block backend driver that remaps requests + * to a user-space memory region. It is intended to be used to write + * application-level servers that provide block interfaces to client VMs. * - * Copyright (c) 2004, Andrew Warfield + */ + +#include <linux/kernel.h> +#include <linux/spinlock.h> +#include <asm-xen/balloon.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/errno.h> +#include <linux/major.h> +#include <linux/gfp.h> +#include <linux/poll.h> +#include <asm/tlbflush.h> +#include "common.h" + +/* Only one process may open /dev/xen/blktap at any time. */ +static unsigned long blktap_dev_inuse; +unsigned long blktap_ring_ok; /* make this ring->state */ + +/* Rings up to user space. */ +static blkif_front_ring_t blktap_ufe_ring; + +/* for poll: */ +static wait_queue_head_t blktap_wait; + +/* current switching mode */ +static unsigned long blktap_mode; + +/* local prototypes */ +static int blktap_read_ufe_ring(void); + + +/* /dev/xen/blktap resides at device number major=10, minor=200 */ +#define BLKTAP_MINOR 202 + +/* blktap IOCTLs: */ +#define BLKTAP_IOCTL_KICK_FE 1 +#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ +#define BLKTAP_IOCTL_SETMODE 3 +#define BLKTAP_IOCTL_PRINT_IDXS 100 + +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ +#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ +#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 +#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */ +#define BLKTAP_MODE_COPY_FE 0x00000004 /* unimp. */ +#define BLKTAP_MODE_COPY_BE 0x00000008 /* unimp. */ +#define BLKTAP_MODE_COPY_FE_PAGES 0x00000010 /* unimp. */ +#define BLKTAP_MODE_COPY_BE_PAGES 0x00000020 /* unimp. */ + +#define BLKTAP_MODE_INTERPOSE \ + (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) + +#define BLKTAP_MODE_COPY_BOTH \ + (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE) + +#define BLKTAP_MODE_COPY_BOTH_PAGES \ + (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES) + +static inline int BLKTAP_MODE_VALID(unsigned long arg) +{ + return ( + ( arg == BLKTAP_MODE_PASSTHROUGH ) || + ( arg == BLKTAP_MODE_INTERCEPT_FE ) || + ( arg == BLKTAP_MODE_INTERPOSE ) ); +/* + return ( + ( arg == BLKTAP_MODE_PASSTHROUGH ) || + ( arg == BLKTAP_MODE_INTERCEPT_FE ) || + ( arg == BLKTAP_MODE_INTERCEPT_BE ) || + ( arg == BLKTAP_MODE_INTERPOSE ) || + ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) || + ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) || + ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH ) + ); +*/ +} + + +/****************************************************************** + * MMAP REGION + */ + +/* + * We use a big chunk of address space to map in-flight requests into, + * and export this region up to user-space. See the comments in blkback + * about this -- the two must be kept in sync if the tap is used as a + * passthrough. + */ + +#define MAX_PENDING_REQS 64 +#define BATCH_PER_DOMAIN 16 + +/* immediately before the mmap area, we have a bunch of pages reserved + * for shared memory rings. + */ +#define RING_PAGES 1 /* Front */ + +/* Where things are inside the device mapping. */ +struct vm_area_struct *blktap_vma = NULL; +unsigned long mmap_vstart; /* Kernel pages for mapping in data. */ +unsigned long rings_vstart; /* start of mmaped vma */ +unsigned long user_vstart; /* start of user mappings */ + +#define MMAP_PAGES \ + (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req,_seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + + + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + unsigned long id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; +} pending_req_t; + +/* + * We can't allocate pending_req's in order, since they may complete out of + * order. We therefore maintain an allocation ring. This ring also indicates + * when enough work has been passed down -- at that point the allocation ring + * will be empty. + */ +static pending_req_t pending_reqs[MAX_PENDING_REQS]; +static unsigned char pending_ring[MAX_PENDING_REQS]; +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; +/* NB. We use a different index type to differentiate from shared blk rings. */ +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +/* Requests passing through the tap to the backend hijack the id field + * in the request message. In it we put the AR index _AND_ the fe domid. + * the domid is used by the backend to map the pages properly. + */ + +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx) +{ + return ( (fe_dom << 16) | MASK_PEND_IDX(idx) ); +} + +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) +{ + return (PEND_RING_IDX)( id & 0x0000ffff ); +} + +extern inline domid_t ID_TO_DOM(unsigned long id) +{ + return (domid_t)(id >> 16); +} + + + +/****************************************************************** + * GRANT HANDLES + */ + +/* When using grant tables to map a frame for device access then the + * handle returned must be used to unmap the frame. This is needed to + * drop the ref count on the frame. + */ +struct grant_handle_pair +{ + u16 kernel; + u16 user; +}; +static struct grant_handle_pair pending_grant_handles[MMAP_PAGES]; +#define pending_handle(_idx, _i) \ + (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)]) +#define BLKTAP_INVALID_HANDLE(_g) \ + (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF)) +#define BLKTAP_INVALIDATE_HANDLE(_g) do { \ + (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \ + } while(0) + + +/****************************************************************** + * BLKTAP VM OPS + */ + +static struct page *blktap_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + /* + * if the page has not been mapped in by the driver then generate + * a SIGBUS to the domain. + */ + + force_sig(SIGBUS, current); + + return 0; +} + +struct vm_operations_struct blktap_vm_ops = { + nopage: blktap_nopage, +}; + +/****************************************************************** + * BLKTAP FILE OPS + */ + +static int blktap_open(struct inode *inode, struct file *filp) +{ + blkif_sring_t *sring; + + if ( test_and_set_bit(0, &blktap_dev_inuse) ) + return -EBUSY; + + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (sring == NULL) + goto fail_nomem; + + SetPageReserved(virt_to_page(sring)); + + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE); + + return 0; + + fail_nomem: + return -ENOMEM; +} + +static int blktap_release(struct inode *inode, struct file *filp) +{ + blktap_dev_inuse = 0; + blktap_ring_ok = 0; + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(blktap_ufe_ring.sring)); + free_page((unsigned long) blktap_ufe_ring.sring); + + /* Clear any active mappings and free foreign map table */ + if (blktap_vma != NULL) { + zap_page_range(blktap_vma, blktap_vma->vm_start, + blktap_vma->vm_end - blktap_vma->vm_start, NULL); + blktap_vma = NULL; + } + + return 0; +} + + +/* Note on mmap: + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to translate a user virtual address down to a + * physical address when the page belongs to another domain. * - * Based on the original split block driver: - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Modifications by Mark A. Williamson are (c) Intel Research Cambridge - * Copyright (c) 2004, Christian Limpach - * - * Note that unlike the split block driver code, this driver has been developed - * strictly for Linux 2.6 - */ - -#include "blktap.h" - -int __init xlblktap_init(void) -{ - ctrl_msg_t cmsg; - blkif_fe_driver_status_t fe_st; - blkif_be_driver_status_t be_st; - - printk(KERN_INFO "Initialising Xen block tap device\n"); -#ifdef CONFIG_XEN_BLKDEV_GRANT - printk(KERN_INFO "Block tap is using grant tables.\n"); -#endif - - DPRINTK(" tap - Backend connection init:\n"); - - - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_FE; - cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_fe_driver_status_t); - fe_st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &fe_st, sizeof(fe_st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - - DPRINTK(" tap - Frontend connection init:\n"); + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + */ +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int size; + struct page **map; + int i; + + DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n", + vma->vm_start, vma->vm_end); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_vm_ops; + + size = vma->vm_end - vma->vm_start; + if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) { + printk(KERN_INFO + "blktap: you _must_ map exactly %d pages!\n", + MMAP_PAGES + RING_PAGES); + return -EAGAIN; + } + + size >>= PAGE_SHIFT; + DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1); - active_reqs_init(); + rings_vstart = vma->vm_start; + user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT); + + /* Map the ring pages to the start of the region and reserve it. */ + + /* not sure if I really need to do this... */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (remap_pfn_range(vma, vma->vm_start, + __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot)) + { + WPRINTK("Mapping user ring failed!\n"); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + * sizeof(struct page_struct*), + GFP_KERNEL); + if (map == NULL) + { + WPRINTK("Couldn't alloc VM_FOREIGH map.\n"); + goto fail; + } + + for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) + map[i] = NULL; + + vma->vm_private_data = map; + vma->vm_flags |= VM_FOREIGN; + + blktap_vma = vma; + blktap_ring_ok = 1; + + return 0; + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + + return -ENOMEM; +} + +static int blktap_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */ + return blktap_read_ufe_ring(); + + case BLKTAP_IOCTL_SETMODE: + if (BLKTAP_MODE_VALID(arg)) { + blktap_mode = arg; + /* XXX: may need to flush rings here. */ + printk(KERN_INFO "blktap: set mode to %lx\n", arg); + return 0; + } + case BLKTAP_IOCTL_PRINT_IDXS: + { + //print_fe_ring_idxs(); + WPRINTK("User Rings: \n-----------\n"); + WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d " + "| req_prod: %2d, rsp_prod: %2d\n", + blktap_ufe_ring.rsp_cons, + blktap_ufe_ring.req_prod_pvt, + blktap_ufe_ring.sring->req_prod, + blktap_ufe_ring.sring->rsp_prod); + + } + } + return -ENOIOCTLCMD; +} + +static unsigned int blktap_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &blktap_wait, wait); + if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) ) + { + flush_tlb_all(); + + RING_PUSH_REQUESTS(&blktap_ufe_ring); + return POLLIN | POLLRDNORM; + } + + return 0; +} + +void blktap_kick_user(void) +{ + /* blktap_ring->req_prod = blktap_req_prod; */ + wake_up_interruptible(&blktap_wait); +} + +static struct file_operations blktap_fops = { + owner: THIS_MODULE, + poll: blktap_poll, + ioctl: blktap_ioctl, + open: blktap_open, + release: blktap_release, + mmap: blktap_mmap, +}; + + + +static int do_block_io_op(blkif_t *blkif, int max_to_do); +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req); +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st); + + +static void fast_flush_area(int idx, int nr_pages) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int i, op = 0; + struct grant_handle_pair *handle; + unsigned long ptep; + + for (i=0; i<nr_pages; i++) + { + handle = &pending_handle(idx, i); + if (!BLKTAP_INVALID_HANDLE(handle)) + { + + unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i); + unmap[op].dev_bus_addr = 0; + unmap[op].handle = handle->kernel; + op++; + + if (create_lookup_pte_addr(blktap_vma->vm_mm, + MMAP_VADDR(user_vstart, idx, i), + &ptep) !=0) { + DPRINTK("Couldn't get a pte addr!\n"); + return; + } + unmap[op].host_addr = ptep; + unmap[op].dev_bus_addr = 0; + unmap[op].handle = handle->user; + op++; + + BLKTAP_INVALIDATE_HANDLE(handle); + } + } + if ( unlikely(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, op))) + BUG(); + + if (blktap_vma != NULL) + zap_page_range(blktap_vma, + MMAP_VADDR(user_vstart, idx, 0), + nr_pages << PAGE_SHIFT, NULL); +} + +/****************************************************************** + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE + */ + +static struct list_head blkio_schedule_list; +static spinlock_t blkio_schedule_list_lock; + +static int __on_blkdev_list(blkif_t *blkif) +{ + return blkif->blkdev_list.next != NULL; +} + +static void remove_from_blkdev_list(blkif_t *blkif) +{ + unsigned long flags; + if ( !__on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if ( __on_blkdev_list(blkif) ) + { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = NULL; + blkif_put(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); +} + +static void add_to_blkdev_list_tail(blkif_t *blkif) +{ + unsigned long flags; + if ( __on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&blkio_schedule_list_lock, flags); + if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) + { + list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); + blkif_get(blkif); + } + spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); +} + + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait); + +static int blkio_schedule(void *arg) +{ + DECLARE_WAITQUEUE(wq, current); + + blkif_t *blkif; + struct list_head *ent; + + daemonize("xenblkd"); + + for ( ; ; ) + { + /* Wait for work to do. */ + add_wait_queue(&blkio_schedule_wait, &wq); + set_current_state(TASK_INTERRUPTIBLE); + if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || + list_empty(&blkio_schedule_list) ) + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&blkio_schedule_wait, &wq); + + /* Queue up a batch of requests. */ + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&blkio_schedule_list) ) + { + ent = blkio_schedule_list.next; + blkif = list_entry(ent, blkif_t, blkdev_list); + blkif_get(blkif); + remove_from_blkdev_list(blkif); + if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) + add_to_blkdev_list_tail(blkif); + blkif_put(blkif); + } + } +} + +static void maybe_trigger_blkio_schedule(void) +{ + /* + * Needed so that two processes, who together make the following predicate + * true, don't both read stale values and evaluate the predicate + * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... + */ + smp_mb(); + + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&blkio_schedule_list) ) + wake_up(&blkio_schedule_wait); +} + + + +/****************************************************************** + * COMPLETION CALLBACK -- Called as bh->b_end_io() + */ + + +static int blktap_read_ufe_ring(void) +{ + /* This is called to read responses from the UFE ring. */ + + RING_IDX i, j, rp; + blkif_response_t *resp; + blkif_t *blkif; + int pending_idx; + pending_req_t *pending_req; + unsigned long flags; + + /* if we are forwarding from UFERring to FERing */ + if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) { + + /* for each outstanding message on the UFEring */ + rp = blktap_ufe_ring.sring->rsp_prod; + rmb(); + + for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ ) + { + resp = RING_GET_RESPONSE(&blktap_ufe_ring, i); + pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id)); + pending_req = &pending_reqs[pending_idx]; + + blkif = pending_req->blkif; + for (j = 0; j < pending_req->nr_pages; j++) { + unsigned long vaddr; + struct page **map = blktap_vma->vm_private_data; + int offset; + + vaddr = MMAP_VADDR(user_vstart, pending_idx, j); + offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT; + + //ClearPageReserved(virt_to_page(vaddr)); + ClearPageReserved((struct page *)map[offset]); + map[offset] = NULL; + } + + fast_flush_area(pending_idx, pending_req->nr_pages); + make_response(blkif, pending_req->id, resp->operation, + resp->status); + blkif_put(pending_req->blkif); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + spin_unlock_irqrestore(&pend_prod_lock, flags); + } + blktap_ufe_ring.rsp_cons = i; + maybe_trigger_blkio_schedule(); + } + return 0; +} + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_t *blkif = dev_id; + add_to_blkdev_list_tail(blkif); + maybe_trigger_blkio_schedule(); + return IRQ_HANDLED; +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ + +static int do_block_io_op(blkif_t *blkif, int max_to_do) +{ + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + blkif_request_t *req; + RING_IDX i, rp; + int more_to_do = 0; + + rp = blk_ring->sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + for ( i = blk_ring->req_cons; + (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); + i++ ) + { + if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) + { + more_to_do = 1; + break; + } + + req = RING_GET_REQUEST(blk_ring, i); + switch ( req->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + dispatch_rw_block_io(blkif, req); + break; + + default: + DPRINTK("error: unknown block io operation [%d]\n", + req->operation); + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + break; + } + } + + blk_ring->req_cons = i; + blktap_kick_user(); + + return more_to_do; +} + +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) +{ + blkif_request_t *target; + int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + pending_req_t *pending_req; + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + int op, ret; + unsigned int nseg; + + /* Check that number of segments is sane. */ + nseg = req->nr_segments; + if ( unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) + { + DPRINTK("Bad number of segments in request (%d)\n", nseg); + goto bad_descriptor; + } + + /* Make sure userspace is ready. */ + if (!blktap_ring_ok) { + DPRINTK("blktap: ring not ready for requests!\n"); + goto bad_descriptor; + } + + + if ( RING_FULL(&blktap_ufe_ring) ) { + WPRINTK("blktap: fe_ring is full, can't add (very broken!).\n"); + goto bad_descriptor; + } + + flush_cache_all(); /* a noop on intel... */ + + /* Map the foreign pages directly in to the application */ + op = 0; + for (i=0; i<req->nr_segments; i++) { + + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long ptep; + + uvaddr = MMAP_VADDR(user_vstart, pending_idx, i); + kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i); + + /* Map the remote page to kernel. */ + map[op].host_addr = kvaddr; + map[op].dom = blkif->domid; + map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]); + map[op].flags = GNTMAP_host_map; + /* This needs a bit more thought in terms of interposition: + * If we want to be able to modify pages during write using + * grant table mappings, the guest will either need to allow + * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */ + if (req->operation == BLKIF_OP_WRITE) + map[op].flags |= GNTMAP_readonly; + op++; + + /* Now map it to user. */ + ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep); + if (ret) + { + DPRINTK("Couldn't get a pte addr!\n"); + fast_flush_area(pending_idx, req->nr_segments); + goto bad_descriptor; + } + + map[op].host_addr = ptep; + map[op].dom = blkif->domid; + map[op].ref = blkif_gref_from_fas(req->frame_and_sects[i]); + map[op].flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + /* Above interposition comment applies here as well. */ + if (req->operation == BLKIF_OP_WRITE) + map[op].flags |= GNTMAP_readonly; + op++; + } + + if ( unlikely(HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, map, op))) + BUG(); + + op = 0; + for (i=0; i<(req->nr_segments*2); i+=2) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + int cancel = 0; + + uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2); + kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2); + + if ( unlikely(map[i].handle < 0) ) + { + DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle); + ret = map[i].handle; + cancel = 1; + } + + if ( unlikely(map[i+1].handle < 0) ) + { + DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle); + ret = map[i+1].handle; + cancel = 1; + } + + if (cancel) + { + fast_flush_area(pending_idx, req->nr_segments); + goto bad_descriptor; + } + + /* Set the necessary mappings in p2m and in the VM_FOREIGN + * vm_area_struct to allow user vaddr -> struct page lookups + * to work. This is needed for direct IO to foreign pages. */ + phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] = + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); + + offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT; + ((struct page **)blktap_vma->vm_private_data)[offset] = + pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + + /* Save handles for unmapping later. */ + pending_handle(pending_idx, i/2).kernel = map[i].handle; + pending_handle(pending_idx, i/2).user = map[i+1].handle; + } + + /* Mark mapped pages as reserved: */ + for ( i = 0; i < req->nr_segments; i++ ) + { + unsigned long kvaddr; + + kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i); + SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT)); + } + + pending_req = &pending_reqs[pending_idx]; + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = req->operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + req->id = MAKE_ID(blkif->domid, pending_idx); + //atomic_set(&pending_req->pendcnt, nbio); + pending_cons++; + blkif_get(blkif); + + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt); + memcpy(target, req, sizeof(*req)); + blktap_ufe_ring.req_prod_pvt++; + return; + + bad_descriptor: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st) +{ + blkif_response_t *resp; + unsigned long flags; + blkif_back_ring_t *blk_ring = &blkif->blk_ring; + + /* Place on the response ring for the relevant domain. */ + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); + resp->id = id; + resp->operation = op; + resp->status = st; + wmb(); /* Ensure other side can see the response fields. */ + blk_ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES(blk_ring); + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + /* Kick the relevant domain. */ + notify_via_evtchn(blkif->evtchn); +} + +static struct miscdevice blktap_miscdev = { + .minor = BLKTAP_MINOR, + .name = "blktap", + .fops = &blktap_fops, + .devfs_name = "misc/blktap", +}; + +void blkif_deschedule(blkif_t *blkif) +{ + remove_from_blkdev_list(blkif); +} + +static int __init blkif_init(void) +{ + int i, j, err; + struct page *page; +/* + if ( !(xen_start_info.flags & SIF_INITDOMAIN) && + !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) + return 0; +*/ blkif_interface_init(); - blkdev_schedule_init(); + + page = balloon_alloc_empty_page_range(MMAP_PAGES); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_be_driver_status_t); - be_st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &be_st, sizeof(be_st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - - DPRINTK(" tap - Userland channel init:\n"); - - blktap_init(); - - DPRINTK("Blkif tap device initialized.\n"); + spin_lock_init(&blkio_schedule_list_lock); + INIT_LIST_HEAD(&blkio_schedule_list); + + if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) + BUG(); + + blkif_xenbus_init(); + + for (i=0; i<MAX_PENDING_REQS ; i++) + for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) + BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j)); + + err = misc_register(&blktap_miscdev); + if ( err != 0 ) + { + printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err); + return err; + } + + init_waitqueue_head(&blktap_wait); return 0; } -#if 0 /* tap doesn't handle suspend/resume */ -void blkdev_suspend(void) -{ -} - -void blkdev_resume(void) -{ - ctrl_msg_t cmsg; - blkif_fe_driver_status_t st; - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_FE; - cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_fe_driver_status_t); - st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} -#endif - -__initcall(xlblktap_init); +__initcall(blkif_init); diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/mm/memory.c --- a/linux-2.6-xen-sparse/mm/memory.c Sun Sep 4 15:08:16 2005 +++ b/linux-2.6-xen-sparse/mm/memory.c Sun Sep 4 21:19:44 2005 @@ -954,10 +954,8 @@ i++; start += PAGE_SIZE; len--; -printk(KERN_ALERT "HIT 0x%lx\n", start); continue; } -else printk(KERN_ALERT "MISS 0x%lx\n", start); } if (!vma || (vma->vm_flags & VM_IO) diff -r 523078a33287 -r f59e0163540e tools/blktap/Makefile --- a/tools/blktap/Makefile Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/Makefile Sun Sep 4 21:19:44 2005 @@ -6,7 +6,8 @@ include $(XEN_ROOT)/tools/Rules.mk SUBDIRS := -SUBDIRS += parallax +SUBDIRS += ublkback +#SUBDIRS += parallax BLKTAP_INSTALL_DIR = /usr/sbin @@ -14,12 +15,12 @@ INSTALL_PROG = $(INSTALL) -m0755 INSTALL_DIR = $(INSTALL) -d -m0755 -INCLUDES += -I. -I $(XEN_LIBXC) +INCLUDES += -I. -I $(XEN_LIBXC) -I $(XEN_XENSTORE) LIBS := -lpthread -lz SRCS := -SRCS += blktaplib.c +SRCS += blktaplib.c xenbus.c blkif.c CFLAGS += -Wall CFLAGS += -Werror @@ -28,17 +29,20 @@ CFLAGS += -g3 CFLAGS += -fno-strict-aliasing CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE +# get asprintf(): +CFLAGS += -D _GNU_SOURCE # Get gcc to generate the dependencies for us. CFLAGS += -Wp,-MD,.$(@F).d CFLAGS += $(INCLUDES) DEPS = .*.d OBJS = $(patsubst %.c,%.o,$(SRCS)) -IBINS = blkdump +IBINS := +#IBINS += blkdump LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR) -all: mk-symlinks libblktap.so blkdump +all: mk-symlinks libblktap.so #blkdump @set -e; for subdir in $(SUBDIRS); do \ $(MAKE) -C $$subdir $@; \ done @@ -59,7 +63,7 @@ $(INSTALL_DIR) -p $(DESTDIR)/usr/include $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR) $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include - $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR) + #$(INSTALL_PROG) $(IBINS) $(DESTDIR)$(BLKTAP_INSTALL_DIR) @set -e; for subdir in $(SUBDIRS); do \ $(MAKE) -C $$subdir $@; \ done @@ -79,14 +83,16 @@ mv staging/i386/*.rpm . rm -rf staging -libblktap.so: $(OBJS) - $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared -o \ - libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS) +libblktap.so: $(OBJS) + $(CC) $(CFLAGS) -Wl,-soname -Wl,$(SONAME) -shared \ + -L$(XEN_XENSTORE) -l xenstore \ + -o libblktap.so.$(MAJOR).$(MINOR) $^ $(LIBS) ln -sf libblktap.so.$(MAJOR).$(MINOR) libblktap.so.$(MAJOR) ln -sf libblktap.so.$(MAJOR) $@ blkdump: libblktap.so - $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. -l blktap blkdump.c + $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L. \ + -l blktap blkdump.c .PHONY: TAGS clean install mk-symlinks rpm diff -r 523078a33287 -r f59e0163540e tools/blktap/blkdump.c --- a/tools/blktap/blkdump.c Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/blkdump.c Sun Sep 4 21:19:44 2005 @@ -8,85 +8,18 @@ #include <stdio.h> #include "blktaplib.h" -int control_print(control_msg_t *msg) -{ - if (msg->type != CMSG_BLKIF_BE) - { - printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type); - return 0; - } - - switch(msg->subtype) - { - case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_be_create_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n", - ((blkif_be_create_t *)msg->msg)->domid, - ((blkif_be_create_t *)msg->msg)->blkif_handle); - break; - case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_be_destroy_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n", - ((blkif_be_destroy_t *)msg->msg)->domid, - ((blkif_be_destroy_t *)msg->msg)->blkif_handle); - break; - case CMSG_BLKIF_BE_CONNECT: - if ( msg->length != sizeof(blkif_be_connect_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_CONNECT(d:%d,h:%d)\n", - ((blkif_be_connect_t *)msg->msg)->domid, - ((blkif_be_connect_t *)msg->msg)->blkif_handle); - break; - case CMSG_BLKIF_BE_DISCONNECT: - if ( msg->length != sizeof(blkif_be_disconnect_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_DISCONNECT(d:%d,h:%d)\n", - ((blkif_be_disconnect_t *)msg->msg)->domid, - ((blkif_be_disconnect_t *)msg->msg)->blkif_handle); - break; - case CMSG_BLKIF_BE_VBD_CREATE: - if ( msg->length != sizeof(blkif_be_vbd_create_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_CREATE(d:%d,h:%d,v:%d)\n", - ((blkif_be_vbd_create_t *)msg->msg)->domid, - ((blkif_be_vbd_create_t *)msg->msg)->blkif_handle, - ((blkif_be_vbd_create_t *)msg->msg)->vdevice); - break; - case CMSG_BLKIF_BE_VBD_DESTROY: - if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_DESTROY(d:%d,h:%d,v:%d)\n", - ((blkif_be_vbd_destroy_t *)msg->msg)->domid, - ((blkif_be_vbd_destroy_t *)msg->msg)->blkif_handle, - ((blkif_be_vbd_destroy_t *)msg->msg)->vdevice); - break; - default: - goto parse_error; - } - - return 0; - -parse_error: - printf("[CONTROL_MSG] Bad message type or length!\n"); - return 0; -} - int request_print(blkif_request_t *req) { int i; unsigned long fas; - if ( req->operation == BLKIF_OP_PROBE ) { - printf("[%2u:%2u<%s]\n", ID_TO_DOM(req->id), ID_TO_IDX(req->id), - blkif_op_name[req->operation]); - return BLKTAP_PASS; - } else { + if ( (req->operation == BLKIF_OP_READ) || + (req->operation == BLKIF_OP_WRITE) ) + { printf("[%2u:%2u<%5s] (nr_segs: %03u, dev: %03u, %010llu)\n", ID_TO_DOM(req->id), ID_TO_IDX(req->id), blkif_op_name[req->operation], - req->nr_segments, req->device, + req->nr_segments, req->handle, req->sector_number); @@ -99,6 +32,8 @@ ); } + } else { + printf("Unknown request message type.\n"); } return BLKTAP_PASS; @@ -106,23 +41,22 @@ int response_print(blkif_response_t *rsp) { - if ( rsp->operation == BLKIF_OP_PROBE ) { - printf("[%2u:%2u>%s]\n", ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id), - blkif_op_name[rsp->operation]); - return BLKTAP_PASS; - } else { + if ( (rsp->operation == BLKIF_OP_READ) || + (rsp->operation == BLKIF_OP_WRITE) ) + { printf("[%2u:%2u>%5s] (status: %d)\n", ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id), blkif_op_name[rsp->operation], rsp->status); + } else { + printf("Unknown request message type.\n"); } return BLKTAP_PASS; } int main(int argc, char *argv[]) { - blktap_register_ctrl_hook("control_print", control_print); blktap_register_request_hook("request_print", request_print); blktap_register_response_hook("response_print", response_print); blktap_listen(); diff -r 523078a33287 -r f59e0163540e tools/blktap/blktaplib.c --- a/tools/blktap/blktaplib.c Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/blktaplib.c Sun Sep 4 21:19:44 2005 @@ -24,7 +24,7 @@ #include <string.h> #include <unistd.h> #include <pthread.h> - +#include <xs.h> #define __COMPILING_BLKTAP_LIB #include "blktaplib.h" @@ -34,11 +34,12 @@ #else #define DPRINTF(_f, _a...) ((void)0) #endif -#define DEBUG_RING_IDXS 1 +#define DEBUG_RING_IDXS 0 #define POLLRDNORM 0x040 #define BLKTAP_IOCTL_KICK 1 + void got_sig_bus(); void got_sig_int(); @@ -46,17 +47,13 @@ /* in kernel these are opposite, but we are a consumer now. */ blkif_back_ring_t fe_ring; /* slightly counterintuitive ;) */ blkif_front_ring_t be_ring; -ctrl_back_ring_t ctrl_ring; unsigned long mmap_vstart = 0; char *blktap_mem; int fd = 0; -#define BLKTAP_RING_PAGES 3 /* Ctrl, Back, Front */ -/*#define BLKTAP_MMAP_PAGES ((11 + 1) * 64)*/ -#define BLKTAP_MMAP_PAGES \ - ((BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) * BLKIF_RING_SIZE) -#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + BLKTAP_MMAP_PAGES) +#define BLKTAP_RING_PAGES 1 /* Front */ +#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + MMAP_PAGES) int bad_count = 0; void bad(void) @@ -79,126 +76,13 @@ } inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); } -/* + static int (*request_hook)(blkif_request_t *req) = NULL; static int (*response_hook)(blkif_response_t *req) = NULL; -*/ - -/*-----[ Request/Response hook chains.]----------------------------------*/ - -#define HOOK_NAME_MAX 50 - -typedef struct ctrl_hook_st { - char name[HOOK_NAME_MAX]; - int (*func)(control_msg_t *); - struct ctrl_hook_st *next; -} ctrl_hook_t; - -typedef struct request_hook_st { - char name[HOOK_NAME_MAX]; - int (*func)(blkif_request_t *); - struct request_hook_st *next; -} request_hook_t; - -typedef struct response_hook_st { - char name[HOOK_NAME_MAX]; - int (*func)(blkif_response_t *); - struct response_hook_st *next; -} response_hook_t; - -static ctrl_hook_t *ctrl_hook_chain = NULL; -static request_hook_t *request_hook_chain = NULL; -static response_hook_t *response_hook_chain = NULL; - -void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *)) -{ - ctrl_hook_t *ch_ent, **c; - - ch_ent = (ctrl_hook_t *)malloc(sizeof(ctrl_hook_t)); - if (!ch_ent) { printf("couldn't allocate a new hook\n"); exit(-1); } - - ch_ent->func = ch; - ch_ent->next = NULL; - strncpy(ch_ent->name, name, HOOK_NAME_MAX); - ch_ent->name[HOOK_NAME_MAX-1] = '\0'; - - c = &ctrl_hook_chain; - while (*c != NULL) { - c = &(*c)->next; - } - *c = ch_ent; -} - -void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *)) -{ - request_hook_t *rh_ent, **c; - - rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t)); - if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); } - - rh_ent->func = rh; - rh_ent->next = NULL; - strncpy(rh_ent->name, name, HOOK_NAME_MAX); - - c = &request_hook_chain; - while (*c != NULL) { - c = &(*c)->next; - } - *c = rh_ent; -} - -void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *)) -{ - response_hook_t *rh_ent, **c; - - rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t)); - if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); } - - rh_ent->func = rh; - rh_ent->next = NULL; - strncpy(rh_ent->name, name, HOOK_NAME_MAX); - - c = &response_hook_chain; - while (*c != NULL) { - c = &(*c)->next; - } - *c = rh_ent; -} - -void print_hooks(void) -{ - request_hook_t *req_hook; - response_hook_t *rsp_hook; - ctrl_hook_t *ctrl_hook; - - DPRINTF("Control Hooks:\n"); - ctrl_hook = ctrl_hook_chain; - while (ctrl_hook != NULL) - { - DPRINTF(" [0x%p] %s\n", ctrl_hook->func, ctrl_hook->name); - ctrl_hook = ctrl_hook->next; - } - - DPRINTF("Request Hooks:\n"); - req_hook = request_hook_chain; - while (req_hook != NULL) - { - DPRINTF(" [0x%p] %s\n", req_hook->func, req_hook->name); - req_hook = req_hook->next; - } - - DPRINTF("Response Hooks:\n"); - rsp_hook = response_hook_chain; - while (rsp_hook != NULL) - { - DPRINTF(" [0x%p] %s\n", rsp_hook->func, rsp_hook->name); - rsp_hook = rsp_hook->next; - } -} /*-----[ Data to/from Backend (server) VM ]------------------------------*/ - +/* inline int write_req_to_be_ring(blkif_request_t *req) { @@ -214,6 +98,7 @@ return 0; } +*/ inline int write_rsp_to_fe_ring(blkif_response_t *rsp) { @@ -230,14 +115,14 @@ return 0; } -static void apply_rsp_hooks(blkif_response_t *rsp) +static void apply_rsp_hooks(blkif_t *blkif, blkif_response_t *rsp) { response_hook_t *rsp_hook; - rsp_hook = response_hook_chain; + rsp_hook = blkif->response_hook_chain; while (rsp_hook != NULL) { - switch(rsp_hook->func(rsp)) + switch(rsp_hook->func(blkif, rsp, 1)) { case BLKTAP_PASS: break; @@ -248,15 +133,19 @@ } } + static pthread_mutex_t push_mutex = PTHREAD_MUTEX_INITIALIZER; -void blktap_inject_response(blkif_response_t *rsp) -{ - - apply_rsp_hooks(rsp); - +void blkif_inject_response(blkif_t *blkif, blkif_response_t *rsp) +{ + + apply_rsp_hooks(blkif, rsp); + write_rsp_to_fe_ring(rsp); - +} + +void blktap_kick_responses(void) +{ pthread_mutex_lock(&push_mutex); RING_PUSH_RESPONSES(&fe_ring); @@ -277,7 +166,7 @@ int active; } pollhook_t; -static struct pollfd pfd[MAX_POLLFDS+1]; +static struct pollfd pfd[MAX_POLLFDS+2]; /* tap and store are extra */ static pollhook_t pollhooks[MAX_POLLFDS]; static unsigned int ph_freelist[MAX_POLLFDS]; static unsigned int ph_cons, ph_prod; @@ -344,65 +233,65 @@ int blktap_listen(void) { - int notify_be, notify_fe, tap_pfd; - + int notify_be, notify_fe, tap_pfd, store_pfd, xs_fd, ret; + struct xs_handle *h; + blkif_t *blkif; + /* comms rings: */ blkif_request_t *req; blkif_response_t *rsp; - control_msg_t *msg; blkif_sring_t *sring; - ctrl_sring_t *csring; RING_IDX rp, i, pfd_count; /* pending rings */ blkif_request_t req_pending[BLKIF_RING_SIZE]; - blkif_response_t rsp_pending[BLKIF_RING_SIZE]; + /* blkif_response_t rsp_pending[BLKIF_RING_SIZE] */; /* handler hooks: */ request_hook_t *req_hook; response_hook_t *rsp_hook; - ctrl_hook_t *ctrl_hook; signal (SIGBUS, got_sig_bus); signal (SIGINT, got_sig_int); - print_hooks(); - + __init_blkif(); + fd = open("/dev/blktap", O_RDWR); - if (fd == -1) { - printf("open failed! (%d)\n", errno); - goto open_failed; - } + if (fd == -1) + err(-1, "open failed!"); blktap_mem = mmap(0, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if ((int)blktap_mem == -1) { - printf("mmap failed! (%d)\n", errno); - goto mmap_failed; - } + if ((int)blktap_mem == -1) + err(-1, "mmap failed!"); /* assign the rings to the mapped memory */ - csring = (ctrl_sring_t *)blktap_mem; - BACK_RING_INIT(&ctrl_ring, csring, PAGE_SIZE); - +/* sring = (blkif_sring_t *)((unsigned long)blktap_mem + PAGE_SIZE); FRONT_RING_INIT(&be_ring, sring, PAGE_SIZE); - - sring = (blkif_sring_t *)((unsigned long)blktap_mem + (2 *PAGE_SIZE)); +*/ + sring = (blkif_sring_t *)((unsigned long)blktap_mem); BACK_RING_INIT(&fe_ring, sring, PAGE_SIZE); mmap_vstart = (unsigned long)blktap_mem +(BLKTAP_RING_PAGES << PAGE_SHIFT); + + /* Set up store connection and watch. */ + h = xs_daemon_open(); + if (h == NULL) + err(-1, "xs_daemon_open"); + + ret = add_blockdevice_probe_watch(h, "Domain-0"); + if (ret != 0) + err(0, "adding device probewatch"); + ioctl(fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE ); while(1) { int ret; /* build the poll list */ - - DPRINTF("Building poll list.\n"); - pfd_count = 0; for ( i=0; i < MAX_POLLFDS; i++ ) { pollhook_t *ph = &pollhooks[i]; @@ -415,49 +304,31 @@ } } - tap_pfd = pfd_count; + tap_pfd = pfd_count++; pfd[tap_pfd].fd = fd; pfd[tap_pfd].events = POLLIN; - DPRINTF("poll() %d fds.\n", pfd_count); + store_pfd = pfd_count++; + pfd[store_pfd].fd = xs_fileno(h); + pfd[store_pfd].events = POLLIN; - if ( (ret = (poll(pfd, pfd_count+1, 10000)) == 0) ) { + if ( (ret = (poll(pfd, pfd_count, 10000)) == 0) ) { if (DEBUG_RING_IDXS) ioctl(fd, BLKTAP_IOCTL_PRINT_IDXS); continue; } - DPRINTF("poll returned %d\n", ret); - for (i=0; i < MAX_POLLFDS; i++) { if ( (pollhooks[i].active ) && (pollhooks[i].pfd->revents ) ) pollhooks[i].func(pollhooks[i].pfd->fd); } - if (pfd[tap_pfd].revents) { - - /* empty the control ring */ - rp = ctrl_ring.sring->req_prod; - rmb(); - for (i = ctrl_ring.req_cons; i < rp; i++) - { - msg = RING_GET_REQUEST(&ctrl_ring, i); - - ctrl_hook = ctrl_hook_chain; - while (ctrl_hook != NULL) - { - DPRINTF("CTRL_HOOK: %s\n", ctrl_hook->name); - /* We currently don't respond to ctrl messages. */ - ctrl_hook->func(msg); - ctrl_hook = ctrl_hook->next; - } - } - /* Using this as a unidirectional ring. */ - ctrl_ring.req_cons = ctrl_ring.rsp_prod_pvt = i; -pthread_mutex_lock(&push_mutex); - RING_PUSH_RESPONSES(&ctrl_ring); -pthread_mutex_unlock(&push_mutex); - + if (pfd[store_pfd].revents) { + ret = xs_fire_next_watch(h); + } + + if (pfd[tap_pfd].revents) + { /* empty the fe_ring */ notify_fe = 0; notify_be = RING_HAS_UNCONSUMED_REQUESTS(&fe_ring); @@ -465,44 +336,62 @@ rmb(); for (i = fe_ring.req_cons; i != rp; i++) { - int done = 0; /* stop forwarding this request */ + int done = 0; req = RING_GET_REQUEST(&fe_ring, i); memcpy(&req_pending[ID_TO_IDX(req->id)], req, sizeof(*req)); req = &req_pending[ID_TO_IDX(req->id)]; - DPRINTF("copying an fe request\n"); - - req_hook = request_hook_chain; - while (req_hook != NULL) + blkif = blkif_find_by_handle(ID_TO_DOM(req->id), req->handle); + + if (blkif != NULL) { - DPRINTF("REQ_HOOK: %s\n", req_hook->name); - switch(req_hook->func(req)) + req_hook = blkif->request_hook_chain; + while (req_hook != NULL) { - case BLKTAP_RESPOND: - apply_rsp_hooks((blkif_response_t *)req); - write_rsp_to_fe_ring((blkif_response_t *)req); - notify_fe = 1; - done = 1; - break; - case BLKTAP_STOLEN: - done = 1; - break; - case BLKTAP_PASS: - break; - default: - printf("Unknown request hook return value!\n"); + switch(req_hook->func(blkif, req, ((i+1) == rp))) + { + case BLKTAP_RESPOND: + apply_rsp_hooks(blkif, (blkif_response_t *)req); + write_rsp_to_fe_ring((blkif_response_t *)req); + notify_fe = 1; + done = 1; + break; + case BLKTAP_STOLEN: + done = 1; + break; + case BLKTAP_PASS: + break; + default: + printf("Unknown request hook return value!\n"); + } + if (done) break; + req_hook = req_hook->next; } - if (done) break; - req_hook = req_hook->next; } - if (done == 0) write_req_to_be_ring(req); + if (done == 0) + { + /* this was: */ + /* write_req_to_be_ring(req); */ + + unsigned long id = req->id; + unsigned short operation = req->operation; + printf("Unterminated request!\n"); + rsp = (blkif_response_t *)req; + rsp->id = id; + rsp->operation = operation; + rsp->status = BLKIF_RSP_ERROR; + write_rsp_to_fe_ring(rsp); + notify_fe = 1; + done = 1; + } } fe_ring.req_cons = i; /* empty the be_ring */ +/* notify_fe |= RING_HAS_UNCONSUMED_RESPONSES(&be_ring); rp = be_ring.sring->rsp_prod; rmb(); @@ -519,9 +408,9 @@ write_rsp_to_fe_ring(rsp); } be_ring.rsp_cons = i; - +*/ /* notify the domains */ - +/* if (notify_be) { DPRINTF("notifying be\n"); pthread_mutex_lock(&push_mutex); @@ -529,13 +418,13 @@ ioctl(fd, BLKTAP_IOCTL_KICK_BE); pthread_mutex_unlock(&push_mutex); } - +*/ if (notify_fe) { DPRINTF("notifying fe\n"); -pthread_mutex_lock(&push_mutex); + pthread_mutex_lock(&push_mutex); RING_PUSH_RESPONSES(&fe_ring); ioctl(fd, BLKTAP_IOCTL_KICK_FE); -pthread_mutex_unlock(&push_mutex); + pthread_mutex_unlock(&push_mutex); } } } diff -r 523078a33287 -r f59e0163540e tools/blktap/blktaplib.h --- a/tools/blktap/blktaplib.h Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/blktaplib.h Sun Sep 4 21:19:44 2005 @@ -2,6 +2,9 @@ * * userland accessors to the block tap. * + * Sept 2/05 -- I'm scaling this back to only support block remappings + * to user in a backend domain. Passthrough and interposition can be readded + * once transitive grants are available. */ #ifndef __BLKTAPLIB_H__ @@ -13,6 +16,7 @@ #include <xen/io/blkif.h> #include <xen/io/ring.h> #include <xen/io/domain_controller.h> +#include <xs.h> /* /dev/xen/blktap resides at device number major=10, minor=202 */ #define BLKTAP_MINOR 202 @@ -49,12 +53,18 @@ return ( ( arg == BLKTAP_MODE_PASSTHROUGH ) || ( arg == BLKTAP_MODE_INTERCEPT_FE ) || + ( arg == BLKTAP_MODE_INTERPOSE ) ); +/* + return ( + ( arg == BLKTAP_MODE_PASSTHROUGH ) || + ( arg == BLKTAP_MODE_INTERCEPT_FE ) || ( arg == BLKTAP_MODE_INTERCEPT_BE ) || ( arg == BLKTAP_MODE_INTERPOSE ) || ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) || ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) || ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH ) ); +*/ } /* Return values for handling messages in hooks. */ @@ -62,29 +72,88 @@ #define BLKTAP_RESPOND 1 /* Request is now a reply. Return it. */ #define BLKTAP_STOLEN 2 /* Hook has stolen request. */ -#define domid_t unsigned short +//#define domid_t unsigned short inline unsigned int ID_TO_IDX(unsigned long id); inline domid_t ID_TO_DOM(unsigned long id); -void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *)); -void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *)); -void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *)); -void blktap_inject_response(blkif_response_t *); int blktap_attach_poll(int fd, short events, int (*func)(int)); void blktap_detach_poll(int fd); int blktap_listen(void); +struct blkif; + +typedef struct request_hook_st { + char *name; + int (*func)(struct blkif *, blkif_request_t *, int); + struct request_hook_st *next; +} request_hook_t; + +typedef struct response_hook_st { + char *name; + int (*func)(struct blkif *, blkif_response_t *, int); + struct response_hook_st *next; +} response_hook_t; + +struct blkif_ops { + long int (*get_size)(struct blkif *blkif); + long int (*get_secsize)(struct blkif *blkif); + unsigned (*get_info)(struct blkif *blkif); +}; + +typedef struct blkif { + domid_t domid; + long int handle; + + long int pdev; + long int readonly; + + enum { DISCONNECTED, CONNECTED } state; + + struct blkif_ops *ops; + request_hook_t *request_hook_chain; + response_hook_t *response_hook_chain; + + struct blkif *hash_next; + + void *prv; /* device-specific data */ +} blkif_t; + +void register_new_blkif_hook(int (*fn)(blkif_t *blkif)); +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); +blkif_t *alloc_blkif(domid_t domid); +int blkif_init(blkif_t *blkif, long int handle, long int pdev, + long int readonly); +void free_blkif(blkif_t *blkif); +void __init_blkif(void); + + +/* xenstore/xenbus: */ +extern int add_blockdevice_probe_watch(struct xs_handle *h, + const char *domname); +int xs_fire_next_watch(struct xs_handle *h); + + +void blkif_print_hooks(blkif_t *blkif); +void blkif_register_request_hook(blkif_t *blkif, char *name, + int (*rh)(blkif_t *, blkif_request_t *, int)); +void blkif_register_response_hook(blkif_t *blkif, char *name, + int (*rh)(blkif_t *, blkif_response_t *, int)); +void blkif_inject_response(blkif_t *blkif, blkif_response_t *); +void blktap_kick_responses(void); + +/* this must match the underlying driver... */ +#define MAX_PENDING_REQS 64 + /* Accessing attached data page mappings */ -#define MMAP_PAGES_PER_REQUEST \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ +#define MMAP_PAGES \ + (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_req,_seg) \ + (mmap_vstart + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ ((_seg) * PAGE_SIZE)) extern unsigned long mmap_vstart; - /* Defines that are only used by library clients */ @@ -93,7 +162,6 @@ static char *blkif_op_name[] = { [BLKIF_OP_READ] = "READ", [BLKIF_OP_WRITE] = "WRITE", - [BLKIF_OP_PROBE] = "PROBE", }; #endif /* __COMPILING_BLKTAP_LIB */ diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/common.h --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/common.h Sun Sep 4 21:19:44 2005 @@ -0,0 +1,112 @@ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include <linux/config.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include <asm/io.h> +#include <asm/setup.h> +#include <asm/pgalloc.h> +#include <asm-xen/evtchn.h> +#include <asm-xen/hypervisor.h> +#include <asm-xen/xen-public/io/blkif.h> +#include <asm-xen/xen-public/io/ring.h> +#include <asm-xen/gnttab.h> + +#if 0 +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) + +struct vbd { + blkif_vdev_t handle; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* VDISK_xxx */ + blkif_pdev_t pdevice; /* phys device that this vbd maps to */ + struct block_device *bdev; +}; + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned long shmem_frame; + unsigned int evtchn; + unsigned int remote_evtchn; + /* Comms information. */ + blkif_back_ring_t blk_ring; + /* VBDs attached to this interface. */ + struct vbd vbd; + /* Private fields. */ + enum { DISCONNECTED, CONNECTED } status; +#ifdef CONFIG_XEN_BLKDEV_TAP_BE + /* Is this a blktap frontend */ + unsigned int is_blktap; +#endif + struct list_head blkdev_list; + spinlock_t blk_ring_lock; + atomic_t refcnt; + + struct work_struct free_work; + u16 shmem_handle; + unsigned long shmem_vaddr; + grant_ref_t shmem_ref; +} blkif_t; + +void blkif_create(blkif_be_create_t *create); +void blkif_destroy(blkif_be_destroy_t *destroy); +void blkif_connect(blkif_be_connect_t *connect); +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id); +void blkif_disconnect_complete(blkif_t *blkif); +blkif_t *alloc_blkif(domid_t domid); +void free_blkif_callback(blkif_t *blkif); +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); + +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + free_blkif_callback(_b); \ + } while (0) + +/* Create a vbd. */ +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, blkif_pdev_t pdevice, + int readonly); +void vbd_free(struct vbd *vbd); + +unsigned long vbd_size(struct vbd *vbd); +unsigned int vbd_info(struct vbd *vbd); +unsigned long vbd_secsize(struct vbd *vbd); + +struct phys_req { + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; +}; + +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); + +void blkif_interface_init(void); + +void blkif_deschedule(blkif_t *blkif); + +void blkif_xenbus_init(void); + +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/interface.c --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/interface.c Sun Sep 4 21:19:44 2005 @@ -0,0 +1,141 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" +#include <asm-xen/evtchn.h> + +static kmem_cache_t *blkif_cachep; + +blkif_t *alloc_blkif(domid_t domid) +{ + blkif_t *blkif; + + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->status = DISCONNECTED; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + + return blkif; +} + +static int map_frontend_page(blkif_t *blkif, unsigned long localaddr, + unsigned long shared_page) +{ + struct gnttab_map_grant_ref op; + op.host_addr = localaddr; + op.flags = GNTMAP_host_map; + op.ref = shared_page; + op.dom = blkif->domid; + + BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); + + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + return op.handle; + } + + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + blkif->shmem_vaddr = localaddr; + return 0; +} + +static void unmap_frontend_page(blkif_t *blkif) +{ + struct gnttab_unmap_grant_ref op; + + op.host_addr = blkif->shmem_vaddr; + op.handle = blkif->shmem_handle; + op.dev_bus_addr = 0; + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); +} + +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) +{ + struct vm_struct *vma; + blkif_sring_t *sring; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; + + BUG_ON(blkif->remote_evtchn); + + if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) + return -ENOMEM; + + err = map_frontend_page(blkif, (unsigned long)vma->addr, shared_page); + if (err) { + vfree(vma->addr); + return err; + } + + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = blkif->domid; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_frontend_page(blkif); + vfree(vma->addr); + return err; + } + + blkif->evtchn = op.u.bind_interdomain.port1; + blkif->remote_evtchn = evtchn; + + sring = (blkif_sring_t *)vma->addr; + SHARED_RING_INIT(sring); + BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); + + bind_evtchn_to_irqhandler(blkif->evtchn, blkif_be_int, 0, "blkif-backend", + blkif); + blkif->status = CONNECTED; + blkif->shmem_frame = shared_page; + + return 0; +} + +static void free_blkif(void *arg) +{ + evtchn_op_t op = { .cmd = EVTCHNOP_close }; + blkif_t *blkif = (blkif_t *)arg; + + op.u.close.port = blkif->evtchn; + op.u.close.dom = DOMID_SELF; + HYPERVISOR_event_channel_op(&op); + op.u.close.port = blkif->remote_evtchn; + op.u.close.dom = blkif->domid; + HYPERVISOR_event_channel_op(&op); + + if (blkif->evtchn) + unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); + + if (blkif->blk_ring.sring) { + unmap_frontend_page(blkif); + vfree(blkif->blk_ring.sring); + blkif->blk_ring.sring = NULL; + } + + kmem_cache_free(blkif_cachep, blkif); +} + +void free_blkif_callback(blkif_t *blkif) +{ + INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif); + schedule_work(&blkif->free_work); +} + +void __init blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); +} diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/xenbus.c Sun Sep 4 21:19:44 2005 @@ -0,0 +1,225 @@ +/* Xenbus code for blkif tap + + A Warfield. + + Hastily modified from the oroginal backend code: + + Copyright (C) 2005 Rusty Russell <rusty@xxxxxxxxxxxxxxx> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <stdarg.h> +#include <linux/module.h> +#include <asm-xen/xenbus.h> +#include "common.h" + +struct backend_info +{ + struct xenbus_device *dev; + + /* our communications channel */ + blkif_t *blkif; + + long int frontend_id; + + /* watch back end for changes */ + struct xenbus_watch backend_watch; + + /* watch front end for changes */ + struct xenbus_watch watch; + char *frontpath; +}; + +static int blkback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->data; + + if (be->watch.node) + unregister_xenbus_watch(&be->watch); + unregister_xenbus_watch(&be->backend_watch); + if (be->blkif) + blkif_put(be->blkif); + if (be->frontpath) + kfree(be->frontpath); + kfree(be); + return 0; +} + +/* Front end tells us frame. */ +static void frontend_changed(struct xenbus_watch *watch, const char *node) +{ + unsigned long ring_ref; + unsigned int evtchn; + int err; + struct backend_info *be + = container_of(watch, struct backend_info, watch); + + /* If other end is gone, delete ourself. */ + if (node && !xenbus_exists(be->frontpath, "")) { + xenbus_rm(be->dev->nodename, ""); + device_unregister(&be->dev->dev); + return; + } + if (be->blkif == NULL || be->blkif->status == CONNECTED) + return; + + err = xenbus_gather(be->frontpath, "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_error(be->dev, err, + "reading %s/ring-ref and event-channel", + be->frontpath); + return; + } + + /* Map the shared frame, irq etc. */ + err = blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_error(be->dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + goto abort; + } + + xenbus_dev_ok(be->dev); + + return; + +abort: + xenbus_transaction_end(1); +} + +/* + Setup supplies physical device. + We provide event channel and device details to front end. + Frontend supplies shared frame and event channel. + */ +static void backend_changed(struct xenbus_watch *watch, const char *node) +{ + int err; + char *p; + long int handle; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + + if (be->blkif == NULL) { + /* Front end dir is a number, which is used as the handle. */ + p = strrchr(be->frontpath, '/') + 1; + handle = simple_strtoul(p, NULL, 0); + + be->blkif = alloc_blkif(be->frontend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_error(dev, err, "creating block interface"); + return; + } + + /* Pass in NULL node to skip exist test. */ + frontend_changed(&be->watch, NULL); + } +} + +static int blkback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct backend_info *be; + char *frontend; + int err; + + be = kmalloc(sizeof(*be), GFP_KERNEL); + if (!be) { + xenbus_dev_error(dev, -ENOMEM, "allocating backend structure"); + return -ENOMEM; + } + memset(be, 0, sizeof(*be)); + + frontend = NULL; + err = xenbus_gather(dev->nodename, + "frontend-id", "%li", &be->frontend_id, + "frontend", NULL, &frontend, + NULL); + if (XENBUS_EXIST_ERR(err)) + goto free_be; + if (err < 0) { + xenbus_dev_error(dev, err, + "reading %s/frontend or frontend-id", + dev->nodename); + goto free_be; + } + if (strlen(frontend) == 0 || !xenbus_exists(frontend, "")) { + /* If we can't get a frontend path and a frontend-id, + * then our bus-id is no longer valid and we need to + * destroy the backend device. + */ + err = -ENOENT; + goto free_be; + } + + be->dev = dev; + be->backend_watch.node = dev->nodename; + be->backend_watch.callback = backend_changed; + err = register_xenbus_watch(&be->backend_watch); + if (err) { + be->backend_watch.node = NULL; + xenbus_dev_error(dev, err, "adding backend watch on %s", + dev->nodename); + goto free_be; + } + + be->frontpath = frontend; + be->watch.node = be->frontpath; + be->watch.callback = frontend_changed; + err = register_xenbus_watch(&be->watch); + if (err) { + be->watch.node = NULL; + xenbus_dev_error(dev, err, + "adding frontend watch on %s", + be->frontpath); + goto free_be; + } + + dev->data = be; + + backend_changed(&be->backend_watch, dev->nodename); + return 0; + + free_be: + if (be->backend_watch.node) + unregister_xenbus_watch(&be->backend_watch); + if (frontend) + kfree(frontend); + kfree(be); + return err; +} + +static struct xenbus_device_id blkback_ids[] = { + { "vbd" }, + { "" } +}; + +static struct xenbus_driver blkback = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkback_ids, + .probe = blkback_probe, + .remove = blkback_remove, +}; + +void blkif_xenbus_init(void) +{ + xenbus_register_backend(&blkback); +} diff -r 523078a33287 -r f59e0163540e tools/blktap/README.sept05 --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/README.sept05 Sun Sep 4 21:19:44 2005 @@ -0,0 +1,33 @@ +The blktap has been rewritten substantially based on the current +blkback driver. I've removed passthrough support, as this is broken +by the move to grant tables and the lack of transitive grants. A +blktap VM is now only capable of terminating block requests in +userspace. + +ublkback/ contains a _very_ initial cut at a user-level version of the block +backend driver. It gives a working example of how the current tap +interfaces are used, in particular w.r.t. the vbd directories in +xenstore. + +parallax/ contains fairly recent parallax code. This does not run on +the changed blktap interface, but should only be a couple of hours +work to get going again. + +All of the tricky bits are done, but there is plenty of cleaning to +do, and the top-level functionality is not here yet. At the moment, +the daemon ignores the pdev requested by the tools and opens the file +or device specified by TMP_IMAGE_FILE_NAME in ublkback.c. + +TODO: +1. Fix to allow pdev in the store to specify the device to open. +2. Add support (to tools as well) to mount arbitrary files... + just write the filename to mount into the store, instead of pdev. +3. Reeximine blkif refcounting, it is almost certainly broken at the moment. + - creating a blkif should take a reference. + - each inflight request should take a reference on dequeue in blktaplib + - sending responses should drop refs. + - blkif should be implicitly freed when refcounts fall to 0. +4. Modify the parallax req/rsp code as per ublkback to use the new tap + interfaces. +5. Write a front end that allows parallax and normal mounts to coexist +6. Allow blkback and blktap to run at the same time. diff -r 523078a33287 -r f59e0163540e tools/blktap/blkif.c --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/blkif.c Sun Sep 4 21:19:44 2005 @@ -0,0 +1,213 @@ +/* + * blkif.c + * + * The blkif interface for blktap. A blkif describes an in-use virtual disk. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <err.h> + +#include "blktaplib.h" + +#if 1 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) + +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + ((blkif->domid != domid) || (blkif->handle != handle)) ) + blkif = blkif->hash_next; + return blkif; +} + +blkif_t *alloc_blkif(domid_t domid) +{ + blkif_t *blkif; + + blkif = (blkif_t *)malloc(sizeof(blkif_t)); + if (!blkif) + return NULL; + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + + return blkif; +} + +static int (*new_blkif_hook)(blkif_t *blkif) = NULL; +void register_new_blkif_hook(int (*fn)(blkif_t *blkif)) +{ + new_blkif_hook = fn; +} + +int blkif_init(blkif_t *blkif, long int handle, long int pdev, + long int readonly) +{ + domid_t domid; + blkif_t **pblkif; + + if (blkif == NULL) + return -EINVAL; + + domid = blkif->domid; + blkif->handle = handle; + blkif->pdev = pdev; + blkif->readonly = readonly; + + /* + * Call out to the new_blkif_hook. The tap application should define this, + * and it should return having set blkif->ops + * + */ + if (new_blkif_hook == NULL) + { + warn("Probe detected a new blkif, but no new_blkif_hook!"); + return -1; + } + new_blkif_hook(blkif); + + /* Now wire it in. */ + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + { + DPRINTF("Could not create blkif: already exists\n"); + return -1; + } + pblkif = &(*pblkif)->hash_next; + } + blkif->hash_next = NULL; + *pblkif = blkif; + + return 0; +} + +void free_blkif(blkif_t *blkif) +{ + blkif_t **pblkif, *curs; + + pblkif = &blkif_hash[BLKIF_HASH(blkif->domid, blkif->handle)]; + while ( (curs = *pblkif) != NULL ) + { + if ( blkif == curs ) + { + *pblkif = curs->hash_next; + } + pblkif = &curs->hash_next; + } + if (blkif != NULL) + free(blkif); +} + +void blkif_register_request_hook(blkif_t *blkif, char *name, + int (*rh)(blkif_t *, blkif_request_t *, int)) +{ + request_hook_t *rh_ent, **c; + + rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t)); + if (!rh_ent) + { + warn("couldn't allocate a new hook"); + return; + } + + rh_ent->func = rh; + rh_ent->next = NULL; + if (asprintf(&rh_ent->name, "%s", name) == -1) + { + free(rh_ent); + warn("couldn't allocate a new hook name"); + return; + } + + c = &blkif->request_hook_chain; + while (*c != NULL) { + c = &(*c)->next; + } + *c = rh_ent; +} + +void blkif_register_response_hook(blkif_t *blkif, char *name, + int (*rh)(blkif_t *, blkif_response_t *, int)) +{ + response_hook_t *rh_ent, **c; + + rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t)); + if (!rh_ent) + { + warn("couldn't allocate a new hook"); + return; + } + + rh_ent->func = rh; + rh_ent->next = NULL; + if (asprintf(&rh_ent->name, "%s", name) == -1) + { + free(rh_ent); + warn("couldn't allocate a new hook name"); + return; + } + + c = &blkif->response_hook_chain; + while (*c != NULL) { + c = &(*c)->next; + } + *c = rh_ent; +} + +void blkif_print_hooks(blkif_t *blkif) +{ + request_hook_t *req_hook; + response_hook_t *rsp_hook; + + DPRINTF("Request Hooks:\n"); + req_hook = blkif->request_hook_chain; + while (req_hook != NULL) + { + DPRINTF(" [0x%p] %s\n", req_hook->func, req_hook->name); + req_hook = req_hook->next; + } + + DPRINTF("Response Hooks:\n"); + rsp_hook = blkif->response_hook_chain; + while (rsp_hook != NULL) + { + DPRINTF(" [0x%p] %s\n", rsp_hook->func, rsp_hook->name); + rsp_hook = rsp_hook->next; + } +} + + +long int vbd_size(blkif_t *blkif) +{ + return 1000000000; +} + +long int vbd_secsize(blkif_t *blkif) +{ + return 512; +} + +unsigned vbd_info(blkif_t *blkif) +{ + return 0; +} + + +void __init_blkif(void) +{ + memset(blkif_hash, 0, sizeof(blkif_hash)); +} diff -r 523078a33287 -r f59e0163540e tools/blktap/list.h --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/list.h Sun Sep 4 21:19:44 2005 @@ -0,0 +1,55 @@ +/* + * list.h + * + * This is a subset of linux's list.h intended to be used in user-space. + * + */ + +#ifndef __LIST_H__ +#define __LIST_H__ + +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#endif /* __LIST_H__ */ diff -r 523078a33287 -r f59e0163540e tools/blktap/ublkback/Makefile --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/ublkback/Makefile Sun Sep 4 21:19:44 2005 @@ -0,0 +1,42 @@ + +XEN_ROOT = ../../.. +include $(XEN_ROOT)/tools/Rules.mk + +INCLUDES += -I.. + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +IBIN = ublkback +INSTALL_DIR = /usr/sbin + +CFLAGS += -Wall +CFLAGS += -Werror +CFLAGS += -Wno-unused +#CFLAGS += -O3 +CFLAGS += -g3 +CFLAGS += -fno-strict-aliasing +CFLAGS += -I $(XEN_LIBXC) +CFLAGS += $(INCLUDES) -I. +CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +OBJS = $(patsubst %.c,%.o,$(SRCS)) + +all: $(IBIN) + +LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse) + +install: + $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INSTALL_DIR) +clean: + rm -rf *.o*~ $(DEPS) xen TAGS $(IBIN) + +ublkback: + $(CC) $(CFLAGS) -o ublkback -L$(XEN_LIBXC) -L. -L.. \ + -lblktap -laio ublkback.c ublkbacklib.c -pg + +.PHONY: clean install + +-include $(DEPS) diff -r 523078a33287 -r f59e0163540e tools/blktap/ublkback/ublkback.c --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/ublkback/ublkback.c Sun Sep 4 21:19:44 2005 @@ -0,0 +1,18 @@ +/* ublkback.c + * + * libaio-based userlevel backend. + */ + +#include "blktaplib.h" +#include "ublkbacklib.h" + + +int main(int argc, char *argv[]) +{ + ublkback_init(); + + register_new_blkif_hook(ublkback_new_blkif); + blktap_listen(); + + return 0; +} diff -r 523078a33287 -r f59e0163540e tools/blktap/ublkback/ublkbacklib.c --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/ublkback/ublkbacklib.c Sun Sep 4 21:19:44 2005 @@ -0,0 +1,477 @@ +/* ublkbacklib.c + * + * file/device image-backed block device -- using linux libaio. + * + * (c) 2004 Andrew Warfield. + * + * Xend has been modified to use an amorfs:[fsid] disk tag. + * This will show up as device type (maj:240,min:0) = 61440. + * + * The fsid is placed in the sec_start field of the disk extent. + * + * NOTE: This doesn't work. Grrr. + */ + +#define _GNU_SOURCE +#define __USE_LARGEFILE64 + +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <string.h> +#include <db.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/poll.h> +#include <unistd.h> +#include <errno.h> +#include <libaio.h> +#include <pthread.h> +#include <time.h> +#include <err.h> +#include "blktaplib.h" + +/* XXXX: */ +/* Current code just mounts this file/device to any requests that come in. */ +//#define TMP_IMAGE_FILE_NAME "/dev/sda1" +#define TMP_IMAGE_FILE_NAME "fc3.image" + +#define MAX_REQUESTS 64 /* must be synced with the blkif drivers. */ +#define MAX_SEGMENTS_PER_REQ 11 +#define SECTOR_SHIFT 9 +#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ) + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +#if 1 +#define ASSERT(_p) \ + if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#else +#define ASSERT(_p) ((void)0) +#endif + +/* Note on pending_reqs: I assume all reqs are queued before they start to + * get filled. so count of 0 is an unused record. + */ +typedef struct { + blkif_request_t req; + blkif_t *blkif; + int count; +} pending_req_t; + +static pending_req_t pending_list[MAX_REQUESTS]; +static io_context_t ctx; +static struct iocb *iocb_free[MAX_AIO_REQS]; +static int iocb_free_count; + +/* ---[ Notification mecahnism ]--------------------------------------- */ + +enum { + READ = 0, + WRITE = 1 +}; + +static int aio_notify[2]; +static volatile int aio_listening = 0; +static pthread_mutex_t notifier_sem = PTHREAD_MUTEX_INITIALIZER; + +static struct io_event aio_events[MAX_AIO_REQS]; +static int aio_event_count = 0; + +/* this is commented out in libaio.h for some reason. */ +extern int io_queue_wait(io_context_t ctx, struct timespec *timeout); + +static void *notifier_thread(void *arg) +{ + int ret; + int msg = 0x00feeb00; + + DPRINTF("Notifier thread started.\n"); + for (;;) { + pthread_mutex_lock(¬ifier_sem); + if ((ret = io_getevents(ctx, 1, MAX_AIO_REQS, aio_events, 0)) > 0) { + aio_event_count = ret; + write(aio_notify[WRITE], &msg, sizeof(msg)); + } else { + printf("[io_queue_wait error! %d]\n", errno); + pthread_mutex_unlock(¬ifier_sem); + } + } +} + +/* --- Talking to xenstore: ------------------------------------------- */ + +int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done); +int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done); + +typedef struct image { + /* These need to turn into an array/rbtree for multi-disk support. */ + int fd; + u64 fsid; + blkif_vdev_t vdevice; + long int size; + long int secsize; + long int info; +} image_t; + +long int ublkback_get_size(blkif_t *blkif) +{ + image_t *img = (image_t *)blkif->prv; + return img->size; +} + +long int ublkback_get_secsize(blkif_t *blkif) +{ + image_t *img = (image_t *)blkif->prv; + return img->secsize; +} + +unsigned ublkback_get_info(blkif_t *blkif) +{ + image_t *img = (image_t *)blkif->prv; + return img->info; +} + +static struct blkif_ops ublkback_ops = { + get_size: ublkback_get_size, + get_secsize: ublkback_get_secsize, + get_info: ublkback_get_info, +}; + +int ublkback_new_blkif(blkif_t *blkif) +{ + image_t *image; + struct stat stat; + int ret; + + image = (image_t *)malloc(sizeof(image_t)); + if (image == NULL) { + printf("error allocating image record.\n"); + return -ENOMEM; + } + + /* Open it. */ + image->fd = open(TMP_IMAGE_FILE_NAME, + O_RDWR | O_DIRECT | O_LARGEFILE); + + if ((image->fd < 0) && (errno == EINVAL)) { + /* Maybe O_DIRECT isn't supported. */ + warn("open() failed on '%s', trying again without O_DIRECT", + TMP_IMAGE_FILE_NAME); + image->fd = open(TMP_IMAGE_FILE_NAME, O_RDWR | O_LARGEFILE); + } + + if (image->fd < 0) { + warn("Couldn't open image file!"); + free(image); + return -EINVAL; + } + + /* Size it. */ + ret = fstat(image->fd, &stat); + if (ret != 0) { + printf("Couldn't stat image in PROBE!"); + return -EINVAL; + } + + image->size = (stat.st_size >> SECTOR_SHIFT); + + /* TODO: IOCTL to get size of raw device. */ +/* + ret = ioctl(img->fd, BLKGETSIZE, &blksize); + if (ret != 0) { + printf("Couldn't ioctl image in PROBE!\n"); + goto err; + } +*/ + if (image->size == 0) + image->size =((u64) 16836057); + image->secsize = 512; + image->info = 0; + + /* Register the hooks */ + blkif_register_request_hook(blkif, "Ublkback req.", ublkback_request); + blkif_register_response_hook(blkif, "Ublkback resp.", ublkback_response); + + + printf(">X<Created a new blkif! pdev was %ld, but you got %s\n", + blkif->pdev, TMP_IMAGE_FILE_NAME); + + blkif->ops = &ublkback_ops; + blkif->prv = (void *)image; + + return 0; +} + + +/* --- Moving the bits: ----------------------------------------------- */ + +static int batch_count = 0; +int ublkback_request(blkif_t *blkif, blkif_request_t *req, int batch_done) +{ + int fd; + u64 sector; + char *spage, *dpage; + int ret, i, idx; + blkif_response_t *rsp; + domid_t dom = ID_TO_DOM(req->id); + static struct iocb *ioq[MAX_SEGMENTS_PER_REQ*MAX_REQUESTS]; + static int io_idx = 0; + struct iocb *io; + image_t *img; + + img = (image_t *)blkif->prv; + fd = img->fd; + + switch (req->operation) + { + case BLKIF_OP_WRITE: + { + unsigned long size; + + + batch_count++; + + idx = ID_TO_IDX(req->id); + ASSERT(pending_list[idx].count == 0); + memcpy(&pending_list[idx].req, req, sizeof(*req)); + pending_list[idx].count = req->nr_segments; + pending_list[idx].blkif = blkif; + + for (i = 0; i < req->nr_segments; i++) { + + sector = req->sector_number + (8*i); + + size = blkif_last_sect (req->frame_and_sects[i]) - + blkif_first_sect(req->frame_and_sects[i]) + 1; + + if (blkif_first_sect(req->frame_and_sects[i]) != 0) + DPRINTF("iWR: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", + req->sector_number, sector, + blkif_first_sect(req->frame_and_sects[i]), + blkif_last_sect (req->frame_and_sects[i]), + (long)(sector << SECTOR_SHIFT)); + + spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); + spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; + + /*convert size and sector to byte offsets */ + size <<= SECTOR_SHIFT; + sector <<= SECTOR_SHIFT; + + io = iocb_free[--iocb_free_count]; + io_prep_pwrite(io, fd, spage, size, sector); + io->data = (void *)idx; + //ioq[i] = io; + ioq[io_idx++] = io; + } + + if (batch_done) { + ret = io_submit(ctx, io_idx, ioq); + batch_count = 0; + if (ret < 0) + printf("BADNESS: io_submit error! (%d)\n", errno); + io_idx = 0; + } + + return BLKTAP_STOLEN; + + } + case BLKIF_OP_READ: + { + unsigned long size; + + batch_count++; + idx = ID_TO_IDX(req->id); + ASSERT(pending_list[idx].count == 0); + memcpy(&pending_list[idx].req, req, sizeof(*req)); + pending_list[idx].count = req->nr_segments; + pending_list[idx].blkif = blkif; + + for (i = 0; i < req->nr_segments; i++) { + + sector = req->sector_number + (8*i); + + size = blkif_last_sect (req->frame_and_sects[i]) - + blkif_first_sect(req->frame_and_sects[i]) + 1; + + dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); + dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; + + if (blkif_first_sect(req->frame_and_sects[i]) != 0) + DPRINTF("iRD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) " + "pos: %15lu dpage: %p\n", + req->sector_number, sector, + blkif_first_sect(req->frame_and_sects[i]), + blkif_last_sect (req->frame_and_sects[i]), + (long)(sector << SECTOR_SHIFT), dpage); + + /*convert size and sector to byte offsets */ + size <<= SECTOR_SHIFT; + sector <<= SECTOR_SHIFT; + + + /* + * NB: Looks like AIO now has non-page aligned support, this path + * can probably be removed... Only really used for hunting + * superblocks anyway... ;) + */ + if ( ((unsigned long)dpage % PAGE_SIZE) != 0 ) { + /* AIO to raw devices must be page aligned, so do this read + * synchronously. The OS is probably just looking for + * a superblock or something, so this won't hurt performance. + */ + int ret; + + printf("Slow path block read.\n"); + /* Question: do in-progress aio ops modify the file cursor? */ + ret = lseek(fd, sector, SEEK_SET); + if (ret == (off_t)-1) + printf("lseek failed!\n"); + ret = read(fd, dpage, size); + if (ret < 0) + printf("read problem (%d)\n", ret); + printf("|\n|\n| read: %lld, %lu, %d\n|\n|\n", sector, size, ret); + + /* not an async request any more... */ + pending_list[idx].count--; + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_READ; + rsp->status = BLKIF_RSP_OKAY; + return BLKTAP_RESPOND; + /* Doh -- need to flush aio if this is end-of-batch */ + } + + io = iocb_free[--iocb_free_count]; + + io_prep_pread(io, fd, dpage, size, sector); + io->data = (void *)idx; + + ioq[io_idx++] = io; + //ioq[i] = io; + } + + if (batch_done) { + ret = io_submit(ctx, io_idx, ioq); + batch_count = 0; + if (ret < 0) + printf("BADNESS: io_submit error! (%d)\n", errno); + io_idx = 0; + } + + return BLKTAP_STOLEN; + + } + } + + printf("Unknown block operation!\n"); +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = req->operation; + rsp->status = BLKIF_RSP_ERROR; + return BLKTAP_RESPOND; +} + + +int ublkback_pollhook(int fd) +{ + struct io_event *ep; + int n, ret, idx; + blkif_request_t *req; + blkif_response_t *rsp; + int responses_queued = 0; + int pages=0; + + for (ep = aio_events; aio_event_count-- > 0; ep++) { + struct iocb *io = ep->obj; + idx = (int) ep->data; + + if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){ + printf("invalid index returned(%u)!\n", idx); + break; + } + + if ((int)ep->res < 0) + printf("***\n***aio request error! (%d,%d)\n***\n", + (int)ep->res, (int)ep->res2); + + pending_list[idx].count--; + iocb_free[iocb_free_count++] = io; + pages++; + + if (pending_list[idx].count == 0) { + blkif_request_t tmp = pending_list[idx].req; + rsp = (blkif_response_t *)&pending_list[idx].req; + rsp->id = tmp.id; + rsp->operation = tmp.operation; + rsp->status = BLKIF_RSP_OKAY; + blkif_inject_response(pending_list[idx].blkif, rsp); + responses_queued++; + } + } + + if (responses_queued) { + blktap_kick_responses(); + } + + read(aio_notify[READ], &idx, sizeof(idx)); + aio_listening = 1; + pthread_mutex_unlock(¬ifier_sem); + + return 0; +} + +/* the image library terminates the request stream. _resp is a noop. */ +int ublkback_response(blkif_t *blkif, blkif_response_t *rsp, int batch_done) +{ + return BLKTAP_PASS; +} + +void ublkback_init(void) +{ + int i, rc; + pthread_t p; + + for (i = 0; i < MAX_REQUESTS; i++) + pending_list[i].count = 0; + + memset(&ctx, 0, sizeof(ctx)); + rc = io_queue_init(MAX_AIO_REQS, &ctx); + if (rc != 0) { + printf("queue_init failed! (%d)\n", rc); + exit(0); + } + + for (i=0; i<MAX_AIO_REQS; i++) { + if (!(iocb_free[i] = (struct iocb *)malloc(sizeof(struct iocb)))) { + printf("error allocating iocb array\n"); + exit(0); + } + iocb_free_count = i; + } + + rc = pipe(aio_notify); + if (rc != 0) { + printf("pipe failed! (%d)\n", errno); + exit(0); + } + + rc = pthread_create(&p, NULL, notifier_thread, NULL); + if (rc != 0) { + printf("pthread_create failed! (%d)\n", errno); + exit(0); + } + + aio_listening = 1; + + blktap_attach_poll(aio_notify[READ], POLLIN, ublkback_pollhook); +} + diff -r 523078a33287 -r f59e0163540e tools/blktap/ublkback/ublkbacklib.h --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/ublkback/ublkbacklib.h Sun Sep 4 21:19:44 2005 @@ -0,0 +1,16 @@ +/* blkaiolib.h + * + * aio image-backed block device. + * + * (c) 2004 Andrew Warfield. + * + * Xend has been modified to use an amorfs:[fsid] disk tag. + * This will show up as device type (maj:240,min:0) = 61440. + * + * The fsid is placed in the sec_start field of the disk extent. + */ + +int ublkback_request(blkif_request_t *req, int batch_done); +int ublkback_response(blkif_response_t *rsp); /* noop */ +int ublkback_new_blkif(blkif_t *blkif); +void ublkback_init(void); diff -r 523078a33287 -r f59e0163540e tools/blktap/xenbus.c --- /dev/null Sun Sep 4 15:08:16 2005 +++ b/tools/blktap/xenbus.c Sun Sep 4 21:19:44 2005 @@ -0,0 +1,578 @@ +/* + * xenbus.c + * + * xenbus interface to the blocktap. + * + * this handles the top-half of integration with block devices through the + * store -- the tap driver negotiates the device channel etc, while the + * userland tap clinet needs to sort out the disk parameters etc. + * + * A. Warfield 2005 Based primarily on the blkback and xenbus driver code. + * Comments there apply here... + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <err.h> +#include <stdarg.h> +#include <errno.h> +#include <xs.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <poll.h> +#include "blktaplib.h" +#include "list.h" + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +/* --- Xenstore / Xenbus helpers ---------------------------------------- */ +/* + * These should all be pulled out into the xenstore API. I'm faulting commands + * in from the xenbus interface as i need them. + */ + + +/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */ +int xs_gather(struct xs_handle *xs, const char *dir, ...) +{ + va_list ap; + const char *name; + char *path; + int ret = 0; + + va_start(ap, dir); + while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { + const char *fmt = va_arg(ap, char *); + void *result = va_arg(ap, void *); + char *p; + + if (asprintf(&path, "%s/%s", dir, name) == -1) + { + warn("allocation error in xs_gather!\n"); + ret = ENOMEM; + break; + } + p = xs_read(xs, path, NULL); + free(path); + if (p == NULL) { + ret = ENOENT; + break; + } + if (fmt) { + if (sscanf(p, fmt, result) == 0) + ret = EINVAL; + free(p); + } else + *(char **)result = p; + } + va_end(ap); + return ret; +} + +/* Single printf and write: returns -errno or 0. */ +int xs_printf(struct xs_handle *h, const char *dir, const char *node, + const char *fmt, ...) +{ + char *buf, *path; + va_list ap; + int ret; + + va_start(ap, fmt); + ret = vasprintf(&buf, fmt, ap); + va_end(ap); + + asprintf(&path, "%s/%s", dir, node); + + if ((path == NULL) || (buf == NULL)) + return 0; + + ret = xs_write(h, path, buf, strlen(buf)+1, O_CREAT); + + free(buf); + free(path); + + return ret; +} + + +int xs_exists(struct xs_handle *h, const char *path) +{ + char **d; + int num; + + d = xs_directory(h, path, &num); + if (d == NULL) + return 0; + free(d); + return 1; +} + + + +/* This assumes that the domain name we are looking for is unique! */ +char *get_dom_uuid(struct xs_handle *h, const char *name) +{ + char **e, *val, *uuid = NULL; + int num, i, len; + char *path; + + e = xs_directory(h, "/domain", &num); + + i=0; + while (i < num) { + asprintf(&path, "/domain/%s/name", e[i]); + val = xs_read(h, path, &len); + free(path); + if (val == NULL) + continue; + if (strcmp(val, name) == 0) { + /* match! */ + asprintf(&path, "/domain/%s/uuid", e[i]); + uuid = xs_read(h, path, &len); + free(val); + free(path); + break; + } + free(val); + i++; + } + + free(e); + return uuid; +} + +static int strsep_len(const char *str, char c, unsigned int len) +{ + unsigned int i; + + for (i = 0; str[i]; i++) + if (str[i] == c) { + if (len == 0) + return i; + len--; + } + return (len == 0) ? i : -ERANGE; +} + + +/* xenbus watches: */ +/* Register callback to watch this node. */ +struct xenbus_watch +{ + struct list_head list; + char *node; + void (*callback)(struct xs_handle *h, + struct xenbus_watch *, + const char *node); +}; + +static LIST_HEAD(watches); + +/* A little paranoia: we don't just trust token. */ +static struct xenbus_watch *find_watch(const char *token) +{ + struct xenbus_watch *i, *cmp; + + cmp = (void *)strtoul(token, NULL, 16); + + list_for_each_entry(i, &watches, list) + if (i == cmp) + return i; + return NULL; +} + +/* Register callback to watch this node. like xs_watch, return 0 on failure */ +int register_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) +{ + /* Pointer in ascii is the token. */ + char token[sizeof(watch) * 2 + 1]; + int er; + + sprintf(token, "%lX", (long)watch); + if (find_watch(token)) + { + warn("watch collision!"); + return -EINVAL; + } + + er = xs_watch(h, watch->node, token); + if (er != 0) { + list_add(&watch->list, &watches); + } + + return er; +} + +int unregister_xenbus_watch(struct xs_handle *h, struct xenbus_watch *watch) +{ + char token[sizeof(watch) * 2 + 1]; + int er; + + sprintf(token, "%lX", (long)watch); + if (!find_watch(token)) + { + warn("no such watch!"); + return -EINVAL; + } + + + er = xs_unwatch(h, watch->node, token); + list_del(&watch->list); + + if (er == 0) + warn("XENBUS Failed to release watch %s: %i", + watch->node, er); + return 0; +} + +/* Re-register callbacks to all watches. */ +void reregister_xenbus_watches(struct xs_handle *h) +{ + struct xenbus_watch *watch; + char token[sizeof(watch) * 2 + 1]; + + list_for_each_entry(watch, &watches, list) { + sprintf(token, "%lX", (long)watch); + xs_watch(h, watch->node, token); + } +} + +/* based on watch_thread() */ +int xs_fire_next_watch(struct xs_handle *h) +{ + char **res; + char *token; + char *node = NULL; + struct xenbus_watch *w; + int er; + + res = xs_read_watch(h); + if (res == NULL) + return -EAGAIN; /* in O_NONBLOCK, read_watch returns 0... */ + + node = res[0]; + token = res[1]; + + er = xs_acknowledge_watch(h, token); + if (er == 0) + warn("Couldn't acknowledge watch (%s)", token); + + w = find_watch(token); + if (!w) + { + warn("unregistered watch fired"); + goto done; + } + w->callback(h, w, node); + + done: + free(res); + return 1; +} + + + + +/* ---------------------------------------------------------------------- */ + +struct backend_info +{ + /* our communications channel */ + blkif_t *blkif; + + long int frontend_id; + long int pdev; + long int readonly; + + /* watch back end for changes */ + struct xenbus_watch backend_watch; + char *backpath; + + /* watch front end for changes */ + struct xenbus_watch watch; + char *frontpath; + + struct list_head list; +}; + +static LIST_HEAD(belist); + +static struct backend_info *be_lookup_be(const char *bepath) +{ + struct backend_info *be; + + list_for_each_entry(be, &belist, list) + if (strcmp(bepath, be->backpath) == 0) + return be; + return (struct backend_info *)NULL; +} + +static int be_exists_be(const char *bepath) +{ + return ( be_lookup_be(bepath) != NULL ); +} + +static struct backend_info *be_lookup_fe(const char *fepath) +{ + struct backend_info *be; + + list_for_each_entry(be, &belist, list) + if (strcmp(fepath, be->frontpath) == 0) + return be; + return (struct backend_info *)NULL; +} + +static int backend_remove(struct xs_handle *h, struct backend_info *be) +{ + /* Turn off watches. */ + if (be->watch.node) + unregister_xenbus_watch(h, &be->watch); + if (be->backend_watch.node) + unregister_xenbus_watch(h, &be->backend_watch); + + /* Unhook from be list. */ + list_del(&be->list); + + /* Free everything else. */ + if (be->blkif) + free_blkif(be->blkif); + if (be->frontpath) + free(be->frontpath); + if (be->backpath) + free(be->backpath); + free(be); + return 0; +} + +static void frontend_changed(struct xs_handle *h, struct xenbus_watch *w, + const char *fepath_im) +{ + struct backend_info *be; + char *fepath = NULL; + int er; + + be = be_lookup_fe(w->node); + if (be == NULL) + { + warn("frontend changed called for nonexistent backend! (%s)", fepath); + goto fail; + } + + /* If other end is gone, delete ourself. */ + if (w->node && !xs_exists(h, be->frontpath)) { + DPRINTF("DELETING BE: %s\n", be->backpath); + backend_remove(h, be); + return; + } + + if (be->blkif == NULL || (be->blkif->state == CONNECTED)) + return; + + /* Supply the information about the device the frontend needs */ + er = xs_transaction_start(h, be->backpath); + if (er == 0) { + warn("starting transaction"); + goto fail; + } + + er = xs_printf(h, be->backpath, "sectors", "%lu", + be->blkif->ops->get_size(be->blkif)); + if (er == 0) { + warn("writing sectors"); + goto fail; + } + + er = xs_printf(h, be->backpath, "info", "%u", + be->blkif->ops->get_info(be->blkif)); + if (er == 0) { + warn("writing info"); + goto fail; + } + + er = xs_printf(h, be->backpath, "sector-size", "%lu", + be->blkif->ops->get_secsize(be->blkif)); + if (er == 0) { + warn("writing sector-size"); + goto fail; + } + + be->blkif->state = CONNECTED; + + xs_transaction_end(h, 0); + + return; + + fail: + if (fepath) + free(fepath); +} + + +static void backend_changed(struct xs_handle *h, struct xenbus_watch *w, + const char *bepath_im) +{ + struct backend_info *be; + char *path = NULL, *p; + int len, er; + long int pdev = 0, handle; + + be = be_lookup_be(w->node); + if (be == NULL) + { + warn("backend changed called for nonexistent backend! (%s)", w->node); + goto fail; + } + + er = xs_gather(h, be->backpath, "physical-device", "%li", &pdev, NULL); + if (er != 0) + goto fail; + + if (be->pdev && be->pdev != pdev) { + warn("changing physical-device not supported"); + goto fail; + } + be->pdev = pdev; + + asprintf(&path, "%s/%s", w->node, "read-only"); + if (xs_exists(h, path)) + be->readonly = 1; + + if (be->blkif == NULL) { + /* Front end dir is a number, which is used as the handle. */ + p = strrchr(be->frontpath, '/') + 1; + handle = strtoul(p, NULL, 0); + + be->blkif = alloc_blkif(be->frontend_id); + if (be->blkif == NULL) + goto fail; + + er = blkif_init(be->blkif, handle, be->pdev, be->readonly); + if (er) + goto fail; + + DPRINTF("[BECHG]: ADDED A NEW BLKIF (%s)\n", w->node); + + /* Pass in NULL node to skip exist test. */ + frontend_changed(h, &be->watch, NULL); + } + + fail: + if (path) + free(path); + +} + +static void blkback_probe(struct xs_handle *h, struct xenbus_watch *w, + const char *bepath_im) +{ + struct backend_info *be = NULL; + char *frontend = NULL, *bepath = NULL; + int er, len; + + bepath = strdup(bepath_im); + if (!bepath) + return; + len = strsep_len(bepath, '/', 6); + if (len < 0) + goto free_be; + + bepath[len] = '\0'; /*truncate the passed-in string with predjudice. */ + + be = malloc(sizeof(*be)); + if (!be) { + warn("allocating backend structure"); + goto free_be; + } + memset(be, 0, sizeof(*be)); + + frontend = NULL; + er = xs_gather(h, bepath, + "frontend-id", "%li", &be->frontend_id, + "frontend", NULL, &frontend, + NULL); + if (er) + goto free_be; + + if (strlen(frontend) == 0 || !xs_exists(h, frontend)) { + /* If we can't get a frontend path and a frontend-id, + * then our bus-id is no longer valid and we need to + * destroy the backend device. + */ + DPRINTF("No frontend (%s)\n", frontend); + goto free_be; + } + + /* Are we already tracking this device? */ + if (be_exists_be(bepath)) + goto free_be; + + be->backpath = bepath; + be->backend_watch.node = be->backpath; + be->backend_watch.callback = backend_changed; + er = register_xenbus_watch(h, &be->backend_watch); + if (er == 0) { + be->backend_watch.node = NULL; + warn("error adding backend watch on %s", bepath); + goto free_be; + } + + be->frontpath = frontend; + be->watch.node = be->frontpath; + be->watch.callback = frontend_changed; + er = register_xenbus_watch(h, &be->watch); + if (er == 0) { + be->watch.node = NULL; + warn("adding frontend watch on %s", be->frontpath); + goto free_be; + } + + list_add(&be->list, &belist); + + DPRINTF("[PROBE]: ADDED NEW DEVICE (%s)\n", bepath_im); + + backend_changed(h, &be->backend_watch, bepath); + return; + + free_be: + if ((be) && (be->backend_watch.node)) + unregister_xenbus_watch(h, &be->backend_watch); + if (frontend) + free(frontend); + if (bepath) + free(bepath); + free(be); + return; +} + + +int add_blockdevice_probe_watch(struct xs_handle *h, const char *domname) +{ + char *uuid, *path; + struct xenbus_watch *vbd_watch; + int er; + + uuid = get_dom_uuid(h, domname); + + DPRINTF("%s: %s\n", domname, (uuid != NULL) ? uuid : "[ not found! ]"); + + asprintf(&path, "/domain/%s/backend/vbd", uuid); + if (path == NULL) + return -ENOMEM; + + vbd_watch = (struct xenbus_watch *)malloc(sizeof(struct xenbus_watch)); + vbd_watch->node = path; + vbd_watch->callback = blkback_probe; + er = register_xenbus_watch(h, vbd_watch); + if (er == 0) { + warn("Error adding vbd probe watch %s", path); + return -EINVAL; + } + + return 0; +} diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Sun Sep 4 15:08:16 2005 +++ /dev/null Sun Sep 4 21:19:44 2005 @@ -1,573 +0,0 @@ -/****************************************************************************** - * blktap_controlmsg.c - * - * XenLinux virtual block-device tap. - * Control interfaces to the frontend and backend drivers. - * - * Copyright (c) 2004, Andrew Warfield - * - */ - -#include "blktap.h" -#include <asm-xen/evtchn.h> - -static char *blkif_state_name[] = { - [BLKIF_STATE_CLOSED] = "closed", - [BLKIF_STATE_DISCONNECTED] = "disconnected", - [BLKIF_STATE_CONNECTED] = "connected", -}; - -static char *blkif_status_name[] = { - [BLKIF_INTERFACE_STATUS_CLOSED] = "closed", - [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", - [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", - [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", -}; - -unsigned int blktap_be_state = BLKIF_STATE_CLOSED; -unsigned int blktap_be_evtchn; - -/*-----[ Control Messages to/from Frontend VMs ]--------------------------*/ - -#define BLKIF_HASHSZ 1024 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) - -static kmem_cache_t *blkif_cachep; -static blkif_t *blkif_hash[BLKIF_HASHSZ]; - -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) -{ - blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif != NULL) && - ((blkif->domid != domid) || (blkif->handle != handle)) ) - blkif = blkif->hash_next; - return blkif; -} - -static void __blkif_disconnect_complete(void *arg) -{ - blkif_t *blkif = (blkif_t *)arg; - ctrl_msg_t cmsg; - blkif_be_disconnect_t disc; -#ifdef CONFIG_XEN_BLKDEV_GRANT - struct gnttab_unmap_grant_ref op; -#endif - - /* - * These can't be done in blkif_disconnect() because at that point there - * may be outstanding requests at the disc whose asynchronous responses - * must still be notified to the remote driver. - */ -#ifdef CONFIG_XEN_BLKDEV_GRANT - op.host_addr = blkif->shmem_vaddr; - op.handle = blkif->shmem_handle; - op.dev_bus_addr = 0; - BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); -#endif - vfree(blkif->blk_ring.sring); - - /* Construct the deferred response message. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; - cmsg.id = blkif->disconnect_rspid; - cmsg.length = sizeof(blkif_be_disconnect_t); - disc.domid = blkif->domid; - disc.blkif_handle = blkif->handle; - disc.status = BLKIF_BE_STATUS_OKAY; - memcpy(cmsg.msg, &disc, sizeof(disc)); - - /* - * Make sure message is constructed /before/ status change, because - * after the status change the 'blkif' structure could be deallocated at - * any time. Also make sure we send the response /after/ status change, - * as otherwise a subsequent CONNECT request could spuriously fail if - * another CPU doesn't see the status change yet. - */ - mb(); - if ( blkif->status != DISCONNECTING ) - BUG(); - blkif->status = DISCONNECTED; - mb(); - - /* Send the successful response. */ - ctrl_if_send_response(&cmsg); -} - -void blkif_disconnect_complete(blkif_t *blkif) -{ - INIT_WORK(&blkif->work, __blkif_disconnect_complete, (void *)blkif); - schedule_work(&blkif->work); -} - -void blkif_ptfe_create(blkif_be_create_t *create) -{ - blkif_t *blkif, **pblkif; - domid_t domid = create->domid; - unsigned int handle = create->blkif_handle; - - - /* May want to store info on the connecting domain here. */ - - DPRINTK("PT got BE_CREATE\n"); - - if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) - { - WPRINTK("Could not create blkif: out of memory\n"); - create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - /* blkif struct init code from blkback.c */ - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->handle = handle; - blkif->status = DISCONNECTED; - spin_lock_init(&blkif->blk_ring_lock); - atomic_set(&blkif->refcnt, 0); - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( *pblkif != NULL ) - { - if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) - { - WPRINTK("Could not create blkif: already exists\n"); - create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; - kmem_cache_free(blkif_cachep, blkif); - return; - } - pblkif = &(*pblkif)->hash_next; - } - - blkif->hash_next = *pblkif; - *pblkif = blkif; - - create->status = BLKIF_BE_STATUS_OKAY; -} - - -void blkif_ptfe_destroy(blkif_be_destroy_t *destroy) -{ - /* Clear anything that we initialized above. */ - - domid_t domid = destroy->domid; - unsigned int handle = destroy->blkif_handle; - blkif_t **pblkif, *blkif; - - DPRINTK("PT got BE_DESTROY\n"); - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif = *pblkif) != NULL ) - { - if ( (blkif->domid == domid) && (blkif->handle == handle) ) - { - if ( blkif->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pblkif = &blkif->hash_next; - } - - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pblkif = blkif->hash_next; - kmem_cache_free(blkif_cachep, blkif); - destroy->status = BLKIF_BE_STATUS_OKAY; -} - -void blkif_ptfe_connect(blkif_be_connect_t *connect) -{ - domid_t domid = connect->domid; - unsigned int handle = connect->blkif_handle; - unsigned int evtchn = connect->evtchn; - unsigned long shmem_frame = connect->shmem_frame; - struct vm_struct *vma; -#ifdef CONFIG_XEN_BLKDEV_GRANT - int ref = connect->shmem_ref; -#else - pgprot_t prot; - int error; -#endif - blkif_t *blkif; - blkif_sring_t *sring; - - DPRINTK("PT got BE_CONNECT\n"); - - blkif = blkif_find_by_handle(domid, handle); - if ( unlikely(blkif == NULL) ) - { - WPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", - connect->domid, connect->blkif_handle); - connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - -#ifndef CONFIG_XEN_BLKDEV_GRANT - prot = __pgprot(_KERNPG_TABLE); - error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), - shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - if ( error != 0 ) - { - if ( error == -ENOMEM ) - connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) - connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; - else - connect->status = BLKIF_BE_STATUS_ERROR; - vfree(vma->addr); - return; - } -#else - { /* Map: Use the Grant table reference */ - struct gnttab_map_grant_ref op; - op.host_addr = VMALLOC_VMADDR(vma->addr); - op.flags = GNTMAP_host_map; - op.ref = ref; - op.dom = domid; - - BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); - - handle = op.handle; - - if (op.handle < 0) { - DPRINTK(" Grant table operation failure !\n"); - connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; - vfree(vma->addr); - return; - } - - blkif->shmem_ref = ref; - blkif->shmem_handle = handle; - blkif->shmem_vaddr = VMALLOC_VMADDR(vma->addr); - } -#endif - - if ( blkif->status != DISCONNECTED ) - { - connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - vfree(vma->addr); - return; - } - - sring = (blkif_sring_t *)vma->addr; - SHARED_RING_INIT(sring); - BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); - - blkif->evtchn = evtchn; - blkif->shmem_frame = shmem_frame; - blkif->status = CONNECTED; - blkif_get(blkif); - - bind_evtchn_to_irqhandler( - evtchn, blkif_ptfe_int, 0, "blkif-pt-backend", blkif); - - connect->status = BLKIF_BE_STATUS_OKAY; -} - -int blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id) -{ - domid_t domid = disconnect->domid; - unsigned int handle = disconnect->blkif_handle; - blkif_t *blkif; - - DPRINTK("PT got BE_DISCONNECT\n"); - - blkif = blkif_find_by_handle(domid, handle); - if ( unlikely(blkif == NULL) ) - { - WPRINTK("blkif_disconnect attempted for non-existent blkif" - " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); - disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return 1; /* Caller will send response error message. */ - } - - if ( blkif->status == CONNECTED ) - { - blkif->status = DISCONNECTING; - blkif->disconnect_rspid = rsp_id; - wmb(); /* Let other CPUs see the status change. */ - unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); - blkif_deschedule(blkif); - blkif_put(blkif); - return 0; /* Caller should not send response message. */ - } - - disconnect->status = BLKIF_BE_STATUS_OKAY; - return 1; -} - -/*-----[ Control Messages to/from Backend VM ]----------------------------*/ - -/* Tell the controller to bring up the interface. */ -static void blkif_ptbe_send_interface_connect(void) -{ - ctrl_msg_t cmsg = { - .type = CMSG_BLKIF_FE, - .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT, - .length = sizeof(blkif_fe_interface_connect_t), - }; - blkif_fe_interface_connect_t *msg = (void*)cmsg.msg; - msg->handle = 0; - msg->shmem_frame = virt_to_mfn(blktap_be_ring.sring); - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -static void blkif_ptbe_close(void) -{ -} - -/* Move from CLOSED to DISCONNECTED state. */ -static void blkif_ptbe_disconnect(void) -{ - blkif_sring_t *sring; - - sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL); - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&blktap_be_ring, sring, PAGE_SIZE); - blktap_be_state = BLKIF_STATE_DISCONNECTED; - DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n"); - blkif_ptbe_send_interface_connect(); -} - -static void blkif_ptbe_connect(blkif_fe_interface_status_t *status) -{ - int err = 0; - - blktap_be_evtchn = status->evtchn; - - err = bind_evtchn_to_irqhandler( - blktap_be_evtchn, blkif_ptbe_int, SA_SAMPLE_RANDOM, "blkif", NULL); - if ( err ) { - WPRINTK("blkfront bind_evtchn_to_irqhandler failed (%d)\n", err); - return; - } else { - /* transtion to connected in case we need to do a - a partion probe on a whole disk */ - blktap_be_state = BLKIF_STATE_CONNECTED; - } -} - -static void unexpected(blkif_fe_interface_status_t *status) -{ - WPRINTK(" TAP: Unexpected blkif status %s in state %s\n", - blkif_status_name[status->status], - blkif_state_name[blktap_be_state]); -} - -static void blkif_ptbe_status( - blkif_fe_interface_status_t *status) -{ - if ( status->handle != 0 ) - { - DPRINTK("Status change on unsupported blkif %d\n", - status->handle); - return; - } - - DPRINTK("ptbe_status: got %s\n", blkif_status_name[status->status]); - - switch ( status->status ) - { - case BLKIF_INTERFACE_STATUS_CLOSED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_ptbe_close(); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_DISCONNECTED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - blkif_ptbe_disconnect(); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - printk(KERN_ALERT "*** add recovery code to the tap driver. ***\n"); - unexpected(status); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CONNECTED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - blkif_ptbe_disconnect(); - blkif_ptbe_connect(status); - break; - case BLKIF_STATE_DISCONNECTED: - blkif_ptbe_connect(status); - break; - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_ptbe_connect(status); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CHANGED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - case BLKIF_STATE_DISCONNECTED: - unexpected(status); - break; - case BLKIF_STATE_CONNECTED: - /* vbd_update(); */ - /* tap doesn't really get state changes... */ - unexpected(status); - break; - } - break; - - default: - DPRINTK("Status change to unknown value %d\n", status->status); - break; - } -} - -/*-----[ All control messages enter here: ]-------------------------------*/ - -void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - switch ( msg->type ) - { - case CMSG_BLKIF_FE: - - switch ( msg->subtype ) - { - case CMSG_BLKIF_FE_INTERFACE_STATUS: - blkif_ptbe_status((blkif_fe_interface_status_t *) &msg->msg[0]); - break; - - default: - goto parse_error; - } - - break; - - case CMSG_BLKIF_BE: - - /* send a copy of the message to user if wanted */ - - if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) || - (blktap_mode & BLKTAP_MODE_COPY_FE) ) { - - blktap_write_ctrl_ring(msg); - blktap_kick_user(); - } - - switch ( msg->subtype ) - { - case CMSG_BLKIF_BE_CREATE: - blkif_ptfe_create((blkif_be_create_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DESTROY: - blkif_ptfe_destroy((blkif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_CONNECT: - blkif_ptfe_connect((blkif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DISCONNECT: - if ( !blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0], - msg->id) ) - return; - break; - - /* We just ignore anything to do with vbds for now. */ - - case CMSG_BLKIF_BE_VBD_CREATE: - DPRINTK("PT got VBD_CREATE\n"); - ((blkif_be_vbd_create_t *)&msg->msg[0])->status - = BLKIF_BE_STATUS_OKAY; - break; - case CMSG_BLKIF_BE_VBD_DESTROY: - DPRINTK("PT got VBD_DESTROY\n"); - ((blkif_be_vbd_destroy_t *)&msg->msg[0])->status - = BLKIF_BE_STATUS_OKAY; - break; - default: - goto parse_error; - } - - break; - } - - ctrl_if_send_response(msg); - return; - - parse_error: - msg->length = 0; - ctrl_if_send_response(msg); -} - -/*-----[ Initialization ]-------------------------------------------------*/ - -void __init blkif_interface_init(void) -{ - blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), - 0, 0, NULL, NULL); - memset(blkif_hash, 0, sizeof(blkif_hash)); - - blktap_be_ring.sring = NULL; -} - - - -/* Debug : print the current ring indices. */ - -void print_fe_ring_idxs(void) -{ - int i; - blkif_t *blkif; - - WPRINTK("FE Rings: \n---------\n"); - for ( i = 0; i < BLKIF_HASHSZ; i++) { - blkif = blkif_hash[i]; - while (blkif != NULL) { - if (blkif->status == DISCONNECTED) { - WPRINTK("(%2d,%2d) DISCONNECTED\n", - blkif->domid, blkif->handle); - } else if (blkif->status == DISCONNECTING) { - WPRINTK("(%2d,%2d) DISCONNECTING\n", - blkif->domid, blkif->handle); - } else if (blkif->blk_ring.sring == NULL) { - WPRINTK("(%2d,%2d) CONNECTED, but null sring!\n", - blkif->domid, blkif->handle); - } else { - blkif_get(blkif); - WPRINTK("(%2d,%2d): req_cons: %2d, rsp_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blkif->domid, blkif->handle, - blkif->blk_ring.req_cons, - blkif->blk_ring.rsp_prod_pvt, - blkif->blk_ring.sring->req_prod, - blkif->blk_ring.sring->rsp_prod); - blkif_put(blkif); - } - blkif = blkif->hash_next; - } - } -} diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c Sun Sep 4 15:08:16 2005 +++ /dev/null Sun Sep 4 21:19:44 2005 @@ -1,449 +0,0 @@ -/****************************************************************************** - * blktap_datapath.c - * - * XenLinux virtual block-device tap. - * Block request routing data path. - * - * Copyright (c) 2004, Andrew Warfield - * -- see full header in blktap.c - */ - -#include "blktap.h" -#include <asm-xen/evtchn.h> - -/*-----[ The data paths ]-------------------------------------------------*/ - -/* Connection to a single backend domain. */ -blkif_front_ring_t blktap_be_ring; - -/*-----[ Tracking active requests ]---------------------------------------*/ - -/* this must be the same as MAX_PENDING_REQS in blkback.c */ -#define MAX_ACTIVE_REQS ((ACTIVE_RING_IDX)64U) - -active_req_t active_reqs[MAX_ACTIVE_REQS]; -ACTIVE_RING_IDX active_req_ring[MAX_ACTIVE_REQS]; -spinlock_t active_req_lock = SPIN_LOCK_UNLOCKED; -ACTIVE_RING_IDX active_prod, active_cons; -#define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1)) -#define ACTIVE_IDX(_ar) (_ar - active_reqs) -#define NR_ACTIVE_REQS (MAX_ACTIVE_REQS - active_prod + active_cons) - -inline active_req_t *get_active_req(void) -{ - ACTIVE_RING_IDX idx; - active_req_t *ar; - unsigned long flags; - - ASSERT(active_cons != active_prod); - - spin_lock_irqsave(&active_req_lock, flags); - idx = active_req_ring[MASK_ACTIVE_IDX(active_cons++)]; - ar = &active_reqs[idx]; - spin_unlock_irqrestore(&active_req_lock, flags); - - return ar; -} - -inline void free_active_req(active_req_t *ar) -{ - unsigned long flags; - - spin_lock_irqsave(&active_req_lock, flags); - active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar); - spin_unlock_irqrestore(&active_req_lock, flags); -} - -active_req_t *lookup_active_req(ACTIVE_RING_IDX idx) -{ - return &active_reqs[idx]; -} - -void active_reqs_init(void) -{ - ACTIVE_RING_IDX i; - - active_cons = 0; - active_prod = MAX_ACTIVE_REQS; - memset(active_reqs, 0, sizeof(active_reqs)); - for ( i = 0; i < MAX_ACTIVE_REQS; i++ ) - active_req_ring[i] = i; -} - -/* Requests passing through the tap to the backend hijack the id field - * in the request message. In it we put the AR index _AND_ the fe domid. - * the domid is used by the backend to map the pages properly. - */ - -static inline unsigned long MAKE_ID(domid_t fe_dom, ACTIVE_RING_IDX idx) -{ - return ( (fe_dom << 16) | MASK_ACTIVE_IDX(idx) ); -} - -/*-----[ Ring helpers ]---------------------------------------------------*/ - -static void maybe_trigger_blktap_schedule(void); - -inline int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp) -{ - blkif_response_t *resp_d; - active_req_t *ar; - - ar = &active_reqs[ID_TO_IDX(rsp->id)]; - rsp->id = ar->id; - - resp_d = RING_GET_RESPONSE(&blkif->blk_ring, - blkif->blk_ring.rsp_prod_pvt); - memcpy(resp_d, rsp, sizeof(blkif_response_t)); - wmb(); - blkif->blk_ring.rsp_prod_pvt++; - - blkif_put(ar->blkif); - free_active_req(ar); - - return 0; -} - -inline int write_req_to_be_ring(blkif_request_t *req) -{ - blkif_request_t *req_d; - - if ( blktap_be_state != BLKIF_STATE_CONNECTED ) { - WPRINTK("Tap trying to access an unconnected backend!\n"); - return 0; - } - - req_d = RING_GET_REQUEST(&blktap_be_ring, - blktap_be_ring.req_prod_pvt); - memcpy(req_d, req, sizeof(blkif_request_t)); - wmb(); - blktap_be_ring.req_prod_pvt++; - - return 0; -} - -void kick_fe_domain(blkif_t *blkif) -{ - RING_PUSH_RESPONSES(&blkif->blk_ring); - notify_via_evtchn(blkif->evtchn); - DPRINTK("notified FE(dom %u)\n", blkif->domid); - - /* We just feed up a batch of request slots... */ - maybe_trigger_blktap_schedule(); - -} - -void kick_be_domain(void) -{ - if ( blktap_be_state != BLKIF_STATE_CONNECTED ) - return; - - wmb(); /* Ensure that the frontend can see the requests. */ - RING_PUSH_REQUESTS(&blktap_be_ring); - notify_via_evtchn(blktap_be_evtchn); - DPRINTK("notified BE\n"); -} - -/*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/ - -/*-----[ Scheduler list maint -from blkback ]--- */ - -static struct list_head blkio_schedule_list; -static spinlock_t blkio_schedule_list_lock; - -static int __on_blkdev_list(blkif_t *blkif) -{ - return blkif->blkdev_list.next != NULL; -} - -static void remove_from_blkdev_list(blkif_t *blkif) -{ - unsigned long flags; - if ( !__on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( __on_blkdev_list(blkif) ) - { - list_del(&blkif->blkdev_list); - blkif->blkdev_list.next = NULL; - blkif_put(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); -} - -static void add_to_blkdev_list_tail(blkif_t *blkif) -{ - unsigned long flags; - if ( __on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) - { - list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); - blkif_get(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); -} - - -/*-----[ Scheduler functions - from blkback ]--- */ - -static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait); - -static int do_block_io_op(blkif_t *blkif, int max_to_do); - -static int blkio_schedule(void *arg) -{ - DECLARE_WAITQUEUE(wq, current); - - blkif_t *blkif; - struct list_head *ent; - - daemonize( - "xentapd" - ); - - for ( ; ; ) - { - /* Wait for work to do. */ - add_wait_queue(&blkio_schedule_wait, &wq); - set_current_state(TASK_INTERRUPTIBLE); - if ( (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) || - list_empty(&blkio_schedule_list) ) - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&blkio_schedule_wait, &wq); - - /* Queue up a batch of requests. */ - while ( (NR_ACTIVE_REQS < MAX_ACTIVE_REQS) && - !list_empty(&blkio_schedule_list) ) - { - ent = blkio_schedule_list.next; - blkif = list_entry(ent, blkif_t, blkdev_list); - blkif_get(blkif); - remove_from_blkdev_list(blkif); - if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) - add_to_blkdev_list_tail(blkif); - blkif_put(blkif); - } - } -} - -static void maybe_trigger_blktap_schedule(void) -{ - /* - * Needed so that two processes, who together make the following predicate - * true, don't both read stale values and evaluate the predicate - * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... - */ - smp_mb(); - - if ( (NR_ACTIVE_REQS < (MAX_ACTIVE_REQS/2)) && - !list_empty(&blkio_schedule_list) ) - wake_up(&blkio_schedule_wait); -} - -void blkif_deschedule(blkif_t *blkif) -{ - remove_from_blkdev_list(blkif); -} - -void __init blkdev_schedule_init(void) -{ - spin_lock_init(&blkio_schedule_list_lock); - INIT_LIST_HEAD(&blkio_schedule_list); - - if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) - BUG(); -} - -/*-----[ Interrupt entry from a frontend ]------ */ - -irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs) -{ - blkif_t *blkif = dev_id; - - add_to_blkdev_list_tail(blkif); - maybe_trigger_blktap_schedule(); - return IRQ_HANDLED; -} - -/*-----[ Other Frontend Ring functions ]-------- */ - -/* irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)*/ -static int do_block_io_op(blkif_t *blkif, int max_to_do) -{ - /* we have pending messages from the real frontend. */ - - blkif_request_t *req_s; - RING_IDX i, rp; - unsigned long flags; - active_req_t *ar; - int more_to_do = 0; - int notify_be = 0, notify_user = 0; - - /* lock both rings */ - spin_lock_irqsave(&blkif_io_lock, flags); - - rp = blkif->blk_ring.sring->req_prod; - rmb(); - - for ( i = blkif->blk_ring.req_cons; - (i != rp) && - !RING_REQUEST_CONS_OVERFLOW(&blkif->blk_ring, i); - i++ ) - { - - if ((--max_to_do == 0) || (NR_ACTIVE_REQS == MAX_ACTIVE_REQS)) - { - more_to_do = 1; - break; - } - - req_s = RING_GET_REQUEST(&blkif->blk_ring, i); - /* This is a new request: - * Assign an active request record, and remap the id. - */ - ar = get_active_req(); - ar->id = req_s->id; - ar->nr_pages = req_s->nr_segments; - blkif_get(blkif); - ar->blkif = blkif; - req_s->id = MAKE_ID(blkif->domid, ACTIVE_IDX(ar)); - /* WPRINTK("%3u < %3lu\n", ID_TO_IDX(req_s->id), ar->id); */ - - /* FE -> BE interposition point is here. */ - - /* ------------------------------------------------------------- */ - /* BLKIF_OP_PROBE_HACK: */ - /* Signal to the backend that we are a tap domain. */ - - if (req_s->operation == BLKIF_OP_PROBE) { - DPRINTK("Adding BLKTAP_COOKIE to PROBE request.\n"); - req_s->frame_and_sects[1] = BLKTAP_COOKIE; - } - - /* ------------------------------------------------------------- */ - - /* If we are in MODE_INTERCEPT_FE or MODE_COPY_FE: */ - if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) || - (blktap_mode & BLKTAP_MODE_COPY_FE) ) { - - /* Copy the response message to UFERing */ - /* In MODE_INTERCEPT_FE, map attached pages into the app vma */ - /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */ - - DPRINTK("req->UFERing\n"); - blktap_write_fe_ring(req_s); - notify_user = 1; - } - - /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */ - if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) || - (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) { - - /* be included to prevent noise from the fe when its off */ - /* copy the request message to the BERing */ - - DPRINTK("blktap: FERing[%u] -> BERing[%u]\n", - (unsigned)i & (RING_SIZE(&blktap_be_ring)-1), - (unsigned)blktap_be_ring.req_prod_pvt & - (RING_SIZE((&blktap_be_ring)-1))); - - write_req_to_be_ring(req_s); - notify_be = 1; - } - } - - blkif->blk_ring.req_cons = i; - - /* unlock rings */ - spin_unlock_irqrestore(&blkif_io_lock, flags); - - if (notify_user) - blktap_kick_user(); - if (notify_be) - kick_be_domain(); - - return more_to_do; -} - -/*-----[ Data to/from Backend (server) VM ]------------------------------*/ - - -irqreturn_t blkif_ptbe_int(int irq, void *dev_id, - struct pt_regs *ptregs) -{ - blkif_response_t *resp_s; - blkif_t *blkif; - RING_IDX rp, i; - unsigned long flags; - - DPRINTK("PT got BE interrupt.\n"); - - /* lock both rings */ - spin_lock_irqsave(&blkif_io_lock, flags); - - rp = blktap_be_ring.sring->rsp_prod; - rmb(); - - for ( i = blktap_be_ring.rsp_cons; i != rp; i++) - { - resp_s = RING_GET_RESPONSE(&blktap_be_ring, i); - - /* BE -> FE interposition point is here. */ - - blkif = active_reqs[ID_TO_IDX(resp_s->id)].blkif; - - /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */ - if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) || - (blktap_mode & BLKTAP_MODE_COPY_BE) ) { - - /* Copy the response message to UBERing */ - /* In MODE_INTERCEPT_BE, map attached pages into the app vma */ - /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */ - - DPRINTK("rsp->UBERing\n"); - blktap_write_be_ring(resp_s); - blktap_kick_user(); - - } - - /* If we are NOT in MODE_INTERCEPT_BE or MODE_INTERCEPT_FE: */ - if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) || - (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) { - - /* (fe included to prevent random interference from the BE) */ - /* Copy the response message to FERing */ - - DPRINTK("blktap: BERing[%u] -> FERing[%u]\n", - (unsigned)i & (RING_SIZE(&blkif->blk_ring)-1), - (unsigned)blkif->blk_ring.rsp_prod_pvt & - (RING_SIZE((&blkif->blk_ring)-1))); - - write_resp_to_fe_ring(blkif, resp_s); - kick_fe_domain(blkif); - - } - } - - blktap_be_ring.rsp_cons = i; - - - spin_unlock_irqrestore(&blkif_io_lock, flags); - - return IRQ_HANDLED; -} - -/* Debug : print the current ring indices. */ - -void print_be_ring_idxs(void) -{ - if (blktap_be_ring.sring != NULL) { - WPRINTK("BE Ring: \n--------\n"); - WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blktap_be_ring.rsp_cons, - blktap_be_ring.req_prod_pvt, - blktap_be_ring.sring->req_prod, - blktap_be_ring.sring->rsp_prod); - } -} diff -r 523078a33287 -r f59e0163540e linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c Sun Sep 4 15:08:16 2005 +++ /dev/null Sun Sep 4 21:19:44 2005 @@ -1,801 +0,0 @@ -/****************************************************************************** - * blktap_userdev.c - * - * XenLinux virtual block-device tap. - * Control interface between the driver and a character device. - * - * Copyright (c) 2004, Andrew Warfield - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/miscdevice.h> -#include <linux/errno.h> -#include <linux/major.h> -#include <linux/gfp.h> -#include <linux/poll.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */ -#ifdef CONFIG_XEN_BLKDEV_GRANT -#include <asm-xen/xen-public/grant_table.h> -#endif - -#include "blktap.h" - - -unsigned long blktap_mode = BLKTAP_MODE_PASSTHROUGH; - -/* Only one process may open /dev/xen/blktap at any time. */ -static unsigned long blktap_dev_inuse; -unsigned long blktap_ring_ok; /* make this ring->state */ - -/* for poll: */ -static wait_queue_head_t blktap_wait; - -/* Rings up to user space. */ -static blkif_front_ring_t blktap_ufe_ring; -static blkif_back_ring_t blktap_ube_ring; -static ctrl_front_ring_t blktap_uctrl_ring; - -/* local prototypes */ -static int blktap_read_fe_ring(void); -static int blktap_read_be_ring(void); - - -/* -------[ mmap region ]--------------------------------------------- */ -/* - * We use a big chunk of address space to map in-flight requests into, - * and export this region up to user-space. See the comments in blkback - * about this -- the two must be kept in sync if the tap is used as a - * passthrough. - */ - -#define MAX_PENDING_REQS 64 - -/* immediately before the mmap area, we have a bunch of pages reserved - * for shared memory rings. - */ -#define RING_PAGES 3 /* Ctrl, Front, and Back */ - -/* Where things are inside the device mapping. */ -struct vm_area_struct *blktap_vma = NULL; -unsigned long mmap_vstart; /* Kernel pages for mapping in data. */ -unsigned long rings_vstart; /* start of mmaped vma */ -unsigned long user_vstart; /* start of user mappings */ - -#define MMAP_PAGES_PER_REQUEST \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) -#define MMAP_PAGES \ - (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) -#define MMAP_VADDR(_start, _req,_seg) \ - ( _start + \ - ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - -/* -------[ grant handles ]------------------------------------------- */ - -#ifdef CONFIG_XEN_BLKDEV_GRANT -/* When using grant tables to map a frame for device access then the - * handle returned must be used to unmap the frame. This is needed to - * drop the ref count on the frame. - */ -struct grant_handle_pair -{ - u16 kernel; - u16 user; -}; -static struct grant_handle_pair pending_grant_handles[MMAP_PAGES]; -#define pending_handle(_idx, _i) \ - (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)]) -#define BLKTAP_INVALID_HANDLE(_g) \ - (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF)) -#define BLKTAP_INVALIDATE_HANDLE(_g) do { \ - (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \ - } while(0) - -#endif - - -/* -------[ blktap vm ops ]------------------------------------------- */ - -static struct page *blktap_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) -{ - /* - * if the page has not been mapped in by the driver then generate - * a SIGBUS to the domain. - */ - - force_sig(SIGBUS, current); - - return 0; -} - -struct vm_operations_struct blktap_vm_ops = { - nopage: blktap_nopage, -}; - -/* -------[ blktap file ops ]----------------------------------------- */ - -static int blktap_open(struct inode *inode, struct file *filp) -{ - blkif_sring_t *sring; - ctrl_sring_t *csring; - - if ( test_and_set_bit(0, &blktap_dev_inuse) ) - return -EBUSY; - - /* Allocate the ctrl ring. */ - csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL); - if (csring == NULL) - goto fail_nomem; - - SetPageReserved(virt_to_page(csring)); - - SHARED_RING_INIT(csring); - FRONT_RING_INIT(&blktap_uctrl_ring, csring, PAGE_SIZE); - - /* Allocate the fe ring. */ - sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); - if (sring == NULL) - goto fail_free_ctrl; - - SetPageReserved(virt_to_page(sring)); - - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE); - - /* Allocate the be ring. */ - sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); - if (sring == NULL) - goto fail_free_fe; - - SetPageReserved(virt_to_page(sring)); - - SHARED_RING_INIT(sring); - BACK_RING_INIT(&blktap_ube_ring, sring, PAGE_SIZE); - - DPRINTK(KERN_ALERT "blktap open.\n"); - - return 0; - - fail_free_ctrl: - free_page( (unsigned long) blktap_uctrl_ring.sring); - - fail_free_fe: - free_page( (unsigned long) blktap_ufe_ring.sring); - - fail_nomem: - return -ENOMEM; -} - -static int blktap_release(struct inode *inode, struct file *filp) -{ - blktap_dev_inuse = 0; - blktap_ring_ok = 0; - - DPRINTK(KERN_ALERT "blktap closed.\n"); - - /* Free the ring page. */ - ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring)); - free_page((unsigned long) blktap_uctrl_ring.sring); - - ClearPageReserved(virt_to_page(blktap_ufe_ring.sring)); - free_page((unsigned long) blktap_ufe_ring.sring); - - ClearPageReserved(virt_to_page(blktap_ube_ring.sring)); - free_page((unsigned long) blktap_ube_ring.sring); - - /* Clear any active mappings and free foreign map table */ - if (blktap_vma != NULL) { - zap_page_range(blktap_vma, blktap_vma->vm_start, - blktap_vma->vm_end - blktap_vma->vm_start, NULL); - blktap_vma = NULL; - } - - return 0; -} - -/* Note on mmap: - * We need to map pages to user space in a way that will allow the block - * subsystem set up direct IO to them. This couldn't be done before, because - * there isn't really a sane way to make a user virtual address down to a - * physical address when the page belongs to another domain. - * - * My first approach was to map the page in to kernel memory, add an entry - * for it in the physical frame list (using alloc_lomem_region as in blkback) - * and then attempt to map that page up to user space. This is disallowed - * by xen though, which realizes that we don't really own the machine frame - * underlying the physical page. - * - * The new approach is to provide explicit support for this in xen linux. - * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages - * mapped from other vms. vma->vm_private_data is set up as a mapping - * from pages to actual page structs. There is a new clause in get_user_pages - * that does the right thing for this sort of mapping. - * - * blktap_mmap sets up this mapping. Most of the real work is done in - * blktap_write_fe_ring below. - */ -static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) -{ - int size; - struct page **map; - int i; - - DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n", - vma->vm_start, vma->vm_end); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &blktap_vm_ops; - - size = vma->vm_end - vma->vm_start; - if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) { - printk(KERN_INFO - "blktap: you _must_ map exactly %d pages!\n", - MMAP_PAGES + RING_PAGES); - return -EAGAIN; - } - - size >>= PAGE_SHIFT; - DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1); - - rings_vstart = vma->vm_start; - user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT); - - /* Map the ring pages to the start of the region and reserve it. */ - - /* not sure if I really need to do this... */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring)); - if (remap_pfn_range(vma, vma->vm_start, - __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) - goto fail; - - - DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring)); - if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE, - __pa(blktap_ube_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) - goto fail; - - DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring)); - if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ), - __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) - goto fail; - - /* Mark this VM as containing foreign pages, and set up mappings. */ - map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - * sizeof(struct page_struct*), - GFP_KERNEL); - if (map == NULL) goto fail; - - for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) - map[i] = NULL; - - vma->vm_private_data = map; - vma->vm_flags |= VM_FOREIGN; - - blktap_vma = vma; - blktap_ring_ok = 1; - - return 0; - fail: - /* Clear any active mappings. */ - zap_page_range(vma, vma->vm_start, - vma->vm_end - vma->vm_start, NULL); - - return -ENOMEM; -} - -static int blktap_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) -{ - switch(cmd) { - case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */ - return blktap_read_fe_ring(); - - case BLKTAP_IOCTL_KICK_BE: /* There are be messages to process. */ - return blktap_read_be_ring(); - - case BLKTAP_IOCTL_SETMODE: - if (BLKTAP_MODE_VALID(arg)) { - blktap_mode = arg; - /* XXX: may need to flush rings here. */ - printk(KERN_INFO "blktap: set mode to %lx\n", arg); - return 0; - } - case BLKTAP_IOCTL_PRINT_IDXS: - { - print_be_ring_idxs(); - print_fe_ring_idxs(); - WPRINTK("User Rings: \n-----------\n"); - WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blktap_ufe_ring.rsp_cons, - blktap_ufe_ring.req_prod_pvt, - blktap_ufe_ring.sring->req_prod, - blktap_ufe_ring.sring->rsp_prod); - WPRINTK("UB: req_cons: %2d, rsp_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blktap_ube_ring.req_cons, - blktap_ube_ring.rsp_prod_pvt, - blktap_ube_ring.sring->req_prod, - blktap_ube_ring.sring->rsp_prod); - - } - } - return -ENOIOCTLCMD; -} - -static unsigned int blktap_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &blktap_wait, wait); - - if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_uctrl_ring) || - RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) || - RING_HAS_UNPUSHED_RESPONSES(&blktap_ube_ring) ) { - - flush_tlb_all(); - - RING_PUSH_REQUESTS(&blktap_uctrl_ring); - RING_PUSH_REQUESTS(&blktap_ufe_ring); - RING_PUSH_RESPONSES(&blktap_ube_ring); - return POLLIN | POLLRDNORM; - } - - return 0; -} - -void blktap_kick_user(void) -{ - /* blktap_ring->req_prod = blktap_req_prod; */ - wake_up_interruptible(&blktap_wait); -} - -static struct file_operations blktap_fops = { - owner: THIS_MODULE, - poll: blktap_poll, - ioctl: blktap_ioctl, - open: blktap_open, - release: blktap_release, - mmap: blktap_mmap, -}; - -/*-----[ Data to/from user space ]----------------------------------------*/ - -static void fast_flush_area(int idx, int nr_pages) -{ -#ifdef CONFIG_XEN_BLKDEV_GRANT - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; - unsigned int i, op = 0; - struct grant_handle_pair *handle; - unsigned long ptep; - - for (i=0; i<nr_pages; i++) - { - handle = &pending_handle(idx, i); - if (!BLKTAP_INVALID_HANDLE(handle)) - { - - unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i); - unmap[op].dev_bus_addr = 0; - unmap[op].handle = handle->kernel; - op++; - - if (create_lookup_pte_addr(blktap_vma->vm_mm, - MMAP_VADDR(user_vstart, idx, i), - &ptep) !=0) { - DPRINTK("Couldn't get a pte addr!\n"); - return; - } - unmap[op].host_addr = ptep; - unmap[op].dev_bus_addr = 0; - unmap[op].handle = handle->user; - op++; - - BLKTAP_INVALIDATE_HANDLE(handle); - } - } - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, op))) - BUG(); -#else - multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int i; - - for ( i = 0; i < nr_pages; i++ ) - { - MULTI_update_va_mapping(mcl+i, MMAP_VADDR(mmap_vstart, idx, i), - __pte(0), 0); - } - - mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) ) - BUG(); -#endif -} - - -int blktap_write_fe_ring(blkif_request_t *req) -{ - blkif_request_t *target; - int i, ret = 0; -#ifdef CONFIG_XEN_BLKDEV_GRANT - struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; - int op; -#else - unsigned long remap_prot; - multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1]; - mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -#endif - - /* - * This is called to pass a request from the real frontend domain's - * blkif ring to the character device. - */ - - if ( ! blktap_ring_ok ) { - DPRINTK("blktap: ufe_ring not ready for a request!\n"); - return 0; - } - - if ( RING_FULL(&blktap_ufe_ring) ) { - PRINTK("blktap: fe_ring is full, can't add.\n"); - return 0; - } - - flush_cache_all(); /* a noop on intel... */ - - target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt); - memcpy(target, req, sizeof(*req)); - - /* Map the foreign pages directly in to the application */ -#ifdef CONFIG_XEN_BLKDEV_GRANT - op = 0; - for (i=0; i<target->nr_segments; i++) { - - unsigned long uvaddr; - unsigned long kvaddr; - unsigned long ptep; - - uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i); - kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i); - - /* Map the remote page to kernel. */ - map[op].host_addr = kvaddr; - map[op].dom = ID_TO_DOM(req->id); - map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]); - map[op].flags = GNTMAP_host_map; - /* This needs a bit more thought in terms of interposition: - * If we want to be able to modify pages during write using - * grant table mappings, the guest will either need to allow - * it, or we'll need to incur a copy. */ - if (req->operation == BLKIF_OP_WRITE) - map[op].flags |= GNTMAP_readonly; - op++; - - /* Now map it to user. */ - ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep); - if (ret) - { - DPRINTK("Couldn't get a pte addr!\n"); - goto fail; - } - - map[op].host_addr = ptep; - map[op].dom = ID_TO_DOM(req->id); - map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]); - map[op].flags = GNTMAP_host_map | GNTMAP_application_map - | GNTMAP_contains_pte; - /* Above interposition comment applies here as well. */ - if (req->operation == BLKIF_OP_WRITE) - map[op].flags |= GNTMAP_readonly; - op++; - } - - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_map_grant_ref, map, op))) - BUG(); - - op = 0; - for (i=0; i<(target->nr_segments*2); i+=2) { - unsigned long uvaddr; - unsigned long kvaddr; - unsigned long offset; - int cancel = 0; - - uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i/2); - kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i/2); - - if ( unlikely(map[i].handle < 0) ) { - DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle); - ret = map[i].handle; - cancel = 1; - } - - if ( unlikely(map[i+1].handle < 0) ) { - DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle); - ret = map[i+1].handle; - cancel = 1; - } - - if (cancel) - goto fail; - - /* Set the necessary mappings in p2m and in the VM_FOREIGN - * vm_area_struct to allow user vaddr -> struct page lookups - * to work. This is needed for direct IO to foreign pages. */ - phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] = - FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); - - offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT; - ((struct page **)blktap_vma->vm_private_data)[offset] = - pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); - - /* Save handles for unmapping later. */ - pending_handle(ID_TO_IDX(req->id), i/2).kernel = map[i].handle; - pending_handle(ID_TO_IDX(req->id), i/2).user = map[i+1].handle; - } - -#else - - remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW; - - for (i=0; i<target->nr_segments; i++) { - unsigned long buf; - unsigned long uvaddr; - unsigned long kvaddr; - unsigned long offset; - unsigned long ptep; - - buf = target->frame_and_sects[i] & PAGE_MASK; - uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i); - kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i); - - MULTI_update_va_mapping_otherdomain( - mcl+i, - kvaddr, - pfn_pte_ma(buf >> PAGE_SHIFT, __pgprot(remap_prot)), - 0, - ID_TO_DOM(req->id)); - - phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] = - FOREIGN_FRAME(buf >> PAGE_SHIFT); - - ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep); - if (ret) - { - DPRINTK("error getting pte\n"); - goto fail; - } - - mmu[i].ptr = ptep; - mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK) - | pgprot_val(blktap_vma->vm_page_prot); - - offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT; - ((struct page **)blktap_vma->vm_private_data)[offset] = - pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); - } - - /* Add the mmu_update call. */ - mcl[i].op = __HYPERVISOR_mmu_update; - mcl[i].args[0] = (unsigned long)mmu; - mcl[i].args[1] = target->nr_segments; - mcl[i].args[2] = 0; - mcl[i].args[3] = ID_TO_DOM(req->id); - - BUG_ON(HYPERVISOR_multicall(mcl, target->nr_segments+1) != 0); - - /* Make sure it all worked. */ - for ( i = 0; i < target->nr_segments; i++ ) - { - if ( unlikely(mcl[i].result != 0) ) - { - DPRINTK("invalid buffer -- could not remap it\n"); - ret = mcl[i].result; - goto fail; - } - } - if ( unlikely(mcl[i].result != 0) ) - { - DPRINTK("direct remapping of pages to /dev/blktap failed.\n"); - ret = mcl[i].result; - goto fail; - } -#endif /* CONFIG_XEN_BLKDEV_GRANT */ - - /* Mark mapped pages as reserved: */ - for ( i = 0; i < target->nr_segments; i++ ) - { - unsigned long kvaddr; - - kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i); - SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT)); - } - - - blktap_ufe_ring.req_prod_pvt++; - - return 0; - - fail: - fast_flush_area(ID_TO_IDX(req->id), target->nr_segments); - return ret; -} - -int blktap_write_be_ring(blkif_response_t *rsp) -{ - blkif_response_t *target; - - /* - * This is called to pass a request from the real backend domain's - * blkif ring to the character device. - */ - - if ( ! blktap_ring_ok ) { - DPRINTK("blktap: be_ring not ready for a request!\n"); - return 0; - } - - /* No test for fullness in the response direction. */ - - target = RING_GET_RESPONSE(&blktap_ube_ring, - blktap_ube_ring.rsp_prod_pvt); - memcpy(target, rsp, sizeof(*rsp)); - - /* no mapping -- pages were mapped in blktap_write_fe_ring() */ - - blktap_ube_ring.rsp_prod_pvt++; - - return 0; -} - -static int blktap_read_fe_ring(void) -{ - /* This is called to read responses from the UFE ring. */ - - RING_IDX i, j, rp; - blkif_response_t *resp_s; - blkif_t *blkif; - active_req_t *ar; - - DPRINTK("blktap_read_fe_ring()\n"); - - /* if we are forwarding from UFERring to FERing */ - if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) { - - /* for each outstanding message on the UFEring */ - rp = blktap_ufe_ring.sring->rsp_prod; - rmb(); - - for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ ) - { - resp_s = RING_GET_RESPONSE(&blktap_ufe_ring, i); - - DPRINTK("resp->fe_ring\n"); - ar = lookup_active_req(ID_TO_IDX(resp_s->id)); - blkif = ar->blkif; - for (j = 0; j < ar->nr_pages; j++) { - unsigned long vaddr; - struct page **map = blktap_vma->vm_private_data; - int offset; - - vaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), j); - offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT; - - ClearPageReserved(virt_to_page(vaddr)); - map[offset] = NULL; - } - - fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages); - zap_page_range(blktap_vma, - MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0), - ar->nr_pages << PAGE_SHIFT, NULL); - write_resp_to_fe_ring(blkif, resp_s); - blktap_ufe_ring.rsp_cons = i + 1; - kick_fe_domain(blkif); - } - } - return 0; -} - -static int blktap_read_be_ring(void) -{ - /* This is called to read requests from the UBE ring. */ - - RING_IDX i, rp; - blkif_request_t *req_s; - - DPRINTK("blktap_read_be_ring()\n"); - - /* if we are forwarding from UFERring to FERing */ - if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) { - - /* for each outstanding message on the UFEring */ - rp = blktap_ube_ring.sring->req_prod; - rmb(); - for ( i = blktap_ube_ring.req_cons; i != rp; i++ ) - { - req_s = RING_GET_REQUEST(&blktap_ube_ring, i); - - DPRINTK("req->be_ring\n"); - write_req_to_be_ring(req_s); - kick_be_domain(); - } - - blktap_ube_ring.req_cons = i; - } - - return 0; -} - -int blktap_write_ctrl_ring(ctrl_msg_t *msg) -{ - ctrl_msg_t *target; - - if ( ! blktap_ring_ok ) { - DPRINTK("blktap: be_ring not ready for a request!\n"); - return 0; - } - - /* No test for fullness in the response direction. */ - - target = RING_GET_REQUEST(&blktap_uctrl_ring, - blktap_uctrl_ring.req_prod_pvt); - memcpy(target, msg, sizeof(*msg)); - - blktap_uctrl_ring.req_prod_pvt++; - - /* currently treat the ring as unidirectional. */ - blktap_uctrl_ring.rsp_cons = blktap_uctrl_ring.sring->rsp_prod; - - return 0; - -} - -/* -------[ blktap module setup ]------------------------------------- */ - -static struct miscdevice blktap_miscdev = { - .minor = BLKTAP_MINOR, - .name = "blktap", - .fops = &blktap_fops, - .devfs_name = "misc/blktap", -}; - -int blktap_init(void) -{ - int err, i, j; - struct page *page; - - page = balloon_alloc_empty_page_range(MMAP_PAGES); - BUG_ON(page == NULL); - mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); - -#ifdef CONFIG_XEN_BLKDEV_GRANT - for (i=0; i<MAX_PENDING_REQS ; i++) - for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) - BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j)); -#endif - - err = misc_register(&blktap_miscdev); - if ( err != 0 ) - { - printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err); - return err; - } - - init_waitqueue_head(&blktap_wait); - - - return 0; -} _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |