* * * diff -r e410857fd83c drivers/xen/Makefile --- a/drivers/xen/Makefile Wed Oct 22 14:55:29 2008 +0100 +++ b/drivers/xen/Makefile Wed Oct 29 21:19:50 2008 -0700 @@ -8,6 +8,7 @@ obj-$(CONFIG_XEN_BALLOON) += balloon/ obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ +obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap2/ obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/ obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmback/ obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += blkfront/ diff -r e410857fd83c drivers/xen/blkback/Makefile --- a/drivers/xen/blkback/Makefile Wed Oct 22 14:55:29 2008 +0100 +++ b/drivers/xen/blkback/Makefile Wed Oct 29 21:19:50 2008 -0700 @@ -1,3 +1,3 @@ obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o -blkbk-y := blkback.o xenbus.o interface.o vbd.o +blkbk-y := blkback.o xenbus.o interface.o vbd.o blkback-pagemap.o diff -r e410857fd83c drivers/xen/blkback/blkback-pagemap.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blkback/blkback-pagemap.c Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,91 @@ +#include "common.h" +#include "blkback-pagemap.h" + +static int blkback_pagemap_size; +static struct blkback_pagemap *blkback_pagemap; + +static inline int +blkback_pagemap_entry_clear(struct blkback_pagemap *map) +{ + static struct blkback_pagemap zero; + return !memcmp(map, &zero, sizeof(zero)); +} + +int +blkback_pagemap_init(int pages) +{ + blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap), + GFP_KERNEL); + if (!blkback_pagemap) + return -ENOMEM; + + blkback_pagemap_size = pages; + return 0; +} + +void +blkback_pagemap_set(int idx, struct page *page, + domid_t domid, busid_t busid, grant_ref_t gref) +{ + struct blkback_pagemap *entry; + + BUG_ON(!blkback_pagemap); + BUG_ON(idx >= blkback_pagemap_size); + + SetPageBlkback(page); + set_page_private(page, idx); + + entry = blkback_pagemap + idx; + if (!blkback_pagemap_entry_clear(entry)) { + printk("overwriting pagemap %d: d %u b %u g %u\n", + idx, entry->domid, entry->busid, entry->gref); + BUG(); + } + + entry->domid = domid; + entry->busid = busid; + entry->gref = gref; +} + +void +blkback_pagemap_clear(struct page *page) +{ + int idx; + struct blkback_pagemap *entry; + + idx = (int)page_private(page); + + BUG_ON(!blkback_pagemap); + BUG_ON(!PageBlkback(page)); + BUG_ON(idx >= blkback_pagemap_size); + + entry = blkback_pagemap + idx; + if (blkback_pagemap_entry_clear(entry)) { + printk("clearing empty pagemap %d\n", idx); + BUG(); + } + + memset(entry, 0, sizeof(*entry)); +} + +struct blkback_pagemap +blkback_pagemap_read(struct page *page) +{ + int idx; + struct blkback_pagemap *entry; + + idx = (int)page_private(page); + + BUG_ON(!blkback_pagemap); + BUG_ON(!PageBlkback(page)); + BUG_ON(idx >= blkback_pagemap_size); + + entry = blkback_pagemap + idx; + if (blkback_pagemap_entry_clear(entry)) { + printk("reading empty pagemap %d\n", idx); + BUG(); + } + + return *entry; +} +EXPORT_SYMBOL(blkback_pagemap_read); diff -r e410857fd83c drivers/xen/blkback/blkback-pagemap.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blkback/blkback-pagemap.h Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,17 @@ +#ifndef _BLKBACK_PAGEMAP_H_ +#define _BLKBACK_PAGEMAP_H_ + +#include +#include + +typedef unsigned int busid_t; + +struct blkback_pagemap { + domid_t domid; + busid_t busid; + grant_ref_t gref; +}; + +struct blkback_pagemap blkback_pagemap_read(struct page *); + +#endif diff -r e410857fd83c drivers/xen/blkback/blkback.c --- a/drivers/xen/blkback/blkback.c Wed Oct 22 14:55:29 2008 +0100 +++ b/drivers/xen/blkback/blkback.c Wed Oct 29 21:19:50 2008 -0700 @@ -173,6 +173,7 @@ handle = pending_handle(req, i); if (handle == BLKBACK_INVALID_HANDLE) continue; + blkback_pagemap_clear(virt_to_page(vaddr(req, i))); gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), GNTMAP_host_map, handle); pending_handle(req, i) = BLKBACK_INVALID_HANDLE; @@ -464,6 +465,10 @@ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); seg[i].buf = map[i].dev_bus_addr | (req->seg[i].first_sect << 9); + blkback_pagemap_set(vaddr_pagenr(pending_req, i), + virt_to_page(vaddr(pending_req, i)), + blkif->domid, req->handle, + req->seg[i].gref); } if (ret) @@ -615,6 +620,9 @@ mmap_pages, GFP_KERNEL); pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); + if (blkback_pagemap_init(mmap_pages)) + goto out_of_memory; + if (!pending_reqs || !pending_grant_handles || !pending_pages) goto out_of_memory; diff -r e410857fd83c drivers/xen/blkback/common.h --- a/drivers/xen/blkback/common.h Wed Oct 22 14:55:29 2008 +0100 +++ b/drivers/xen/blkback/common.h Wed Oct 29 21:19:50 2008 -0700 @@ -43,6 +43,8 @@ #include #include #include +#include "blkback-pagemap.h" + #define DPRINTK(_f, _a...) \ pr_debug("(file=%s, line=%d) " _f, \ @@ -136,4 +138,8 @@ int blkback_barrier(struct xenbus_transaction xbt, struct backend_info *be, int state); +int blkback_pagemap_init(int); +void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t); +void blkback_pagemap_clear(struct page *); + #endif /* __BLKIF__BACKEND__COMMON_H__ */ diff -r e410857fd83c drivers/xen/blktap/blktap.c --- a/drivers/xen/blktap/blktap.c Wed Oct 22 14:55:29 2008 +0100 +++ b/drivers/xen/blktap/blktap.c Wed Oct 29 21:19:50 2008 -0700 @@ -116,6 +116,7 @@ [req id, idx] tuple */ blkif_t *blkif; /*Associate blkif with tapdev */ struct domid_translate_ext trans; /*Translation from domid to bus. */ + struct vm_foreign_map foreign_map; } tap_blkif_t; static struct tap_blkif *tapfds[MAX_TAP_DEV]; @@ -330,7 +331,7 @@ ptep, is_fullmm); info = vma->vm_file->private_data; - map = vma->vm_private_data; + map = info->foreign_map.map; /* TODO Should these be changed to if statements? */ BUG_ON(!info); @@ -615,7 +616,8 @@ info->vma, info->vma->vm_start, info->vma->vm_end - info->vma->vm_start, NULL); - kfree(info->vma->vm_private_data); + kfree(info->foreign_map.map); + info->foreign_map.map = NULL; info->vma = NULL; } @@ -707,7 +709,8 @@ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) map[i] = NULL; - vma->vm_private_data = map; + info->foreign_map.map = map; + vma->vm_private_data = &info->foreign_map; vma->vm_flags |= VM_FOREIGN; vma->vm_flags |= VM_DONTCOPY; @@ -1173,7 +1176,7 @@ for (j = 0; j < pending_req->nr_pages; j++) { unsigned long kvaddr, uvaddr; - struct page **map = info->vma->vm_private_data; + struct page **map = info->foreign_map.map; struct page *pg; int offset; @@ -1467,8 +1470,7 @@ >> PAGE_SHIFT)); offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); - ((struct page **)info->vma->vm_private_data)[offset] = - pg; + info->foreign_map.map[offset] = pg; } } else { for (i = 0; i < nseg; i++) { @@ -1495,8 +1497,7 @@ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); - ((struct page **)info->vma->vm_private_data)[offset] = - pg; + info->foreign_map.map[offset] = pg; } } diff -r e410857fd83c drivers/xen/blktap2/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/Makefile Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,3 @@ +obj-y := blktap.o + +blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o diff -r e410857fd83c drivers/xen/blktap2/blktap.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/blktap.h Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,241 @@ +#ifndef _BLKTAP_H_ +#define _BLKTAP_H_ + +#include +#include +#include +#include +#include + +//#define ENABLE_PASSTHROUGH + +extern int blktap_debug_level; + +#define BTPRINTK(level, tag, force, _f, _a...) \ + do { \ + if (blktap_debug_level > level && \ + (force || printk_ratelimit())) \ + printk(tag "%s: " _f, __func__, ##_a); \ + } while (0) + +#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a) +#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a) +#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a) +#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a) + +#define MAX_BLKTAP_DEVICE 256 + +#define BLKTAP_CONTROL 1 +#define BLKTAP_RING_FD 2 +#define BLKTAP_RING_VMA 3 +#define BLKTAP_DEVICE 4 +#define BLKTAP_SYSFS 5 +#define BLKTAP_PAUSE_REQUESTED 6 +#define BLKTAP_PAUSED 7 +#define BLKTAP_SHUTDOWN_REQUESTED 8 +#define BLKTAP_PASSTHROUGH 9 +#define BLKTAP_DEFERRED 10 + +/* blktap IOCTLs: */ +#define BLKTAP2_IOCTL_KICK_FE 1 +#define BLKTAP2_IOCTL_ALLOC_TAP 200 +#define BLKTAP2_IOCTL_FREE_TAP 201 +#define BLKTAP2_IOCTL_CREATE_DEVICE 202 +#define BLKTAP2_IOCTL_SET_PARAMS 203 +#define BLKTAP2_IOCTL_PAUSE 204 +#define BLKTAP2_IOCTL_REOPEN 205 +#define BLKTAP2_IOCTL_RESUME 206 + +#define BLKTAP2_MAX_MESSAGE_LEN 256 + +#define BLKTAP2_RING_MESSAGE_PAUSE 1 +#define BLKTAP2_RING_MESSAGE_RESUME 2 +#define BLKTAP2_RING_MESSAGE_CLOSE 3 + +/* + * The maximum number of requests that can be outstanding at any time + * is determined by + * + * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] + * + * where mmap_alloc < MAX_DYNAMIC_MEM. + * + * TODO: + * mmap_alloc is initialised to 2 and should be adjustable on the fly via + * sysfs. + */ +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) +#define MAX_DYNAMIC_MEM BLK_RING_SIZE +#define MAX_PENDING_REQS BLK_RING_SIZE +#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) +#define MMAP_VADDR(_start, _req, _seg) \ + (_start + \ + ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + +#define blktap_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blktap_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->wq); \ + } while (0) + +struct blktap; + +struct grant_handle_pair { + grant_handle_t kernel; + grant_handle_t user; +}; +#define INVALID_GRANT_HANDLE 0xFFFF + +struct blktap_handle { + unsigned int ring; + unsigned int device; + unsigned int minor; +}; + +struct blktap_params { + char name[BLKTAP2_MAX_MESSAGE_LEN]; + unsigned long long capacity; + unsigned long sector_size; +}; + +struct blktap_device { + int users; + spinlock_t lock; + struct gendisk *gd; + +#ifdef ENABLE_PASSTHROUGH + struct block_device *bdev; +#endif +}; + +struct blktap_ring { + struct vm_area_struct *vma; + blkif_front_ring_t ring; + struct vm_foreign_map foreign_map; + unsigned long ring_vstart; + unsigned long user_vstart; + + int response; + + wait_queue_head_t poll_wait; + + dev_t devno; + struct class_device *dev; + atomic_t sysfs_refcnt; + struct mutex sysfs_mutex; +}; + +struct blktap_statistics { + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_rd_sect; + int st_wr_sect; + s64 st_rd_cnt; + s64 st_rd_sum_usecs; + s64 st_rd_max_usecs; + s64 st_wr_cnt; + s64 st_wr_sum_usecs; + s64 st_wr_max_usecs; +}; + +struct blktap_request { + uint64_t id; + uint16_t usr_idx; + + uint8_t status; + atomic_t pendcnt; + uint8_t nr_pages; + unsigned short operation; + + struct timeval time; + struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct list_head free_list; +}; + +struct blktap { + int minor; + pid_t pid; + atomic_t refcnt; + unsigned long dev_inuse; + + struct blktap_params params; + + struct rw_semaphore tap_sem; + + struct blktap_ring ring; + struct blktap_device device; + + int pending_cnt; + struct blktap_request *pending_requests[MAX_PENDING_REQS]; + + wait_queue_head_t wq; + struct list_head deferred_queue; + + struct blktap_statistics stats; +}; + +extern struct blktap *blktaps[MAX_BLKTAP_DEVICE]; + +static inline int +blktap_active(struct blktap *tap) +{ + return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse); +} + +static inline int +blktap_validate_params(struct blktap *tap, struct blktap_params *params) +{ + /* TODO: sanity check */ + params->name[sizeof(params->name) - 1] = '\0'; + BTINFO("%s: capacity: %llu, sector-size: %lu\n", + params->name, params->capacity, params->sector_size); + return 0; +} + +int blktap_control_destroy_device(struct blktap *); + +int blktap_ring_init(int *); +int blktap_ring_free(void); +int blktap_ring_create(struct blktap *); +int blktap_ring_destroy(struct blktap *); +int blktap_ring_pause(struct blktap *); +int blktap_ring_resume(struct blktap *); +void blktap_ring_kick_user(struct blktap *); + +int blktap_sysfs_init(void); +void blktap_sysfs_free(void); +int blktap_sysfs_create(struct blktap *); +int blktap_sysfs_destroy(struct blktap *); + +int blktap_device_init(int *); +void blktap_device_free(void); +int blktap_device_create(struct blktap *); +int blktap_device_destroy(struct blktap *); +int blktap_device_pause(struct blktap *); +int blktap_device_resume(struct blktap *); +void blktap_device_restart(struct blktap *); +void blktap_device_finish_request(struct blktap *, + blkif_response_t *, + struct blktap_request *); +void blktap_device_fail_pending_requests(struct blktap *); +#ifdef ENABLE_PASSTHROUGH +int blktap_device_enable_passthrough(struct blktap *, + unsigned, unsigned); +#endif + +void blktap_defer(struct blktap *); +void blktap_run_deferred(void); + +int blktap_request_pool_init(void); +void blktap_request_pool_free(void); +int blktap_request_pool_grow(void); +int blktap_request_pool_shrink(void); +struct blktap_request *blktap_request_allocate(struct blktap *); +void blktap_request_free(struct blktap *, struct blktap_request *); +unsigned long request_to_kaddr(struct blktap_request *, int); + +#endif diff -r e410857fd83c drivers/xen/blktap2/control.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/control.c Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,277 @@ +#include +#include + +#include "blktap.h" + +static DEFINE_SPINLOCK(blktap_control_lock); +struct blktap *blktaps[MAX_BLKTAP_DEVICE]; + +static int ring_major; +static int device_major; +static int blktap_control_registered; + +static void +blktap_control_initialize_tap(struct blktap *tap) +{ + int minor = tap->minor; + + memset(tap, 0, sizeof(*tap)); + set_bit(BLKTAP_CONTROL, &tap->dev_inuse); + init_rwsem(&tap->tap_sem); + init_waitqueue_head(&tap->wq); + atomic_set(&tap->refcnt, 0); + + tap->minor = minor; +} + +static struct blktap * +blktap_control_create_tap(void) +{ + int minor; + struct blktap *tap; + + tap = kmalloc(sizeof(*tap), GFP_KERNEL); + if (unlikely(!tap)) + return NULL; + + blktap_control_initialize_tap(tap); + + spin_lock_irq(&blktap_control_lock); + for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) + if (!blktaps[minor]) + break; + + if (minor == MAX_BLKTAP_DEVICE) { + kfree(tap); + tap = NULL; + goto out; + } + + tap->minor = minor; + blktaps[minor] = tap; + +out: + spin_unlock_irq(&blktap_control_lock); + return tap; +} + +static struct blktap * +blktap_control_allocate_tap(void) +{ + int err, minor; + struct blktap *tap; + + /* + * This is called only from the ioctl, which + * means we should always have interrupts enabled. + */ + BUG_ON(irqs_disabled()); + + spin_lock_irq(&blktap_control_lock); + + for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) { + tap = blktaps[minor]; + if (!tap) + goto found; + + if (!tap->dev_inuse) { + blktap_control_initialize_tap(tap); + goto found; + } + } + + tap = NULL; + +found: + spin_unlock_irq(&blktap_control_lock); + + if (!tap) { + tap = blktap_control_create_tap(); + if (!tap) + return NULL; + } + + err = blktap_ring_create(tap); + if (err) { + BTERR("ring creation failed: %d\n", err); + clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); + return NULL; + } + + BTINFO("allocated tap %p\n", tap); + return tap; +} + +static int +blktap_control_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + unsigned long dev; + struct blktap *tap; + + switch (cmd) { + case BLKTAP2_IOCTL_ALLOC_TAP: { + struct blktap_handle h; + + tap = blktap_control_allocate_tap(); + if (!tap) { + BTERR("error allocating device\n"); + return -ENOMEM; + } + + h.ring = ring_major; + h.device = device_major; + h.minor = tap->minor; + + if (copy_to_user((struct blktap_handle __user *)arg, + &h, sizeof(h))) { + blktap_control_destroy_device(tap); + return -EFAULT; + } + + return 0; + } + + case BLKTAP2_IOCTL_FREE_TAP: + dev = arg; + + if (dev > MAX_BLKTAP_DEVICE || !blktaps[dev]) + return -EINVAL; + + blktap_control_destroy_device(blktaps[dev]); + return 0; + } + + return -ENOIOCTLCMD; +} + +static struct file_operations blktap_control_file_operations = { + .owner = THIS_MODULE, + .ioctl = blktap_control_ioctl, +}; + +static struct miscdevice blktap_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "blktap-control", + .fops = &blktap_control_file_operations, +}; + +int +blktap_control_destroy_device(struct blktap *tap) +{ + int err; + unsigned long inuse; + + if (!tap) + return 0; + + set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); + + for (;;) { + inuse = tap->dev_inuse; + err = blktap_device_destroy(tap); + if (err) + goto wait; + + inuse = tap->dev_inuse; + err = blktap_ring_destroy(tap); + if (err) + goto wait; + + inuse = tap->dev_inuse; + err = blktap_sysfs_destroy(tap); + if (err) + goto wait; + + break; + + wait: + BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n", + inuse, tap->dev_inuse); + if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse)) + break; + } + + clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); + + if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) { + err = 0; + clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); + } + + return err; +} + +static int +blktap_control_init(void) +{ + int err; + + err = misc_register(&blktap_misc); + if (err) { + BTERR("misc_register failed for control device"); + return err; + } + + blktap_control_registered = 1; + return 0; +} + +static void +blktap_control_free(void) +{ + int i; + + for (i = 0; i < MAX_BLKTAP_DEVICE; i++) + blktap_control_destroy_device(blktaps[i]); + + if (blktap_control_registered) + if (misc_deregister(&blktap_misc) < 0) + BTERR("misc_deregister failed for control device"); +} + +static void +blktap_exit(void) +{ + blktap_control_free(); + blktap_ring_free(); + blktap_sysfs_free(); + blktap_device_free(); + blktap_request_pool_free(); +} + +static int __init +blktap_init(void) +{ + int err; + + err = blktap_request_pool_init(); + if (err) + return err; + + err = blktap_device_init(&device_major); + if (err) + goto fail; + + err = blktap_ring_init(&ring_major); + if (err) + goto fail; + + err = blktap_sysfs_init(); + if (err) + goto fail; + + err = blktap_control_init(); + if (err) + goto fail; + + return 0; + +fail: + blktap_exit(); + return err; +} + +module_init(blktap_init); +module_exit(blktap_exit); +MODULE_LICENSE("Dual BSD/GPL"); diff -r e410857fd83c drivers/xen/blktap2/device.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/device.c Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,1134 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "blktap.h" + +#ifdef CONFIG_XEN_BLKDEV_BACKEND +#include "../blkback/blkback-pagemap.h" +#else +struct blkback_pagemap { }; +#define blkback_pagemap_read(page) BUG(); +#endif + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +struct blktap_grant_table { + int cnt; + struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; +}; + +static int blktap_device_major; + +static inline struct blktap * +dev_to_blktap(struct blktap_device *dev) +{ + return container_of(dev, struct blktap, device); +} + +static int +blktap_device_open(struct inode *inode, struct file *filep) +{ + struct blktap *tap; + struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + + if (!dev) + return -ENOENT; + + tap = dev_to_blktap(dev); + if (!blktap_active(tap) || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + return -ENOENT; + + dev->users++; + + return 0; +} + +static int +blktap_device_release(struct inode *inode, struct file *filep) +{ + struct blktap_device *dev = inode->i_bdev->bd_disk->private_data; + struct blktap *tap = dev_to_blktap(dev); + + dev->users--; + if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + blktap_device_destroy(tap); + + return 0; +} + +static int +blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg) +{ + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + sector_t nsect = get_capacity(bd->bd_disk); + sector_t cylinders = nsect; + + hg->heads = 0xff; + hg->sectors = 0x3f; + sector_div(cylinders, hg->heads * hg->sectors); + hg->cylinders = cylinders; + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) + hg->cylinders = 0xffff; + return 0; +} + +static int +blktap_device_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + int i; + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long)argument, inode->i_rdev); + + switch (command) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + case HDIO_GETGEO: { + struct block_device *bd = inode->i_bdev; + struct hd_geometry geo; + int ret; + + if (!argument) + return -EINVAL; + + geo.start = get_start_sect(bd); + ret = blktap_device_getgeo(bd, &geo); + if (ret) + return ret; + + if (copy_to_user((struct hd_geometry __user *)argument, &geo, + sizeof(geo))) + return -EFAULT; + + return 0; + } +#endif + case CDROMMULTISESSION: + BTDBG("FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case SCSI_IOCTL_GET_IDLUN: + if (!access_ok(VERIFY_WRITE, argument, + sizeof(struct scsi_idlun))) + return -EFAULT; + + /* return 0 for now. */ + __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id); + __put_user(0, + &((struct scsi_idlun __user *)argument)->host_unique_id); + return 0; + + default: + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + +static struct block_device_operations blktap_device_file_operations = { + .owner = THIS_MODULE, + .open = blktap_device_open, + .release = blktap_device_release, + .ioctl = blktap_device_ioctl, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) + .getgeo = blktap_device_getgeo +#endif +}; + +static int +blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page, + unsigned long addr, void *data) +{ + pte_t *pte = (pte_t *)data; + + BTDBG("ptep %p -> %012llx\n", ptep, pte_val(*pte)); + set_pte(ptep, *pte); + xen_invlpg(addr); + return 0; +} + +static int +blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte) +{ + return apply_to_page_range(mm, address, + PAGE_SIZE, blktap_map_uaddr_fn, &pte); +} + +static int +blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page, + unsigned long addr, void *data) +{ + struct mm_struct *mm = (struct mm_struct *)data; + + BTDBG("ptep %p\n", ptep); + pte_clear(mm, addr, ptep); + xen_invlpg(addr); + return 0; +} + +static int +blktap_umap_uaddr(struct mm_struct *mm, unsigned long address) +{ + return apply_to_page_range(mm, address, + PAGE_SIZE, blktap_umap_uaddr_fn, mm); +} + +static void +blktap_device_end_dequeued_request(struct blktap_device *dev, + struct request *req, int uptodate) +{ + int ret; + + ret = end_that_request_first(req, uptodate, req->hard_nr_sectors); + BUG_ON(ret); + + spin_lock_irq(&dev->lock); + end_that_request_last(req, uptodate); + spin_unlock_irq(&dev->lock); +} + +/* + * tap->tap_sem held on entry + */ +static void +blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request) +{ + uint64_t ptep; + int ret, usr_idx; + unsigned int i, cnt; + struct page **map, *page; + struct blktap_ring *ring; + struct grant_handle_pair *khandle; + unsigned long kvaddr, uvaddr, offset; + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; + + cnt = 0; + ring = &tap->ring; + usr_idx = request->usr_idx; + map = ring->foreign_map.map; + + if (!ring->vma) + return; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + zap_page_range(ring->vma, + MMAP_VADDR(ring->user_vstart, usr_idx, 0), + request->nr_pages << PAGE_SHIFT, NULL); + + for (i = 0; i < request->nr_pages; i++) { + kvaddr = request_to_kaddr(request, i); + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + + khandle = request->handles + i; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[cnt], kvaddr, + GNTMAP_host_map, khandle->kernel); + cnt++; + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + if (create_lookup_pte_addr(ring->vma->vm_mm, + uvaddr, &ptep) != 0) { + BTERR("Couldn't get a pte addr!\n"); + return; + } + + gnttab_set_unmap_op(&unmap[cnt], ptep, + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + cnt++; + } + + offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; + + BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, " + "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: " + "0x%08lx, handle: %u\n", offset, map[offset], request, + usr_idx, i, kvaddr, khandle->kernel, uvaddr, + khandle->user); + + page = map[offset]; + if (page) { + ClearPageReserved(map[offset]); + if (PageBlkback(page)) { + ClearPageBlkback(page); + set_page_private(page, 0); + } + } + map[offset] = NULL; + + khandle->kernel = INVALID_GRANT_HANDLE; + khandle->user = INVALID_GRANT_HANDLE; + } + + if (cnt) { + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, cnt); + BUG_ON(ret); + } + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + zap_page_range(ring->vma, + MMAP_VADDR(ring->user_vstart, usr_idx, 0), + request->nr_pages << PAGE_SHIFT, NULL); +} + +/* + * tap->tap_sem held on entry + */ +static void +blktap_unmap(struct blktap *tap, struct blktap_request *request) +{ + int i, usr_idx; + unsigned long kvaddr; + + usr_idx = request->usr_idx; + down_write(&tap->ring.vma->vm_mm->mmap_sem); + + for (i = 0; i < request->nr_pages; i++) { + BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, " + "uvaddr: 0x%08lx, uhandle: %u\n", request, i, + request_to_kaddr(request, i), + request->handles[i].kernel, + MMAP_VADDR(tap->ring.user_vstart, usr_idx, i), + request->handles[i].user); + + if (request->handles[i].kernel == INVALID_GRANT_HANDLE) { + kvaddr = request_to_kaddr(request, i); + blktap_umap_uaddr(&init_mm, kvaddr); + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + } + + blktap_device_fast_flush(tap, request); + up_write(&tap->ring.vma->vm_mm->mmap_sem); +} + +/* + * called if the tapdisk process dies unexpectedly. + * fail and release any pending requests and disable queue. + */ +void +blktap_device_fail_pending_requests(struct blktap *tap) +{ + int usr_idx; + struct request *req; + struct blktap_device *dev; + struct blktap_request *request; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return; + + down_write(&tap->tap_sem); + + dev = &tap->device; + for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { + request = tap->pending_requests[usr_idx]; + if (!request) + continue; + + BTERR("%u:%u: failing pending %s of %d pages\n", + blktap_device_major, tap->minor, + (request->operation == BLKIF_OP_READ ? + "read" : "write"), request->nr_pages); + + blktap_unmap(tap, request); + req = (struct request *)(unsigned long)request->id; + blktap_device_end_dequeued_request(dev, req, 0); + blktap_request_free(tap, request); + } + + up_write(&tap->tap_sem); + + spin_lock_irq(&dev->lock); + + /* fail any future requests */ + dev->gd->queue->queuedata = NULL; + blk_start_queue(dev->gd->queue); + + spin_unlock_irq(&dev->lock); +} + +/* + * tap->tap_sem held on entry + */ +void +blktap_device_finish_request(struct blktap *tap, + blkif_response_t *res, + struct blktap_request *request) +{ + int uptodate; + struct request *req; + struct blktap_device *dev; + + dev = &tap->device; + + blktap_unmap(tap, request); + + req = (struct request *)(unsigned long)request->id; + uptodate = (res->status == BLKIF_RSP_OKAY); + + BTDBG("req %p res status %d operation %d/%d id %lld\n", req, + res->status, res->operation, request->operation, res->id); + + switch (request->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if (unlikely(res->status != BLKIF_RSP_OKAY)) + BTERR("Bad return from device data " + "request: %x\n", res->status); + blktap_device_end_dequeued_request(dev, req, uptodate); + break; + default: + BUG(); + } + + blktap_request_free(tap, request); +} + +static int +blktap_prep_foreign(struct blktap *tap, + struct blktap_request *request, + blkif_request_t *blkif_req, + unsigned int seg, struct page *page, + struct blktap_grant_table *table) +{ + uint64_t ptep; + uint32_t flags; + struct page *tap_page; + struct blktap_ring *ring; + struct blkback_pagemap map; + unsigned long uvaddr, kvaddr; + + ring = &tap->ring; + map = blkback_pagemap_read(page); + blkif_req->seg[seg].gref = map.gref; + + uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); + kvaddr = request_to_kaddr(request, seg); + flags = GNTMAP_host_map | + (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0); + + gnttab_set_map_op(&table->grants[table->cnt], + kvaddr, flags, map.gref, map.domid); + table->cnt++; + + /* enable chained tap devices */ + tap_page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + set_page_private(tap_page, page_private(page)); + SetPageBlkback(tap_page); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) { + BTERR("couldn't get a pte addr!\n"); + return -1; + } + + flags |= GNTMAP_application_map | GNTMAP_contains_pte; + gnttab_set_map_op(&table->grants[table->cnt], + ptep, flags, map.gref, map.domid); + table->cnt++; + + return 0; +} + +static int +blktap_map_foreign(struct blktap *tap, + struct blktap_request *request, + blkif_request_t *blkif_req, + struct blktap_grant_table *table) +{ + struct page *page; + int i, grant, err, usr_idx; + struct blktap_ring *ring; + unsigned long uvaddr, kvaddr, foreign_mfn; + + if (!table->cnt) + return 0; + + err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + table->grants, table->cnt); + BUG_ON(err); + + grant = 0; + usr_idx = request->usr_idx; + ring = &tap->ring; + + for (i = 0; i < request->nr_pages; i++) { + if (!blkif_req->seg[i].gref) + continue; + + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + kvaddr = request_to_kaddr(request, i); + + if (unlikely(table->grants[grant].status)) { + BTERR("invalid kernel buffer: could not remap it\n"); + err |= 1; + table->grants[grant].handle = INVALID_GRANT_HANDLE; + } + + request->handles[i].kernel = table->grants[grant].handle; + foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT; + grant++; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + goto done; + + if (unlikely(table->grants[grant].status)) { + BTERR("invalid user buffer: could not remap it\n"); + err |= 1; + table->grants[grant].handle = INVALID_GRANT_HANDLE; + } + + request->handles[i].user = table->grants[grant].handle; + grant++; + + done: + if (err) + continue; + + page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + FOREIGN_FRAME(foreign_mfn)); + else if (vm_insert_page(ring->vma, uvaddr, page)) + err |= 1; + + BTDBG("pending_req: %p, seg: %d, page: %p, " + "kvaddr: 0x%08lx, khandle: %u, uvaddr: 0x%08lx, " + "uhandle: %u\n", request, i, page, + kvaddr, request->handles[i].kernel, + uvaddr, request->handles[i].user); + } + + return err; +} + +static void +blktap_map(struct blktap *tap, + struct blktap_request *request, + unsigned int seg, struct page *page) +{ + pte_t pte; + int usr_idx; + struct blktap_ring *ring; + unsigned long uvaddr, kvaddr; + + ring = &tap->ring; + usr_idx = request->usr_idx; + uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg); + kvaddr = request_to_kaddr(request, seg); + + pte = mk_pte(page, ring->vma->vm_page_prot); + blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte)); + blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL)); + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte)); + request->handles[seg].kernel = INVALID_GRANT_HANDLE; + request->handles[seg].user = INVALID_GRANT_HANDLE; + + BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, " + "uvaddr: 0x%08lx\n", request, seg, page, kvaddr, + uvaddr); +} + +static int +blktap_device_process_request(struct blktap *tap, + struct blktap_request *request, + struct request *req) +{ + struct bio *bio; + struct page *page; + struct bio_vec *bvec; + int idx, usr_idx, err; + struct blktap_ring *ring; + struct blktap_grant_table table; + unsigned int fsect, lsect, nr_sects; + unsigned long offset, uvaddr, kvaddr; + struct blkif_request blkif_req, *target; + + err = -1; + memset(&table, 0, sizeof(table)); + + down_read(&tap->tap_sem); + + if (!blktap_active(tap)) + goto out; + + ring = &tap->ring; + usr_idx = request->usr_idx; + blkif_req.id = usr_idx; + blkif_req.sector_number = (blkif_sector_t)req->sector; + blkif_req.handle = 0; + blkif_req.operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + + request->id = (unsigned long)req; + request->operation = blkif_req.operation; + request->status = BLKIF_RSP_OKAY; + do_gettimeofday(&request->time); + + nr_sects = 0; + request->nr_pages = 0; + blkif_req.nr_segments = 0; + rq_for_each_bio(bio, req) { + bio_for_each_segment(bvec, bio, idx) { + BUG_ON(blkif_req.nr_segments == + BLKIF_MAX_SEGMENTS_PER_REQUEST); + + fsect = bvec->bv_offset >> 9; + lsect = fsect + (bvec->bv_len >> 9) - 1; + nr_sects += bvec->bv_len >> 9; + + blkif_req.seg[blkif_req.nr_segments] = + (struct blkif_request_segment) { + .gref = 0, + .first_sect = fsect, + .last_sect = lsect }; + + if (PageBlkback(bvec->bv_page)) { + /* foreign page -- use xen */ + if (blktap_prep_foreign(tap, + request, + &blkif_req, + blkif_req.nr_segments, + bvec->bv_page, + &table)) + goto out; + } else { + /* do it the old fashioned way */ + blktap_map(tap, + request, + blkif_req.nr_segments, + bvec->bv_page); + } + + uvaddr = MMAP_VADDR(ring->user_vstart, + usr_idx, blkif_req.nr_segments); + kvaddr = request_to_kaddr(request, + blkif_req.nr_segments); + offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; + page = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + ring->foreign_map.map[offset] = page; + SetPageReserved(page); + + BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n", + uvaddr, page, __pa(kvaddr) >> PAGE_SHIFT); + BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, " + "page: %p, kvaddr: 0x%08lx, uvaddr: 0x%08lx\n", + offset, request, blkif_req.nr_segments, + page, kvaddr, uvaddr); + + blkif_req.nr_segments++; + request->nr_pages++; + } + } + + if (blktap_map_foreign(tap, request, &blkif_req, &table)) + goto out; + + /* Finally, write the request message to the user ring. */ + target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); + memcpy(target, &blkif_req, sizeof(blkif_req)); + target->id = request->usr_idx; + wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ + ring->ring.req_prod_pvt++; + + if (rq_data_dir(req)) { + tap->stats.st_wr_sect += nr_sects; + tap->stats.st_wr_req++; + } else { + tap->stats.st_rd_sect += nr_sects; + tap->stats.st_rd_req++; + } + + err = 0; + +out: + if (err) + blktap_device_fast_flush(tap, request); + + up_read(&tap->tap_sem); + return err; +} + +#ifdef ENABLE_PASSTHROUGH +#define rq_for_each_bio_safe(_bio, _tmp, _req) \ + if ((_req)->bio) \ + for (_bio = (_req)->bio; \ + _bio && ((_tmp = _bio->bi_next) || 1); \ + _bio = _tmp) + +static void +blktap_device_forward_request(struct blktap *tap, struct request *req) +{ + struct bio *bio, *tmp; + struct blktap_device *dev; + + dev = &tap->device; + + rq_for_each_bio_safe(bio, tmp, req) { + bio->bi_bdev = dev->bdev; + submit_bio(bio->bi_rw, bio); + } +} + +static void +blktap_device_close_bdev(struct blktap *tap) +{ + struct blktap_device *dev; + + dev = &tap->device; + + if (dev->bdev) + blkdev_put(dev->bdev); + + dev->bdev = NULL; + clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); +} + +static int +blktap_device_open_bdev(struct blktap *tap, u32 pdev) +{ + struct block_device *bdev; + struct blktap_device *dev; + + dev = &tap->device; + + bdev = open_by_devnum(pdev, FMODE_WRITE); + if (IS_ERR(bdev)) { + BTERR("opening device %x:%x failed: %ld\n", + MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev)); + return PTR_ERR(bdev); + } + + if (!bdev->bd_disk) { + BTERR("device %x:%x doesn't exist\n", + MAJOR(pdev), MINOR(pdev)); + blkdev_put(dev->bdev); + return -ENOENT; + } + + dev->bdev = bdev; + set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); + + /* TODO: readjust queue parameters */ + + BTINFO("set device %d to passthrough on %x:%x\n", + tap->minor, MAJOR(pdev), MINOR(pdev)); + + return 0; +} + +int +blktap_device_enable_passthrough(struct blktap *tap, + unsigned major, unsigned minor) +{ + u32 pdev; + struct blktap_device *dev; + + dev = &tap->device; + pdev = MKDEV(major, minor); + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (dev->bdev) { + if (pdev) + return -EINVAL; + blktap_device_close_bdev(tap); + return 0; + } + + return blktap_device_open_bdev(tap, pdev); +} +#endif + +/* + * dev->lock held on entry + */ +static void +blktap_device_run_queue(struct blktap *tap) +{ + int queued, err; + request_queue_t *rq; + struct request *req; + struct blktap_ring *ring; + struct blktap_device *dev; + struct blktap_request *request; + + queued = 0; + ring = &tap->ring; + dev = &tap->device; + rq = dev->gd->queue; + + BTDBG("running queue for %d\n", tap->minor); + + while ((req = elv_next_request(rq)) != NULL) { + if (!blk_fs_request(req)) { + end_request(req, 0); + continue; + } + + if (blk_barrier_rq(req)) { + end_request(req, 0); + continue; + } + +#ifdef ENABLE_PASSTHROUGH + if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { + blkdev_dequeue_request(req); + blktap_device_forward_request(tap, req); + continue; + } +#endif + + if (RING_FULL(&ring->ring)) { + wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + blktap_defer(tap); + break; + } + + request = blktap_request_allocate(tap); + if (!request) { + tap->stats.st_oo_req++; + goto wait; + } + + BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) " + "buffer:%p [%s], pending: %p\n", req, tap->minor, + req->cmd, req->sector, req->current_nr_sectors, + req->nr_sectors, req->buffer, + rq_data_dir(req) ? "write" : "read", request); + + blkdev_dequeue_request(req); + + spin_unlock_irq(&dev->lock); + + err = blktap_device_process_request(tap, request, req); + if (!err) + queued++; + else { + blktap_device_end_dequeued_request(dev, req, 0); + blktap_request_free(tap, request); + } + + spin_lock_irq(&dev->lock); + } + + if (queued) + blktap_ring_kick_user(tap); +} + +/* + * dev->lock held on entry + */ +static void +blktap_device_do_request(request_queue_t *rq) +{ + struct request *req; + struct blktap *tap; + struct blktap_device *dev; + + dev = rq->queuedata; + if (!dev) + goto fail; + + tap = dev_to_blktap(dev); + if (!blktap_active(tap)) + goto fail; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || + test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + blktap_defer(tap); + return; + } + + blktap_device_run_queue(tap); + return; + +fail: + while ((req = elv_next_request(rq))) { + BTERR("device closed: failing secs %llu - %llu\n", + req->sector, req->sector + req->nr_sectors); + end_request(req, 0); + } +} + +void +blktap_device_restart(struct blktap *tap) +{ + struct blktap_device *dev; + + dev = &tap->device; + if (!dev->gd || !dev->gd->queue) + return; + + if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) { + blktap_defer(tap); + return; + } + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || + test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + blktap_defer(tap); + return; + } + + spin_lock_irq(&dev->lock); + + /* Re-enable calldowns. */ + if (blk_queue_stopped(dev->gd->queue)) + blk_start_queue(dev->gd->queue); + + /* Kick things off immediately. */ + blktap_device_do_request(dev->gd->queue); + + spin_unlock_irq(&dev->lock); +} + +static void +blktap_device_configure(struct blktap *tap) +{ + struct request_queue *rq; + struct blktap_device *dev = &tap->device; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd) + return; + + dev = &tap->device; + rq = dev->gd->queue; + + spin_lock_irq(&dev->lock); + + set_capacity(dev->gd, tap->params.capacity); + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_hardsect_size(rq, tap->params.sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + spin_unlock_irq(&dev->lock); +} + +int +blktap_device_resume(struct blktap *tap) +{ + int err; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return 0; + + err = blktap_ring_resume(tap); + if (err) + return err; + + /* device size may have changed */ + blktap_device_configure(tap); + + BTDBG("restarting device\n"); + blktap_device_restart(tap); + + return 0; +} + +int +blktap_device_pause(struct blktap *tap) +{ + unsigned long flags; + struct blktap_device *dev = &tap->device; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) + return -ENODEV; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return 0; + + spin_lock_irqsave(&dev->lock, flags); + + blk_stop_queue(dev->gd->queue); + set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + + spin_unlock_irqrestore(&dev->lock, flags); + + return blktap_ring_pause(tap); +} + +int +blktap_device_destroy(struct blktap *tap) +{ + struct blktap_device *dev = &tap->device; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return 0; + + BTINFO("destroy device %d users %d\n", tap->minor, dev->users); + + if (dev->users) + return -EBUSY; + + spin_lock_irq(&dev->lock); + /* No more blktap_device_do_request(). */ + blk_stop_queue(dev->gd->queue); + clear_bit(BLKTAP_DEVICE, &tap->dev_inuse); + spin_unlock_irq(&dev->lock); + +#ifdef ENABLE_PASSTHROUGH + if (dev->bdev) + blktap_device_close_bdev(tap); +#endif + + del_gendisk(dev->gd); + put_disk(dev->gd); + blk_cleanup_queue(dev->gd->queue); + + dev->gd = NULL; + + wake_up(&tap->wq); + + return 0; +} + +int +blktap_device_create(struct blktap *tap) +{ + int minor, err; + struct gendisk *gd; + struct request_queue *rq; + struct blktap_device *dev; + + gd = NULL; + rq = NULL; + dev = &tap->device; + minor = tap->minor; + + if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + return -EEXIST; + + if (blktap_validate_params(tap, &tap->params)) + return -EINVAL; + + BTINFO("minor %d sectors %Lu sector-size %lu\n", + minor, tap->params.capacity, tap->params.sector_size); + + err = -ENODEV; + + gd = alloc_disk(1); + if (!gd) + goto error; + + if (minor < 26) + sprintf(gd->disk_name, "tapdev%c", 'a' + minor); + else + sprintf(gd->disk_name, "tapdev%c%c", + 'a' + ((minor / 26) - 1), 'a' + (minor % 26)); + + gd->major = blktap_device_major; + gd->first_minor = minor; + gd->fops = &blktap_device_file_operations; + gd->private_data = dev; + + dev->lock = SPIN_LOCK_UNLOCKED; + rq = blk_init_queue(blktap_device_do_request, &dev->lock); + if (!rq) + goto error; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) + elevator_init(rq, "noop"); +#else + elevator_init(rq, &elevator_noop); +#endif + + gd->queue = rq; + rq->queuedata = dev; + dev->gd = gd; + + set_bit(BLKTAP_DEVICE, &tap->dev_inuse); + blktap_device_configure(tap); + + add_disk(gd); + + err = 0; + goto out; + + error: + if (gd) + del_gendisk(gd); + if (rq) + blk_cleanup_queue(rq); + + out: + BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err); + return err; +} + +int +blktap_device_init(int *maj) +{ + int major; + + /* Dynamically allocate a major for this device */ + major = register_blkdev(0, "tapdev"); + if (major < 0) { + BTERR("Couldn't register blktap device\n"); + return -ENOMEM; + } + + blktap_device_major = *maj = major; + BTINFO("blktap device major %d\n", major); + + return 0; +} + +void +blktap_device_free(void) +{ + if (blktap_device_major) + if (unregister_blkdev(blktap_device_major, "tapdev")) + BTERR("blktap device unregister failed\n"); +} diff -r e410857fd83c drivers/xen/blktap2/request.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/request.c Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,296 @@ +#include +#include + +#include "blktap.h" + +#define MAX_BUCKETS 8 +#define BUCKET_SIZE MAX_PENDING_REQS + +#define BLKTAP_POOL_CLOSING 1 + +struct blktap_request_bucket; + +struct blktap_request_handle { + int slot; + uint8_t inuse; + struct blktap_request request; + struct blktap_request_bucket *bucket; +}; + +struct blktap_request_bucket { + atomic_t reqs_in_use; + struct blktap_request_handle handles[BUCKET_SIZE]; + struct page **foreign_pages; +}; + +struct blktap_request_pool { + spinlock_t lock; + uint8_t status; + struct list_head free_list; + atomic_t reqs_in_use; + wait_queue_head_t wait_queue; + struct blktap_request_bucket *buckets[MAX_BUCKETS]; +}; + +static struct blktap_request_pool pool; + +static inline struct blktap_request_handle * +blktap_request_to_handle(struct blktap_request *req) +{ + return container_of(req, struct blktap_request_handle, request); +} + +static void +blktap_request_pool_init_request(struct blktap_request *request) +{ + int i; + + request->usr_idx = -1; + request->nr_pages = 0; + INIT_LIST_HEAD(&request->free_list); + for (i = 0; i < ARRAY_SIZE(request->handles); i++) { + request->handles[i].user = INVALID_GRANT_HANDLE; + request->handles[i].kernel = INVALID_GRANT_HANDLE; + } +} + +static int +blktap_request_pool_allocate_bucket(void) +{ + int i, idx; + unsigned long flags; + struct blktap_request *request; + struct blktap_request_handle *handle; + struct blktap_request_bucket *bucket; + + bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL); + if (!bucket) + goto fail; + + bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES); + if (!bucket->foreign_pages) + goto fail; + + spin_lock_irqsave(&pool.lock, flags); + + idx = -1; + for (i = 0; i < MAX_BUCKETS; i++) { + if (!pool.buckets[i]) { + idx = i; + pool.buckets[idx] = bucket; + break; + } + } + + if (idx == -1) { + spin_unlock_irqrestore(&pool.lock, flags); + goto fail; + } + + for (i = 0; i < BUCKET_SIZE; i++) { + handle = bucket->handles + i; + request = &handle->request; + + handle->slot = i; + handle->inuse = 0; + handle->bucket = bucket; + + blktap_request_pool_init_request(request); + list_add_tail(&request->free_list, &pool.free_list); + } + + spin_unlock_irqrestore(&pool.lock, flags); + + return 0; + +fail: + if (bucket && bucket->foreign_pages) + free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); + kfree(bucket); + return -ENOMEM; +} + +static void +blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket) +{ + if (!bucket) + return; + + BTDBG("freeing bucket %p\n", bucket); + + free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); + kfree(bucket); +} + +unsigned long +request_to_kaddr(struct blktap_request *req, int seg) +{ + struct blktap_request_handle *handle = blktap_request_to_handle(req); + int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; + unsigned long pfn = page_to_pfn(handle->bucket->foreign_pages[idx]); + return (unsigned long)pfn_to_kaddr(pfn); +} + +int +blktap_request_pool_shrink(void) +{ + int i, err; + unsigned long flags; + struct blktap_request_bucket *bucket; + + err = -EAGAIN; + + spin_lock_irqsave(&pool.lock, flags); + + /* always keep at least one bucket */ + for (i = 1; i < MAX_BUCKETS; i++) { + bucket = pool.buckets[i]; + if (!bucket) + continue; + + if (atomic_read(&bucket->reqs_in_use)) + continue; + + blktap_request_pool_free_bucket(bucket); + pool.buckets[i] = NULL; + err = 0; + break; + } + + spin_unlock_irqrestore(&pool.lock, flags); + + return err; +} + +int +blktap_request_pool_grow(void) +{ + return blktap_request_pool_allocate_bucket(); +} + +struct blktap_request * +blktap_request_allocate(struct blktap *tap) +{ + int i; + uint16_t usr_idx; + unsigned long flags; + struct blktap_request *request; + + usr_idx = -1; + request = NULL; + + spin_lock_irqsave(&pool.lock, flags); + + if (pool.status == BLKTAP_POOL_CLOSING) + goto out; + + for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++) + if (!tap->pending_requests[i]) { + usr_idx = i; + break; + } + + if (usr_idx == (uint16_t)-1) + goto out; + + if (!list_empty(&pool.free_list)) { + request = list_entry(pool.free_list.next, + struct blktap_request, free_list); + list_del(&request->free_list); + } + + if (request) { + struct blktap_request_handle *handle; + + atomic_inc(&pool.reqs_in_use); + + handle = blktap_request_to_handle(request); + atomic_inc(&handle->bucket->reqs_in_use); + handle->inuse = 1; + + request->usr_idx = usr_idx; + + tap->pending_requests[usr_idx] = request; + tap->pending_cnt++; + } + +out: + spin_unlock_irqrestore(&pool.lock, flags); + return request; +} + +void +blktap_request_free(struct blktap *tap, struct blktap_request *request) +{ + int free; + unsigned long flags; + struct blktap_request_handle *handle; + + BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests)); + handle = blktap_request_to_handle(request); + + spin_lock_irqsave(&pool.lock, flags); + + handle->inuse = 0; + tap->pending_requests[request->usr_idx] = NULL; + blktap_request_pool_init_request(request); + list_add(&request->free_list, &pool.free_list); + atomic_dec(&handle->bucket->reqs_in_use); + free = atomic_dec_and_test(&pool.reqs_in_use); + + spin_unlock_irqrestore(&pool.lock, flags); + + if (--tap->pending_cnt == 0) + wake_up_interruptible(&tap->wq); + + if (free) + wake_up(&pool.wait_queue); +} + +void +blktap_request_pool_free(void) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&pool.lock, flags); + + pool.status = BLKTAP_POOL_CLOSING; + while (atomic_read(&pool.reqs_in_use)) { + spin_unlock_irqrestore(&pool.lock, flags); + wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use)); + spin_lock_irqsave(&pool.lock, flags); + } + + for (i = 0; i < MAX_BUCKETS; i++) { + blktap_request_pool_free_bucket(pool.buckets[i]); + pool.buckets[i] = NULL; + } + + spin_unlock_irqrestore(&pool.lock, flags); +} + +int +blktap_request_pool_init(void) +{ + int i, err; + + memset(&pool, 0, sizeof(pool)); + + spin_lock_init(&pool.lock); + INIT_LIST_HEAD(&pool.free_list); + atomic_set(&pool.reqs_in_use, 0); + init_waitqueue_head(&pool.wait_queue); + + for (i = 0; i < 2; i++) { + err = blktap_request_pool_allocate_bucket(); + if (err) + goto fail; + } + + return 0; + +fail: + blktap_request_pool_free(); + return err; +} diff -r e410857fd83c drivers/xen/blktap2/ring.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/ring.c Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,613 @@ +#include +#include + +#include "blktap.h" + +static int blktap_ring_major; + +static inline struct blktap * +vma_to_blktap(struct vm_area_struct *vma) +{ + struct vm_foreign_map *m = vma->vm_private_data; + struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map); + return container_of(r, struct blktap, ring); +} + + /* + * BLKTAP - immediately before the mmap area, + * we have a bunch of pages reserved for shared memory rings. + */ +#define RING_PAGES 1 + +static int +blktap_read_ring(struct blktap *tap) +{ + /* This is called to read responses from the ring. */ + int usr_idx; + RING_IDX rc, rp; + blkif_response_t res; + struct blktap_ring *ring; + struct blktap_request *request; + + down_read(&tap->tap_sem); + + ring = &tap->ring; + if (!ring->vma) { + up_read(&tap->tap_sem); + return 0; + } + + /* for each outstanding message on the ring */ + rp = ring->ring.sring->rsp_prod; + rmb(); + + for (rc = ring->ring.rsp_cons; rc != rp; rc++) { + memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res)); + mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ + ++ring->ring.rsp_cons; + + usr_idx = (int)res.id; + if (usr_idx >= MAX_PENDING_REQS || + !tap->pending_requests[usr_idx]) { + BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n", + rc, rp, usr_idx, tap->pid, ring->vma); + continue; + } + + request = tap->pending_requests[usr_idx]; + BTDBG("request %p response #%d id %x\n", request, rc, usr_idx); + blktap_device_finish_request(tap, &res, request); + } + + up_read(&tap->tap_sem); + + blktap_run_deferred(); + + return 0; +} + +static struct page * +blktap_ring_nopage(struct vm_area_struct *vma, + unsigned long address, int *type) +{ + /* + * if the page has not been mapped in by the driver then return + * NOPAGE_SIGBUS to the domain. + */ + + return NOPAGE_SIGBUS; +} + +static pte_t +blktap_ring_clear_pte(struct vm_area_struct *vma, + unsigned long uvaddr, + pte_t *ptep, int is_fullmm) +{ + pte_t copy; + struct blktap *tap; + unsigned long kvaddr; + struct page **map, *page; + struct blktap_ring *ring; + struct blktap_request *request; + struct grant_handle_pair *khandle; + struct gnttab_unmap_grant_ref unmap[2]; + int offset, seg, usr_idx, count = 0; + + tap = vma_to_blktap(vma); + ring = &tap->ring; + map = ring->foreign_map.map; + BUG_ON(!map); /* TODO Should this be changed to if statement? */ + + /* + * Zap entry if the address is before the start of the grant + * mapped region. + */ + if (uvaddr < ring->user_vstart) + return ptep_get_and_clear_full(vma->vm_mm, uvaddr, + ptep, is_fullmm); + + offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT); + usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; + seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST; + + offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT); + page = map[offset]; + if (page) { + ClearPageReserved(page); + if (PageBlkback(page)) { + ClearPageBlkback(page); + set_page_private(page, 0); + } + } + map[offset] = NULL; + + request = tap->pending_requests[usr_idx]; + kvaddr = request_to_kaddr(request, seg); + khandle = request->handles + seg; + + if (khandle->kernel != INVALID_GRANT_HANDLE) { + gnttab_set_unmap_op(&unmap[count], kvaddr, + GNTMAP_host_map, khandle->kernel); + count++; + + set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + + + if (khandle->user != INVALID_GRANT_HANDLE) { + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + + copy = *ptep; + gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), + GNTMAP_host_map + | GNTMAP_application_map + | GNTMAP_contains_pte, + khandle->user); + count++; + } else + copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, + is_fullmm); + + if (count) + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, count)) + BUG(); + + khandle->kernel = INVALID_GRANT_HANDLE; + khandle->user = INVALID_GRANT_HANDLE; + + return copy; +} + +static void +blktap_ring_vm_unmap(struct vm_area_struct *vma) +{ + struct blktap *tap = vma_to_blktap(vma); + + down_write(&tap->tap_sem); + clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + up_write(&tap->tap_sem); +} + +static void +blktap_ring_vm_close(struct vm_area_struct *vma) +{ + struct blktap *tap = vma_to_blktap(vma); + struct blktap_ring *ring = &tap->ring; + + blktap_ring_vm_unmap(vma); /* fail future requests */ + blktap_device_fail_pending_requests(tap); /* fail pending requests */ + blktap_device_restart(tap); /* fail deferred requests */ + + down_write(&tap->tap_sem); + + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + + kfree(ring->foreign_map.map); + ring->foreign_map.map = NULL; + + /* Free the ring page. */ + ClearPageReserved(virt_to_page(ring->ring.sring)); + free_page((unsigned long)ring->ring.sring); + + BTINFO("unmapping ring %d\n", tap->minor); + ring->ring.sring = NULL; + ring->vma = NULL; + + up_write(&tap->tap_sem); + + wake_up(&tap->wq); +} + +static struct vm_operations_struct blktap_ring_vm_operations = { + .close = blktap_ring_vm_close, + .unmap = blktap_ring_vm_unmap, + .nopage = blktap_ring_nopage, + .zap_pte = blktap_ring_clear_pte, +}; + +static int +blktap_ring_open(struct inode *inode, struct file *filp) +{ + int idx; + struct blktap *tap; + + idx = iminor(inode); + if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) { + BTERR("unable to open device blktap%d\n", idx); + return -ENODEV; + } + + tap = blktaps[idx]; + + BTINFO("opening device blktap%d\n", idx); + + if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse)) + return -ENODEV; + + /* Only one process can access ring at a time */ + if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse)) + return -EBUSY; + + filp->private_data = tap; + BTINFO("opened device %d\n", tap->minor); + + return 0; +} + +static int +blktap_ring_release(struct inode *inode, struct file *filp) +{ + struct blktap *tap = filp->private_data; + + BTINFO("freeing device %d\n", tap->minor); + clear_bit(BLKTAP_RING_FD, &tap->dev_inuse); + filp->private_data = NULL; + wake_up(&tap->wq); + return 0; +} + +/* Note on mmap: + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to translate a user virtual address down to a + * physical address when the page belongs to another domain. + * + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + */ +static int +blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int size, err; + struct page **map; + struct blktap *tap; + blkif_sring_t *sring; + struct blktap_ring *ring; + + tap = filp->private_data; + ring = &tap->ring; + map = NULL; + sring = NULL; + + if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) + return -ENOMEM; + + size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (size != (MMAP_PAGES + RING_PAGES)) { + BTERR("you _must_ map exactly %lu pages!\n", + MMAP_PAGES + RING_PAGES); + return -EAGAIN; + } + + /* Allocate the fe ring. */ + sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); + if (!sring) { + BTERR("Couldn't alloc sring.\n"); + goto fail_mem; + } + + map = kzalloc(size * sizeof(struct page *), GFP_KERNEL); + if (!map) { + BTERR("Couldn't alloc VM_FOREIGN map.\n"); + goto fail_mem; + } + + SetPageReserved(virt_to_page(sring)); + + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE); + + ring->ring_vstart = vma->vm_start; + ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT); + + /* Map the ring pages to the start of the region and reserve it. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + err = vm_insert_page(vma, vma->vm_start, + virt_to_page(ring->ring.sring)); + else + err = remap_pfn_range(vma, vma->vm_start, + __pa(ring->ring.sring) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + BTERR("Mapping user ring failed: %d\n", err); + goto fail; + } + + /* Mark this VM as containing foreign pages, and set up mappings. */ + ring->foreign_map.map = map; + vma->vm_private_data = &ring->foreign_map; + vma->vm_flags |= VM_FOREIGN; + vma->vm_flags |= VM_DONTCOPY; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &blktap_ring_vm_operations; + +#ifdef CONFIG_X86 + vma->vm_mm->context.has_foreign_mappings = 1; +#endif + + tap->pid = current->pid; + BTINFO("blktap: mapping pid is %d\n", tap->pid); + + ring->vma = vma; + return 0; + + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + ClearPageReserved(virt_to_page(sring)); + fail_mem: + free_page((unsigned long)sring); + kfree(map); + + return -ENOMEM; +} + +static inline void +blktap_ring_set_message(struct blktap *tap, int msg) +{ + struct blktap_ring *ring = &tap->ring; + + down_read(&tap->tap_sem); + if (ring->ring.sring) + ring->ring.sring->pad[0] = msg; + up_read(&tap->tap_sem); +} + +static int +blktap_ring_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct blktap_params params; + struct blktap *tap = filp->private_data; + + BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg); + + switch(cmd) { + case BLKTAP2_IOCTL_KICK_FE: + /* There are fe messages to process. */ + return blktap_read_ring(tap); + + case BLKTAP2_IOCTL_CREATE_DEVICE: + if (!arg) + return -EINVAL; + + if (copy_from_user(¶ms, (struct blktap_params __user *)arg, + sizeof(params))) { + BTERR("failed to get params\n"); + return -EFAULT; + } + + if (blktap_validate_params(tap, ¶ms)) { + BTERR("invalid params\n"); + return -EINVAL; + } + + tap->params = params; + return blktap_device_create(tap); + + case BLKTAP2_IOCTL_SET_PARAMS: + if (!arg) + return -EINVAL; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (copy_from_user(¶ms, (struct blktap_params __user *)arg, + sizeof(params))) { + BTERR("failed to get params\n"); + return -EFAULT; + } + + if (blktap_validate_params(tap, ¶ms)) { + BTERR("invalid params\n"); + return -EINVAL; + } + + tap->params = params; + return 0; + + case BLKTAP2_IOCTL_PAUSE: + if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) + return -EINVAL; + + set_bit(BLKTAP_PAUSED, &tap->dev_inuse); + clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + + + case BLKTAP2_IOCTL_REOPEN: + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + if (!arg) + return -EINVAL; + + if (copy_to_user((char __user *)arg, + tap->params.name, + strlen(tap->params.name) + 1)) + return -EFAULT; + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + + case BLKTAP2_IOCTL_RESUME: + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + tap->ring.response = (int)arg; + if (!tap->ring.response) + clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); + + blktap_ring_set_message(tap, 0); + wake_up_interruptible(&tap->wq); + + return 0; + } + + return -ENOIOCTLCMD; +} + +static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait) +{ + struct blktap *tap = filp->private_data; + struct blktap_ring *ring = &tap->ring; + + poll_wait(filp, &ring->poll_wait, wait); + if (ring->ring.sring->pad[0] != 0 || + ring->ring.req_prod_pvt != ring->ring.sring->req_prod) { + RING_PUSH_REQUESTS(&ring->ring); + return POLLIN | POLLRDNORM; + } + + return 0; +} + +static struct file_operations blktap_ring_file_operations = { + .owner = THIS_MODULE, + .open = blktap_ring_open, + .release = blktap_ring_release, + .ioctl = blktap_ring_ioctl, + .mmap = blktap_ring_mmap, + .poll = blktap_ring_poll, +}; + +void +blktap_ring_kick_user(struct blktap *tap) +{ + wake_up_interruptible(&tap->ring.poll_wait); +} + +int +blktap_ring_resume(struct blktap *tap) +{ + int err; + struct blktap_ring *ring = &tap->ring; + + if (!blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EINVAL; + + /* set shared flag for resume */ + ring->response = 0; + + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME); + blktap_ring_kick_user(tap); + + wait_event_interruptible(tap->wq, ring->response || + !test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); + + err = ring->response; + ring->response = 0; + + BTDBG("err: %d\n", err); + + if (err) + return err; + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EAGAIN; + + return 0; +} + +int +blktap_ring_pause(struct blktap *tap) +{ + if (!blktap_active(tap)) + return -ENODEV; + + if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) + return -EINVAL; + + BTDBG("draining queue\n"); + wait_event_interruptible(tap->wq, !tap->pending_cnt); + if (tap->pending_cnt) + return -EAGAIN; + + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE); + blktap_ring_kick_user(tap); + + BTDBG("waiting for tapdisk response\n"); + wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) + return -EAGAIN; + + return 0; +} + +int +blktap_ring_destroy(struct blktap *tap) +{ + if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) && + !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) + return 0; + + BTDBG("sending tapdisk close message\n"); + blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE); + blktap_ring_kick_user(tap); + + return -EAGAIN; +} + +static void +blktap_ring_initialize(struct blktap_ring *ring, int minor) +{ + memset(ring, 0, sizeof(*ring)); + init_waitqueue_head(&ring->poll_wait); + ring->devno = MKDEV(blktap_ring_major, minor); +} + +int +blktap_ring_create(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + blktap_ring_initialize(ring, tap->minor); + return blktap_sysfs_create(tap); +} + +int +blktap_ring_init(int *major) +{ + int err; + + err = register_chrdev(0, "blktap2", &blktap_ring_file_operations); + if (err < 0) { + BTERR("error registering blktap ring device: %d\n", err); + return err; + } + + blktap_ring_major = *major = err; + BTINFO("blktap ring major: %d\n", blktap_ring_major); + return 0; +} + +int +blktap_ring_free(void) +{ + if (blktap_ring_major) + unregister_chrdev(blktap_ring_major, "blktap2"); + + return 0; +} diff -r e410857fd83c drivers/xen/blktap2/sysfs.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/sysfs.c Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,425 @@ +#include +#include +#include + +#include "blktap.h" + +int blktap_debug_level = 1; + +static struct class *class; +static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq); + +static inline void +blktap_sysfs_get(struct blktap *tap) +{ + atomic_inc(&tap->ring.sysfs_refcnt); +} + +static inline void +blktap_sysfs_put(struct blktap *tap) +{ + if (atomic_dec_and_test(&tap->ring.sysfs_refcnt)) + wake_up(&sysfs_wq); +} + +static inline void +blktap_sysfs_enter(struct blktap *tap) +{ + blktap_sysfs_get(tap); /* pin sysfs device */ + mutex_lock(&tap->ring.sysfs_mutex); /* serialize sysfs operations */ +} + +static inline void +blktap_sysfs_exit(struct blktap *tap) +{ + mutex_unlock(&tap->ring.sysfs_mutex); + blktap_sysfs_put(tap); +} + +static ssize_t blktap_sysfs_pause_device(struct class_device *, const char *, size_t); +CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); +static ssize_t blktap_sysfs_resume_device(struct class_device *, const char *, size_t); +CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); + +static ssize_t +blktap_sysfs_set_name(struct class_device *dev, const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EPERM; + goto out; + } + + if (size > BLKTAP2_MAX_MESSAGE_LEN) { + err = -ENAMETOOLONG; + goto out; + } + + if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) { + err = -EINVAL; + goto out; + } + + snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf); + err = size; + +out: + blktap_sysfs_exit(tap); + return err; +} + +static ssize_t +blktap_sysfs_get_name(struct class_device *dev, char *buf) +{ + ssize_t size; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev) + size = -ENODEV; + else if (tap->params.name[0]) + size = sprintf(buf, "%s\n", tap->params.name); + else + size = sprintf(buf, "%d\n", tap->minor); + + blktap_sysfs_exit(tap); + + return size; +} +CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR, + blktap_sysfs_get_name, blktap_sysfs_set_name); + +static ssize_t +blktap_sysfs_remove_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + if (!tap->ring.dev) + return size; + + if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) + return -EBUSY; + + err = blktap_control_destroy_device(tap); + + return (err ? : size); +} +CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); + +static ssize_t +blktap_sysfs_pause_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + BTDBG("pausing %u:%u: dev_inuse: %lu\n", + MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { + err = -EBUSY; + goto out; + } + + if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = 0; + goto out; + } + + err = blktap_device_pause(tap); + if (!err) { + class_device_remove_file(dev, &class_device_attr_pause); + class_device_create_file(dev, &class_device_attr_resume); + } + +out: + blktap_sysfs_exit(tap); + + return (err ? err : size); +} + +static ssize_t +blktap_sysfs_resume_device(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + struct blktap *tap = (struct blktap *)dev->class_data; + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + err = blktap_device_resume(tap); + if (!err) { + class_device_remove_file(dev, &class_device_attr_resume); + class_device_create_file(dev, &class_device_attr_pause); + } + +out: + blktap_sysfs_exit(tap); + + BTDBG("returning %d\n", (err ? err : size)); + return (err ? err : size); +} + +#ifdef ENABLE_PASSTHROUGH +static ssize_t +blktap_sysfs_enable_passthrough(struct class_device *dev, + const char *buf, size_t size) +{ + int err; + unsigned major, minor; + struct blktap *tap = (struct blktap *)dev->class_data; + + BTINFO("passthrough request enabled\n"); + + blktap_sysfs_enter(tap); + + if (!tap->ring.dev || + test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { + err = -ENODEV; + goto out; + } + + if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { + err = -EINVAL; + goto out; + } + + err = sscanf(buf, "%x:%x", &major, &minor); + if (err != 2) { + err = -EINVAL; + goto out; + } + + err = blktap_device_enable_passthrough(tap, major, minor); + +out: + blktap_sysfs_exit(tap); + BTDBG("returning %d\n", (err ? err : size)); + return (err ? err : size); +} +#endif + +static ssize_t +blktap_sysfs_debug_device(struct class_device *dev, char *buf) +{ + char *tmp; + int i, ret; + struct blktap *tap = (struct blktap *)dev->class_data; + + tmp = buf; + blktap_sysfs_get(tap); + + if (!tap->ring.dev) { + ret = sprintf(tmp, "no device\n"); + goto out; + } + + tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n", + tap->params.name, MAJOR(tap->ring.devno), + MINOR(tap->ring.devno), atomic_read(&tap->refcnt), + tap->dev_inuse); + tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, " + "device users: %d\n", tap->params.capacity, + tap->params.sector_size, tap->device.users); + + down_read(&tap->tap_sem); + + tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt); + for (i = 0; i < MAX_PENDING_REQS; i++) { + struct blktap_request *req = tap->pending_requests[i]; + if (!req) + continue; + + tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, " + "status: 0x%02x, pendcnt: %d, " + "nr_pages: %u, op: %d, time: %lu:%lu\n", + i, req->id, req->usr_idx, + req->status, atomic_read(&req->pendcnt), + req->nr_pages, req->operation, req->time.tv_sec, + req->time.tv_usec); + } + + up_read(&tap->tap_sem); + ret = (tmp - buf) + 1; + +out: + blktap_sysfs_put(tap); + BTDBG("%s\n", buf); + + return ret; +} +CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); + +int +blktap_sysfs_create(struct blktap *tap) +{ + struct blktap_ring *ring; + struct class_device *dev; + + if (!class) + return -ENODEV; + + ring = &tap->ring; + + dev = class_device_create(class, NULL, ring->devno, + NULL, "blktap%d", tap->minor); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + ring->dev = dev; + dev->class_data = tap; + + mutex_init(&ring->sysfs_mutex); + atomic_set(&ring->sysfs_refcnt, 0); + set_bit(BLKTAP_SYSFS, &tap->dev_inuse); + + class_device_create_file(dev, &class_device_attr_name); + class_device_create_file(dev, &class_device_attr_remove); + class_device_create_file(dev, &class_device_attr_pause); + class_device_create_file(dev, &class_device_attr_debug); + + return 0; +} + +int +blktap_sysfs_destroy(struct blktap *tap) +{ + struct blktap_ring *ring; + struct class_device *dev; + + ring = &tap->ring; + dev = ring->dev; + if (!class || !dev) + return 0; + + ring->dev = NULL; + if (wait_event_interruptible(sysfs_wq, + !atomic_read(&tap->ring.sysfs_refcnt))) + return -EAGAIN; + + /* XXX: is it safe to remove the class from a sysfs attribute? */ + class_device_remove_file(dev, &class_device_attr_name); + class_device_remove_file(dev, &class_device_attr_remove); + class_device_remove_file(dev, &class_device_attr_pause); + class_device_remove_file(dev, &class_device_attr_resume); + class_device_remove_file(dev, &class_device_attr_debug); + class_device_destroy(class, ring->devno); + + clear_bit(BLKTAP_SYSFS, &tap->dev_inuse); + + return 0; +} + +static ssize_t +blktap_sysfs_show_verbosity(struct class *class, char *buf) +{ + return sprintf(buf, "%d\n", blktap_debug_level); +} + +static ssize_t +blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size) +{ + int level; + + if (sscanf(buf, "%d", &level) == 1) { + blktap_debug_level = level; + return size; + } + + return -EINVAL; +} +CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR, + blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity); + +static ssize_t +blktap_sysfs_show_devices(struct class *class, char *buf) +{ + int i, ret; + struct blktap *tap; + + ret = 0; + for (i = 0; i < MAX_BLKTAP_DEVICE; i++) { + tap = blktaps[i]; + if (!tap) + continue; + + if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) + continue; + + ret += sprintf(buf + ret, "%d ", tap->minor); + ret += snprintf(buf + ret, sizeof(tap->params.name) - 1, + tap->params.name); + ret += sprintf(buf + ret, "\n"); + } + + return ret; +} +CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL); + +void +blktap_sysfs_free(void) +{ + if (!class) + return; + + class_remove_file(class, &class_attr_verbosity); + class_remove_file(class, &class_attr_devices); + + class_destroy(class); +} + +int +blktap_sysfs_init(void) +{ + struct class *cls; + + if (class) + return -EEXIST; + + cls = class_create(THIS_MODULE, "blktap2"); + if (IS_ERR(cls)) + return PTR_ERR(cls); + + class_create_file(cls, &class_attr_verbosity); + class_create_file(cls, &class_attr_devices); + + class = cls; + return 0; +} diff -r e410857fd83c drivers/xen/blktap2/wait_queue.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/drivers/xen/blktap2/wait_queue.c Wed Oct 29 21:19:50 2008 -0700 @@ -0,0 +1,40 @@ +#include +#include + +#include "blktap.h" + +static LIST_HEAD(deferred_work_queue); +static DEFINE_SPINLOCK(deferred_work_lock); + +void +blktap_run_deferred(void) +{ + LIST_HEAD(queue); + struct blktap *tap; + unsigned long flags; + + spin_lock_irqsave(&deferred_work_lock, flags); + list_splice_init(&deferred_work_queue, &queue); + list_for_each_entry(tap, &queue, deferred_queue) + clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse); + spin_unlock_irqrestore(&deferred_work_lock, flags); + + while (!list_empty(&queue)) { + tap = list_entry(queue.next, struct blktap, deferred_queue); + list_del_init(&tap->deferred_queue); + blktap_device_restart(tap); + } +} + +void +blktap_defer(struct blktap *tap) +{ + unsigned long flags; + + spin_lock_irqsave(&deferred_work_lock, flags); + if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) { + set_bit(BLKTAP_DEFERRED, &tap->dev_inuse); + list_add_tail(&tap->deferred_queue, &deferred_work_queue); + } + spin_unlock_irqrestore(&deferred_work_lock, flags); +} diff -r e410857fd83c include/linux/mm.h --- a/include/linux/mm.h Wed Oct 22 14:55:29 2008 +0100 +++ b/include/linux/mm.h Wed Oct 29 21:19:50 2008 -0700 @@ -166,6 +166,9 @@ #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #ifdef CONFIG_XEN #define VM_FOREIGN 0x04000000 /* Has pages belonging to another VM */ +struct vm_foreign_map { + struct page **map; +}; #endif #define VM_ALWAYSDUMP 0x08000000 /* Always include in core dumps */ @@ -210,6 +213,10 @@ * original value of @ptep. */ pte_t (*zap_pte)(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, int is_fullmm); + + /* called before close() to indicate no more pages should be mapped */ + void (*unmap)(struct vm_area_struct *area); + #ifdef CONFIG_NUMA int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); struct mempolicy *(*get_policy)(struct vm_area_struct *vma, diff -r e410857fd83c include/linux/page-flags.h --- a/include/linux/page-flags.h Wed Oct 22 14:55:29 2008 +0100 +++ b/include/linux/page-flags.h Wed Oct 29 21:19:50 2008 -0700 @@ -99,6 +99,16 @@ #endif #define PG_foreign 20 /* Page is owned by foreign allocator. */ + +#define PG_netback 21 /* Page is owned by netback */ +#define PageNetback(page) test_bit(PG_netback, &(page)->flags) +#define SetPageNetback(page) set_bit(PG_netback, &(page)->flags) +#define ClearPageNetback(page) clear_bit(PG_netback, &(page)->flags) + +#define PG_blkback 22 /* Page is owned by blkback */ +#define PageBlkback(page) test_bit(PG_blkback, &(page)->flags) +#define SetPageBlkback(page) set_bit(PG_blkback, &(page)->flags) +#define ClearPageBlkback(page) clear_bit(PG_blkback, &(page)->flags) /* * Manipulation of page state flags diff -r e410857fd83c mm/memory.c --- a/mm/memory.c Wed Oct 22 14:55:29 2008 +0100 +++ b/mm/memory.c Wed Oct 29 21:19:50 2008 -0700 @@ -1045,7 +1045,9 @@ #ifdef CONFIG_XEN if (vma && (vma->vm_flags & VM_FOREIGN)) { - struct page **map = vma->vm_private_data; + struct vm_foreign_map *foreign_map = + vma->vm_private_data; + struct page **map = foreign_map->map; int offset = (start - vma->vm_start) >> PAGE_SHIFT; if (map[offset] != NULL) { if (pages) { diff -r e410857fd83c mm/mmap.c --- a/mm/mmap.c Wed Oct 22 14:55:29 2008 +0100 +++ b/mm/mmap.c Wed Oct 29 21:19:50 2008 -0700 @@ -1687,6 +1687,12 @@ tlb_finish_mmu(tlb, start, end); } +static inline void unmap_vma(struct vm_area_struct *vma) +{ + if (unlikely(vma->vm_ops && vma->vm_ops->unmap)) + vma->vm_ops->unmap(vma); +} + /* * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. @@ -1702,6 +1708,7 @@ insertion_point = (prev ? &prev->vm_next : &mm->mmap); do { rb_erase(&vma->vm_rb, &mm->mm_rb); + unmap_vma(vma); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -1959,13 +1966,16 @@ void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma_tmp, *vma = mm->mmap; unsigned long nr_accounted = 0; unsigned long end; #ifdef arch_exit_mmap arch_exit_mmap(mm); #endif + + for (vma_tmp = mm->mmap; vma_tmp; vma_tmp = vma_tmp->vm_next) + unmap_vma(vma_tmp); lru_add_drain(); flush_cache_mm(mm);