[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] Blktap updates: request batching, O_DIRECT/AIO support.



# HG changeset patch
# User akw27@xxxxxxxxxxxxxxxxxxxxxx
# Node ID 9f0eff879d8913a824280cf67658a530c80e8424
# Parent  1d240086de52f7ace9b0dc47b2db85a54696233a
Blktap updates: request batching, O_DIRECT/AIO support.

This patch changes the blktap code mapping pages to user space to be
faster and to allow page lookups to foreign mapped pages through linux
to do direct io.  An AIO test driver on this achieves comparable
performance to the in-kernel block-backend.

Signed-off-by: andrew.warfield@xxxxxxxxxxxx

diff -r 1d240086de52 -r 9f0eff879d89 
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h  Thu Aug  4 15:02:09 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h  Thu Aug  4 16:35:35 2005
@@ -103,8 +103,6 @@
     blkif_t       *blkif;
     unsigned long  id;
     int            nr_pages;
-    unsigned long  mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-    unsigned long  virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     int            next_free;
 } active_req_t;
 
@@ -172,32 +170,7 @@
 
 
 /* -------[ Mappings to User VMA ]------------------------------------ */
-#define MAX_PENDING_REQS 64
 #define BATCH_PER_DOMAIN 16
-extern struct vm_area_struct *blktap_vma;
-
-/* The following are from blkback.c and should probably be put in a
- * header and included from there.
- * The mmap area described here is where attached data pages eill be mapped.
- */
- 
-extern unsigned long mmap_vstart;
-#define MMAP_PAGES_PER_REQUEST \
-    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
-#define MMAP_PAGES             \
-    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_req,_seg)                        \
-    (mmap_vstart +                                   \
-     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
-     ((_seg) * PAGE_SIZE))
-
-/* immediately before the mmap area, we have a bunch of pages reserved
- * for shared memory rings.
- */
-
-#define RING_PAGES 3 /* Ctrl, Front, and Back */ 
-extern unsigned long rings_vstart;
-
 
 /* -------[ Here be globals ]----------------------------------------- */
 extern unsigned long blktap_mode;
diff -r 1d240086de52 -r 9f0eff879d89 
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c Thu Aug  4 
15:02:09 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c Thu Aug  4 
16:35:35 2005
@@ -280,8 +280,6 @@
     int more_to_do = 0;
     int notify_be = 0, notify_user = 0;
     
-    if (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) return 1;
-    
     /* lock both rings */
     spin_lock_irqsave(&blkif_io_lock, flags);
 
diff -r 1d240086de52 -r 9f0eff879d89 
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c  Thu Aug  4 
15:02:09 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c  Thu Aug  4 
16:35:35 2005
@@ -19,6 +19,7 @@
 #include <linux/gfp.h>
 #include <linux/poll.h>
 #include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
 #include <asm-xen/xen-public/io/blkif.h> /* for control ring. */
 
 #include "blktap.h"
@@ -32,11 +33,6 @@
 
 /* for poll: */
 static wait_queue_head_t blktap_wait;
-
-/* Where things are inside the device mapping. */
-struct vm_area_struct *blktap_vma = NULL;
-unsigned long mmap_vstart;
-unsigned long rings_vstart;
 
 /* Rings up to user space. */
 static blkif_front_ring_t blktap_ufe_ring;
@@ -47,6 +43,39 @@
 static int blktap_read_fe_ring(void);
 static int blktap_read_be_ring(void);
 
+/* -------[ mmap region ]--------------------------------------------- */
+/*
+ * We use a big chunk of address space to map in-flight requests into,
+ * and export this region up to user-space.  See the comments in blkback
+ * about this -- the two must be kept in sync if the tap is used as a 
+ * passthrough.
+ */
+
+#define MAX_PENDING_REQS 64
+
+/* immediately before the mmap area, we have a bunch of pages reserved
+ * for shared memory rings.
+ */
+#define RING_PAGES 3 /* Ctrl, Front, and Back */ 
+
+/* Where things are inside the device mapping. */
+struct vm_area_struct *blktap_vma = NULL;
+unsigned long mmap_vstart;  /* Kernel pages for mapping in data. */
+unsigned long rings_vstart; /* start of mmaped vma               */
+unsigned long user_vstart;  /* start of user mappings            */
+
+#define MMAP_PAGES_PER_REQUEST \
+    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
+#define MMAP_PAGES             \
+    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg)                \
+    ( _start +                                       \
+     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+     ((_seg) * PAGE_SIZE))
+
+
+
+
 /* -------[ blktap vm ops ]------------------------------------------- */
 
 static struct page *blktap_nopage(struct vm_area_struct *vma,
@@ -76,8 +105,6 @@
     
     if ( test_and_set_bit(0, &blktap_dev_inuse) )
         return -EBUSY;
-
-    printk(KERN_ALERT "blktap open.\n");
     
     /* Allocate the ctrl ring. */
     csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL);
@@ -128,7 +155,7 @@
     blktap_dev_inuse = 0;
     blktap_ring_ok = 0;
 
-    printk(KERN_ALERT "blktap closed.\n");
+    DPRINTK(KERN_ALERT "blktap closed.\n");
 
     /* Free the ring page. */
     ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring));
@@ -140,7 +167,7 @@
     ClearPageReserved(virt_to_page(blktap_ube_ring.sring));
     free_page((unsigned long) blktap_ube_ring.sring);
 
-    /* Clear any active mappings. */
+    /* Clear any active mappings and free foreign map table */
     if (blktap_vma != NULL) {
         zap_page_range(blktap_vma, blktap_vma->vm_start, 
                        blktap_vma->vm_end - blktap_vma->vm_start, NULL);
@@ -151,21 +178,36 @@
 }
 
 /* Note on mmap:
- * remap_pfn_range sets VM_IO on vma->vm_flags.  In trying to make libaio
- * work to do direct page access from userspace, this ended up being a
- * problem.  The bigger issue seems to be that there is no way to map
- * a foreign page in to user space and have the virtual address of that 
- * page map sanely down to a mfn.
- * Removing the VM_IO flag results in a loop in get_user_pages, as 
- * pfn_valid() always fails on a foreign page.
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them.  This couldn't be done before, because
+ * there isn't really a sane way to make a user virtual address down to a 
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space.  This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
+ * from pages to actual page structs.  There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ * 
+ * blktap_mmap sets up this mapping.  Most of the real work is done in
+ * blktap_write_fe_ring below.
  */
 static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
 {
     int size;
-
-    printk(KERN_ALERT "blktap mmap (%lx, %lx)\n",
+    struct page **map;
+    int i;
+
+    DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
            vma->vm_start, vma->vm_end);
 
+    vma->vm_flags |= VM_RESERVED;
     vma->vm_ops = &blktap_vm_ops;
 
     size = vma->vm_end - vma->vm_start;
@@ -177,10 +219,10 @@
     }
 
     size >>= PAGE_SHIFT;
-    printk(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
+    DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
     
     rings_vstart = vma->vm_start;
-    mmap_vstart  = rings_vstart + (RING_PAGES << PAGE_SHIFT);
+    user_vstart  = rings_vstart + (RING_PAGES << PAGE_SHIFT);
     
     /* Map the ring pages to the start of the region and reserve it. */
 
@@ -190,29 +232,44 @@
     DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring));
     if (remap_pfn_range(vma, vma->vm_start, 
                          __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT, 
-                         PAGE_SIZE, vma->vm_page_prot)) {
-        WPRINTK("ctrl_ring: remap_pfn_range failure!\n");
-    }
+                         PAGE_SIZE, vma->vm_page_prot)) 
+        goto fail;
 
 
     DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring));
     if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE, 
                          __pa(blktap_ube_ring.sring) >> PAGE_SHIFT, 
-                         PAGE_SIZE, vma->vm_page_prot)) {
-        WPRINTK("be_ring: remap_pfn_range failure!\n");
-    }
+                         PAGE_SIZE, vma->vm_page_prot)) 
+        goto fail;
 
     DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring));
     if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ), 
                          __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, 
-                         PAGE_SIZE, vma->vm_page_prot)) {
-        WPRINTK("fe_ring: remap_pfn_range failure!\n");
-    }
-            
+                         PAGE_SIZE, vma->vm_page_prot)) 
+        goto fail;
+
+    /* Mark this VM as containing foreign pages, and set up mappings. */
+    map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
+                  * sizeof(struct page_struct*),
+                  GFP_KERNEL);
+    if (map == NULL) goto fail;
+
+    for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
+        map[i] = NULL;
+    
+    vma->vm_private_data = map;
+    vma->vm_flags |= VM_FOREIGN;
+
     blktap_vma = vma;
     blktap_ring_ok = 1;
 
     return 0;
+ fail:
+    /* Clear any active mappings. */
+    zap_page_range(vma, vma->vm_start, 
+                   vma->vm_end - vma->vm_start, NULL);
+
+    return -ENOMEM;
 }
 
 static int blktap_ioctl(struct inode *inode, struct file *filp,
@@ -263,6 +320,8 @@
              RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring)   ||
              RING_HAS_UNPUSHED_RESPONSES(&blktap_ube_ring) ) {
 
+            flush_tlb_all();
+
             RING_PUSH_REQUESTS(&blktap_uctrl_ring);
             RING_PUSH_REQUESTS(&blktap_ufe_ring);
             RING_PUSH_RESPONSES(&blktap_ube_ring);
@@ -290,10 +349,35 @@
 /*-----[ Data to/from user space ]----------------------------------------*/
 
 
+static void fast_flush_area(int idx, int nr_pages)
+{
+    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    int               i;
+
+    for ( i = 0; i < nr_pages; i++ )
+    {
+        MULTI_update_va_mapping(mcl+i, MMAP_VADDR(mmap_vstart, idx, i),
+                                __pte(0), 0);
+    }
+
+    mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
+    if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
+        BUG();
+}
+
+
+extern int __direct_remap_area_pages(struct mm_struct *mm,
+                                     unsigned long address,
+                                     unsigned long size,
+                                     mmu_update_t *v);
+
 int blktap_write_fe_ring(blkif_request_t *req)
 {
     blkif_request_t *target;
-    int error, i;
+    int i;
+    unsigned long remap_prot;
+    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1];
+    mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 
     /*
      * This is called to pass a request from the real frontend domain's
@@ -310,26 +394,81 @@
         return 0;
     }
 
-    target = RING_GET_REQUEST(&blktap_ufe_ring,
-            blktap_ufe_ring.req_prod_pvt);
+    remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+    flush_cache_all(); /* a noop on intel... */
+
+    target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
     memcpy(target, req, sizeof(*req));
 
-    /* Attempt to map the foreign pages directly in to the application */
+    /* Map the foreign pages directly in to the application */
     for (i=0; i<target->nr_segments; i++) {
-
-        error = direct_remap_area_pages(blktap_vma->vm_mm, 
-                                        MMAP_VADDR(ID_TO_IDX(req->id), i), 
-                                        target->frame_and_sects[i] & PAGE_MASK,
-                                        PAGE_SIZE,
-                                        blktap_vma->vm_page_prot,
-                                        ID_TO_DOM(req->id));
-        if ( error != 0 ) {
-            printk(KERN_INFO "remapping attached page failed! (%d)\n", error);
-            /* the request is now dropped on the floor. */
-            return 0;
+        unsigned long buf;
+        unsigned long uvaddr;
+        unsigned long kvaddr;
+        unsigned long offset;
+
+        buf   = target->frame_and_sects[i] & PAGE_MASK;
+        uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
+        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
+
+        MULTI_update_va_mapping_otherdomain(
+            mcl+i, 
+            kvaddr, 
+            pfn_pte_ma(buf >> PAGE_SHIFT, __pgprot(remap_prot)),
+            0,
+            ID_TO_DOM(req->id));
+
+        phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
+            FOREIGN_FRAME(buf >> PAGE_SHIFT);
+
+        __direct_remap_area_pages(blktap_vma->vm_mm,
+                                  uvaddr,
+                                  PAGE_SIZE,
+                                  &mmu[i]);
+        mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK)
+            | pgprot_val(blktap_vma->vm_page_prot);
+
+        offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+        ((struct page **)blktap_vma->vm_private_data)[offset] =
+            pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+    }
+    
+    /* Add the mmu_update call. */
+    mcl[i].op = __HYPERVISOR_mmu_update;
+    mcl[i].args[0] = (unsigned long)mmu;
+    mcl[i].args[1] = target->nr_segments;
+    mcl[i].args[2] = 0;
+    mcl[i].args[3] = ID_TO_DOM(req->id);
+
+    BUG_ON(HYPERVISOR_multicall(mcl, target->nr_segments+1) != 0);
+
+    /* Make sure it all worked. */
+    for ( i = 0; i < target->nr_segments; i++ )
+    {
+        if ( unlikely(mcl[i].result != 0) )
+        {
+            DPRINTK("invalid buffer -- could not remap it\n");
+            fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
+            return -1;
         }
     }
-    
+    if ( unlikely(mcl[i].result != 0) )
+    {
+        DPRINTK("direct remapping of pages to /dev/blktap failed.\n");
+        return -1;
+    }
+
+
+    /* Mark mapped pages as reserved: */
+    for ( i = 0; i < target->nr_segments; i++ )
+    {
+        unsigned long kvaddr;
+
+        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
+        SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
+    }
+
+
     blktap_ufe_ring.req_prod_pvt++;
     
     return 0;
@@ -366,7 +505,7 @@
 {
     /* This is called to read responses from the UFE ring. */
 
-    RING_IDX i, rp;
+    RING_IDX i, j, rp;
     blkif_response_t *resp_s;
     blkif_t *blkif;
     active_req_t *ar;
@@ -387,8 +526,23 @@
             DPRINTK("resp->fe_ring\n");
             ar = lookup_active_req(ID_TO_IDX(resp_s->id));
             blkif = ar->blkif;
-            zap_page_range(blktap_vma, MMAP_VADDR(ID_TO_IDX(resp_s->id), 0), 
+            for (j = 0; j < ar->nr_pages; j++) {
+                unsigned long vaddr;
+                struct page **map = blktap_vma->vm_private_data;
+                int offset; 
+
+                vaddr  = MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), j);
+                offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+
+                ClearPageReserved(virt_to_page(vaddr));
+                map[offset] = NULL;
+            }
+
+
+            zap_page_range(blktap_vma, 
+                    MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0), 
                     ar->nr_pages << PAGE_SHIFT, NULL);
+            fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
             write_resp_to_fe_ring(blkif, resp_s);
             blktap_ufe_ring.rsp_cons = i + 1;
             kick_fe_domain(blkif);
@@ -464,6 +618,9 @@
 {
     int err;
 
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+        BUG();
+
     err = misc_register(&blktap_miscdev);
     if ( err != 0 )
     {
diff -r 1d240086de52 -r 9f0eff879d89 tools/blktap/blktaplib.c
--- a/tools/blktap/blktaplib.c  Thu Aug  4 15:02:09 2005
+++ b/tools/blktap/blktaplib.c  Thu Aug  4 16:35:35 2005
@@ -34,7 +34,7 @@
 #else
 #define DPRINTF(_f, _a...) ((void)0)
 #endif
-#define DEBUG_RING_IDXS 0
+#define DEBUG_RING_IDXS 1
 
 #define POLLRDNORM     0x040 
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.