[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] Add grant table support to block tap.



# HG changeset patch
# User akw27@xxxxxxxxxxxxxxxxxxxxxx
# Node ID eaf498f1ffdef1d63ef9df03f1d8ea749227d183
# Parent  0237746ecf92423a1b948836902f857c4cc3ddd3
Add grant table support to block tap.

This patch adds grant table support to the block tap.  The AIO support
introduced in patch 9f0eff879d8913a824280cf67658a530c80e8424 still
works -- The tap code maps a granted page twice, once in kernel and
once in user.  The kernel page is patched into the p2m table and pages
added to the user vm_area are mapped to the appropriate underlying
struct pages using the VM_FOREIGN hooks in get_user_pages().

Comparing block IO from dom0 to the existing block backend, and to the
tap managing the same partition as the BE from user space with AIO, I
get the following performance:

Version  1.03       ------Sequential Output------ --Sequential Input- --Random-
                    -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
xen0             2G 31198  95 56818   8 20967   2 28415  77 59595   4 264.9   0
xenU-blkbe2cpuGT 2G 31157  96 54026  10 25585   4 30664  90 64919   7 292.7   0
xenU-blktp2cpuGT 2G 32313  97 54217   8 20950   3 28117  87 65924   4 191.8   0

Signed-off-by: andrew.warfield@xxxxxxxxxxxx

diff -r 0237746ecf92 -r eaf498f1ffde 
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c  Tue Aug 16 07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c  Tue Aug 16 10:12:18 2005
@@ -23,6 +23,9 @@
     blkif_be_driver_status_t be_st;
 
     printk(KERN_INFO "Initialising Xen block tap device\n");
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    printk(KERN_INFO "Block tap is using grant tables.\n");
+#endif
 
     DPRINTK("   tap - Backend connection init:\n");
 
diff -r 0237746ecf92 -r eaf498f1ffde 
linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h  Tue Aug 16 07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h  Tue Aug 16 10:12:18 2005
@@ -85,6 +85,11 @@
     spinlock_t          blk_ring_lock;
     atomic_t            refcnt;
     struct work_struct work;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    u16 shmem_handle;
+    memory_t shmem_vaddr;
+    grant_ref_t shmem_ref;
+#endif
 } blkif_t;
 
 blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
diff -r 0237746ecf92 -r eaf498f1ffde 
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c       Tue Aug 
16 07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c       Tue Aug 
16 10:12:18 2005
@@ -9,6 +9,7 @@
  */
  
 #include "blktap.h"
+#include <asm-xen/evtchn.h>
 
 static char *blkif_state_name[] = {
     [BLKIF_STATE_CLOSED]       = "closed",
@@ -48,12 +49,21 @@
     blkif_t              *blkif = (blkif_t *)arg;
     ctrl_msg_t            cmsg;
     blkif_be_disconnect_t disc;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    struct gnttab_unmap_grant_ref op;
+#endif
 
     /*
      * These can't be done in blkif_disconnect() because at that point there
      * may be outstanding requests at the disc whose asynchronous responses
      * must still be notified to the remote driver.
      */
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    op.host_addr = blkif->shmem_vaddr;
+    op.handle         = blkif->shmem_handle;
+    op.dev_bus_addr   = 0;
+    BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+#endif
     vfree(blkif->blk_ring.sring);
 
     /* Construct the deferred response message. */
@@ -177,8 +187,12 @@
     unsigned int   evtchn = connect->evtchn;
     unsigned long  shmem_frame = connect->shmem_frame;
     struct vm_struct *vma;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    int ref = connect->shmem_ref;
+#else
     pgprot_t       prot;
     int            error;
+#endif
     blkif_t       *blkif;
     blkif_sring_t *sring;
 
@@ -199,24 +213,46 @@
         return;
     }
 
-    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+#ifndef CONFIG_XEN_BLKDEV_GRANT
+    prot = __pgprot(_KERNPG_TABLE);
     error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
                                     shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
                                     prot, domid);
     if ( error != 0 )
     {
-        WPRINTK("BE_CONNECT: error! (%d)\n", error);
         if ( error == -ENOMEM ) 
             connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
-        else if ( error == -EFAULT ) {
+        else if ( error == -EFAULT )
             connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
-            WPRINTK("BE_CONNECT: MAPPING error!\n");
-        }
         else
             connect->status = BLKIF_BE_STATUS_ERROR;
         vfree(vma->addr);
         return;
     }
+#else
+    { /* Map: Use the Grant table reference */
+        struct gnttab_map_grant_ref op;
+        op.host_addr = VMALLOC_VMADDR(vma->addr);
+        op.flags            = GNTMAP_host_map;
+        op.ref              = ref;
+        op.dom              = domid;
+       
+        BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) );
+       
+        handle = op.handle;
+       
+        if (op.handle < 0) {
+            DPRINTK(" Grant table operation failure !\n");
+            connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+            vfree(vma->addr);
+            return;
+        }
+
+        blkif->shmem_ref = ref;
+        blkif->shmem_handle = handle;
+        blkif->shmem_vaddr = VMALLOC_VMADDR(vma->addr);
+    }
+#endif
 
     if ( blkif->status != DISCONNECTED )
     {
diff -r 0237746ecf92 -r eaf498f1ffde 
linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c
--- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c  Tue Aug 16 
07:07:11 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c  Tue Aug 16 
10:12:18 2005
@@ -21,6 +21,9 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm-xen/xen-public/io/blkif.h> /* for control ring. */
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+#include <asm-xen/xen-public/grant_table.h>
+#endif
 
 #include "blktap.h"
 
@@ -42,6 +45,7 @@
 /* local prototypes */
 static int blktap_read_fe_ring(void);
 static int blktap_read_be_ring(void);
+
 
 /* -------[ mmap region ]--------------------------------------------- */
 /*
@@ -73,7 +77,28 @@
      ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
      ((_seg) * PAGE_SIZE))
 
-
+/* -------[ grant handles ]------------------------------------------- */
+
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+    u16  kernel;
+    u16  user;
+};
+static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
+#define pending_handle(_idx, _i) \
+    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
+#define BLKTAP_INVALID_HANDLE(_g) \
+    (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
+#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
+    (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
+    } while(0)
+    
+#endif
 
 
 /* -------[ blktap vm ops ]------------------------------------------- */
@@ -348,9 +373,43 @@
     
 /*-----[ Data to/from user space ]----------------------------------------*/
 
-
 static void fast_flush_area(int idx, int nr_pages)
 {
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+    unsigned int i, op = 0;
+    struct grant_handle_pair *handle;
+    unsigned long ptep;
+
+    for (i=0; i<nr_pages; i++)
+    {
+        handle = &pending_handle(idx, i);
+        if (!BLKTAP_INVALID_HANDLE(handle))
+        {
+
+            unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
+            unmap[op].dev_bus_addr = 0;
+            unmap[op].handle = handle->kernel;
+            op++;
+
+            if (create_lookup_pte_addr(blktap_vma->vm_mm,
+                                       MMAP_VADDR(user_vstart, idx, i), 
+                                       &ptep) !=0) {
+                DPRINTK("Couldn't get a pte addr!\n");
+                return;
+            }
+            unmap[op].host_addr    = ptep;
+            unmap[op].dev_bus_addr = 0;
+            unmap[op].handle       = handle->user;
+            op++;
+            
+            BLKTAP_INVALIDATE_HANDLE(handle);
+        }
+    }
+    if ( unlikely(HYPERVISOR_grant_table_op(
+        GNTTABOP_unmap_grant_ref, unmap, op)))
+        BUG();
+#else
     multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     int               i;
 
@@ -363,21 +422,22 @@
     mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
     if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
         BUG();
-}
-
-
-extern int __direct_remap_area_pages(struct mm_struct *mm,
-                                     unsigned long address,
-                                     unsigned long size,
-                                     mmu_update_t *v);
+#endif
+}
+
 
 int blktap_write_fe_ring(blkif_request_t *req)
 {
     blkif_request_t *target;
-    int i;
+    int i, ret = 0;
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+    int op;
+#else
     unsigned long remap_prot;
     multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1];
     mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+#endif
 
     /*
      * This is called to pass a request from the real frontend domain's
@@ -394,18 +454,109 @@
         return 0;
     }
 
-    remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
     flush_cache_all(); /* a noop on intel... */
 
     target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt);
     memcpy(target, req, sizeof(*req));
 
     /* Map the foreign pages directly in to the application */
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    op = 0;
+    for (i=0; i<target->nr_segments; i++) {
+
+        unsigned long uvaddr;
+        unsigned long kvaddr;
+        unsigned long ptep;
+
+        uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
+        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i);
+
+        /* Map the remote page to kernel. */
+        map[op].host_addr = kvaddr;
+        map[op].dom   = ID_TO_DOM(req->id);
+        map[op].ref   = blkif_gref_from_fas(target->frame_and_sects[i]);
+        map[op].flags = GNTMAP_host_map;
+        /* This needs a bit more thought in terms of interposition: 
+         * If we want to be able to modify pages during write using 
+         * grant table mappings, the guest will either need to allow 
+         * it, or we'll need to incur a copy. */
+        if (req->operation == BLKIF_OP_WRITE)
+            map[op].flags |= GNTMAP_readonly;
+        op++;
+
+        /* Now map it to user. */
+        ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
+        if (ret)
+        {
+            DPRINTK("Couldn't get a pte addr!\n");
+            goto fail;
+        }
+
+        map[op].host_addr = ptep;
+        map[op].dom       = ID_TO_DOM(req->id);
+        map[op].ref       = blkif_gref_from_fas(target->frame_and_sects[i]);
+        map[op].flags     = GNTMAP_host_map | GNTMAP_application_map
+                            | GNTMAP_contains_pte;
+        /* Above interposition comment applies here as well. */
+        if (req->operation == BLKIF_OP_WRITE)
+            map[op].flags |= GNTMAP_readonly;
+        op++;
+    }
+
+    if ( unlikely(HYPERVISOR_grant_table_op(
+            GNTTABOP_map_grant_ref, map, op)))
+        BUG();
+
+    op = 0;
+    for (i=0; i<(target->nr_segments*2); i+=2) {
+        unsigned long uvaddr;
+        unsigned long kvaddr;
+        unsigned long offset;
+        int cancel = 0;
+
+        uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i/2);
+        kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i/2);
+
+        if ( unlikely(map[i].handle < 0) ) {
+            DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle);
+            ret = map[i].handle;
+            cancel = 1;
+        }
+
+        if ( unlikely(map[i+1].handle < 0) ) {
+            DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle);
+            ret = map[i+1].handle;
+            cancel = 1;
+        }
+
+        if (cancel) 
+            goto fail;
+
+        /* Set the necessary mappings in p2m and in the VM_FOREIGN 
+         * vm_area_struct to allow user vaddr -> struct page lookups
+         * to work.  This is needed for direct IO to foreign pages. */
+        phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
+            FOREIGN_FRAME(map[i].dev_bus_addr);
+
+        offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
+        ((struct page **)blktap_vma->vm_private_data)[offset] =
+            pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
+
+        /* Save handles for unmapping later. */
+        pending_handle(ID_TO_IDX(req->id), i/2).kernel = map[i].handle;
+        pending_handle(ID_TO_IDX(req->id), i/2).user   = map[i+1].handle;
+    }
+    
+#else
+
+    remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+
     for (i=0; i<target->nr_segments; i++) {
         unsigned long buf;
         unsigned long uvaddr;
         unsigned long kvaddr;
         unsigned long offset;
+        unsigned long ptep;
 
         buf   = target->frame_and_sects[i] & PAGE_MASK;
         uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i);
@@ -421,10 +572,14 @@
         phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] =
             FOREIGN_FRAME(buf >> PAGE_SHIFT);
 
-        __direct_remap_area_pages(blktap_vma->vm_mm,
-                                  uvaddr,
-                                  PAGE_SIZE,
-                                  &mmu[i]);
+        ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
+        if (ret)
+        { 
+            DPRINTK("error getting pte\n");
+            goto fail;
+        }
+
+        mmu[i].ptr = ptep;
         mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK)
             | pgprot_val(blktap_vma->vm_page_prot);
 
@@ -448,16 +603,17 @@
         if ( unlikely(mcl[i].result != 0) )
         {
             DPRINTK("invalid buffer -- could not remap it\n");
-            fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
-            return -1;
+            ret = mcl[i].result;
+            goto fail;
         }
     }
     if ( unlikely(mcl[i].result != 0) )
     {
         DPRINTK("direct remapping of pages to /dev/blktap failed.\n");
-        return -1;
-    }
-
+        ret = mcl[i].result;
+        goto fail;
+    }
+#endif /* CONFIG_XEN_BLKDEV_GRANT */
 
     /* Mark mapped pages as reserved: */
     for ( i = 0; i < target->nr_segments; i++ )
@@ -472,6 +628,10 @@
     blktap_ufe_ring.req_prod_pvt++;
     
     return 0;
+
+ fail:
+    fast_flush_area(ID_TO_IDX(req->id), target->nr_segments);
+    return ret;
 }
 
 int blktap_write_be_ring(blkif_response_t *rsp)
@@ -538,11 +698,10 @@
                 map[offset] = NULL;
             }
 
-
+            fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
             zap_page_range(blktap_vma, 
                     MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0), 
                     ar->nr_pages << PAGE_SHIFT, NULL);
-            fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages);
             write_resp_to_fe_ring(blkif, resp_s);
             blktap_ufe_ring.rsp_cons = i + 1;
             kick_fe_domain(blkif);
@@ -616,10 +775,16 @@
 
 int blktap_init(void)
 {
-    int err;
+    int err, i, j;
 
     if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
         BUG();
+
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    for (i=0; i<MAX_PENDING_REQS ; i++)
+        for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+            BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
+#endif
 
     err = misc_register(&blktap_miscdev);
     if ( err != 0 )

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.