[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1/9] Xen Share: Simplified I/O Mechanism



I think this is ready for experimental inclusion.  A number of people
have asked about it, and I think it would benefit from exposure to
bright minds.  It provides additional mechanisms, and need not replace
existing ones.

My main concern is that having the hypervisor decrement an arbitrary
address on notification is wonderful for driver authors, but requires
the x86 hypervisor to keep mappings: restricting addresses to within the
shared pages would be easier for the hypervisor...

Feedback encouraged!
Rusty.
---
Subject: Xen share core

This introduces a page "share" mechanism to xen: an alternative to
both cross-domain binding of event channels, and grant tables.

Dom0 can create sharable pages, which returns a handle.  It can then
grant access permission to other domains.  Any domain which has
permission can request access using that handle, which binds an event
channel to that share and returns a unique peerid for that domain:
this is useful for arbitration on multi-way shared pages ("you are
user #3").

A watch & trigger mechanism creates a simple event mechanism: a watch
on an arbitrary "watch number" associated with the share causes the
hypervisor to decrement an address when a trigger is performed on that
watch number: if the location is decremented to zero, an event channel
is raised.

Finally, a scatter-gather list mechanism allows the domains to
associate their pages with arbitrary queue numbers in the shared
region, to transport bulk data (effectively by having the hypervisor
do "DMA" between domains).

The patch includes an abstraction layer so architectures which don't
want virtual or machine addresses from the kernel can change that, and
also so that architectures can allocate the sharable pages as they
wish.

diff -r d5f98d23427a xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Tue May 30 10:44:23 2006
+++ b/xen/arch/x86/Makefile     Wed May 31 17:39:54 2006
@@ -30,6 +30,7 @@
 obj-y += physdev.o
 obj-y += rwlock.o
 obj-y += setup.o
+obj-y += share.o
 obj-y += shutdown.o
 obj-y += smp.o
 obj-y += smpboot.o
diff -r d5f98d23427a xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Tue May 30 10:44:23 2006
+++ b/xen/arch/x86/x86_32/entry.S       Wed May 31 17:39:54 2006
@@ -648,6 +648,7 @@
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_share_op
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -687,6 +688,7 @@
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 5 /* do_share_op          */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r d5f98d23427a xen/common/Makefile
--- a/xen/common/Makefile       Tue May 30 10:44:23 2006
+++ b/xen/common/Makefile       Wed May 31 17:39:54 2006
@@ -16,6 +16,7 @@
 obj-y += sched_credit.o
 obj-y += sched_sedf.o
 obj-y += schedule.o
+obj-y += share.o
 obj-y += softirq.o
 obj-y += string.o
 obj-y += symbols.o
diff -r d5f98d23427a xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c     Tue May 30 10:44:23 2006
+++ b/xen/common/dom0_ops.c     Wed May 31 17:39:54 2006
@@ -16,6 +16,7 @@
 #include <xen/domain_page.h>
 #include <xen/trace.h>
 #include <xen/console.h>
+#include <xen/share.h>
 #include <xen/iocap.h>
 #include <xen/guest_access.h>
 #include <asm/current.h>
@@ -634,6 +635,27 @@
         }
     }
     break;
+    case DOM0_CREATESHAREDPAGES:
+    {
+        ret = create_shared_pages(op->u.createsharedpages.num);
+    }
+    break;
+    case DOM0_DESTROYSHAREDPAGES:
+    {
+        ret = destroy_shared_pages(op->u.destroysharedpages.share_ref);
+    }
+    break;
+    case DOM0_GRANTSHAREDPAGES:
+    {
+        struct domain *d; 
+        ret = -ESRCH;
+        d = find_domain_by_id(op->u.grantsharedpages.domain);
+        if ( d != NULL )
+        {
+            ret = grant_shared_pages(op->u.grantsharedpages.share_ref, d);
+        }
+    }
+    break;
 
     case DOM0_IRQ_PERMISSION:
     {
diff -r d5f98d23427a xen/common/domain.c
--- a/xen/common/domain.c       Tue May 30 10:44:23 2006
+++ b/xen/common/domain.c       Wed May 31 17:39:54 2006
@@ -16,6 +16,7 @@
 #include <xen/console.h>
 #include <xen/softirq.h>
 #include <xen/domain_page.h>
+#include <xen/share.h>
 #include <xen/rangeset.h>
 #include <xen/guest_access.h>
 #include <xen/hypercall.h>
@@ -304,6 +305,7 @@
     grant_table_destroy(d);
 
     arch_domain_destroy(d);
+    free_shares(d);
 
     free_domain(d);
 
diff -r d5f98d23427a xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Tue May 30 10:44:23 2006
+++ b/xen/include/asm-x86/mm.h  Wed May 31 17:39:54 2006
@@ -183,9 +183,53 @@
         free_domheap_page(page);
 }
 
+int try_shared_page(struct page_info *page, struct domain *domain);
 
 static inline int get_page(struct page_info *page,
                            struct domain *domain)
+{
+    u32 x, nx, y = page->count_info;
+    u32 d, nd = page->u.inuse._domain;
+    u32 _domain = pickle_domptr(domain);
+
+    do {
+        x  = y;
+        nx = x + 1;
+        d  = nd;
+        if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
+             unlikely((nx & PGC_count_mask) == 0) ) /* Count overflow? */
+        {
+           if ( !_shadow_mode_refcounts(domain) )
+               DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" 
PRtype_info "\n",
+                       page_to_pfn(page), domain, unpickle_domptr(d),
+                       x, page->u.inuse.type_info);
+           return 0;
+        }
+       if ( unlikely(d != _domain) )               /* Wrong owner? */
+           return try_shared_page(page, domain);
+        __asm__ __volatile__(
+            LOCK_PREFIX "cmpxchg8b %3"
+            : "=d" (nd), "=a" (y), "=c" (d),
+              "=m" (*(volatile u64 *)(&page->count_info))
+            : "0" (d), "1" (x), "c" (d), "b" (nx) );
+    }
+    while ( unlikely(nd != d) || unlikely(y != x) );
+
+    return 1;
+}
+
+static inline int arch_get_shared_page(struct page_info *page)
+{
+       /* Shared pages always under lock, so this is safe. */
+       if (unlikely((page->count_info+1)&PGC_count_mask) == 0)
+               return 0;
+       page->count_info++;
+       return 1;
+}
+
+/* Does not try to get shared pages. */
+static inline int get_unshared_page(struct page_info *page,
+                                   struct domain *domain)
 {
     u32 x, nx, y = page->count_info;
     u32 d, nd = page->u.inuse._domain;
diff -r d5f98d23427a xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Tue May 30 10:44:23 2006
+++ b/xen/include/public/dom0_ops.h     Wed May 31 17:39:54 2006
@@ -513,6 +513,28 @@
 };
 typedef struct dom0_hypercall_init dom0_hypercall_init_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_hypercall_init_t);
+
+#define DOM0_CREATESHAREDPAGES 49
+struct dom0_createsharedpages {
+    uint32_t num;
+};
+typedef struct dom0_createsharedpages dom0_createsharedpages_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_createsharedpages_t);
+
+#define DOM0_GRANTSHAREDPAGES 50
+struct dom0_grantsharedpages {
+    unsigned long share_ref;
+    domid_t domain;
+};
+typedef struct dom0_grantsharedpages dom0_grantsharedpages_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_grantsharedpages_t);
+
+#define DOM0_DESTROYSHAREDPAGES 51
+struct dom0_destroysharedpages {
+    unsigned long share_ref;
+};
+typedef struct dom0_destroysharedpages dom0_destroysharedpages_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_destroysharedpages_t);
 
 struct dom0_op {
     uint32_t cmd;
@@ -555,6 +577,9 @@
         struct dom0_irq_permission    irq_permission;
         struct dom0_iomem_permission  iomem_permission;
         struct dom0_hypercall_init    hypercall_init;
+        struct dom0_createsharedpages createsharedpages;
+        struct dom0_grantsharedpages  grantsharedpages;
+        struct dom0_destroysharedpages destroysharedpages;
         uint8_t                       pad[128];
     } u;
 };
diff -r d5f98d23427a xen/include/public/xen.h
--- a/xen/include/public/xen.h  Tue May 30 10:44:23 2006
+++ b/xen/include/public/xen.h  Wed May 31 17:39:54 2006
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_share_op             33
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
diff -r d5f98d23427a xen/arch/x86/share.c
--- /dev/null   Tue May 30 10:44:23 2006
+++ b/xen/arch/x86/share.c      Wed May 31 17:39:54 2006
@@ -0,0 +1,36 @@
+#include <xen/share.h>
+#include <xen/mm.h>
+#include <asm/share.h>
+
+struct page_info *arch_alloc_shared_pages(unsigned int order, share_ref_t *ref)
+{
+       struct page_info *page;
+       void *addr;
+       int i;
+
+       /* x86 uses normal xen heap pages to share. */
+       addr = alloc_xenheap_pages(order);
+       if (!addr)
+               return NULL;
+
+       for(i=0;i<(1<<order);i++) {
+               clear_page(addr+i*PAGE_SIZE);
+               page = virt_to_page(addr+i*PAGE_SIZE);
+               page_set_owner(page, NULL);
+               /* Domain pointer must be visible before updating refcnt. */
+               wmb();
+               page->count_info = PGC_allocated|1;
+               page->u.inuse.type_info = PGT_writable_page|PGT_validated;
+               BUG_ON(page->u.inuse._domain);
+       }
+
+       /* x86 simply uses page frame numbers as share_refs. */
+       page = virt_to_page(addr);
+       *ref = page_to_mfn(page);
+       return page;
+}
+
+void arch_free_shared_pages(struct page_info *page, unsigned int order)
+{
+       free_xenheap_pages(page_to_virt(page), order);
+}
diff -r d5f98d23427a xen/common/share.c
--- /dev/null   Tue May 30 10:44:23 2006
+++ b/xen/common/share.c        Wed May 31 17:39:54 2006
@@ -0,0 +1,926 @@
+/* -*-  Mode:C; c-basic-offset:8; tab-width:8; indent-tabs-mode:t -*- */
+/******************************************************************************
+ * Page sharing and triggers for Xen.
+ *
+ * Copyright (C) 2005,2006 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <public/share.h>
+#include <xen/share.h>
+#include <xen/list.h>
+#include <xen/sched.h>
+#include <xen/mm.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/share.h>
+
+/* Howmany peers are we willing to share a page with? */
+#define MAX_PEERS      32
+
+struct watch
+{
+       struct list_head list;
+
+       /* Where am I watching? */
+       u32 trigger;
+
+       /* Where to decrement: have done map_domain_mem on this. */
+       atomic_t *decrement;
+
+       struct peer *owner;
+};
+
+struct sg_list
+{
+       struct list_head list;
+
+       /* Where to write size: have done map_domain_mem on this. */
+       u32 *len_pointer;
+
+       /* Which queue am I in? */
+       u32 queue;
+
+       struct peer *owner;
+
+       int direction;
+       unsigned int num_sg;
+       struct xen_sg sg[0];
+};
+
+/* Each domain accessing this share. */
+struct peer
+{
+       struct list_head list;
+
+       /* Peer ID; unique for this share. */
+       unsigned int id;
+
+       /* Share whose linked list we're in. */
+       struct share *share;
+
+       /* What domain & port to notify when something happens. */
+       /* FIXME: Fix up this when vcpu goes up or down. */
+       struct vcpu *vcpu;
+       int port;
+};
+
+struct share
+{
+       struct list_head list;
+
+       share_ref_t share_ref;
+
+       /* The page involved. */
+       struct page_info *page;
+
+       /* Which domains are sharing this. */
+       struct list_head peers;
+
+       /* Watches on this page */
+       struct list_head watches;
+
+       /* Scatter-gather lists on this page for this peer. */
+       struct list_head sgs;
+
+       /* Can this page be destroyed when last one unshares? */
+       int destroy;
+
+       /* Who can share this?  At least creator. */
+       unsigned int num_granted;
+       /* FIXME: Make this dynamic in the future */
+       struct domain *granted[MAX_PEERS];
+
+       /* How many pages requested and what order allocation was used */
+       unsigned int num_pages;
+       unsigned int order;
+};
+static LIST_HEAD(shares);
+static spinlock_t share_lock = SPIN_LOCK_UNLOCKED;
+
+static inline int is_sharing(const struct domain *domain,
+                            const struct share *share)
+{
+       struct peer *i;
+
+       list_for_each_entry(i, &share->peers, list)
+               if (i->vcpu->domain == domain)
+                       return 1;
+       return 0;
+}
+
+/* Has this domain been granted access to share? */
+static inline int allowed_to_share(const struct domain *domain,
+                                  const struct share *share)
+{
+       unsigned int i;
+
+       for (i = 0; i < share->num_granted; i++)
+               if (share->granted[i] == domain)
+                       return 1;
+       return 0;
+}
+
+int try_shared_page(struct page_info *page, struct domain *domain)
+{
+       struct share *i;
+
+       spin_lock(&share_lock);
+       list_for_each_entry(i, &shares, list) {
+               /* Does the pfn match the shared page or is the physical
+                * address in the range of allocated pages for this share_ref */
+               if (i->page == page
+                   || (page_to_maddr(i->page) <= page_to_maddr(page)
+                       && page_to_maddr(page) < 
+                          page_to_maddr(i->page) + i->num_pages*PAGE_SIZE)) {
+                       if (!is_sharing(domain, i))
+                               break;
+                       if (!arch_get_shared_page(page))
+                               break;
+                       spin_unlock(&share_lock);
+                       return 1;
+               }
+       }
+       spin_unlock(&share_lock);
+       return 0;
+}
+
+/* Like fds, guarantees lowest number available.  Keeps list ordered */
+static void insert_by_peer_id(struct share *share, struct peer *peer)
+{
+       unsigned int id = 0;
+       struct peer *i;
+
+       list_for_each_entry(i, &share->peers, list) {
+               if (i->id != id) {
+                       /* Put new peer in this hole. */
+                       list_add_tail(&peer->list, &i->list);
+                       peer->id = id;
+                       return;
+               }
+               id++;
+       }
+       list_add_tail(&peer->list, &share->peers);
+       peer->id = id;
+}
+
+static int add_domain_to_peers(struct share *share,
+                              struct vcpu *vcpu, int port)
+{
+       struct peer *peer = xmalloc(struct peer);
+
+       if (!peer)
+               return -ENOMEM;
+
+       peer->vcpu = vcpu;
+       peer->port = port;
+       peer->share = share;
+       insert_by_peer_id(share, peer);
+       return peer->id;
+}
+
+/* Returns share id of page(s). */
+share_ref_t create_shared_pages(unsigned int num)
+{
+       paddr_t ret;
+       struct share *share;
+
+       /* Only support upto 16 pages at the moment. */
+       if (num > 16 )
+               return -EINVAL;
+
+       share = xmalloc(struct share);
+       if (!share)
+               return -ENOMEM;
+
+       share->num_granted = 1;
+       share->granted[0] = current->domain;
+       share->destroy = 0;
+       share->num_pages = num;
+       share->order = get_order_from_pages(num);
+
+       INIT_LIST_HEAD(&share->peers);
+       INIT_LIST_HEAD(&share->watches);
+       INIT_LIST_HEAD(&share->sgs);
+
+       share->page = arch_alloc_shared_pages(share->order, &share->share_ref);
+       if (!share->page) {
+               xfree(share);
+               return -ENOMEM;
+       }
+       
+       /* Grab first to avoid potential race with free. */
+       ret = share->share_ref;
+
+       spin_lock(&share_lock);
+       list_add(&share->list, &shares);
+       spin_unlock(&share_lock);
+       return ret;
+}
+
+static inline int add_grant(struct share *share, struct domain *domain)
+{
+       /* FIXME: Currently we statically allocate an array for peers */
+       if (share->num_granted == MAX_PEERS)
+               return -ENOSPC;
+
+       /* Domain must not already have access. */
+       if (allowed_to_share(domain, share))
+               return -EEXIST;
+
+       /* Add this domain to the end of the array, Lock already held */
+       share->granted[share->num_granted] = domain;
+       share->num_granted++;
+
+       return 0;
+}
+
+static struct share *find_share(share_ref_t share_ref)
+{
+       struct share *i;
+
+       list_for_each_entry(i, &shares, list)
+               if (i->share_ref == share_ref)
+                       return i;
+       return NULL;
+}
+
+static struct share *find_share_check(paddr_t machine_addr)
+{
+       struct share *share = find_share(machine_addr);
+
+       if (!share || !is_sharing(current->domain, share))
+               return NULL;
+       return share;
+}
+
+int grant_shared_pages(share_ref_t share_ref, struct domain *domain)
+{
+       struct share *share;
+       int err;
+
+       spin_lock(&share_lock);
+       share = find_share(share_ref);
+       if (share)
+               err = add_grant(share, domain);
+       else
+               err = -ENOENT;
+       spin_unlock(&share_lock);
+       return err;
+}
+
+static void try_free_share(struct share *share)
+{
+       /* Last peer out frees page. */
+       if (list_empty(&share->peers) && share->destroy) {
+               list_del(&share->list);
+               arch_free_shared_pages(share->page, share->order);
+               xfree(share);
+       }
+}
+
+int destroy_shared_pages(share_ref_t share_ref)
+{
+       struct share *share;
+       int ret;
+
+       spin_lock(&share_lock);
+       share = find_share(share_ref);
+       if (!share)
+               ret = -ENOENT;
+       else if (share->destroy)
+               ret = -EINVAL;
+       else {
+               share->destroy = 1;
+               try_free_share(share);
+               ret = 0;
+       }
+       spin_unlock(&share_lock);
+       return ret;
+}
+
+static int share_get(share_ref_t share_ref, int port)
+{
+       struct share *share;
+       int err;
+
+       printk("Getting share share_ref %#lx port %i\n", share_ref, port);
+       spin_lock(&share_lock);
+       share = find_share(share_ref);
+       if (share) {
+#if 0
+               if (!allowed_to_share(current->domain, share)) {
+                       err = -EPERM;
+               } else
+#endif
+                       err = add_domain_to_peers(share, current, port);
+       } else {
+               printk("No such share!\n");
+               err = -ENOENT;
+       }
+       spin_unlock(&share_lock);
+       return err;
+}
+
+static void free_peer(struct peer *peer)
+{
+       list_del(&peer->list);
+       try_free_share(peer->share);
+       xfree(peer);
+}
+
+static struct peer *find_self_as_peer(share_ref_t share_ref)
+{
+       struct peer *i;
+       struct share *share;
+
+       share = find_share(share_ref);
+       if (!share)
+               return NULL;
+
+       list_for_each_entry(i, &share->peers, list) {
+               if (i->vcpu->domain == current->domain)
+                       return i;
+       }
+       return NULL;
+}
+
+static int still_in_use(struct peer *peer)
+{
+       const struct watch *w;
+       const struct sg_list *s;
+
+       list_for_each_entry(w, &peer->share->watches, list)
+               if (w->owner == peer)
+                       return 1;
+
+       list_for_each_entry(s, &peer->share->sgs, list)
+               if (s->owner == peer)
+                       return 1;
+
+       return 0;
+}
+
+static int share_drop(share_ref_t share_ref)
+{
+       int err;
+       struct peer *peer;
+
+       spin_lock(&share_lock);
+       peer = find_self_as_peer(share_ref);
+       if (peer) {
+               /* FIXME: could automatically close these */
+               if (still_in_use(peer))
+                       err = -EBUSY;
+               else {
+                       free_peer(peer);
+                       err = 0;
+               }
+       } else
+               err = -ENOENT;
+       spin_unlock(&share_lock);
+       return err;
+}
+
+/* Maps a user address.  Use unmap_domain_page_global() on result to free. */
+static int map_user_address(paddr_t uaddr, void **p)
+{
+       unsigned int pageoff;
+
+       /* Check addr is sane. */
+        if ((uaddr % __alignof__(int)) != 0)
+               return -EINVAL;
+
+       /* Hold reference to the page, check it's valid. */
+       if (!mfn_valid(paddr_to_pfn(uaddr))
+           || !get_page(maddr_to_page(uaddr), current->domain)) {
+               return -EFAULT;
+       }
+
+       pageoff = uaddr % PAGE_SIZE;
+       *p = map_domain_page_global(paddr_to_pfn(uaddr)) + pageoff;
+       return 0;
+}
+
+static int add_watch(struct peer *peer, u32 trigger, paddr_t decaddr)
+{
+       struct watch *watch;
+       int err;
+
+       /* FIXME: Limit */
+       watch = xmalloc(struct watch);
+       if (!watch)
+               return -ENOMEM;
+
+       err = map_user_address(decaddr, (void **)&watch->decrement);
+       if (err) {
+               xfree(watch);
+               return err;
+       }
+
+       watch->trigger = trigger;
+       watch->owner = peer;
+       list_add(&watch->list, &peer->share->watches);
+       return 0;
+}
+
+static int share_watch(share_ref_t share_ref, u32 trigger, paddr_t decaddr)
+{
+       struct peer *peer;
+       int ret;
+
+       spin_lock(&share_lock);
+       peer = find_self_as_peer(share_ref);
+       if (!peer)
+               ret = -ENOENT;
+       else
+               ret = add_watch(peer, trigger, decaddr);
+       spin_unlock(&share_lock);
+
+       return ret;
+}
+
+static void free_watch(struct watch *watch)
+{
+       unmap_domain_page_global(&watch->decrement);
+       list_del(&watch->list);
+       xfree(watch);
+}
+
+static int share_unwatch(share_ref_t share_ref, u32 trigger)
+{
+       struct peer *peer;
+
+       spin_lock(&share_lock);
+       peer = find_self_as_peer(share_ref);
+       if (peer) {
+               struct watch *i;
+               list_for_each_entry(i, &peer->share->watches, list) {
+                       if (i->owner == peer && i->trigger == trigger) {
+                               free_watch(i);
+                               spin_unlock(&share_lock);
+                               return 0;
+                       }
+               }
+       }
+       spin_unlock(&share_lock);
+       return -ENOENT;
+}
+
+static unsigned int do_trigger(struct share *share, u32 trigger)
+{
+       struct watch *i;
+       unsigned int count = 0;
+
+       list_for_each_entry(i, &share->watches, list) {
+               if (i->trigger != trigger)
+                       continue;
+               count++;
+               if (atomic_dec_and_test(i->decrement))
+                       evtchn_set_pending(i->owner->vcpu, i->owner->port);
+       }
+       return count;
+}
+
+static int share_trigger(share_ref_t share_ref, u32 trigger)
+{
+       struct share *share;
+       int ret;
+
+       spin_lock(&share_lock);
+       share = find_share_check(share_ref);
+       if (share)
+               ret = do_trigger(share, trigger);
+       else
+               ret = -ENOENT;
+       spin_unlock(&share_lock);
+       return ret;
+}
+
+/* Check that this domain has access to all this memory. */
+static int get_sg_list(const struct sg_list *sg)
+{
+       int i;
+
+       for (i = 0; i < sg->num_sg; i++) {
+               struct page_info *page;
+
+               page = maddr_to_page(sg->sg[i].addr);
+
+               /* FIXME: What a hack!  Must be same page for now. */
+               if (page != maddr_to_page(sg->sg[i].addr + sg->sg[i].len - 1)) {
+                       printk("Over a page 0x%08lx + %li\n",
+                              sg->sg[i].addr, sg->sg[i].len);
+                       goto fail;
+               }
+
+               if (!mfn_valid(paddr_to_pfn(sg->sg[i].addr))
+                   || !get_unshared_page(page, current->domain)) {
+                       printk("pfn %s\n",
+                              mfn_valid(paddr_to_pfn(sg->sg[i].addr))
+                              ? "valid": "INVALID");
+                       goto fail;
+               }
+       }
+       return 1;
+
+fail:
+       /* Put all the pages. */
+       while (--i >= 0)
+               put_page(maddr_to_page(sg->sg[i].addr));
+       return 0;
+}
+
+static void put_sg_list(const struct sg_list *sg)
+{
+       unsigned int i;
+
+       for (i = 0; i < sg->num_sg; i++)
+               put_page(maddr_to_page(sg->sg[i].addr));
+}
+
+/* Caller must free this if it is used. */
+static struct sg_list *next_sg_list(struct share *share, u32 queue)
+{
+       struct sg_list *i;
+
+       list_for_each_entry(i, &share->sgs, list)
+               if (i->queue == queue)
+                       return i;
+       return NULL;
+}
+
+static int sg_register(share_ref_t share_ref, u32 queue, 
+                       unsigned int num_sgs, int dir,
+                      struct xen_sg *usgs, paddr_t ulenaddr)
+{
+       struct sg_list *sg;
+       struct peer *me;
+       int ret;
+
+       if (num_sgs == 0 || num_sgs > XEN_SG_MAX) {
+               printk("%i sgs bad\n", num_sgs);
+               return -EINVAL;
+       }
+
+       if (!(dir & XEN_SG_DIR)) {
+               printk("dir %i bad\n", dir);
+               return -EINVAL;
+       }
+
+       sg = xmalloc_bytes(sizeof(*sg) + num_sgs * sizeof(sg->sg[0]));
+       if (!sg) {
+               printk("Could not allocate %i sgs\n", num_sgs);
+               return -ENOMEM;
+       }
+
+       ret = map_user_address(ulenaddr, (void **)&sg->len_pointer);
+       if (ret < 0)
+               goto free_sg;
+
+       spin_lock(&share_lock);
+       me = find_self_as_peer(share_ref);
+       if (!me) {
+               ret = -ENOENT;
+               goto unlock_free;
+       }
+
+       if (copy_from_user(sg->sg, usgs, num_sgs * sizeof(sg->sg[0])) != 0) {
+               printk("Faulted copying sgs from %p\n", (void *)usgs);
+               ret = -EFAULT;
+               goto unlock_free;
+       }
+
+       sg->num_sg = num_sgs;
+       sg->direction = dir;
+       sg->queue = queue;
+       sg->owner = me;
+
+       if (!get_sg_list(sg)) {
+               ret = -EFAULT;
+               goto unlock_free;
+       }
+
+       /* We always activate trigger 0 if we were completely out of sgs.
+        * FIXME: don't trigger self?
+        */
+       if (!next_sg_list(me->share, queue))
+               do_trigger(me->share, 0);
+
+       list_add(&sg->list, &me->share->sgs);
+       ret = 0;
+       spin_unlock(&share_lock);
+       return ret;
+
+unlock_free:
+       spin_unlock(&share_lock);
+       unmap_domain_page_global(sg->len_pointer);
+free_sg:
+       xfree(sg);
+       return ret;
+}
+
+static void free_sg_list(struct sg_list *sg_list)
+{
+       list_del(&sg_list->list);
+       unmap_domain_page_global(sg_list->len_pointer);
+       put_sg_list(sg_list);
+       xfree(sg_list);
+}
+
+static int sg_unregister(share_ref_t share_ref, paddr_t first_addr)
+{
+       struct sg_list *i;
+       struct peer *peer;
+       int err;
+
+       spin_lock(&share_lock);
+       peer = find_self_as_peer(share_ref);
+       if (!peer)
+               err = -ENOENT;
+       else {
+               err = -ENOENT;
+               list_for_each_entry(i, &peer->share->sgs, list) {
+                       if (i->owner == peer && i->sg[0].addr == first_addr) {
+                               free_sg_list(i);
+                               err = 0;
+                               break;
+                       }
+               }
+       }
+       spin_unlock(&share_lock);
+       return err;
+}
+
+static unsigned long from_user(paddr_t dst, paddr_t src, unsigned long len)
+{
+       void *dstp;
+
+       /* Only do within this page boundary. */
+       if ((dst % PAGE_SIZE) + len > PAGE_SIZE)
+               len = PAGE_SIZE - (dst % PAGE_SIZE);
+
+       dstp = map_domain_page(paddr_to_pfn(dst)) + (dst % PAGE_SIZE);
+       if (copy_from_user(dstp, (void *)src, len) != 0) {
+               printk("Copying %li bytes from %p faulted!\n", len,
+                      (void *)src);
+               len = 0;
+       }
+       unmap_domain_page_global(dstp);
+       return len;
+}
+
+static unsigned long to_user(paddr_t dst, paddr_t src, unsigned long len)
+{
+       void *srcp;
+
+       /* Only do within this page boundary. */
+       if ((src % PAGE_SIZE) + len > PAGE_SIZE)
+               len = PAGE_SIZE - (src % PAGE_SIZE);
+
+       srcp = map_domain_page(paddr_to_pfn(src)) + (src % PAGE_SIZE);
+       if (copy_to_user((void *)dst, srcp, len) != 0)
+               len = 0;
+       unmap_domain_page_global(srcp);
+       return len;
+}
+
+/* Copy from src to dst, return amount copied. */
+static int do_copy(const struct sg_list *sgdst, const struct sg_list *sgsrc,
+                  unsigned long (*copy)(paddr_t, paddr_t, unsigned long))
+{
+       unsigned long totlen, src, dst, srcoff, dstoff;
+       int ret = 0;
+
+       totlen = 0;
+       src = dst = 0;
+       srcoff = dstoff = 0;
+       while (src < sgsrc->num_sg) {
+               unsigned long len;
+               len = min(sgsrc->sg[src].len - srcoff,
+                         sgdst->sg[dst].len - dstoff);
+
+               len = copy(sgdst->sg[dst].addr+dstoff,
+                          sgsrc->sg[src].addr+srcoff,
+                          len);
+               if (len == 0) {
+                       printk("Copying from uaddr 0x%08lx faulted\n",
+                              sgsrc->sg[src].addr+srcoff);
+                       return -EFAULT;
+               }
+
+               totlen += len;
+               srcoff += len;
+               dstoff += len;
+               ret += len;
+               if (srcoff == sgsrc->sg[src].len) {
+                       src++;
+                       srcoff = 0;
+               }
+               if (dstoff == sgdst->sg[dst].len) {
+                       dst++;
+                       dstoff = 0;
+                       if (dst == sgdst->num_sg)
+                               break;
+               }
+       }
+       return ret;
+}
+
+static int sg_xfer(share_ref_t share_ref, unsigned int num_sgs, int dir,
+                  u32 queue, struct xen_sg *usgs)
+{
+       int ret;
+       struct share *share;
+       struct sg_list *sg;
+       struct {
+               struct sg_list sglist;
+               struct xen_sg sg[XEN_SG_MAX];
+       } tmp;
+
+       if (dir != XEN_SG_IN && dir != XEN_SG_OUT)
+               return -EINVAL;
+
+       if (num_sgs == 0 || num_sgs > XEN_SG_MAX)
+               return -EINVAL;
+
+       spin_lock(&share_lock);
+       share = find_share_check(share_ref);
+       if (!share) {
+               ret = -ENOENT;
+               goto out;
+       }
+       sg = next_sg_list(share, queue);
+       if (!sg) {
+               ret = -ENOSPC;
+               goto out;
+       }
+       if (copy_from_user(tmp.sg, usgs, num_sgs*sizeof(usgs[0])) != 0) {
+               printk("Copying %i sgs from uaddr %p faulted\n",
+                      num_sgs, usgs);
+               ret = -EFAULT;
+               goto out;
+       }
+       tmp.sglist.num_sg = num_sgs;
+       /* If XEN_SG_IN, it must let us XEN_SG_OUT, and vice-versa. */
+       if (!(sg->direction & (dir ^ XEN_SG_DIR))) {
+               ret = -EPERM;
+               goto out;
+       }
+
+       if (dir == XEN_SG_IN)
+               ret = do_copy(&tmp.sglist, sg, to_user);
+       else
+               ret = do_copy(sg, &tmp.sglist, from_user);
+
+       if (ret > 0) {
+               *sg->len_pointer = ret;
+               evtchn_set_pending(sg->owner->vcpu, sg->owner->port);
+               free_sg_list(sg);
+       }
+
+out:
+       spin_unlock(&share_lock);
+       return ret;
+}
+
+static void free_peer_users(struct peer *peer)
+{
+       struct watch *w, *wtmp;
+       struct sg_list *s, *stmp;
+
+       list_for_each_entry_safe(w, wtmp, &peer->share->watches, list)
+               if (w->owner == peer)
+                       free_watch(w);
+
+       list_for_each_entry_safe(s, stmp, &peer->share->sgs, list)
+               if (s->owner == peer)
+                       free_sg_list(s);
+}
+
+void free_shares(struct domain *domain)
+{
+       struct share *i, *tmp;
+
+       spin_lock(&share_lock);
+       list_for_each_entry_safe(i, tmp, &shares, list) {
+               struct peer *s;
+               list_for_each_entry(s, &i->peers, list) {
+                       if (s->vcpu->domain == domain) {
+                               free_peer_users(s);
+                               free_peer(s);
+                               break;
+                       }
+               }
+       }
+       spin_unlock(&share_lock);
+}
+
+int share_dump(void)
+{
+#if 0
+       struct share *s;
+
+       spin_lock(&share_lock);
+       list_for_each_entry(s, &shares, list) {
+               int i;
+               struct peer *p;
+               struct watch *w;
+
+               printk("%i: Dumping share def for %#lx(destroy==%i)[%p]\n", 
+                               current->domain->domain_id,
+                               s->share_ref, s->destroy,
+                               page_to_virt(s->page));
+
+               for(i=0; i < s->num_granted; i++)
+                       printk("\tGranted to Domain %i\n", 
+                                       s->granted[i]->domain_id);
+
+               list_for_each_entry(p, &s->peers, list) {
+                       struct sg_list *sg;
+
+                       printk("\tHas peer %i(share %s match)\n", p->id,
+                                       (p->share==s?"does":"doesn't"));
+
+                       list_for_each_entry(sg, &p->sgs, list) {
+                               printk("\t\tRegistered sg [len_ptr==%#lx, "
+                                      "direction==%i, num_sg==%i]\n",
+                                               (unsigned long)sg->len_pointer,
+                                               sg->direction, sg->num_sg);
+                       }
+               }
+
+               list_for_each_entry(w, &s->watches, list) {
+                       printk("\tHas watch [trigger==%u, decrement==%i]\n",
+                                       w->trigger, atomic_read(w->decrement));
+                       printk("\tOwner is peer %i\n", w->owner->id);
+
+               }
+       }
+       spin_unlock(&share_lock);
+#endif
+       return 0;
+}
+
+static inline int xen_share_sg_arg_count(unsigned long arg)
+{
+       return (arg & 0xFFFF) >> 2;
+}
+
+static inline int xen_share_sg_arg_dir(unsigned long arg)
+{
+       return arg & XEN_SG_DIR;
+}
+
+static inline int xen_share_sg_arg_queue(unsigned long arg)
+{
+       return arg >> 16;
+}
+
+long do_share_op(unsigned int cmd,
+                unsigned long arg1, unsigned long arg2, unsigned long arg3,
+                unsigned long arg4)
+{
+       switch (cmd) {
+       case XEN_SHARE_get:
+               return share_get(arg1, arg2);
+       case XEN_SHARE_drop:
+               return share_drop(arg1);
+       case XEN_SHARE_watch:
+               return share_watch(arg1, arg2, arg3);
+       case XEN_SHARE_unwatch:
+               return share_unwatch(arg1, arg2);
+       case XEN_SHARE_trigger:
+               return share_trigger(arg1, arg2);
+       case XEN_SHARE_sg_register:
+               return sg_register(arg1, xen_share_sg_arg_queue(arg2),
+                                  xen_share_sg_arg_count(arg2), 
+                                  xen_share_sg_arg_dir(arg2), 
+                                  (struct xen_sg *)arg3, arg4);
+       case XEN_SHARE_sg_unregister:
+               return sg_unregister(arg1, arg2);
+       case XEN_SHARE_sg_xfer:
+               return sg_xfer(arg1, xen_share_sg_arg_count(arg2), 
+                              xen_share_sg_arg_dir(arg2),
+                              xen_share_sg_arg_queue(arg2),
+                              (struct xen_sg *)arg3); 
+       case XEN_SHARE_dump:
+               return share_dump();
+       default:
+               return -ENOSYS;
+       }
+}
diff -r d5f98d23427a xen/include/asm-x86/share.h
--- /dev/null   Tue May 30 10:44:23 2006
+++ b/xen/include/asm-x86/share.h       Wed May 31 17:39:54 2006
@@ -0,0 +1,7 @@
+#ifndef __XEN_ASM_SHARE_H
+#define __XEN_ASM_SHARE_H
+
+struct page_info *arch_alloc_shared_pages(unsigned int order, share_ref_t 
*ref);
+void arch_free_shared_pages(struct page_info *pfn, unsigned int order);
+
+#endif /* __XEN_ASM_SHARE_H */
diff -r d5f98d23427a xen/include/public/share.h
--- /dev/null   Tue May 30 10:44:23 2006
+++ b/xen/include/public/share.h        Wed May 31 17:39:54 2006
@@ -0,0 +1,48 @@
+/* Simple share page ops for Xen. */
+#ifndef __XEN_PUBLIC_SHARE_H__
+#define __XEN_PUBLIC_SHARE_H__
+
+/* Operations to share/unshare memory. */
+/* The share reference */
+typedef unsigned long share_ref_t;
+
+/* int get(share_ref, port).  Returns unique peer id. */
+#define XEN_SHARE_get          0
+/* void drop(share_ref, peerid) */
+#define XEN_SHARE_drop         1
+
+/* Watch and trigger operations */
+/* irq_t watch(share_ref, u32 triggernum, physaddr_t decaddr) */
+#define XEN_SHARE_watch                2
+/* void unwatch(share_ref, u32 triggernum) */
+#define XEN_SHARE_unwatch      3
+/* int trigger(share_ref, u32 triggernum) */
+#define XEN_SHARE_trigger      4
+
+/* Scatter-gather operations. */
+#define XEN_SG_IN      0x01
+#define XEN_SG_OUT     0x02
+#define XEN_SG_DIR     (XEN_SG_IN|XEN_SG_OUT)
+
+/* Maximum number of sg elements. */
+#define XEN_SG_MAX     16
+
+struct xen_sg
+{
+       unsigned long addr, len;
+};
+
+/* We combine the count, queue and direction: the bottom two bits are
+ * directions XEN_SG_IN/OUT, the top 16 are the queue number. */
+#define xen_share_sg_arg(queue, count, dir) ((queue) << 16 | ((count) << 2) | 
(dir))
+
+/* int sg_register(share_ref, queue_count_dir, struct xen_sg *sgs, physaddr_t 
len).*/
+#define XEN_SHARE_sg_register  5
+/* int sg_unregister(share_ref, memory_t first_sg_addr). */
+#define XEN_SHARE_sg_unregister        6
+/* int sg_xfer(share_ref, queue_count_dir, struct xen_sg *sgs) */
+#define XEN_SHARE_sg_xfer      7
+/* int share_dump(void) */
+#define XEN_SHARE_dump         8
+
+#endif /* __XEN_PUBLIC_SHARE_H__ */
diff -r d5f98d23427a xen/include/xen/share.h
--- /dev/null   Tue May 30 10:44:23 2006
+++ b/xen/include/xen/share.h   Wed May 31 17:39:54 2006
@@ -0,0 +1,13 @@
+#ifndef _XEN_SHARE_H
+#define _XEN_SHARE_H
+#include <public/share.h>
+
+struct domain;
+/* DOM0 ops */
+share_ref_t create_shared_pages(unsigned int num);
+int grant_shared_pages(share_ref_t share_ref, struct domain *d);
+int destroy_shared_pages(share_ref_t share_ref);
+
+/* Domain is dying, release shares */
+void free_shares(struct domain *domain);
+#endif

-- 
 ccontrol: http://ccontrol.ozlabs.org


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.