[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 2/3] xen-sockback: add support of the xen-sockback driver



This driver will allow to use xensock sockets for the
server application for the dom0/domD domain.

Signed-off-by: Oleksandr Dmytryshyn <oleksandr.dmytryshyn@xxxxxxxxxxxxxxx>
---
 drivers/net/Kconfig                  |   19 +
 drivers/net/Makefile                 |    1 +
 drivers/net/xen-sockback/Makefile    |    3 +
 drivers/net/xen-sockback/common.h    |  150 ++++
 drivers/net/xen-sockback/interface.c |  245 ++++++
 drivers/net/xen-sockback/sockback.c  | 1527 ++++++++++++++++++++++++++++++++++
 drivers/net/xen-sockback/xenbus.c    |  348 ++++++++
 include/xen/interface/io/sockif.h    |   74 ++
 8 files changed, 2367 insertions(+)
 create mode 100644 drivers/net/xen-sockback/Makefile
 create mode 100644 drivers/net/xen-sockback/common.h
 create mode 100644 drivers/net/xen-sockback/interface.c
 create mode 100644 drivers/net/xen-sockback/sockback.c
 create mode 100644 drivers/net/xen-sockback/xenbus.c
 create mode 100644 include/xen/interface/io/sockif.h

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 420981a..e643f42 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -348,9 +348,28 @@ config XEN_NETDEV_BACKEND
          compile this driver as a module, chose M here: the module
          will be called xen-netback.
 
+config XEN_SOCKDEV_BACKEND
+       tristate "Xen backend socket device"
+       depends on XEN_BACKEND
+       select XEN_SOCKDEV_PROTO
+       help
+         This driver allows the kernel to act as a Xen socket driver
+         domain which exports paravirtual network devices to other
+         Xen domains. These devices can be accessed by any operating
+         system that implements a compatible front end.
+
+         The corresponding Linux frontend driver is enabled by the
+         CONFIG_XEN_SOCKDEV_FRONTEND configuration option.
+
+         If you are compiling a kernel to run in a Xen socket driver
+         domain (often this is domain 0) you should say Y here. To
+         compile this driver as a module, chose M here: the module
+         will be called xen-sockback.
+
 config XEN_SOCKDEV_PROTO
        bool
        default n
+       depends on XEN_SOCKDEV_BACKEND
 
 config VMXNET3
        tristate "VMware VMXNET3 ethernet driver"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 43bf910..96c6c97 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
 obj-$(CONFIG_XEN_NETDEV_BACKEND) += xen-netback/
 
 obj-$(CONFIG_XEN_SOCKDEV_PROTO) += xensock/
+obj-$(CONFIG_XEN_SOCKDEV_BACKEND) += xen-sockback/
 
 obj-$(CONFIG_USB_CATC)          += usb/
 obj-$(CONFIG_USB_KAWETH)        += usb/
diff --git a/drivers/net/xen-sockback/Makefile 
b/drivers/net/xen-sockback/Makefile
new file mode 100644
index 0000000..c255867
--- /dev/null
+++ b/drivers/net/xen-sockback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_SOCKDEV_BACKEND) := xen-sockback.o
+
+xen-sockback-y := sockback.o xenbus.o interface.o
diff --git a/drivers/net/xen-sockback/common.h 
b/drivers/net/xen-sockback/common.h
new file mode 100644
index 0000000..b5b09bb
--- /dev/null
+++ b/drivers/net/xen-sockback/common.h
@@ -0,0 +1,150 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_SOCKBACK__COMMON_H__
+#define __XEN_SOCKBACK__COMMON_H__
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/io.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include <xen/interface/io/sockif.h>
+#include <xen/interface/grant_table.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+
+#include <net/xensock.h>
+#include <net/af_xensock.h>
+
+struct xen_sockbk;
+
+struct xenvsock {
+       /* Unique identifier for this interface. */
+       domid_t          domid;
+
+       /* Reference to xensock processing backend. */
+       struct xen_sockbk *sockbk;
+
+       /* Physical parameters of the comms window. */
+       unsigned int     irq;
+
+       /* Back pointer to the xenbus_device. */
+       struct xenbus_device            *xbdev;
+
+       /* List of frontends to notify after a batch of frames sent. */
+       struct list_head notify_list;
+
+       /* The shared rings and indexes. */
+       struct xen_sockif_tx_back_ring tx;
+       struct xen_sockif_rx_back_ring rx;
+
+       /* Internal feature information. */
+       u8 can_queue:1;     /* can queue packets for receiver? */
+
+       /* Allow xenvsock_start_xmit() to peek ahead in the rx request
+        * ring.  This is a prediction of what rx_req_cons will be
+        * once all queued skbs are put on the ring.
+        */
+       RING_IDX rx_req_cons_peek;
+
+       /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+       unsigned long   credit_bytes;
+       unsigned long   credit_usec;
+       unsigned long   remaining_credit;
+       struct timer_list credit_timeout;
+
+       /* Statistics */
+       unsigned long rx_gso_checksum_fixup;
+
+       /* Miscellaneous private stuff. */
+       struct list_head schedule_list;
+       atomic_t         refcnt;
+       struct xen_sock_dev *dev;
+
+       wait_queue_head_t waiting_to_free;
+};
+
+#define XEN_SOCKIF_TX_RING_SIZE __CONST_RING_SIZE(xen_sockif_tx, PAGE_SIZE)
+#define XEN_SOCKIF_RX_RING_SIZE __CONST_RING_SIZE(xen_sockif_rx, PAGE_SIZE)
+
+struct xenvsock *xenvsock_alloc(struct device *parent,
+                               domid_t domid,
+                               unsigned int handle);
+
+int xenvsock_connect(struct xenvsock *vsock, unsigned long tx_ring_ref,
+                    unsigned long rx_ring_ref, unsigned int evtchn);
+void xenvsock_disconnect(struct xenvsock *vsock);
+
+void xenvsock_get(struct xenvsock *vsock);
+void xenvsock_put(struct xenvsock *vsock);
+
+int xenvsock_xenbus_init(void);
+
+int xenvsock_schedulable(struct xenvsock *vsock);
+
+int xen_sockbk_rx_ring_full(struct xenvsock *vsock);
+
+int xen_sockbk_must_stop_queue(struct xenvsock *vsock);
+
+/* (Un)Map communication rings. */
+void xen_sockbk_unmap_frontend_rings(struct xenvsock *vsock);
+int xen_sockbk_map_frontend_rings(struct xenvsock *vsock,
+                                 grant_ref_t tx_ring_ref,
+                                 grant_ref_t rx_ring_ref);
+
+/* (De)Register a xenvsock with the xensock backend. */
+void xen_sockbk_add_xenvsock(struct xenvsock *vsock);
+void xen_sockbk_remove_xenvsock(struct xenvsock *vsock);
+
+/* (De)Schedule backend processing for a xenvsock */
+void xen_sockbk_schedule_xenvsock(struct xenvsock *vsock);
+void xen_sockbk_deschedule_xenvsock(struct xenvsock *vsock);
+
+/* Check for SKBs from frontend and schedule backend processing */
+void xen_sockbk_check_rx_xenvsock(struct xenvsock *vsock);
+/* Receive an SKB from the frontend */
+void xenvsock_receive_skb(struct xenvsock *vsock, struct sk_buff *skb);
+
+/* Queue an SKB for transmission to the frontend */
+void xen_sockbk_queue_tx_skb(struct xenvsock *vsock, struct sk_buff *skb);
+/* Notify xenvsock that ring now has space to send an skb to the frontend */
+void xenvsock_notify_tx_completion(struct xenvsock *vsock);
+
+/* Prevent the device from generating any further traffic. */
+void xenvsock_carrier_off(struct xenvsock *vsock);
+
+/* Returns number of ring slots required to send an skb to the frontend */
+unsigned int xen_sockbk_count_skb_slots(struct xenvsock *vsock,
+                                       struct sk_buff *skb);
+
+#endif /* __XEN_SOCKBACK__COMMON_H__ */
diff --git a/drivers/net/xen-sockback/interface.c 
b/drivers/net/xen-sockback/interface.c
new file mode 100644
index 0000000..995fb22
--- /dev/null
+++ b/drivers/net/xen-sockback/interface.c
@@ -0,0 +1,245 @@
+/*
+ * Xenvsock-device interface management.
+ *
+ * Copyright (c) 2004-2005, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#include <xen/events.h>
+#include <asm/xen/hypercall.h>
+
+#define XENVSOCK_QUEUE_LENGTH 32
+
+void xenvsock_get(struct xenvsock *vsock)
+{
+       atomic_inc(&vsock->refcnt);
+}
+
+void xenvsock_put(struct xenvsock *vsock)
+{
+       if (atomic_dec_and_test(&vsock->refcnt))
+               wake_up(&vsock->waiting_to_free);
+}
+
+int xenvsock_schedulable(struct xenvsock *vsock)
+{
+       return sockif_carrier_ok(vsock->dev);
+}
+
+static int xenvsock_rx_schedulable(struct xenvsock *vsock)
+{
+       return xenvsock_schedulable(vsock) && !xen_sockbk_rx_ring_full(vsock);
+}
+
+static irqreturn_t xenvsock_interrupt(int irq, void *dev_id)
+{
+       struct xenvsock *vsock = dev_id;
+
+       if (vsock->sockbk == NULL)
+               return IRQ_NONE;
+
+       xen_sockbk_schedule_xenvsock(vsock);
+
+       if (xenvsock_rx_schedulable(vsock))
+               sockif_wake_queue(vsock->dev);
+
+       return IRQ_HANDLED;
+}
+
+static int xenvsock_start_xmit(struct sk_buff *skb, struct xen_sock_dev *dev)
+{
+       struct xenvsock *vsock = xensock_dev_priv(dev);
+
+       if (vsock->sockbk == NULL)
+               goto drop;
+
+       /* Drop the packet if the target domain has no receive buffers. */
+       if (!xenvsock_rx_schedulable(vsock))
+               goto drop;
+
+       skb->dev = (struct net_device *)dev;
+
+       /* Reserve ring slots for the worst-case number of fragments. */
+       vsock->rx_req_cons_peek += xen_sockbk_count_skb_slots(vsock, skb);
+       xenvsock_get(vsock);
+
+       if (vsock->can_queue && xen_sockbk_must_stop_queue(vsock))
+               sockif_stop_queue(dev);
+
+       xen_sockbk_queue_tx_skb(vsock, skb);
+
+       return 0;
+
+ drop:
+       dev_kfree_skb(skb);
+       return 0;
+}
+
+void xenvsock_receive_skb(struct xenvsock *vsock, struct sk_buff *skb)
+{
+       xensock_dev_queue_rx_skb(skb, vsock->dev);
+}
+
+void xenvsock_notify_tx_completion(struct xenvsock *vsock)
+{
+       if (sockif_queue_stopped(vsock->dev) && xenvsock_rx_schedulable(vsock))
+               sockif_wake_queue(vsock->dev);
+}
+
+static void xenvsock_up(struct xenvsock *vsock)
+{
+       xen_sockbk_add_xenvsock(vsock);
+       enable_irq(vsock->irq);
+       xen_sockbk_check_rx_xenvsock(vsock);
+}
+
+static void xenvsock_down(struct xenvsock *vsock)
+{
+       disable_irq(vsock->irq);
+       del_timer_sync(&vsock->credit_timeout);
+       xen_sockbk_deschedule_xenvsock(vsock);
+       xen_sockbk_remove_xenvsock(vsock);
+}
+
+static const struct xenvsock_stat {
+       char name[ETH_GSTRING_LEN];
+       u16 offset;
+} xenvsock_stats[] = {
+       {
+               "rx_gso_checksum_fixup",
+               offsetof(struct xenvsock, rx_gso_checksum_fixup)
+       },
+};
+
+struct xenvsock *xenvsock_alloc(struct device *parent, domid_t domid,
+                               unsigned int handle)
+{
+       int err;
+       struct xen_sock_dev *dev;
+       struct xenvsock *vsock;
+       char name[IFNAMSIZ] = {};
+
+       snprintf(name, IFNAMSIZ - 1, "vsock%u.%u", domid, handle);
+       dev = alloc_xen_sock_dev(sizeof(struct xenvsock), name);
+       if (dev == NULL) {
+               pr_warn("Could not allocate sockdev\n");
+               return ERR_PTR(-ENOMEM);
+       }
+
+       vsock = xensock_dev_priv(dev);
+       vsock->domid  = domid;
+       vsock->sockbk  = NULL;
+       atomic_set(&vsock->refcnt, 1);
+       init_waitqueue_head(&vsock->waiting_to_free);
+       vsock->dev = dev;
+       INIT_LIST_HEAD(&vsock->schedule_list);
+       INIT_LIST_HEAD(&vsock->notify_list);
+
+       vsock->credit_bytes = ~0UL;
+       vsock->remaining_credit = ~0UL;
+       vsock->credit_usec  = 0UL;
+       init_timer(&vsock->credit_timeout);
+       /* Initialize 'expires' now: it's used to track the credit window. */
+       vsock->credit_timeout.expires = jiffies;
+
+       dev->start_xmit = xenvsock_start_xmit;
+
+       dev->tx_queue_len = XENVSOCK_QUEUE_LENGTH;
+
+       sockif_carrier_off(dev);
+
+       err = xensock_register_dev(dev);
+       if (err) {
+               pr_warn("Could not register device: err=%d\n", err);
+               free_xen_sock_dev(dev);
+               return ERR_PTR(err);
+       }
+
+       pr_warn("Successfully created xenvsock\n");
+       return vsock;
+}
+
+int xenvsock_connect(struct xenvsock *vsock, unsigned long tx_ring_ref,
+                    unsigned long rx_ring_ref, unsigned int evtchn)
+{
+       int err = -ENOMEM;
+
+       /* Already connected through? */
+       if (vsock->irq)
+               return 0;
+
+       err = xen_sockbk_map_frontend_rings(vsock, tx_ring_ref, rx_ring_ref);
+       if (err < 0)
+               goto err;
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+               vsock->domid, evtchn, xenvsock_interrupt, 0,
+               vsock->dev->name, vsock);
+       if (err < 0)
+               goto err_unmap;
+       vsock->irq = err;
+       disable_irq(vsock->irq);
+
+       xenvsock_get(vsock);
+
+       sockif_carrier_on(vsock->dev);
+       xenvsock_up(vsock);
+
+       return 0;
+err_unmap:
+       xen_sockbk_unmap_frontend_rings(vsock);
+err:
+       return err;
+}
+
+void xenvsock_carrier_off(struct xenvsock *vsock)
+{
+       struct xen_sock_dev *dev = vsock->dev;
+
+       sockif_carrier_off(dev); /* discard queued packets */
+       xenvsock_down(vsock);
+       xenvsock_put(vsock);
+}
+
+void xenvsock_disconnect(struct xenvsock *vsock)
+{
+       if (sockif_carrier_ok(vsock->dev))
+               xenvsock_carrier_off(vsock);
+
+       atomic_dec(&vsock->refcnt);
+       wait_event(vsock->waiting_to_free, atomic_read(&vsock->refcnt) == 0);
+
+       if (vsock->irq)
+               unbind_from_irqhandler(vsock->irq, vsock);
+
+       xensock_unregister_dev(vsock->dev);
+
+       xen_sockbk_unmap_frontend_rings(vsock);
+
+       free_xen_sock_dev(vsock->dev);
+}
diff --git a/drivers/net/xen-sockback/sockback.c 
b/drivers/net/xen-sockback/sockback.c
new file mode 100644
index 0000000..84f5848
--- /dev/null
+++ b/drivers/net/xen-sockback/sockback.c
@@ -0,0 +1,1527 @@
+/*
+ * Back-end of the driver for xensock devices. A
+ * reference front-end implementation can be found in:
+ *  drivers/net/xen-sockfront.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#include <linux/kthread.h>
+#include <linux/if_vlan.h>
+
+#include <net/tcp.h>
+
+#include <xen/xen.h>
+#include <xen/events.h>
+#include <xen/interface/memory.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+struct pending_tx_info {
+       struct xen_sockif_tx_request req;
+       struct xenvsock *vsock;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct sockbk_rx_meta {
+       int id;
+       int size;
+};
+
+#define SOCK_DEV_SKB(skb)      ((struct xen_sock_dev *)((skb)->dev))
+
+#define MAX_PENDING_REQS 256
+
+/* Discriminate from any valid pending_idx value. */
+#define INVALID_PENDING_IDX 0xFFFF
+
+#define MAX_BUFFER_OFFSET PAGE_SIZE
+
+/* extra field used in struct page */
+union page_ext {
+       struct {
+#if BITS_PER_LONG < 64
+#define IDX_WIDTH   8
+#define GROUP_WIDTH (BITS_PER_LONG - IDX_WIDTH)
+               unsigned int group:GROUP_WIDTH;
+               unsigned int idx:IDX_WIDTH;
+#else
+               unsigned int group, idx;
+#endif
+       } e;
+       void *mapping;
+};
+
+struct xen_sockbk {
+       wait_queue_head_t wq;
+       struct task_struct *task;
+
+       struct sk_buff_head rx_queue;
+       struct sk_buff_head tx_queue;
+
+       struct timer_list sock_timer;
+
+       struct page *mmap_pages[MAX_PENDING_REQS];
+
+       pending_ring_idx_t pending_prod;
+       pending_ring_idx_t pending_cons;
+       struct list_head sock_schedule_list;
+
+       /* Protect the sock_schedule_list in sockbk. */
+       spinlock_t sock_schedule_list_lock;
+
+       atomic_t sockfront_count;
+
+       struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+       struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
+
+       u16 pending_ring[MAX_PENDING_REQS];
+
+       /* Given MAX_BUFFER_OFFSET of 4096 the worst case is that each
+        * head/fragment page uses 2 copy operations because it
+        * straddles two buffers in the frontend.
+        */
+       struct gnttab_copy grant_copy_op[2*XEN_SOCKIF_RX_RING_SIZE];
+       struct sockbk_rx_meta meta[2*XEN_SOCKIF_RX_RING_SIZE];
+};
+
+static struct xen_sockbk *xen_sockbk;
+static int xen_sockbk_group_nr;
+
+void xen_sockbk_add_xenvsock(struct xenvsock *vsock)
+{
+       int i;
+       int min_sockfront_count;
+       int min_group = 0;
+       struct xen_sockbk *sockbk;
+
+       min_sockfront_count = atomic_read(&xen_sockbk[0].sockfront_count);
+       for (i = 0; i < xen_sockbk_group_nr; i++) {
+               int sockfront_count;
+
+               sockfront_count = atomic_read(&xen_sockbk[i].sockfront_count);
+               if (sockfront_count < min_sockfront_count) {
+                       min_group = i;
+                       min_sockfront_count = sockfront_count;
+               }
+       }
+
+       sockbk = &xen_sockbk[min_group];
+
+       vsock->sockbk = sockbk;
+       atomic_inc(&sockbk->sockfront_count);
+}
+
+void xen_sockbk_remove_xenvsock(struct xenvsock *vsock)
+{
+       struct xen_sockbk *sockbk = vsock->sockbk;
+
+       vsock->sockbk = NULL;
+       atomic_dec(&sockbk->sockfront_count);
+}
+
+static void xen_sockbk_idx_release(struct xen_sockbk *sockbk, u16 pending_idx,
+                                  u8 status);
+static void make_tx_response(struct xenvsock *vsock,
+                            struct xen_sockif_tx_request *txp,
+                            s8       st);
+static struct xen_sockif_rx_response *make_rx_response(struct xenvsock *vsock,
+                                                      u16 id, s8 st,
+                                                      u16 offset,
+                                                      u16 size,
+                                                      u16 flags);
+
+static inline unsigned long idx_to_pfn(struct xen_sockbk *sockbk,
+                                      u16 idx)
+{
+       return page_to_pfn(sockbk->mmap_pages[idx]);
+}
+
+static inline unsigned long idx_to_kaddr(struct xen_sockbk *sockbk,
+                                        u16 idx)
+{
+       return (unsigned long)pfn_to_kaddr(idx_to_pfn(sockbk, idx));
+}
+
+/* extra field used in struct page */
+static inline void set_page_ext(struct page *pg, struct xen_sockbk *sockbk,
+                               unsigned int idx)
+{
+       unsigned int group = sockbk - xen_sockbk;
+       union page_ext ext = { .e = { .group = group + 1, .idx = idx } };
+
+       BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
+       pg->mapping = ext.mapping;
+}
+
+static int get_page_ext(struct page *pg,
+                       unsigned int *pgroup, unsigned int *pidx)
+{
+       union page_ext ext = { .mapping = pg->mapping };
+       struct xen_sockbk *sockbk;
+       unsigned int group, idx;
+
+       group = ext.e.group - 1;
+
+       if (group < 0 || group >= xen_sockbk_group_nr)
+               return 0;
+
+       sockbk = &xen_sockbk[group];
+
+       idx = ext.e.idx;
+
+       if ((idx < 0) || (idx >= MAX_PENDING_REQS))
+               return 0;
+
+       if (sockbk->mmap_pages[idx] != pg)
+               return 0;
+
+       *pgroup = group;
+       *pidx = idx;
+
+       return 1;
+}
+
+/* This is the amount of packet we copy rather than map, so that the
+ * guest can't fiddle with the contents of the headers while we do
+ * packet processing on them.
+ */
+#define PKT_PROT_LEN    (ETH_HLEN + \
+                        VLAN_HLEN + \
+                        sizeof(struct iphdr) + MAX_IPOPTLEN + \
+                        sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
+
+static u16 frag_get_pending_idx(skb_frag_t *frag)
+{
+       return (u16)frag->page_offset;
+}
+
+static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx)
+{
+       frag->page_offset = pending_idx;
+}
+
+static inline pending_ring_idx_t pending_index(unsigned i)
+{
+       return i & (MAX_PENDING_REQS-1);
+}
+
+static inline pending_ring_idx_t nr_pending_reqs(struct xen_sockbk *sockbk)
+{
+       return MAX_PENDING_REQS -
+               sockbk->pending_prod + sockbk->pending_cons;
+}
+
+static void xen_sockbk_kick_thread(struct xen_sockbk *sockbk)
+{
+       wake_up(&sockbk->wq);
+}
+
+static int max_required_rx_slots(struct xenvsock *vsock)
+{
+       int max = DIV_ROUND_UP(XENSOCK_MTU, PAGE_SIZE);
+       return max;
+}
+
+int xen_sockbk_rx_ring_full(struct xenvsock *vsock)
+{
+       RING_IDX peek   = vsock->rx_req_cons_peek;
+       RING_IDX needed = max_required_rx_slots(vsock);
+       RING_IDX rsp_prod = vsock->rx.rsp_prod_pvt + XEN_SOCKIF_RX_RING_SIZE;
+
+       return ((vsock->rx.sring->req_prod - peek) < needed) ||
+              ((rsp_prod - peek) < needed);
+}
+
+int xen_sockbk_must_stop_queue(struct xenvsock *vsock)
+{
+       if (!xen_sockbk_rx_ring_full(vsock))
+               return 0;
+
+       vsock->rx.sring->req_event = vsock->rx_req_cons_peek +
+               max_required_rx_slots(vsock);
+       mb(); /* request notification /then/ check the queue */
+
+       return xen_sockbk_rx_ring_full(vsock);
+}
+
+/* Returns true if we should start a new receive buffer instead of
+ * adding 'size' bytes to a buffer which currently contains 'offset'
+ * bytes.
+ */
+static bool start_new_rx_buffer(int offset, unsigned long size, int head)
+{
+       /* simple case: we have completely filled the current buffer. */
+       if (offset == MAX_BUFFER_OFFSET)
+               return true;
+
+       /* complex case: start a fresh buffer if the current frag
+        * would overflow the current buffer but only if:
+        *     (i)   this frag would fit completely in the next buffer
+        * and (ii)  there is already some data in the current buffer
+        * and (iii) this is not the head buffer.
+        *
+        * Where:
+        * - (i) stops us splitting a frag into two copies
+        *   unless the frag is too large for a single buffer.
+        * - (ii) stops us from leaving a buffer pointlessly empty.
+        * - (iii) stops us leaving the first buffer
+        *   empty. Strictly speaking this is already covered
+        *   by (ii) but is explicitly checked because
+        *   sock front relies on the first buffer being
+        *   non-empty and can crash otherwise.
+        *
+        * This means we will effectively linearise small
+        * frags but do not needlessly split large buffers
+        * into multiple copies tend to give large frags their
+        * own buffers as before.
+        */
+       if ((offset + size > MAX_BUFFER_OFFSET) &&
+           (size <= MAX_BUFFER_OFFSET) && offset && !head)
+               return true;
+
+       return false;
+}
+
+/* Figure out how many ring slots we're going to need to send @skb to
+ * the guest. This function is essentially a dry run of
+ * sockbk_gop_frag_copy.
+ */
+unsigned int xen_sockbk_count_skb_slots(struct xenvsock *vsock,
+                                       struct sk_buff *skb)
+{
+       unsigned int count;
+       int i, copy_off;
+
+       count = DIV_ROUND_UP(skb_headlen(skb), PAGE_SIZE);
+
+       copy_off = skb_headlen(skb) % PAGE_SIZE;
+
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+               unsigned long size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+               unsigned long offset = skb_shinfo(skb)->frags[i].page_offset;
+               unsigned long bytes;
+
+               offset &= ~PAGE_MASK;
+
+               while (size > 0) {
+                       BUG_ON(offset >= PAGE_SIZE);
+                       BUG_ON(copy_off > MAX_BUFFER_OFFSET);
+
+                       bytes = PAGE_SIZE - offset;
+
+                       if (bytes > size)
+                               bytes = size;
+
+                       if (start_new_rx_buffer(copy_off, bytes, 0)) {
+                               count++;
+                               copy_off = 0;
+                       }
+
+                       if (copy_off + bytes > MAX_BUFFER_OFFSET)
+                               bytes = MAX_BUFFER_OFFSET - copy_off;
+
+                       copy_off += bytes;
+
+                       offset += bytes;
+                       size -= bytes;
+
+                       if (offset == PAGE_SIZE)
+                               offset = 0;
+               }
+       }
+       return count;
+}
+
+struct sockrx_pending_operations {
+       unsigned copy_prod, copy_cons;
+       unsigned meta_prod, meta_cons;
+       struct gnttab_copy *copy;
+       struct sockbk_rx_meta *meta;
+       int copy_off;
+       grant_ref_t copy_gref;
+};
+
+static
+struct sockbk_rx_meta *get_next_rx_buf(struct xenvsock *vsock,
+                                      struct sockrx_pending_operations *spo)
+{
+       struct sockbk_rx_meta *meta;
+       struct xen_sockif_rx_request *req;
+
+       req = RING_GET_REQUEST(&vsock->rx, vsock->rx.req_cons++);
+
+       meta = spo->meta + spo->meta_prod++;
+       meta->size = 0;
+       meta->id = req->id;
+
+       spo->copy_off = 0;
+       spo->copy_gref = req->gref;
+
+       return meta;
+}
+
+/* Set up the grant operations for this fragment. If it's a flipping
+ * interface, we also set up the unmap request from here.
+ */
+static void sockbk_gop_frag_copy(struct xenvsock *vsock, struct sk_buff *skb,
+                                struct sockrx_pending_operations *spo,
+                                struct page *page, unsigned long size,
+                                unsigned long offset, int *head)
+{
+       struct gnttab_copy *copy_gop;
+       struct sockbk_rx_meta *meta;
+       /* These variables are used iff get_page_ext returns true,
+        * in which case they are guaranteed to be initialized.
+        */
+       unsigned int uninitialized_var(group), uninitialized_var(idx);
+       int foreign = get_page_ext(page, &group, &idx);
+       unsigned long bytes;
+
+       /* Data must not cross a page boundary. */
+       BUG_ON(size + offset > PAGE_SIZE<<compound_order(page));
+
+       meta = spo->meta + spo->meta_prod - 1;
+
+       /* Skip unused frames from start of page */
+       page += offset >> PAGE_SHIFT;
+       offset &= ~PAGE_MASK;
+
+       while (size > 0) {
+               BUG_ON(offset >= PAGE_SIZE);
+               BUG_ON(spo->copy_off > MAX_BUFFER_OFFSET);
+
+               bytes = PAGE_SIZE - offset;
+
+               if (bytes > size)
+                       bytes = size;
+
+               if (start_new_rx_buffer(spo->copy_off, bytes, *head)) {
+                       /* Sockfront requires there to be some data in the head
+                        * buffer.
+                        */
+                       BUG_ON(*head);
+
+                       meta = get_next_rx_buf(vsock, spo);
+               }
+
+               if (spo->copy_off + bytes > MAX_BUFFER_OFFSET)
+                       bytes = MAX_BUFFER_OFFSET - spo->copy_off;
+
+               copy_gop = spo->copy + spo->copy_prod++;
+               copy_gop->flags = GNTCOPY_dest_gref;
+               if (foreign) {
+                       struct xen_sockbk *sockbk = &xen_sockbk[group];
+                       struct pending_tx_info *src_pend;
+
+                       src_pend = &sockbk->pending_tx_info[idx];
+
+                       copy_gop->source.domid = src_pend->vsock->domid;
+                       copy_gop->source.u.ref = src_pend->req.gref;
+                       copy_gop->flags |= GNTCOPY_source_gref;
+               } else {
+                       void *vaddr = page_address(page);
+
+                       copy_gop->source.domid = DOMID_SELF;
+                       copy_gop->source.u.gmfn = virt_to_mfn(vaddr);
+               }
+               copy_gop->source.offset = offset;
+               copy_gop->dest.domid = vsock->domid;
+
+               copy_gop->dest.offset = spo->copy_off;
+               copy_gop->dest.u.ref = spo->copy_gref;
+               copy_gop->len = bytes;
+
+               spo->copy_off += bytes;
+               meta->size += bytes;
+
+               offset += bytes;
+               size -= bytes;
+
+               /* Next frame */
+               if (offset == PAGE_SIZE && size) {
+                       BUG_ON(!PageCompound(page));
+                       page++;
+                       offset = 0;
+               }
+
+               *head = 0; /* There must be something in this buffer now. */
+       }
+}
+
+/* Prepare an SKB to be transmitted to the frontend.
+ *
+ * This function is responsible for allocating grant operations, meta
+ * structures, etc.
+ *
+ * It returns the number of meta structures consumed. The number of
+ * ring slots used is always equal to the number of meta slots used
+ * plus the number of GSO descriptors used. Currently, we use either
+ * zero GSO descriptors (for non-GSO packets) or one descriptor (for
+ * frontend-side LRO).
+ */
+static int sockbk_gop_skb(struct sk_buff *skb,
+                         struct sockrx_pending_operations *spo)
+{
+       struct xenvsock *vsock = xensock_dev_priv(SOCK_DEV_SKB(skb));
+       int nr_frags = skb_shinfo(skb)->nr_frags;
+       int i;
+       struct xen_sockif_rx_request *req;
+       struct sockbk_rx_meta *meta;
+       unsigned char *data;
+       int head = 1;
+       int old_meta_prod;
+
+       old_meta_prod = spo->meta_prod;
+
+       req = RING_GET_REQUEST(&vsock->rx, vsock->rx.req_cons++);
+       meta = spo->meta + spo->meta_prod++;
+
+       meta->size = 0;
+       meta->id = req->id;
+       spo->copy_off = 0;
+       spo->copy_gref = req->gref;
+
+       data = skb->data;
+       while (data < skb_tail_pointer(skb)) {
+               unsigned int offset = offset_in_page(data);
+               unsigned int len = PAGE_SIZE - offset;
+
+               if (data + len > skb_tail_pointer(skb))
+                       len = skb_tail_pointer(skb) - data;
+
+               sockbk_gop_frag_copy(vsock, skb, spo,
+                                    virt_to_page(data), len, offset, &head);
+               data += len;
+       }
+
+       for (i = 0; i < nr_frags; i++) {
+               sockbk_gop_frag_copy(vsock, skb, spo,
+                                    skb_frag_page(&skb_shinfo(skb)->frags[i]),
+                                    skb_frag_size(&skb_shinfo(skb)->frags[i]),
+                                    skb_shinfo(skb)->frags[i].page_offset,
+                                    &head);
+       }
+
+       return spo->meta_prod - old_meta_prod;
+}
+
+/* This is a twin to sockbk_gop_skb.  Assume that sockbk_gop_skb was
+ * used to set up the operations on the top of
+ * sockrx_pending_operations, which have since been done.  Check that
+ * they didn't give any errors and advance over them.
+ */
+static int sockbk_check_gop(struct xenvsock *vsock, int nr_meta_slots,
+                           struct sockrx_pending_operations *spo)
+{
+       struct gnttab_copy     *copy_op;
+       int status = XEN_SOCKIF_RSP_OKAY;
+       int i;
+
+       for (i = 0; i < nr_meta_slots; i++) {
+               copy_op = spo->copy + spo->copy_cons++;
+               if (copy_op->status != GNTST_okay) {
+                       dev_dbg(&vsock->xbdev->dev,
+                               "Bad status %d from copy to DOM%d.\n",
+                               copy_op->status, vsock->domid);
+                       status = XEN_SOCKIF_RSP_ERROR;
+               }
+       }
+
+       return status;
+}
+
+static void sockbk_add_frag_responses(struct xenvsock *vsock, int status,
+                                     struct sockbk_rx_meta *meta,
+                                     int nr_meta_slots)
+{
+       int i;
+       unsigned long offset;
+
+       /* No fragments used */
+       if (nr_meta_slots <= 1)
+               return;
+
+       nr_meta_slots--;
+
+       for (i = 0; i < nr_meta_slots; i++) {
+               int flags;
+
+               if (i == nr_meta_slots - 1)
+                       flags = 0;
+               else
+                       flags = XEN_SOCKRXF_more_data;
+
+               offset = 0;
+               make_rx_response(vsock, meta[i].id, status, offset,
+                                meta[i].size, flags);
+       }
+}
+
+struct skb_cb_overlay {
+       int meta_slots_used;
+};
+
+static void xen_sockbk_rx_action(struct xen_sockbk *sockbk)
+{
+       struct xenvsock *vsock = NULL, *tmp;
+       s8 status;
+       u16 irq, flags;
+       struct xen_sockif_rx_response *resp;
+       struct sk_buff_head rxq;
+       struct sk_buff *skb;
+       LIST_HEAD(notify);
+       int ret;
+       int nr_frags;
+       int count;
+       unsigned long offset;
+       struct skb_cb_overlay *sco;
+
+       struct sockrx_pending_operations spo = {
+               .copy  = sockbk->grant_copy_op,
+               .meta  = sockbk->meta,
+       };
+
+       skb_queue_head_init(&rxq);
+
+       count = 0;
+
+       while ((skb = skb_dequeue(&sockbk->rx_queue)) != NULL) {
+               vsock = xensock_dev_priv(SOCK_DEV_SKB(skb));
+
+               nr_frags = skb_shinfo(skb)->nr_frags;
+
+               sco = (struct skb_cb_overlay *)skb->cb;
+               sco->meta_slots_used = sockbk_gop_skb(skb, &spo);
+
+               count += nr_frags + 1;
+
+               __skb_queue_tail(&rxq, skb);
+
+               /* Filled the batch queue? */
+               if (count + MAX_SKB_FRAGS >= XEN_SOCKIF_RX_RING_SIZE)
+                       break;
+       }
+
+       BUG_ON(spo.meta_prod > ARRAY_SIZE(sockbk->meta));
+
+       if (!spo.copy_prod)
+               return;
+
+       BUG_ON(spo.copy_prod > ARRAY_SIZE(sockbk->grant_copy_op));
+       gnttab_batch_copy(sockbk->grant_copy_op, spo.copy_prod);
+
+       while ((skb = __skb_dequeue(&rxq)) != NULL) {
+               sco = (struct skb_cb_overlay *)skb->cb;
+
+               vsock = xensock_dev_priv(SOCK_DEV_SKB(skb));
+
+               status = sockbk_check_gop(vsock, sco->meta_slots_used, &spo);
+
+               if (sco->meta_slots_used == 1)
+                       flags = 0;
+               else
+                       flags = XEN_SOCKRXF_more_data;
+
+               offset = 0;
+               resp = make_rx_response(vsock, sockbk->meta[spo.meta_cons].id,
+                                       status, offset,
+                                       sockbk->meta[spo.meta_cons].size,
+                                       flags);
+
+               sockbk_add_frag_responses(vsock, status,
+                                         sockbk->meta + spo.meta_cons + 1,
+                                         sco->meta_slots_used);
+
+               RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vsock->rx, ret);
+               irq = vsock->irq;
+               if (ret && list_empty(&vsock->notify_list))
+                       list_add_tail(&vsock->notify_list, &notify);
+
+               xenvsock_notify_tx_completion(vsock);
+
+               xenvsock_put(vsock);
+               spo.meta_cons += sco->meta_slots_used;
+               dev_kfree_skb(skb);
+       }
+
+       list_for_each_entry_safe(vsock, tmp, &notify, notify_list) {
+               notify_remote_via_irq(vsock->irq);
+               list_del_init(&vsock->notify_list);
+       }
+
+       /* More work to do? */
+       if (!skb_queue_empty(&sockbk->rx_queue) &&
+           !timer_pending(&sockbk->sock_timer))
+               xen_sockbk_kick_thread(sockbk);
+}
+
+void xen_sockbk_queue_tx_skb(struct xenvsock *vsock, struct sk_buff *skb)
+{
+       struct xen_sockbk *sockbk = vsock->sockbk;
+
+       skb_queue_tail(&sockbk->rx_queue, skb);
+       xen_sockbk_kick_thread(sockbk);
+}
+
+static void xen_sockbk_alarm(unsigned long data)
+{
+       struct xen_sockbk *sockbk = (struct xen_sockbk *)data;
+
+       xen_sockbk_kick_thread(sockbk);
+}
+
+static int __on_sock_schedule_list(struct xenvsock *vsock)
+{
+       return !list_empty(&vsock->schedule_list);
+}
+
+/* Must be called with sock_schedule_list_lock held */
+static void remove_from_sock_schedule_list(struct xenvsock *vsock)
+{
+       if (likely(__on_sock_schedule_list(vsock))) {
+               list_del_init(&vsock->schedule_list);
+               xenvsock_put(vsock);
+       }
+}
+
+static struct xenvsock *poll_sock_schedule_list(struct xen_sockbk *sockbk)
+{
+       struct xenvsock *vsock = NULL;
+
+       spin_lock_irq(&sockbk->sock_schedule_list_lock);
+       if (list_empty(&sockbk->sock_schedule_list))
+               goto out;
+
+       vsock = list_first_entry(&sockbk->sock_schedule_list,
+                                struct xenvsock, schedule_list);
+       if (!vsock)
+               goto out;
+
+       xenvsock_get(vsock);
+
+       remove_from_sock_schedule_list(vsock);
+out:
+       spin_unlock_irq(&sockbk->sock_schedule_list_lock);
+       return vsock;
+}
+
+void xen_sockbk_schedule_xenvsock(struct xenvsock *vsock)
+{
+       unsigned long flags;
+       struct xen_sockbk *sockbk = vsock->sockbk;
+       struct list_head *sockbk_schedule_list = &sockbk->sock_schedule_list;
+
+       if (__on_sock_schedule_list(vsock))
+               goto kick;
+
+       spin_lock_irqsave(&sockbk->sock_schedule_list_lock, flags);
+       if (!__on_sock_schedule_list(vsock) &&
+           likely(xenvsock_schedulable(vsock))) {
+               list_add_tail(&vsock->schedule_list, sockbk_schedule_list);
+               xenvsock_get(vsock);
+       }
+       spin_unlock_irqrestore(&sockbk->sock_schedule_list_lock, flags);
+
+kick:
+       smp_mb(); /* Ensure that an item is added to the shedule list */
+       if ((nr_pending_reqs(sockbk) < (MAX_PENDING_REQS/2)) &&
+           !list_empty(sockbk_schedule_list))
+               xen_sockbk_kick_thread(sockbk);
+}
+
+void xen_sockbk_deschedule_xenvsock(struct xenvsock *vsock)
+{
+       struct xen_sockbk *sockbk = vsock->sockbk;
+
+       spin_lock_irq(&sockbk->sock_schedule_list_lock);
+       remove_from_sock_schedule_list(vsock);
+       spin_unlock_irq(&sockbk->sock_schedule_list_lock);
+}
+
+void xen_sockbk_check_rx_xenvsock(struct xenvsock *vsock)
+{
+       int more_to_do;
+
+       RING_FINAL_CHECK_FOR_REQUESTS(&vsock->tx, more_to_do);
+
+       if (more_to_do)
+               xen_sockbk_schedule_xenvsock(vsock);
+}
+
+static void tx_add_credit(struct xenvsock *vsock)
+{
+       unsigned long max_burst, max_credit;
+
+       /* Allow a burst big enough to transmit a jumbo packet of up to 128kB.
+        * Otherwise the interface can seize up due to insufficient credit.
+        */
+       max_burst = RING_GET_REQUEST(&vsock->tx, vsock->tx.req_cons)->size;
+       max_burst = min(max_burst, 131072UL);
+       max_burst = max(max_burst, vsock->credit_bytes);
+
+       /* Take care that adding a new chunk of credit doesn't wrap to zero. */
+       max_credit = vsock->remaining_credit + vsock->credit_bytes;
+       if (max_credit < vsock->remaining_credit)
+               max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
+
+       vsock->remaining_credit = min(max_credit, max_burst);
+}
+
+static void tx_credit_callback(unsigned long data)
+{
+       struct xenvsock *vsock = (struct xenvsock *)data;
+
+       tx_add_credit(vsock);
+       xen_sockbk_check_rx_xenvsock(vsock);
+}
+
+static void sockbk_tx_err(struct xenvsock *vsock,
+                         struct xen_sockif_tx_request *txp, RING_IDX end)
+{
+       RING_IDX cons = vsock->tx.req_cons;
+
+       do {
+               make_tx_response(vsock, txp, XEN_SOCKIF_RSP_ERROR);
+               if (cons == end)
+                       break;
+               txp = RING_GET_REQUEST(&vsock->tx, cons++);
+       } while (1);
+       vsock->tx.req_cons = cons;
+       xen_sockbk_check_rx_xenvsock(vsock);
+       xenvsock_put(vsock);
+}
+
+static void sockbk_fatal_tx_err(struct xenvsock *vsock)
+{
+       dev_err(&vsock->xbdev->dev, "fatal error; disabling device\n");
+       xenvsock_carrier_off(vsock);
+       xenvsock_put(vsock);
+}
+
+static int sockbk_count_requests(struct xenvsock *vsock,
+                                struct xen_sockif_tx_request *first,
+                                struct xen_sockif_tx_request *txp,
+                                int work_to_do)
+{
+       RING_IDX cons = vsock->tx.req_cons;
+       int frags = 0;
+
+       if (!(first->flags & XEN_SOCKTXF_more_data))
+               return 0;
+
+       do {
+               if (frags >= work_to_do) {
+                       dev_err(&vsock->xbdev->dev, "Need more frags\n");
+                       sockbk_fatal_tx_err(vsock);
+                       return -ENODATA;
+               }
+
+               if (unlikely(frags >= MAX_SKB_FRAGS)) {
+                       dev_err(&vsock->xbdev->dev, "Too many frags\n");
+                       sockbk_fatal_tx_err(vsock);
+                       return -E2BIG;
+               }
+
+               memcpy(txp, RING_GET_REQUEST(&vsock->tx, cons + frags),
+                      sizeof(*txp));
+               if (txp->size > first->size) {
+                       dev_err(&vsock->xbdev->dev, "Frag is bigger than 
frame.\n");
+                       sockbk_fatal_tx_err(vsock);
+                       return -EIO;
+               }
+
+               first->size -= txp->size;
+               frags++;
+
+               if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+                       dev_err(&vsock->xbdev->dev, "txp->offset: %x, size: 
%u\n",
+                               txp->offset, txp->size);
+                       sockbk_fatal_tx_err(vsock);
+                       return -EINVAL;
+               }
+       } while ((txp++)->flags & XEN_SOCKTXF_more_data);
+       return frags;
+}
+
+static struct page *xen_sockbk_alloc_page(struct xen_sockbk *sockbk,
+                                         struct sk_buff *skb,
+                                         u16 pending_idx)
+{
+       struct page *page;
+
+       page = alloc_page(GFP_KERNEL|__GFP_COLD);
+       if (!page)
+               return NULL;
+       set_page_ext(page, sockbk, pending_idx);
+       sockbk->mmap_pages[pending_idx] = page;
+       return page;
+}
+
+static
+struct gnttab_copy *xen_sockbk_get_requests(struct xen_sockbk *sockbk,
+                                           struct xenvsock *vsock,
+                                           struct sk_buff *skb,
+                                           struct xen_sockif_tx_request *txp,
+                                           struct gnttab_copy *gop)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       skb_frag_t *frags = shinfo->frags;
+       u16 pending_idx = *((u16 *)skb->data);
+       int i, start;
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+
+       for (i = start; i < shinfo->nr_frags; i++, txp++) {
+               struct page *page;
+               pending_ring_idx_t index;
+               struct pending_tx_info *pending_tx_info =
+                       sockbk->pending_tx_info;
+
+               index = pending_index(sockbk->pending_cons++);
+               pending_idx = sockbk->pending_ring[index];
+               page = xen_sockbk_alloc_page(sockbk, skb, pending_idx);
+               if (!page)
+                       goto err;
+
+               gop->source.u.ref = txp->gref;
+               gop->source.domid = vsock->domid;
+               gop->source.offset = txp->offset;
+
+               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+               gop->dest.domid = DOMID_SELF;
+               gop->dest.offset = txp->offset;
+
+               gop->len = txp->size;
+               gop->flags = GNTCOPY_source_gref;
+
+               gop++;
+
+               memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+               xenvsock_get(vsock);
+               pending_tx_info[pending_idx].vsock = vsock;
+               frag_set_pending_idx(&frags[i], pending_idx);
+       }
+
+       return gop;
+err:
+       /* Unwind, freeing all pages and sending error responses. */
+       while (i-- > start) {
+               xen_sockbk_idx_release(sockbk, frag_get_pending_idx(&frags[i]),
+                                      XEN_SOCKIF_RSP_ERROR);
+       }
+       /* The head too, if necessary. */
+       if (start)
+               xen_sockbk_idx_release(sockbk, pending_idx,
+                                      XEN_SOCKIF_RSP_ERROR);
+
+       return NULL;
+}
+
+static int xen_sockbk_tx_check_gop(struct xen_sockbk *sockbk,
+                                  struct sk_buff *skb,
+                                  struct gnttab_copy **gopp)
+{
+       struct gnttab_copy *gop = *gopp;
+       u16 pending_idx = *((u16 *)skb->data);
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i, err, start;
+
+       /* Check status of header. */
+       err = gop->status;
+       if (unlikely(err))
+               xen_sockbk_idx_release(sockbk, pending_idx,
+                                      XEN_SOCKIF_RSP_ERROR);
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+
+       for (i = start; i < nr_frags; i++) {
+               int j, newerr;
+
+               pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
+
+               /* Check error status: if okay then remember grant handle. */
+               newerr = (++gop)->status;
+               if (likely(!newerr)) {
+                       /* Had a previous error? Invalidate this fragment. */
+                       if (unlikely(err))
+                               xen_sockbk_idx_release(sockbk, pending_idx,
+                                                      XEN_SOCKIF_RSP_OKAY);
+                       continue;
+               }
+
+               /* Error on this fragment: respond to client with an error. */
+               xen_sockbk_idx_release(sockbk, pending_idx,
+                                      XEN_SOCKIF_RSP_ERROR);
+
+               /* Not the first error? Preceding frags already invalidated. */
+               if (err)
+                       continue;
+
+               /* First error: invalidate header and preceding fragments. */
+               pending_idx = *((u16 *)skb->data);
+               xen_sockbk_idx_release(sockbk, pending_idx,
+                                      XEN_SOCKIF_RSP_OKAY);
+               for (j = start; j < i; j++) {
+                       pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+                       xen_sockbk_idx_release(sockbk, pending_idx,
+                                              XEN_SOCKIF_RSP_OKAY);
+               }
+
+               /* Remember the error: invalidate all subsequent fragments. */
+               err = newerr;
+       }
+
+       *gopp = gop + 1;
+       return err;
+}
+
+static void xen_sockbk_fill_frags(struct xen_sockbk *sockbk,
+                                 struct sk_buff *skb)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i;
+
+       for (i = 0; i < nr_frags; i++) {
+               skb_frag_t *frag = shinfo->frags + i;
+               struct xen_sockif_tx_request *txp;
+               struct page *page;
+               u16 pending_idx;
+
+               pending_idx = frag_get_pending_idx(frag);
+
+               txp = &sockbk->pending_tx_info[pending_idx].req;
+               page = virt_to_page(idx_to_kaddr(sockbk, pending_idx));
+               __skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
+               skb->len += txp->size;
+               skb->data_len += txp->size;
+               skb->truesize += txp->size;
+
+               /* Take an extra reference to offset xen_sockbk_idx_release */
+               get_page(sockbk->mmap_pages[pending_idx]);
+               xen_sockbk_idx_release(sockbk, pending_idx,
+                                      XEN_SOCKIF_RSP_OKAY);
+       }
+}
+
+static bool tx_credit_exceeded(struct xenvsock *vsock, unsigned size)
+{
+       unsigned long now = jiffies;
+       unsigned long next_credit =
+               vsock->credit_timeout.expires +
+               msecs_to_jiffies(vsock->credit_usec / 1000);
+
+       /* Timer could already be pending in rare cases. */
+       if (timer_pending(&vsock->credit_timeout))
+               return true;
+
+       /* Passed the point where we can replenish credit? */
+       if (time_after_eq(now, next_credit)) {
+               vsock->credit_timeout.expires = now;
+               tx_add_credit(vsock);
+       }
+
+       /* Still too big to send right now? Set a callback. */
+       if (size > vsock->remaining_credit) {
+               vsock->credit_timeout.data     =
+                       (unsigned long)vsock;
+               vsock->credit_timeout.function =
+                       tx_credit_callback;
+               mod_timer(&vsock->credit_timeout,
+                         next_credit);
+
+               return true;
+       }
+
+       return false;
+}
+
+static unsigned xen_sockbk_tx_build_gops(struct xen_sockbk *sockbk)
+{
+       struct gnttab_copy *gop = sockbk->tx_copy_ops, *request_gop;
+       struct sk_buff *skb;
+       int ret;
+
+       while (((nr_pending_reqs(sockbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+              !list_empty(&sockbk->sock_schedule_list)) {
+               struct xenvsock *vsock;
+               struct xen_sockif_tx_request txreq;
+               struct xen_sockif_tx_request txfrags[MAX_SKB_FRAGS];
+               struct page *page;
+               u16 pending_idx;
+               RING_IDX idx;
+               int work_to_do;
+               unsigned int data_len;
+               pending_ring_idx_t index;
+
+               /* Get a vsock from the list with work to do. */
+               vsock = poll_sock_schedule_list(sockbk);
+               /* This can sometimes happen because the test of
+                * list_empty(sock_schedule_list) at the top of the
+                * loop is unlocked.  Just go back and have another
+                * look.
+                */
+               if (!vsock)
+                       continue;
+
+               if (vsock->tx.sring->req_prod - vsock->tx.req_cons >
+                   XEN_SOCKIF_TX_RING_SIZE) {
+                       dev_err(&vsock->xbdev->dev,
+                               "Impossible number of requests. "
+                               "req_prod %d, req_cons %d, size %ld\n",
+                               vsock->tx.sring->req_prod,
+                               vsock->tx.req_cons,
+                               XEN_SOCKIF_TX_RING_SIZE);
+                       sockbk_fatal_tx_err(vsock);
+                       continue;
+               }
+
+               RING_FINAL_CHECK_FOR_REQUESTS(&vsock->tx, work_to_do);
+               if (!work_to_do) {
+                       xenvsock_put(vsock);
+                       continue;
+               }
+
+               idx = vsock->tx.req_cons;
+               rmb(); /* Ensure that we see the request before we copy it. */
+               memcpy(&txreq, RING_GET_REQUEST(&vsock->tx, idx),
+                      sizeof(txreq));
+
+               /* Credit-based scheduling. */
+               if (txreq.size > vsock->remaining_credit &&
+                   tx_credit_exceeded(vsock, txreq.size)) {
+                       xenvsock_put(vsock);
+                       continue;
+               }
+
+               vsock->remaining_credit -= txreq.size;
+
+               work_to_do--;
+               vsock->tx.req_cons = ++idx;
+
+               ret = sockbk_count_requests(vsock, &txreq, txfrags, work_to_do);
+               if (unlikely(ret < 0))
+                       continue;
+
+               idx += ret;
+
+               if (unlikely(txreq.size == 0)) {
+                       dev_dbg(&vsock->xbdev->dev,
+                               "Bad packet size: %d\n", txreq.size);
+                       sockbk_tx_err(vsock, &txreq, idx);
+                       continue;
+               }
+
+               /* No crossing a page as the payload mustn't fragment. */
+               if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+                       dev_err(&vsock->xbdev->dev,
+                               "txreq.offset: %x, size: %u, end: %lu\n",
+                               txreq.offset, txreq.size,
+                               (txreq.offset&~PAGE_MASK) + txreq.size);
+                       sockbk_fatal_tx_err(vsock);
+                       continue;
+               }
+
+               index = pending_index(sockbk->pending_cons);
+               pending_idx = sockbk->pending_ring[index];
+
+               data_len = (txreq.size > PKT_PROT_LEN &&
+                           ret < MAX_SKB_FRAGS) ?
+                       PKT_PROT_LEN : txreq.size;
+
+               skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
+                               GFP_ATOMIC | __GFP_NOWARN);
+               if (unlikely(skb == NULL)) {
+                       dev_dbg(&vsock->xbdev->dev,
+                               "Can't allocate a skb in start_xmit.\n");
+                       sockbk_tx_err(vsock, &txreq, idx);
+                       break;
+               }
+
+               /* Packets passed to sockif_rx() must have some headroom. */
+               skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+               /* XXX could copy straight to head */
+               page = xen_sockbk_alloc_page(sockbk, skb, pending_idx);
+               if (!page) {
+                       kfree_skb(skb);
+                       sockbk_tx_err(vsock, &txreq, idx);
+                       continue;
+               }
+
+               gop->source.u.ref = txreq.gref;
+               gop->source.domid = vsock->domid;
+               gop->source.offset = txreq.offset;
+
+               gop->dest.u.gmfn = virt_to_mfn(page_address(page));
+               gop->dest.domid = DOMID_SELF;
+               gop->dest.offset = txreq.offset;
+
+               gop->len = txreq.size;
+               gop->flags = GNTCOPY_source_gref;
+
+               gop++;
+
+               memcpy(&sockbk->pending_tx_info[pending_idx].req,
+                      &txreq, sizeof(txreq));
+               sockbk->pending_tx_info[pending_idx].vsock = vsock;
+               *((u16 *)skb->data) = pending_idx;
+
+               __skb_put(skb, data_len);
+
+               skb_shinfo(skb)->nr_frags = ret;
+               if (data_len < txreq.size) {
+                       skb_shinfo(skb)->nr_frags++;
+                       frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
+                                            pending_idx);
+               } else {
+                       frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
+                                            INVALID_PENDING_IDX);
+               }
+
+               sockbk->pending_cons++;
+
+               request_gop = xen_sockbk_get_requests(sockbk, vsock,
+                                                     skb, txfrags, gop);
+               if (request_gop == NULL) {
+                       kfree_skb(skb);
+                       sockbk_tx_err(vsock, &txreq, idx);
+                       continue;
+               }
+               gop = request_gop;
+
+               __skb_queue_tail(&sockbk->tx_queue, skb);
+
+               vsock->tx.req_cons = idx;
+               xen_sockbk_check_rx_xenvsock(vsock);
+
+               if (gop-sockbk->tx_copy_ops >= ARRAY_SIZE(sockbk->tx_copy_ops))
+                       break;
+       }
+
+       return gop - sockbk->tx_copy_ops;
+}
+
+static void xen_sockbk_tx_submit(struct xen_sockbk *sockbk)
+{
+       struct gnttab_copy *gop = sockbk->tx_copy_ops;
+       struct sk_buff *skb;
+
+       while ((skb = __skb_dequeue(&sockbk->tx_queue)) != NULL) {
+               struct xen_sockif_tx_request *txp;
+               struct xenvsock *vsock;
+               u16 pending_idx;
+               unsigned data_len;
+
+               pending_idx = *((u16 *)skb->data);
+               vsock = sockbk->pending_tx_info[pending_idx].vsock;
+               txp = &sockbk->pending_tx_info[pending_idx].req;
+
+               /* Check the remap error code. */
+               if (unlikely(xen_sockbk_tx_check_gop(sockbk, skb, &gop))) {
+                       dev_dbg(&vsock->xbdev->dev, "sockback grant failed.\n");
+                       skb_shinfo(skb)->nr_frags = 0;
+                       kfree_skb(skb);
+                       continue;
+               }
+
+               data_len = skb->len;
+               memcpy(skb->data,
+                      (void *)(idx_to_kaddr(sockbk, pending_idx)|txp->offset),
+                      data_len);
+               if (data_len < txp->size) {
+                       /* Append the packet payload as a fragment. */
+                       txp->offset += data_len;
+                       txp->size -= data_len;
+               } else {
+                       /* Schedule a response immediately. */
+                       xen_sockbk_idx_release(sockbk, pending_idx,
+                                              XEN_SOCKIF_RSP_OKAY);
+               }
+
+               xen_sockbk_fill_frags(sockbk, skb);
+
+               /* If the initial fragment was < PKT_PROT_LEN then
+                * pull through some bytes from the other fragments to
+                * increase the linear region to PKT_PROT_LEN bytes.
+                */
+               if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
+                       int target = min_t(int, skb->len, PKT_PROT_LEN);
+
+                       __pskb_pull_tail(skb, target - skb_headlen(skb));
+               }
+
+               xenvsock_receive_skb(vsock, skb);
+       }
+}
+
+/* Called after sockfront has transmitted */
+static void xen_sockbk_tx_action(struct xen_sockbk *sockbk)
+{
+       unsigned nr_gops;
+
+       nr_gops = xen_sockbk_tx_build_gops(sockbk);
+
+       if (nr_gops == 0)
+               return;
+
+       gnttab_batch_copy(sockbk->tx_copy_ops, nr_gops);
+
+       xen_sockbk_tx_submit(sockbk);
+}
+
+static void xen_sockbk_idx_release(struct xen_sockbk *sockbk, u16 pending_idx,
+                                  u8 status)
+{
+       struct xenvsock *vsock;
+       struct pending_tx_info *pending_tx_info;
+       pending_ring_idx_t index;
+
+       /* Already complete? */
+       if (sockbk->mmap_pages[pending_idx] == NULL)
+               return;
+
+       pending_tx_info = &sockbk->pending_tx_info[pending_idx];
+
+       vsock = pending_tx_info->vsock;
+
+       make_tx_response(vsock, &pending_tx_info->req, status);
+
+       index = pending_index(sockbk->pending_prod++);
+       sockbk->pending_ring[index] = pending_idx;
+
+       xenvsock_put(vsock);
+
+       sockbk->mmap_pages[pending_idx]->mapping = 0;
+       put_page(sockbk->mmap_pages[pending_idx]);
+       sockbk->mmap_pages[pending_idx] = NULL;
+}
+
+static void make_tx_response(struct xenvsock *vsock,
+                            struct xen_sockif_tx_request *txp,
+                            s8 st)
+{
+       RING_IDX i = vsock->tx.rsp_prod_pvt;
+       struct xen_sockif_tx_response *resp;
+       int notify;
+
+       resp = RING_GET_RESPONSE(&vsock->tx, i);
+       resp->id     = txp->id;
+       resp->status = st;
+
+       vsock->tx.rsp_prod_pvt = ++i;
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vsock->tx, notify);
+       if (notify)
+               notify_remote_via_irq(vsock->irq);
+}
+
+static struct xen_sockif_rx_response *make_rx_response(struct xenvsock *vsock,
+                                                      u16 id, s8 st,
+                                                      u16 offset, u16 size,
+                                                      u16 flags)
+{
+       RING_IDX i = vsock->rx.rsp_prod_pvt;
+       struct xen_sockif_rx_response *resp;
+
+       resp = RING_GET_RESPONSE(&vsock->rx, i);
+       resp->offset     = offset;
+       resp->flags      = flags;
+       resp->id         = id;
+       resp->status     = (s16)size;
+       if (st < 0)
+               resp->status = (s16)st;
+
+       vsock->rx.rsp_prod_pvt = ++i;
+
+       return resp;
+}
+
+static inline int rx_work_todo(struct xen_sockbk *sockbk)
+{
+       return !skb_queue_empty(&sockbk->rx_queue);
+}
+
+static inline int tx_work_todo(struct xen_sockbk *sockbk)
+{
+       if (((nr_pending_reqs(sockbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+           !list_empty(&sockbk->sock_schedule_list))
+               return 1;
+
+       return 0;
+}
+
+static int xen_sockbk_kthread(void *data)
+{
+       struct xen_sockbk *sockbk = data;
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(sockbk->wq,
+                                        rx_work_todo(sockbk) ||
+                                        tx_work_todo(sockbk) ||
+                                        kthread_should_stop());
+               cond_resched();
+
+               if (kthread_should_stop())
+                       break;
+
+               if (rx_work_todo(sockbk))
+                       xen_sockbk_rx_action(sockbk);
+
+               if (tx_work_todo(sockbk))
+                       xen_sockbk_tx_action(sockbk);
+       }
+
+       return 0;
+}
+
+void xen_sockbk_unmap_frontend_rings(struct xenvsock *vsock)
+{
+       if (vsock->tx.sring)
+               xenbus_unmap_ring_vfree(vsock->xbdev, vsock->tx.sring);
+       if (vsock->rx.sring)
+               xenbus_unmap_ring_vfree(vsock->xbdev, vsock->rx.sring);
+}
+
+int xen_sockbk_map_frontend_rings(struct xenvsock *vsock,
+                                 grant_ref_t tx_ring_ref,
+                                 grant_ref_t rx_ring_ref)
+{
+       void *addr;
+       struct xen_sockif_tx_sring *txs;
+       struct xen_sockif_rx_sring *rxs;
+
+       int err = -ENOMEM;
+
+       err = xenbus_map_ring_valloc(vsock->xbdev, tx_ring_ref, &addr);
+       if (err)
+               goto err;
+
+       txs = (struct xen_sockif_tx_sring *)addr;
+       BACK_RING_INIT(&vsock->tx, txs, PAGE_SIZE);
+
+       err = xenbus_map_ring_valloc(vsock->xbdev, rx_ring_ref, &addr);
+       if (err)
+               goto err;
+
+       rxs = (struct xen_sockif_rx_sring *)addr;
+       BACK_RING_INIT(&vsock->rx, rxs, PAGE_SIZE);
+
+       vsock->rx_req_cons_peek = 0;
+
+       return 0;
+
+err:
+       xen_sockbk_unmap_frontend_rings(vsock);
+       return err;
+}
+
+static int __init sockback_init(void)
+{
+       int i;
+       int rc = 0;
+       int group;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       xen_sockbk_group_nr = num_online_cpus();
+       xen_sockbk = vzalloc(sizeof(*xen_sockbk) * xen_sockbk_group_nr);
+       if (!xen_sockbk)
+               return -ENOMEM;
+
+       for (group = 0; group < xen_sockbk_group_nr; group++) {
+               struct xen_sockbk *sockbk = &xen_sockbk[group];
+
+               skb_queue_head_init(&sockbk->rx_queue);
+               skb_queue_head_init(&sockbk->tx_queue);
+
+               init_timer(&sockbk->sock_timer);
+               sockbk->sock_timer.data = (unsigned long)sockbk;
+               sockbk->sock_timer.function = xen_sockbk_alarm;
+
+               sockbk->pending_cons = 0;
+               sockbk->pending_prod = MAX_PENDING_REQS;
+               for (i = 0; i < MAX_PENDING_REQS; i++)
+                       sockbk->pending_ring[i] = i;
+
+               init_waitqueue_head(&sockbk->wq);
+               sockbk->task = kthread_create(xen_sockbk_kthread,
+                                            (void *)sockbk,
+                                            "sockback/%u", group);
+
+               if (IS_ERR(sockbk->task)) {
+                       pr_alert("kthread_create() fails at sockback\n");
+                       del_timer(&sockbk->sock_timer);
+                       rc = PTR_ERR(sockbk->task);
+                       goto failed_init;
+               }
+
+               kthread_bind(sockbk->task, group);
+
+               INIT_LIST_HEAD(&sockbk->sock_schedule_list);
+
+               spin_lock_init(&sockbk->sock_schedule_list_lock);
+
+               atomic_set(&sockbk->sockfront_count, 0);
+
+               wake_up_process(sockbk->task);
+       }
+
+       rc = xenvsock_xenbus_init();
+       if (rc)
+               goto failed_init;
+
+       rc = xensock_proto_server_init();
+       if (rc)
+               goto failed_init;
+
+       return 0;
+
+failed_init:
+       while (--group >= 0) {
+               struct xen_sockbk *sockbk = &xen_sockbk[group];
+
+               for (i = 0; i < MAX_PENDING_REQS; i++) {
+                       if (sockbk->mmap_pages[i])
+                               __free_page(sockbk->mmap_pages[i]);
+               }
+               del_timer(&sockbk->sock_timer);
+               kthread_stop(sockbk->task);
+       }
+       vfree(xen_sockbk);
+       return rc;
+}
+
+module_init(sockback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vsock");
diff --git a/drivers/net/xen-sockback/xenbus.c 
b/drivers/net/xen-sockback/xenbus.c
new file mode 100644
index 0000000..7bd9a06
--- /dev/null
+++ b/drivers/net/xen-sockback/xenbus.c
@@ -0,0 +1,348 @@
+/*
+ * Xenbus code for sockif backend
+ *
+ * Copyright (C) 2005 Rusty Russell <rusty@xxxxxxxxxxxxxxx>
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+*/
+#include "common.h"
+
+struct backend_info {
+       struct xenbus_device *dev;
+       struct xenvsock *vsock;
+       enum xenbus_state frontend_state;
+};
+
+static int connect_rings(struct backend_info *);
+static void connect(struct backend_info *);
+static int backend_create_xenvsock(struct backend_info *be);
+
+static int sockback_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       if (be->vsock) {
+               xenvsock_disconnect(be->vsock);
+               be->vsock = NULL;
+       }
+       kfree(be);
+       dev_set_drvdata(&dev->dev, NULL);
+       return 0;
+}
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and switch to InitWait.
+ */
+static int sockback_probe(struct xenbus_device *dev,
+                         const struct xenbus_device_id *id)
+{
+       const char *message;
+       struct xenbus_transaction xbt;
+       int err;
+       int sg;
+       struct backend_info *be = kzalloc(sizeof(*be), GFP_KERNEL);
+
+       if (!be) {
+               xenbus_dev_fatal(dev, -ENOMEM,
+                                "allocating backend structure");
+               return -ENOMEM;
+       }
+
+       be->dev = dev;
+       dev_set_drvdata(&dev->dev, be);
+
+       sg = 1;
+
+       do {
+               err = xenbus_transaction_start(&xbt);
+               if (err) {
+                       xenbus_dev_fatal(dev, err, "starting transaction");
+                       goto fail;
+               }
+
+               /* We support rx-copy path. */
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "feature-rx-copy", "%d", 1);
+               if (err) {
+                       message = "writing feature-rx-copy";
+                       goto abort_transaction;
+               }
+
+               /* We don't support rx-flip path (except old guests who don't
+                * grok this feature flag).
+                */
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "feature-rx-flip", "%d", 0);
+               if (err) {
+                       message = "writing feature-rx-flip";
+                       goto abort_transaction;
+               }
+
+               err = xenbus_transaction_end(xbt, 0);
+       } while (err == -EAGAIN);
+
+       if (err) {
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto fail;
+       }
+
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+
+       return 0;
+
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, err, "%s", message);
+fail:
+       pr_debug("failed");
+       sockback_remove(dev);
+       return err;
+}
+
+static int backend_create_xenvsock(struct backend_info *be)
+{
+       int err;
+       long handle;
+       struct xenbus_device *dev = be->dev;
+
+       if (be->vsock != NULL)
+               return 0;
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
+       if (err != 1) {
+               err = -EINVAL;
+               xenbus_dev_fatal(dev, err, "reading handle");
+               goto fail;
+       }
+
+       be->vsock = xenvsock_alloc(&dev->dev, dev->otherend_id, handle);
+       if (IS_ERR(be->vsock)) {
+               err = PTR_ERR(be->vsock);
+               be->vsock = NULL;
+               xenbus_dev_fatal(dev, err, "creating interface");
+               goto fail;
+       }
+
+       /* setup back pointer */
+       be->vsock->xbdev = dev;
+
+       err = 0;
+fail:
+       return err;
+}
+
+
+static void disconnect_backend(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       if (be->vsock) {
+               xenvsock_disconnect(be->vsock);
+               be->vsock = NULL;
+       }
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       pr_debug("frontend state %s", xenbus_strstate(frontend_state));
+
+       be->frontend_state = frontend_state;
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosed) {
+                       pr_alert("%s: %s: prepare for reconnect\n",
+                                __func__, dev->nodename);
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
+               break;
+
+       case XenbusStateInitialised:
+       case XenbusStateConnected:
+               /* Ensure we connect even when two watches fire in
+                * close succession and we miss the intermediate value
+                * of frontend_state.
+                */
+               if (dev->state == XenbusStateConnected)
+                       break;
+
+               /* Enforce precondition before potential leak point.
+                * disconnect_backend() is idempotent.
+                */
+               disconnect_backend(dev);
+
+               if (backend_create_xenvsock(be))
+                       break;
+
+               connect(be);
+               break;
+
+       case XenbusStateClosing:
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               disconnect_backend(dev);
+               xenbus_switch_state(dev, XenbusStateClosed);
+               if (xenbus_dev_is_online(dev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               device_unregister(&dev->dev);
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                frontend_state);
+               break;
+       }
+}
+
+static void xen_sock_read_rate(struct xenbus_device *dev,
+                              unsigned long *bytes, unsigned long *usec)
+{
+       char *s, *e;
+       unsigned long b, u;
+       char *ratestr;
+
+       /* Default to unlimited bandwidth. */
+       *bytes = ~0UL;
+       *usec = 0;
+
+       ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
+       if (IS_ERR(ratestr))
+               return;
+
+       s = ratestr;
+       e = strchr(s, ',');
+       if (e == NULL)
+               goto fail;
+
+       e[0] = '\0';
+
+       if (kstrtoul(s, 10, &b))
+               goto fail;
+
+       s = e + 1;
+       if (kstrtoul(s, 10, &u))
+               goto fail;
+
+       *bytes = b;
+       *usec = u;
+
+       kfree(ratestr);
+       return;
+
+ fail:
+       pr_warn("Failed to parse socket data rate limit. Traffic unlimited.\n");
+       kfree(ratestr);
+}
+
+static void connect(struct backend_info *be)
+{
+       int err;
+       struct xenbus_device *dev = be->dev;
+
+       err = connect_rings(be);
+       if (err)
+               return;
+
+       xen_sock_read_rate(dev, &be->vsock->credit_bytes,
+                          &be->vsock->credit_usec);
+       be->vsock->remaining_credit = be->vsock->credit_bytes;
+
+       sockif_wake_queue(be->vsock->dev);
+       xenbus_switch_state(dev, XenbusStateConnected);
+}
+
+static int connect_rings(struct backend_info *be)
+{
+       struct xenvsock *vsock = be->vsock;
+       struct xenbus_device *dev = be->dev;
+       unsigned long tx_ring_ref, rx_ring_ref;
+       unsigned int evtchn, rx_copy;
+       int err;
+       int val;
+
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                           "tx-ring-ref", "%lu", &tx_ring_ref,
+                           "rx-ring-ref", "%lu", &rx_ring_ref,
+                           "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                dev->otherend);
+               return err;
+       }
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+                          &rx_copy);
+       if (err == -ENOENT) {
+               err = 0;
+               rx_copy = 0;
+       }
+       if (err < 0) {
+               xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+                                dev->otherend);
+               return err;
+       }
+       if (!rx_copy)
+               return -EOPNOTSUPP;
+
+       if (vsock->dev->tx_queue_len != 0) {
+               if (xenbus_scanf(XBT_NIL, dev->otherend,
+                                "feature-rx-notify", "%d", &val) < 0)
+                       val = 0;
+               if (val)
+                       vsock->can_queue = 1;
+               else
+                       /* Must be non-zero for pfifo_fast to work. */
+                       vsock->dev->tx_queue_len = 1;
+       }
+
+       /* Map the shared frame, irq etc. */
+       err = xenvsock_connect(vsock, tx_ring_ref, rx_ring_ref, evtchn);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "mapping shared-frames %lu/%lu port %u",
+                                tx_ring_ref, rx_ring_ref, evtchn);
+               return err;
+       }
+       return 0;
+}
+
+/* ** Driver Registration ** */
+static const struct xenbus_device_id sockback_ids[] = {
+       { "vsock" },
+       { "" }
+};
+
+static DEFINE_XENBUS_DRIVER(sockback, ,
+       .probe = sockback_probe,
+       .remove = sockback_remove,
+       .otherend_changed = frontend_changed,
+);
+
+int xenvsock_xenbus_init(void)
+{
+       return xenbus_register_backend(&sockback_driver);
+}
diff --git a/include/xen/interface/io/sockif.h 
b/include/xen/interface/io/sockif.h
new file mode 100644
index 0000000..c12ce11
--- /dev/null
+++ b/include/xen/interface/io/sockif.h
@@ -0,0 +1,74 @@
+/******************************************************************************
+ * sockif.h
+ *
+ * Unified xensock I/O interface for Xen guest OSes.
+ *
+ */
+#ifndef __XEN_PUBLIC_IO_SOCKIF_H__
+#define __XEN_PUBLIC_IO_SOCKIF_H__
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+
+#include <net/sock.h>
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/grant_table.h>
+
+/*
+ * REQUEST CODES.
+ */
+#define SOCKIF_OP_SENDMSG              0
+#define SOCKIF_OP_RECVMSG              1
+
+#define SOCKIF_MAX_PAGES_PER_REQUEST   10
+
+#define SOCKIF_DEV_ID_CNT              5
+
+
+/* Packet continues in the next request descriptor. */
+#define _XEN_SOCKTXF_more_data         (2)
+#define  XEN_SOCKTXF_more_data         (1U<<_XEN_SOCKTXF_more_data)
+
+struct xen_sockif_tx_request {
+       grant_ref_t gref;      /* Reference to buffer page */
+       uint16_t offset;       /* Offset within buffer page */
+       uint16_t flags;        /* XEN_SOCKTXF_* */
+       uint16_t id;           /* Echoed in response message. */
+       uint16_t size;         /* Packet size in bytes.       */
+};
+
+struct xen_sockif_tx_response {
+       uint16_t id;
+       int16_t  status;       /* XEN_SOCKIF_RSP_* */
+};
+
+struct xen_sockif_rx_request {
+       uint16_t    id;        /* Echoed in response message.        */
+       grant_ref_t gref;      /* Reference to incoming granted frame */
+};
+
+/* Packet continues in the next request descriptor. */
+#define _XEN_SOCKRXF_more_data         (0)
+#define  XEN_SOCKRXF_more_data         (1U<<_XEN_SOCKRXF_more_data)
+
+struct xen_sockif_rx_response {
+       uint16_t id;
+       uint16_t offset;       /* Offset in page of start of received packet  */
+       uint16_t flags;        /* XEN_SOCKRXF_* */
+       int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
+};
+
+DEFINE_RING_TYPES(xen_sockif_tx,
+                 struct xen_sockif_tx_request,
+                 struct xen_sockif_tx_response);
+
+DEFINE_RING_TYPES(xen_sockif_rx,
+                 struct xen_sockif_rx_request,
+                 struct xen_sockif_rx_response);
+
+#define XEN_SOCKIF_RSP_DROPPED -2
+#define XEN_SOCKIF_RSP_ERROR   -1
+#define XEN_SOCKIF_RSP_OKAY     0
+
+#endif /* __XEN_PUBLIC_IO_SOCKIF_H__ */
-- 
1.8.2.rc2


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.