[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 8/9] Xen Share Net Device



Subject: Xen Share Net Device

This is a simple network device using the vdevice bus and the share
info.  It is primitive, in that it does not handle fragmented skbs
(for no particularly good reason).

The main feature of this device is that it demonstrates how a N:N
device like a virtual intra-domain LAN can be implemented.

diff -r c0c781af505d linux-2.6-xen-sparse/drivers/xen/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/Makefile Mon Jun  5 04:27:31 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/Makefile Mon Jun  5 16:40:49 2006
@@ -9,6 +9,7 @@
 obj-y  += privcmd/
 obj-y  += xenbus/
 obj-y  += vdevice/
+obj-m  += sharenet/
 obj-y  += xenshare.o
 
 obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += blkback/
diff -r c0c781af505d tools/vdevice/vdevice.c
--- a/tools/vdevice/vdevice.c   Mon Jun  5 04:27:31 2006
+++ b/tools/vdevice/vdevice.c   Mon Jun  5 16:40:49 2006
@@ -193,6 +193,46 @@
        void (*list)(struct vdevice_type *, const struct vdevice_desc *vdesc);
 };
 
+/* --create.  Returns num args consumed. */
+static int net_create(struct vdevice_type *type __unused,
+                     share_ref_t ref __unused, void *map __unused,
+                     int argc __unused, char *argv[] __unused)
+{
+       /* We don't need to do anything to the shared page, nor wait
+        * for "backend". */
+       return 0;
+}
+
+/* List info about this vdevice. */
+static void net_list(struct vdevice_type *type,
+                    const struct vdevice_desc *vdesc)
+{
+       unsigned int i, peer_id;
+
+       struct xensnet_receiver
+       {
+               unsigned char mac[ETH_ALEN];
+               /* Currently stores a peer's promiscuity state */
+               unsigned char flags;
+       };
+       struct xensnet_receiver *r, empty;
+
+       r = map_pages(vdesc->shared_ref, vdesc->nr_pages, &peer_id);
+       if (!r) {
+               printf(" *cannot map*");
+               return;
+       }
+       memset(&empty, 0, sizeof(empty));
+       for (i = 0; i < (vdesc->nr_pages*getpagesize())/sizeof(*r); i++) {
+               if (memcmp(&empty, &r[i], sizeof(empty)) != 0)
+                       printf(" [%i]=%02x:%02x:%02x:%02x:%02x:%02x%s",
+                              i, r[i].mac[0], r[i].mac[1], r[i].mac[2], 
+                              r[i].mac[3], r[i].mac[4], r[i].mac[5], 
+                              r[i].flags & 0x01 ? "(promisc)" : "");
+       }
+       unmap_pages(r);
+}
+
 /* Volatile is important: someone else changes it. */
 static uint32_t get_status(volatile struct vdevice_desc *vdevice)
 {
@@ -346,6 +386,13 @@
 }
 
 static struct vdevice_type types[] = {
+       { .name = "net",
+         .type = 1,
+         .features = 1,
+         .num_pages = 1,
+         .create = net_create,
+         .list = net_list,
+       },
 };
 
 #define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
diff -r c0c781af505d linux-2.6-xen-sparse/drivers/xen/sharenet/Makefile
--- /dev/null   Mon Jun  5 04:27:31 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/sharenet/Makefile        Mon Jun  5 
16:40:49 2006
@@ -0,0 +1,1 @@
+obj-m  := xensnet.o
diff -r c0c781af505d linux-2.6-xen-sparse/drivers/xen/sharenet/xensnet.c
--- /dev/null   Mon Jun  5 04:27:31 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/sharenet/xensnet.c       Mon Jun  5 
16:40:49 2006
@@ -0,0 +1,527 @@
+/* Simple Xen share network */
+// #define DEBUG
+#include <xen/interface/share.h>
+#include <xen/evtchn.h>
+#include <asm/io.h>
+#include <asm/share.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/vdevice.h>
+
+#define SHARED_SIZE            4096
+#define DATA_SIZE              1500
+#define MAX_LANS               4
+#define NUM_SKBS               32
+#define PROMISC_BIT            0x01
+
+struct xensnet_receiver
+{
+       unsigned char mac[ETH_ALEN];
+       /* Currently stores a peer's promiscuity state */
+       unsigned char flags;
+};
+
+/* The skbs which are registered as sglists with the hypervisor. */
+struct xensnet_skb
+{
+       struct sk_buff *skb;
+       /* Set by Hypervisor when other end triggers */
+       u32 length;
+};
+
+struct xensnet_info
+{
+       /* The shared page. */
+       struct xensnet_receiver *peers;
+
+       /* vdev->private == netdevice. */
+       struct vdevice *vdev;
+
+       struct net_device_stats stats;
+
+       /* Receive queue. */
+       struct xensnet_skb skbs[NUM_SKBS];
+
+       /* Single cached (failed) transmission, with lock */
+       spinlock_t out_lock;
+       struct sk_buff *out_skb;
+       unsigned int out_peer;
+
+       struct xen_share_handler handler;
+
+       /* Set to 0 when congestion relieved.  And later when peers
+        * join/unjoin */
+       u32 change_watch;
+       u32 max_partitions;
+};
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+       return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+}
+
+static int transfer_packet(struct net_device *dev,
+                          struct sk_buff *skb,
+                          unsigned int peernum,
+                          struct xensnet_info *info)
+{
+       unsigned int i = 0;
+       struct xen_sg sg[2+MAX_SKB_FRAGS]; /* FIXME: Check overflow */
+       unsigned long offset;
+       int retval;
+
+       BUG_ON(skb_headlen(skb) != skb->len);
+       /* FIXME: pages might not be contiguous, but if Xen did
+        * translation we wouldn't have to worry about it. */
+       for (offset = 0;
+            offset < skb_headlen(skb);
+            offset += rest_of_page(skb->data + offset)) {
+               sg[i].addr = virt_to_phys(skb->data + offset);
+               sg[i].len = min((unsigned)(skb_headlen(skb) - offset),
+                               rest_of_page(skb->data + offset));
+               i++;
+       }
+
+       BUG_ON(skb_shinfo(skb)->nr_frags);
+
+       pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
+       retval = xen_sg_xfer(info->vdev->share, peernum, XEN_SG_OUT, i, sg);
+       if (retval < 0) {
+               pr_debug("Can't xfer to peer %i: %i\n", peernum, retval);
+               info->stats.tx_fifo_errors++;
+               return retval;
+       } else if (retval != skb->len) {
+               info->stats.tx_aborted_errors++;
+               pr_debug("Short xfer to peer %i: %i of %i (sg %p/%li)\n",
+                        peernum, retval, skb->len,
+                        (void *)sg[0].addr, sg[0].len);
+               /* This is their problem, don't re-xmit. */
+               return 0;
+       } else
+               pr_debug("xensnet: sent %u bytes in %i chunks\n",
+                        skb->len, i);
+       info->stats.tx_bytes += skb->len;
+       info->stats.tx_packets++;
+       return 0;
+}
+
+static int mac_eq(const unsigned char mac[ETH_ALEN],
+                 struct xensnet_info *info, unsigned int peer)
+{
+       return memcmp(mac, info->peers[peer].mac, ETH_ALEN) == 0;
+}
+
+static int unused_peer(struct xensnet_receiver *peers, unsigned int num)
+{
+       return peers[num].mac[0] == 0;
+}
+
+
+static int is_broadcast(const unsigned char dest[ETH_ALEN])
+{
+       return dest[0] == 0xFF && dest[1] == 0xFF && dest[2] == 0xFF
+               && dest[3] == 0xFF && dest[4] == 0xFF && dest[5] == 0xFF;
+}
+
+static int promisc(struct xensnet_info *info, unsigned int peer)
+{
+       return info->peers[peer].flags & PROMISC_BIT;
+}
+
+static void xensnet_set_multicast(struct net_device *dev)
+{
+       struct xensnet_info *info = dev->priv;
+
+       if (dev->flags & IFF_PROMISC)
+               info->peers[info->vdev->share->peerid].flags |= PROMISC_BIT;
+       else 
+               info->peers[info->vdev->share->peerid].flags &= ~PROMISC_BIT;
+}
+
+static int xensnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+       unsigned int i;
+       int transferred = 0, broadcast = 0;
+       struct xensnet_info *info = dev->priv;
+       const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
+
+       if (is_broadcast(dest))
+               broadcast = 1;
+
+       pr_debug("xensnet %s: xmit broadcast=%i\n", 
+                dev->name, broadcast);
+       pr_debug("dest: %02x:%02x:%02x:%02x:%02x:%02x\n",
+                dest[0], dest[1], dest[2], dest[3], dest[4], dest[5]);
+
+       for (i = 0; i < info->max_partitions; i++) {
+               if (i == info->vdev->share->peerid || unused_peer(info->peers, 
i))
+                       continue;
+
+               if (broadcast || promisc(info, i) || mac_eq(dest, info, i)) {
+                       unsigned long flags;
+                       pr_debug("xensnet %s: sending from %i to %i\n",
+                                dev->name, info->vdev->share->peerid, i);
+                       spin_lock_irqsave(&info->out_lock, flags);
+                       if (transfer_packet(dev, skb, i, info) == -ENOSPC
+                           && !broadcast) {
+                               /* Queue this packet, stop queue. */
+                               pr_debug("Queuing packet, stopping queue\n");
+                               BUG_ON(info->out_skb);
+                               info->out_skb = skb;
+                               info->out_peer = i;
+                               netif_stop_queue(dev);
+                               spin_unlock_irqrestore(&info->out_lock, flags);
+                               return 0;
+                       }
+                       spin_unlock_irqrestore(&info->out_lock, flags);
+                       transferred = 1;
+               }
+       }
+
+       if (!transferred) {
+               pr_debug("Can't xfer to %02x:%02x:%02x:%02x:%02x:%02x\n",
+                        dest[0], dest[1], dest[2], dest[3], dest[4], dest[5]);
+               info->stats.tx_carrier_errors++;
+       }
+
+       dev_kfree_skb(skb);
+       return 0;
+}
+
+static struct sk_buff *xensnet_alloc_skb(struct net_device *dev, int gfpflags)
+{
+       struct sk_buff *skb;
+
+       skb = alloc_skb(16 + ETH_HLEN + DATA_SIZE, gfpflags);
+       if (!skb)
+               return NULL;
+
+       skb->dev = dev;
+       skb_reserve(skb, 16);
+       return skb;
+}
+
+/* Unregister scatter-gather with hypervisor. */
+static void release_skb(struct xensnet_info *info, int slot)
+{
+       struct sk_buff *skb = info->skbs[slot].skb;
+
+       xen_sg_unregister(info->vdev->share, virt_to_phys(skb->data));
+}
+
+/* Find a new skb to put in this slot in shared mem. */
+static int fill_slot(struct net_device *dev, unsigned int slot)
+{
+       struct xensnet_info *info = dev->priv;
+       struct xen_sg sg[MAX_SKB_FRAGS+1];
+       int err;
+
+       /* Try to create and register a new one. */
+       info->skbs[slot].skb = xensnet_alloc_skb(dev, GFP_ATOMIC);
+       if (!info->skbs[slot].skb) {
+               printk("xensnet: could not fill slot %i\n", slot);
+               return -ENOMEM;
+       }
+
+       info->skbs[slot].length = 0;
+
+       sg[0].addr = virt_to_phys(info->skbs[slot].skb->data);
+       sg[0].len = ETH_HLEN + DATA_SIZE;
+
+       /* We queue up at our peerid, by convention. */
+       err = xen_sg_register(info->vdev->share, XEN_SG_IN, 
+                             info->vdev->share->peerid,
+                             &info->skbs[slot].length, 1, sg);
+       if (err) {
+               dev_kfree_skb_irq(info->skbs[slot].skb);
+               info->skbs[slot].skb = NULL;
+               printk("xensnet: could not register skb for slot %i\n", slot);
+               return err;
+       }
+
+       pr_debug("xensnet: %s populating slot %i with %p\n", dev->name, slot,
+                       info->skbs[slot].skb);
+
+       return 0;
+}
+
+static int try_retransmit(struct net_device *dev, struct xensnet_info *info,
+                          struct sk_buff *skb, unsigned int peer)
+{
+       int err;
+
+       /* Nothing to re-xmit? */
+       if (!skb)
+               return 0;
+
+       /* Peer has gone away? */
+       if (unused_peer(info->peers, peer)) {
+               printk("Peer %i no longer exists!\n", peer);
+               return 1;
+       }
+
+       /* Any error other than "no buffers left". */
+       err = transfer_packet(dev, skb, peer, info);
+       pr_debug("Transferring queued packet %i\n", err);
+       return err != -ENOSPC;
+}
+
+static void xensnet_handler(struct xen_share_handler *handler)
+{
+       struct xensnet_info *info;
+       struct net_device *dev;
+       unsigned int i;
+       struct sk_buff *skb;
+
+       info = container_of(handler, struct xensnet_info, handler);
+       dev = info->vdev->private;
+
+       /* Something changed?  If we have packet queued, try re-xmit. */
+       if (info->change_watch != 1) {
+               unsigned long flags;
+
+               info->change_watch = 1;
+
+               pr_debug("%i: try_retransmit\n", info->vdev->share->peerid);
+               spin_lock_irqsave(&info->out_lock, flags);
+               if (try_retransmit(dev, info, info->out_skb, info->out_peer)) {
+                       dev_kfree_skb_irq(info->out_skb);
+                       info->out_skb = NULL;
+                       netif_wake_queue(dev);
+               } else
+                       pr_debug("%i: try_retransmit failed\n", 
info->vdev->share->peerid);
+               spin_unlock_irqrestore(&info->out_lock, flags);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(info->skbs); i++) {
+               unsigned int length;
+
+               length = info->skbs[i].length;
+               if (length == 0)
+                       continue;
+
+               skb = info->skbs[i].skb;
+               fill_slot(dev, i);
+
+               if (skb) {
+                       if (length < 14 || length > 1514) {
+                               printk(KERN_WARNING
+                                      "xensnet: unbelievable skb len: %i\n",
+                                      length);
+                               dev_kfree_skb(skb);
+                               continue;
+                       }
+                       skb_put(skb, length);
+                       skb->protocol = eth_type_trans(skb, dev);
+                       /* This is a reliable transport. */
+                       skb->ip_summed = CHECKSUM_UNNECESSARY;
+                       pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
+                                ntohs(skb->protocol), skb->len,skb->pkt_type);
+
+                       info->stats.rx_bytes += skb->len;
+                       info->stats.rx_packets++;
+                       netif_rx(skb);
+               }
+       }
+}
+
+static int populate_page(struct net_device *dev)
+{
+       int i;
+       struct xensnet_info *info = dev->priv;
+       struct xensnet_receiver *me = &info->peers[info->vdev->share->peerid];
+       int retval;
+
+       pr_debug("xensnet: peer %i shared page %p me %p\n", 
+                info->vdev->share->peerid, info->peers, me);
+       /* Save MAC address */
+       memcpy(me->mac, dev->dev_addr, ETH_ALEN);
+
+       me->flags = 0;
+       /* Turn on promisc mode if needed */
+       xensnet_set_multicast(dev);
+
+       for (i = 0; i < ARRAY_SIZE(info->skbs); i++) {
+               retval = fill_slot(dev, i);
+
+               if (retval)
+                       goto cleanup;
+       }
+       pr_debug("xensnet: allocated %i watches\n", i);
+
+       return 0;
+
+cleanup:
+       while (--i >= 0) {
+               release_skb(info, i);
+               dev_kfree_skb(info->skbs[i].skb);
+       }
+
+       return -ENOMEM;
+}
+
+static void unpopulate_page(struct xensnet_info *info)
+{
+       unsigned int i;
+       struct xensnet_receiver *me = &info->peers[info->vdev->share->peerid];
+
+       /* Clear all trace: others might deliver packets, we'll ignore it. */
+       memset(me, 0, sizeof(*me));
+       mb();
+
+       /* Disclaim slot. */
+       me->mac[0] = 0;
+
+       /* Deregister sg lists, free up skbs, remove triggers. */
+       for (i = 0; i < ARRAY_SIZE(info->skbs); i++) {
+               release_skb(info, i);
+               dev_kfree_skb(info->skbs[i].skb);
+       }
+}
+
+static int xensnet_open(struct net_device *dev)
+{
+       return populate_page(dev);
+}
+
+static int xensnet_close(struct net_device *dev)
+{
+       unpopulate_page(dev->priv);
+       return 0;
+}
+
+static struct net_device_stats *xensnet_get_stats(struct net_device *dev)
+{
+       struct xensnet_info *info = dev->priv;
+
+       return &info->stats;
+}
+
+/* Setup device with page at this address.  If fail, drop page and
+ * return ERR_PTR(-errno). */
+static struct net_device *setup_device(struct vdevice *vdev)
+{
+       int err;
+       struct net_device *dev;
+       struct xensnet_info *info;
+
+       vdev->private = dev = alloc_etherdev(sizeof(struct xensnet_info));
+       if (!dev)
+               return ERR_PTR(-ENOMEM);
+
+       SET_MODULE_OWNER(dev);
+
+       /* Ethernet defaults with some changes */
+       ether_setup(dev);
+       dev->set_mac_address = NULL;
+       dev->mtu = DATA_SIZE;
+
+       /* FIXME: Base initial MAC address on domain id. */
+       random_ether_addr(dev->dev_addr);
+       /* Ensure top byte not zero */
+       dev->dev_addr[0] |= 0x80;
+
+       dev->open = xensnet_open;
+       dev->stop = xensnet_close;
+       dev->hard_start_xmit = xensnet_start_xmit;
+       dev->get_stats = xensnet_get_stats;
+       /* Turning on/off promisc will call dev->set_multicast_list.
+        * We don't actually support multicast yet */
+       dev->set_multicast_list = xensnet_set_multicast;
+       /* Only true for x86 where share_ref == mfn, but gives indication */
+       dev->mem_start = vdev->share->share_ref << PAGE_SHIFT;
+       dev->mem_end = dev->mem_start + PAGE_SIZE;
+       dev->dma = 0;
+
+       info = dev->priv;
+       info->vdev = vdev;
+       info->out_skb = NULL;
+       info->change_watch = 1;
+       info->peers = vdev->share->addr;
+       spin_lock_init(&info->out_lock);
+
+       /* skbs allocated on open */
+       memset(info->skbs, 0, sizeof(info->skbs));
+
+       info->handler.handler = xensnet_handler;
+       xen_share_add_handler(vdev->share, &info->handler);
+
+       /* Watch offset 0 for changes. */
+       err = xen_share_watch(vdev->share, 0, &info->change_watch);
+       if (err) {
+               pr_debug("xensnet: watching 0x%lx %i failed\n",
+                        vdev->share->share_ref,
+                        vdev->share->peerid);
+               goto remove_handler;
+       }
+
+       err = register_netdev(dev);
+       if (err) {
+               pr_debug("xensnet: registering device failed\n");
+               goto free_watch;
+       }
+       pr_debug("xensnet: registered device %s\n", dev->name);
+
+       return dev;
+
+free_watch:
+       xen_share_unwatch(vdev->share, 0);
+remove_handler:
+       xen_share_remove_handler(vdev->share, &info->handler);
+       free_netdev(dev);
+       return ERR_PTR(err);
+}
+
+static void xensnet_remove(struct vdevice *vdev)
+{
+       struct net_device *netdev = vdev->private;
+       struct xensnet_info *info = netdev->priv;
+
+       unregister_netdev(netdev);
+       xen_share_unwatch(vdev->share, 0);
+       xen_share_remove_handler(vdev->share, &info->handler);
+       free_netdev(netdev);
+}
+
+static int xensnet_probe(struct vdevice *vdev, const struct vdevice_id *ent)
+{
+       struct net_device *netdev;
+       struct xensnet_info *info;
+
+       netdev = setup_device(vdev);
+       if (IS_ERR(netdev))
+               return PTR_ERR(netdev);
+       vdev->private = netdev;
+       info = netdev->priv;
+       info->max_partitions = (vdev->share->num_pages * PAGE_SIZE) / 
+                               sizeof(struct xensnet_receiver);
+
+       printk(KERN_INFO
+              "xensnet: mapped lan %s at share_ref 0x%lx upto %i nodes\n",
+              netdev->name, vdev->share->share_ref, info->max_partitions);
+       return 0;
+}
+
+static struct vdevice_id xensnet_ids[] = {
+       { .type = 1, .features = 1 },
+       { .type = 0, .features = 0 },
+};
+static struct vdevice_driver xensnet_drv = {
+       .name = "xensnet",
+       .owner = THIS_MODULE,
+       .ids = xensnet_ids,
+       .probe = xensnet_probe,
+       .remove = xensnet_remove,
+       .stop = NULL,
+       .reconnect = NULL,
+};
+
+static __init int xensnet_init(void)
+{
+       return register_vdevice_driver(&xensnet_drv);
+}
+
+module_init(xensnet_init);
+MODULE_LICENSE("GPL");

-- 
 ccontrol: http://ccontrol.ozlabs.org


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.