[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] Lguest implemention of virtio draft III



This is a bonus patch for those wondering how a virtio implementation
can look.  I have two, this is the more efficient one (needs some
modification for inter-guest though: it assumes the other end does all
the accessing of our memory.  It's currently tacked on to the existing
lguest I/O mechanism as a demonstration, rather than replacing it.

It shows that it's possible to implement virtio without internal
locking.

Userspace server-side code isn't included.
===
This allows zero-copy from guest <-> host.  It uses a page of
descriptors, a page to say what descriptors to use, and a page to say
what's been used: one each set for inbufs and one for outbufs.

TODO:
1) More polishing
2) Get rid of old I/O
3) Inter-guest I/O implementation

Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx>
---
 drivers/lguest/Makefile         |    2 
 drivers/lguest/hypercalls.c     |    4 
 drivers/lguest/lguest_virtio.c  |  511 +++++++++++++++++++++++++++++++++++++++
 include/linux/lguest.h          |    3 
 include/linux/lguest_launcher.h |   24 +
 6 files changed, 948 insertions(+), 5 deletions(-)

--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,5 +1,5 @@
 # Guest requires the paravirt_ops replacement and the bus driver.
-obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o 
lguest_virtio.o
 
 # Host requires the other files, which can be a module.
 obj-$(CONFIG_LGUEST)   += lg.o
===================================================================
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -86,6 +86,10 @@ static void do_hcall(struct lguest *lg, 
                break;
        case LHCALL_HALT:
                lg->halted = 1;
+               break;
+       case LHCALL_NOTIFY:
+               lg->pending_key = regs->edx << PAGE_SHIFT;
+               lg->dma_is_pending = 1;
                break;
        default:
                kill_guest(lg, "Bad hypercall %li\n", regs->eax);
===================================================================
--- /dev/null
+++ b/drivers/lguest/lguest_virtio.c
@@ -0,0 +1,511 @@
+/* Descriptor-based virtio backend using lguest. */
+
+/* FIXME: Put "running" in shared page so other side really doesn't
+ * send us interrupts.  Then we would never need to "fail" restart.
+ * If there are more buffers when we set "running", simply ping other
+ * side.  It would interrupt us back again.
+ */
+#define DEBUG
+#include <linux/lguest.h>
+#include <linux/lguest_bus.h>
+#include <linux/virtio.h>
+#include <linux/interrupt.h>
+#include <asm/io.h>
+
+#define NUM_DESCS (PAGE_SIZE / sizeof(struct lguest_desc))
+
+#ifdef DEBUG
+/* For development, we want to crash whenever the other side is bad. */
+#define BAD_SIDE(lgv, fmt...)                  \
+       do { dev_err(lgv->vdev.dev, fmt); BUG(); } while(0)
+#define START_USE(di) \
+       do { if ((di)->in_use) panic("in_use = %i\n", (di)->in_use); 
(di)->in_use = __LINE__; mb(); } while(0)
+#define END_USE(di) \
+       do { BUG_ON(!(di)->in_use); (di)->in_use = 0; mb(); } while(0)
+#else
+#define BAD_SIDE(lgv, fmt...)                  \
+       do { dev_err(lgv->vdev.dev, fmt); (lgv)->broken = true; } while(0)
+#define START_USE(di)
+#define END_USE(di)
+#endif
+
+/* FIXME: make the device mem layout a struct, not a set of pointers */
+struct desc_info
+{
+       /* Page of descriptors. */
+       struct lguest_desc *desc;
+       /* How we tell other side what buffers are available. */
+       unsigned int *avail_idx;
+       unsigned int *available;
+       /* How other side tells us what's used. */
+       unsigned int *used_idx;
+       struct lguest_used *used;
+
+       /* Number of free buffers */
+       unsigned int num_free;
+       /* Head of free buffer list. */
+       unsigned int free_head;
+       /* Number we've added since last sync. */
+       unsigned int num_added;
+
+       /* Last used index we've seen. */
+       unsigned int last_used_idx;
+
+       /* Unless they told us to stop */
+       bool running;
+
+#ifdef DEBUG
+       /* They're supposed to lock for us. */
+       unsigned int in_use;
+#endif
+
+       /* Tokens for callbacks. */
+       void *data[NUM_DESCS];
+};
+
+/* FIXME: When doing this for real, vdev will go straight into lguest_device */
+struct lguest_virtio_device
+{
+       struct virtio_device vdev;
+       struct lguest_device *lg;
+       void *priv;
+
+       /* Other side has made a mess, don't try any more. */
+       bool broken;
+
+       struct desc_info in, out;
+};
+
+static inline struct lguest_virtio_device *
+vdev_to_lgv(struct virtio_device *vdev)
+{
+       return container_of(vdev, struct lguest_virtio_device, vdev);
+}
+
+static unsigned long add_buf(struct desc_info *di,
+                            const struct scatterlist *sg,
+                            unsigned int num,
+                            void *data)
+{
+       unsigned int i, head, uninitialized_var(prev);
+
+       BUG_ON(data == NULL);
+       START_USE(di);
+
+       if (di->num_free < num) {
+               pr_debug("Can't add buf len %i - avail = %i\n", num,
+                        di->num_free);
+               END_USE(di);
+               return -ENOSPC;
+       }
+
+       /* We're about to use some buffers from the free list. */
+       di->num_free -= num;
+
+       head = di->free_head;
+       for (i = di->free_head; num; i = di->desc[i].next, num--) {
+               di->desc[i].flags |= LGUEST_DESC_F_NEXT;
+               di->desc[i].pfn = page_to_pfn(sg[0].page);
+               di->desc[i].offset = sg[0].offset;
+               di->desc[i].len = sg[0].length;
+               prev = i;
+               sg++;
+       }
+       /* Last one doesn't continue. */
+       di->desc[prev].flags &= ~LGUEST_DESC_F_NEXT;
+
+       /* Update free pointer */
+       di->free_head = i;
+
+       di->data[head] = data;
+
+       /* Make sure it's all visible to other side before setting head. */
+       wmb();
+       di->desc[head].flags |= LGUEST_DESC_F_HEAD;
+
+       /* Put it in available array for advertising. */
+       di->available[(*di->avail_idx + di->num_added++) % NUM_DESCS] = head;
+
+       pr_debug("Added buffer head %i\n", head);
+       END_USE(di);
+       return head;
+}
+
+static unsigned long lguest_add_outbuf(struct virtio_device *vdev,
+                                      const struct scatterlist sg[],
+                                      unsigned int num,
+                                      void *data)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       BUG_ON(num > NUM_DESCS);
+       BUG_ON(num == 0);
+
+       return add_buf(&lgv->out, sg, num, data);
+}
+
+static unsigned long lguest_add_inbuf(struct virtio_device *vdev,
+                                     struct scatterlist sg[],
+                                     unsigned int num,
+                                     void *data)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       BUG_ON(num > NUM_DESCS);
+       BUG_ON(num == 0);
+
+       return add_buf(&lgv->in, sg, num, data);
+}
+
+static void lguest_sync(struct virtio_device *vdev, enum virtio_dir inout)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       if (inout & VIRTIO_IN)
+               START_USE(&lgv->in);
+       if (inout & VIRTIO_OUT)
+               START_USE(&lgv->out);
+       /* LGUEST_DESC_F_HEAD needs to be set before we say they're avail. */
+       wmb();
+
+       if (inout & VIRTIO_IN) {
+               *lgv->in.avail_idx += lgv->in.num_added;
+               lgv->in.num_added = 0;
+       }
+       if (inout & VIRTIO_OUT) {
+               *lgv->out.avail_idx += lgv->out.num_added;
+               lgv->out.num_added = 0;
+       }
+
+       /* Prod other side to tell it about changes. */
+       hcall(LHCALL_NOTIFY, lguest_devices[lgv->lg->index].pfn, 0, 0);
+       if (inout & VIRTIO_IN)
+               END_USE(&lgv->in);
+       if (inout & VIRTIO_OUT)
+               END_USE(&lgv->out);
+}
+
+static void detach_buf(struct desc_info *di, int id)
+{
+       unsigned int i;
+
+       BUG_ON(id >= NUM_DESCS);
+       BUG_ON(!(di->desc[id].flags & LGUEST_DESC_F_HEAD));
+
+       di->desc[id].flags &= ~LGUEST_DESC_F_HEAD;
+       /* Make sure other side has seen that it's detached. */
+       wmb();
+
+       /* Put back on free list: find end */
+       for (i = id; di->desc[i].flags&LGUEST_DESC_F_NEXT; i=di->desc[i].next)
+               di->num_free++;
+
+       di->desc[i].next = di->free_head;
+       di->free_head = id;
+       /* Plus final descriptor */
+       di->num_free++;
+}
+
+static void lguest_detach_outbuf(struct virtio_device *vdev, unsigned long id)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       START_USE(&lgv->out);
+       detach_buf(&lgv->out, id);
+       END_USE(&lgv->out);
+}
+
+static void lguest_detach_inbuf(struct virtio_device *vdev, unsigned long id)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       START_USE(&lgv->in);
+       detach_buf(&lgv->in, id);
+       END_USE(&lgv->in);
+}
+
+static bool more_used(struct desc_info *di)
+{
+       return di->last_used_idx != *di->used_idx;
+}
+
+static void *get_buf(struct desc_info *di, struct lguest_virtio_device *lgv,
+                    unsigned int *len)
+{
+       unsigned int id;
+
+       START_USE(di);
+
+       if (!more_used(di)) {
+               END_USE(di);
+               return NULL;
+       }
+
+       /* Don't let them make us do infinite work. */
+       if (unlikely(*di->used_idx > di->last_used_idx + NUM_DESCS)) {
+               BAD_SIDE(lgv, "Too many descriptors");
+               return NULL;
+       }
+
+       id = di->used[di->last_used_idx%NUM_DESCS].id;
+       *len = di->used[di->last_used_idx%NUM_DESCS].len;
+
+       if (unlikely(id >= NUM_DESCS)) {
+               BAD_SIDE(lgv, "id %u out of range\n", id);
+               return NULL;
+       }
+       if (unlikely(!(di->desc[id].flags & LGUEST_DESC_F_HEAD))) {
+               BAD_SIDE(lgv, "id %u is not a head!\n", id);
+               return NULL;
+       }
+
+       detach_buf(di, id);
+       di->last_used_idx++;
+       BUG_ON(!di->data[id]);
+       END_USE(di);
+       return di->data[id];
+}
+
+static void *lguest_get_outbuf(struct virtio_device *vdev, unsigned int *len)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       return get_buf(&lgv->out, lgv, len);
+}
+
+static void *lguest_get_inbuf(struct virtio_device *vdev, unsigned int *len)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       return get_buf(&lgv->in, lgv, len);
+}
+
+static bool lguest_restart_in(struct virtio_device *vdev)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       START_USE(&lgv->in);
+       BUG_ON(lgv->in.running);
+
+       if (likely(!more_used(&lgv->in)) || unlikely(lgv->broken))
+               lgv->in.running = true;
+
+       END_USE(&lgv->in);
+       return lgv->in.running;
+}
+
+static bool lguest_restart_out(struct virtio_device *vdev)
+{
+       struct lguest_virtio_device *lgv = vdev_to_lgv(vdev);
+
+       START_USE(&lgv->out);
+       BUG_ON(lgv->out.running);
+
+       if (likely(!more_used(&lgv->in)) || unlikely(lgv->broken))
+               lgv->in.running = true;
+
+       END_USE(&lgv->out);
+       return lgv->in.running;
+}
+
+static irqreturn_t lguest_virtio_interrupt(int irq, void *_lgv)
+{
+       struct lguest_virtio_device *lgv = _lgv;
+
+       if (unlikely(lgv->broken))
+               return IRQ_HANDLED;
+
+       if (lgv->out.running && more_used(&lgv->out))
+               lgv->out.running = lgv->vdev.driver_ops->out(&lgv->vdev);
+
+       if (lgv->in.running && more_used(&lgv->in))
+               lgv->in.running = lgv->vdev.driver_ops->in(&lgv->vdev);
+
+       return IRQ_HANDLED;
+}
+
+static struct virtio_ops lguest_virtio_ops = {
+       .add_outbuf = lguest_add_outbuf,
+       .add_inbuf = lguest_add_inbuf,
+       .sync = lguest_sync,
+       .detach_outbuf = lguest_detach_outbuf,
+       .detach_inbuf = lguest_detach_inbuf,
+       .get_outbuf = lguest_get_outbuf,
+       .get_inbuf = lguest_get_inbuf,
+       .restart_in = lguest_restart_in,
+       .restart_out = lguest_restart_out,
+};
+
+static struct lguest_virtio_device *lg_new_virtio(struct lguest_device *lgdev)
+{
+       struct lguest_virtio_device *lgv;
+       void *mem;
+       unsigned int i;
+
+       lgv = kmalloc(sizeof(*lgv), GFP_KERNEL);
+       if (!lgv)
+               return NULL;
+
+       memset(lgv, 0, sizeof(*lgv));
+
+       lgdev->private = lgv;
+       lgv->lg = lgdev;
+
+       /* Device mem is input pages followed by output pages */
+       mem = lguest_map(lguest_devices[lgdev->index].pfn<<PAGE_SHIFT, 6);
+       if (!mem)
+               goto free_lgv;
+       lgv->in.desc = mem;
+       lgv->in.avail_idx = mem + PAGE_SIZE;
+       lgv->in.available = (void *)(lgv->in.avail_idx + 1);
+       lgv->in.used_idx = mem + PAGE_SIZE*2;
+       lgv->in.used = (void *)(lgv->in.used_idx + 1);
+       lgv->out.desc = mem + PAGE_SIZE*3;
+       lgv->out.avail_idx = mem + PAGE_SIZE*4;
+       lgv->out.available = (void *)(lgv->out.avail_idx + 1);
+       lgv->out.used_idx = mem + PAGE_SIZE*5;
+       lgv->out.used = (void *)(lgv->out.used_idx + 1);
+
+       lgv->in.last_used_idx = lgv->out.last_used_idx = 0;
+       lgv->in.num_added = lgv->out.num_added = 0;
+       lgv->in.running = lgv->out.running = true;
+
+       /* Put everything in free lists. */
+       lgv->in.num_free = lgv->out.num_free = NUM_DESCS;
+       for (i = 0; i < NUM_DESCS-1; i++) {
+               lgv->in.desc[i].next = i+1;
+               lgv->out.desc[i].next = i+1;
+       }
+
+       lgv->vdev.ops = &lguest_virtio_ops;
+       lgv->vdev.dev = &lgdev->dev;
+       lgv->broken = false;
+       return lgv;
+
+free_lgv:
+       kfree(lgv);
+       return NULL;;
+}
+
+static void lg_destroy_virtio(struct lguest_virtio_device *lgv)
+{
+       lguest_unmap(lgv->in.desc);
+       kfree(lgv);
+}
+
+/* It's nice to have the name for the interrupt, so we do this separately
+ * from lg_new_virtio(). */
+static int lg_setup_interrupt(struct lguest_virtio_device *lgv,
+                             const char *name)
+{
+       int irqf;
+
+       if (lguest_devices[lgv->lg->index].features&LGUEST_DEVICE_F_RANDOMNESS)
+               irqf = IRQF_SAMPLE_RANDOM;
+       else
+               irqf = 0;
+
+       return request_irq(lgdev_irq(lgv->lg), lguest_virtio_interrupt, irqf,
+                          name, lgv);
+}
+
+/* Example network driver code. */
+#include <linux/virtio_net.h>
+#include <linux/etherdevice.h>
+
+static int lguest_virtnet_probe(struct lguest_device *lgdev)
+{
+       struct lguest_virtio_device *lgv;
+       struct net_device *dev;
+       u8 mac[ETH_ALEN];
+       int err;
+
+       lgv = lg_new_virtio(lgdev);
+       if (!lgv)
+               return -ENOMEM;
+
+       random_ether_addr(mac);
+       lgv->priv = dev = virtnet_probe(&lgv->vdev, mac);
+       if (IS_ERR(lgv->priv)) {
+               err = PTR_ERR(lgv->priv);
+               goto destroy;
+       }
+       err = lg_setup_interrupt(lgv, dev->name);
+       if (err)
+               goto unprobe;
+       return 0;
+
+unprobe:
+       virtnet_remove(dev);
+destroy:
+       lg_destroy_virtio(lgv);
+       return err;
+}
+
+static struct lguest_driver lguest_virtnet_drv = {
+       .name = "lguestvirtnet",
+       .owner = THIS_MODULE,
+       .device_type = LGUEST_DEVICE_T_VIRTNET,
+       .probe = lguest_virtnet_probe,
+};
+
+static __init int lguest_virtnet_init(void)
+{
+       return register_lguest_driver(&lguest_virtnet_drv);
+}
+device_initcall(lguest_virtnet_init);
+
+/* Example block driver code. */
+#include <linux/virtio_blk.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+static int lguest_virtblk_probe(struct lguest_device *lgdev)
+{
+       struct lguest_virtio_device *lgv;
+       struct gendisk *disk;
+       unsigned long sectors;
+       int err;
+
+       lgv = lg_new_virtio(lgdev);
+       if (!lgv)
+               return -ENOMEM;
+
+       /* Page is initially used to pass capacity. */
+       sectors = *(unsigned long *)lgv->in.desc;
+       *(unsigned long *)lgv->in.desc = 0;
+
+       lgv->priv = disk = virtblk_probe(&lgv->vdev);
+       if (IS_ERR(lgv->priv)) {
+               err = PTR_ERR(lgv->priv);
+               goto destroy;
+       }
+       set_capacity(disk, sectors);
+       blk_queue_max_hw_segments(disk->queue, NUM_DESCS-1);
+
+       err = lg_setup_interrupt(lgv, disk->disk_name);
+       if (err)
+               goto unprobe;
+       add_disk(disk);
+       return 0;
+
+unprobe:
+       virtblk_remove(disk);
+destroy:
+       lg_destroy_virtio(lgv);
+       return err;
+}
+
+static struct lguest_driver lguest_virtblk_drv = {
+       .name = "lguestvirtblk",
+       .owner = THIS_MODULE,
+       .device_type = LGUEST_DEVICE_T_VIRTBLK,
+       .probe = lguest_virtblk_probe,
+};
+
+static __init int lguest_virtblk_init(void)
+{
+       return register_lguest_driver(&lguest_virtblk_drv);
+}
+device_initcall(lguest_virtblk_init);
+
+MODULE_LICENSE("GPL");
===================================================================
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -23,6 +23,9 @@
 #define LHCALL_SET_PTE         14
 #define LHCALL_SET_PMD         15
 #define LHCALL_LOAD_TLS                16
+
+/* Experimental hcalls for new I/O */
+#define LHCALL_NOTIFY  100 /* pfn */
 
 #define LG_CLOCK_MIN_DELTA     100UL
 #define LG_CLOCK_MAX_DELTA     ULONG_MAX
===================================================================
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -44,6 +44,8 @@ struct lguest_device_desc {
 #define LGUEST_DEVICE_T_CONSOLE        1
 #define LGUEST_DEVICE_T_NET    2
 #define LGUEST_DEVICE_T_BLOCK  3
+#define LGUEST_DEVICE_T_VIRTNET        8
+#define LGUEST_DEVICE_T_VIRTBLK        9
 
        u16 features;
 #define LGUEST_NET_F_NOCSUM            0x4000 /* Don't bother checksumming */
@@ -70,4 +72,26 @@ enum lguest_req
        LHREQ_IRQ, /* + irq */
        LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
 };
+
+/* This marks a buffer as being the start (and active) */
+#define LGUEST_DESC_F_HEAD     1
+/* This marks a buffer as continuing via the next field. */
+#define LGUEST_DESC_F_NEXT     2
+
+/* Virtio descriptors */
+struct lguest_desc
+{
+       unsigned long pfn;
+       unsigned long len;
+       u16 offset;
+       u16 flags;
+       /* We chain unused descriptors via this, too */
+       u32 next;
+};
+
+struct lguest_used
+{
+       unsigned int id;
+       unsigned int len;
+};
 #endif /* _ASM_LGUEST_USER */



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.