[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCHv2 2/2] xen/privcmd: add ioctls for locking/unlocking hypercall buffers



Using mlock() for hypercall buffers is not sufficient since mlocked
pages are still subject to compaction and page migration.  Page
migration can be prevented by taking additional references to the
pages.

Introduce two new ioctls: IOCTL_PRIVCMD_HCALL_BUF_LOCK and
IOCTL_PRIVCMD_HCALL_BUF_UNLOCK which get and put the necessary page
references.  The buffers do not need to be page aligned and they may
overlap with other buffers.  However, it is not possible to partially
unlock a buffer (i.e., the LOCK/UNLOCK must always be paired).  Any
locked buffers are automatically unlocked when the file descriptor is
closed.

An alternative approach would be to extend the driver with an ioctl to
populate a VMA with "special", non-migratable pages.  But the
LOCK/UNLOCK ioctls are more flexible as they allow any page to be used
for hypercalls (e.g., stack, mmap'd files, etc.).  This could be used
to minimize bouncing for performance critical hypercalls.

Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx>
---
v2:
- Only take one reference per buffer.
- Interate over the tree to unlock/free all buffers.
- Max 1GB size per buffer.
- Only use one memory allocation per buffer.
- Fix overflow issues in privcmd_hbuf_compare().
- Return -ENXIO when unlocking an unlocked buffer.
---
 drivers/xen/privcmd.c      | 216 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/xen/privcmd.h |  38 ++++++++
 2 files changed, 254 insertions(+)

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index ac76bc4..5419b31 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -22,6 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/miscdevice.h>
+#include <linux/rbtree.h>
 
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -43,6 +44,19 @@ MODULE_LICENSE("GPL");
 
 #define PRIV_VMA_LOCKED ((void *)1)
 
+struct privcmd_data {
+       struct rb_root hbuf_root;
+       struct mutex hbuf_mutex;
+};
+
+struct privcmd_hbuf {
+       struct rb_node node;
+       struct privcmd_hcall_buf buf;
+       unsigned int count;
+       unsigned int nr_pages;
+       struct page *pages[0];
+};
+
 static int privcmd_vma_range_is_mapped(
                struct vm_area_struct *vma,
                unsigned long addr,
@@ -548,9 +562,175 @@ out_unlock:
        goto out;
 }
 
+static void privcmd_hbuf_free(struct privcmd_hbuf *hbuf)
+{
+       unsigned int i;
+
+       for (i = 0; i < hbuf->nr_pages; i++)
+               put_page(hbuf->pages[i]);
+
+       kfree(hbuf);
+}
+
+static struct privcmd_hbuf *privcmd_hbuf_alloc(struct privcmd_hcall_buf *buf)
+{
+       struct privcmd_hbuf *hbuf;
+       unsigned long start, end, nr_pages;
+       int ret;
+
+       start = (unsigned long)buf->start & PAGE_MASK;
+       end = ALIGN((unsigned long)buf->start + buf->len, PAGE_SIZE);
+       nr_pages = (end - start) / PAGE_SIZE;
+
+       hbuf = kzalloc(sizeof(*hbuf) + nr_pages * sizeof(hbuf->pages[0]),
+                      GFP_KERNEL);
+       if (!hbuf)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Take a reference to each page, this will prevent swapping
+        * and page migration.
+        */
+       ret = get_user_pages_fast(start, nr_pages, 1, hbuf->pages);
+       if (ret < 0)
+               goto error;
+
+       hbuf->buf = *buf;
+       hbuf->count = 1;
+       hbuf->nr_pages = ret;
+
+       if (hbuf->nr_pages != nr_pages) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       return hbuf;
+
+  error:
+       privcmd_hbuf_free(hbuf);
+       return ERR_PTR(ret);
+}
+
+static int privcmd_hbuf_compare(struct privcmd_hcall_buf *a,
+                               struct privcmd_hcall_buf *b)
+{
+       if (b->start > a->start)
+               return 1;
+       if (b->start < a->start)
+               return -1;
+       if (b->len > a->len)
+               return 1;
+       if (b->len < a->len)
+               return -1;
+       return 0;
+}
+
+static int privcmd_hbuf_insert(struct privcmd_data *priv,
+                               struct privcmd_hcall_buf *buf)
+{
+       struct rb_node **new = &priv->hbuf_root.rb_node, *parent = NULL;
+       struct privcmd_hbuf *hbuf;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct privcmd_hbuf *this = container_of(*new, struct 
privcmd_hbuf, node);
+               int result = privcmd_hbuf_compare(buf, &this->buf);
+
+               parent = *new;
+               if (result < 0)
+                       new = &(*new)->rb_left;
+               else if (result > 0)
+                       new = &(*new)->rb_right;
+               else {
+                       this->count++;
+                       return 0;
+               }
+       }
+
+       /* Allocate and insert a new node. */
+       hbuf = privcmd_hbuf_alloc(buf);
+       if (IS_ERR(hbuf))
+               return PTR_ERR(hbuf);
+
+       rb_link_node(&hbuf->node, parent, new);
+       rb_insert_color(&hbuf->node, &priv->hbuf_root);
+
+       return 0;
+}
+
+static int privcmd_hbuf_remove(struct privcmd_data *priv,
+                              struct privcmd_hcall_buf *key)
+{
+       struct rb_node *node = priv->hbuf_root.rb_node;
+
+       while (node) {
+               struct privcmd_hbuf *hbuf = container_of(node, struct 
privcmd_hbuf, node);
+               int result;
+
+               result = privcmd_hbuf_compare(key, &hbuf->buf);
+
+               if (result < 0)
+                       node = node->rb_left;
+               else if (result > 0)
+                       node = node->rb_right;
+               else {
+                       if (--hbuf->count == 0) {
+                               rb_erase(&hbuf->node, &priv->hbuf_root);
+                               privcmd_hbuf_free(hbuf);
+                       }
+                       return 0;
+               }
+       }
+       return -ENXIO;
+}
+
+static int privcmd_ioctl_hcall_buf_lock(struct privcmd_data *priv,
+                                       void __user *udata)
+{
+       struct privcmd_hcall_buf buf;
+       int ret;
+
+       if (copy_from_user(&buf, udata, sizeof(buf)))
+               return -EFAULT;
+
+       if (buf.len > (1 << 30))
+               return -EINVAL;
+
+       mutex_lock(&priv->hbuf_mutex);
+       ret = privcmd_hbuf_insert(priv, &buf);
+       mutex_unlock(&priv->hbuf_mutex);
+
+       return ret;
+}
+
+static int privcmd_ioctl_hcall_buf_unlock(struct privcmd_data *priv,
+                                         void __user *udata)
+{
+       struct privcmd_hcall_buf buf;
+       int ret;
+
+       if (copy_from_user(&buf, udata, sizeof(buf)))
+               return -EFAULT;
+
+       mutex_lock(&priv->hbuf_mutex);
+       ret = privcmd_hbuf_remove(priv, &buf);
+       mutex_unlock(&priv->hbuf_mutex);
+
+       return ret;
+}
+
+static void privcmd_hbuf_unlock_all(struct privcmd_data *priv)
+{
+       struct privcmd_hbuf *hbuf, *n;
+
+       rbtree_postorder_for_each_entry_safe(hbuf, n, &priv->hbuf_root, node)
+               privcmd_hbuf_free(hbuf);
+}
+
 static long privcmd_ioctl(struct file *file,
                          unsigned int cmd, unsigned long data)
 {
+       struct privcmd_data *priv = file->private_data;
        int ret = -ENOSYS;
        void __user *udata = (void __user *) data;
 
@@ -571,6 +751,14 @@ static long privcmd_ioctl(struct file *file,
                ret = privcmd_ioctl_mmap_batch(udata, 2);
                break;
 
+       case IOCTL_PRIVCMD_HCALL_BUF_LOCK:
+               ret = privcmd_ioctl_hcall_buf_lock(priv, udata);
+               break;
+
+       case IOCTL_PRIVCMD_HCALL_BUF_UNLOCK:
+               ret = privcmd_ioctl_hcall_buf_unlock(priv, udata);
+               break;
+
        default:
                ret = -ENOTTY;
                break;
@@ -644,8 +832,36 @@ static int privcmd_vma_range_is_mapped(
                                   is_mapped_fn, NULL) != 0;
 }
 
+static int privcmd_open(struct inode *inode, struct file *file)
+{
+       struct privcmd_data *priv;
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->hbuf_root = RB_ROOT;
+       mutex_init(&priv->hbuf_mutex);
+
+       file->private_data = priv;
+
+       return 0;
+}
+
+static int privcmd_release(struct inode *inode, struct file *file)
+{
+       struct privcmd_data *priv = file->private_data;
+
+       privcmd_hbuf_unlock_all(priv);
+
+       kfree(priv);
+       return 0;
+}
+
 const struct file_operations xen_privcmd_fops = {
        .owner = THIS_MODULE,
+       .open = privcmd_open,
+       .release = privcmd_release,
        .unlocked_ioctl = privcmd_ioctl,
        .mmap = privcmd_mmap,
 };
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index 7ddeeda..d59d122 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -77,6 +77,11 @@ struct privcmd_mmapbatch_v2 {
        int __user *err;  /* array of error codes */
 };
 
+struct privcmd_hcall_buf {
+       void __user *start;
+       size_t len;
+};
+
 /*
  * @cmd: IOCTL_PRIVCMD_HYPERCALL
  * @arg: &privcmd_hypercall_t
@@ -99,4 +104,37 @@ struct privcmd_mmapbatch_v2 {
 #define IOCTL_PRIVCMD_MMAPBATCH_V2                             \
        _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2))
 
+/*
+ * @cmd: IOCTL_PRIVCMD_HCALL_BUF_LOCK
+ * @arg: struct privcmd hcall_buf *
+ * Return: 0 on success. On an error, -1 is returned and errno is set
+ * to EINVAL (buffer too large), ENOMEM, or EFAULT.
+ *
+ * Locks a memory buffer so it may be used in a hypercall.  This is
+ * similar to mlock(2) but also prevents compaction/page migration.
+ *
+ * The buffers may have any alignment and any size <= 1 GiB. They may
+ * overlap other buffers.
+ *
+ * Locked buffers are unlocked with IOCTL_PRIVCMD_HCALL_BUF_UNLOCK or
+ * by closing the file handle.
+ */
+#define IOCTL_PRIVCMD_HCALL_BUF_LOCK                           \
+       _IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_hcall_buf))
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HCALL_BUF_UNLOCK
+ * @arg: struct privcmd hcall_buf *
+ * Return: 0 on success. On an error, -1 is returned and errno is set
+ * to ENXIO (buffer was not locked), or EFAULT.
+ *
+ * Unlocks a memory buffer previously locked with
+ * IOCTL_PRIVCMD_HCALL_BUF_LOCK.
+ *
+ * It is not possible to partially unlock a buffer.  i.e., the
+ * LOCK/UNLOCK must be exactly paired.
+ */
+#define IOCTL_PRIVCMD_HCALL_BUF_UNLOCK                         \
+       _IOC(_IOC_NONE, 'P', 6, sizeof(struct privcmd_hcall_buf))
+
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.