[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCHv1 2/2] xen/privcmd: add ioctls for locking/unlocking hypercall buffers



Using mlock() for hypercall buffers is not sufficient since mlocked
pages are still subject to compaction and page migration.  Page
migration can be prevented by taking additional references to the
pages.

Introduce two new ioctls: IOCTL_PRIVCMD_HCALL_BUF_LOCK and
IOCTL_PRIVCMD_HCALL_BUF_UNLOCK which get and put the necessary page
references.  The buffers do not need to be page aligned and they may
overlap with other buffers.  However, it is not possible to partially
unlock a buffer (i.e., the LOCK/UNLOCK must always be paired).  Any
locked buffers are automatically unlocked when the file descriptor is
closed.

An alternative approach would be to extend the driver with an ioctl to
populate a VMA with "special", non-migratable pages.  But the
LOCK/UNLOCK ioctls are more flexible as they allow any page to be used
for hypercalls (e.g., stack, mmap'd files, etc.).  This could be used
to minimize bouncing for performance critical hypercalls.

Locked buffers are stored in a rbtree for faster lookup during UNLOCK,
and stored in a list so all buffers can be unlocked when the file
descriptor is closed.

Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx>
---
 drivers/xen/privcmd.c      | 208 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/xen/privcmd.h |  37 ++++++++
 2 files changed, 245 insertions(+)

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index ac76bc4..7fc9db8 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -22,6 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/miscdevice.h>
+#include <linux/rbtree.h>
 
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -43,6 +44,21 @@ MODULE_LICENSE("GPL");
 
 #define PRIV_VMA_LOCKED ((void *)1)
 
+struct privcmd_data {
+       struct rb_root hbuf_root;
+       struct list_head hbuf_list;
+       struct mutex hbuf_mutex;
+};
+
+struct privcmd_hbuf {
+       struct rb_node node;
+       struct list_head release_node;
+       struct privcmd_hcall_buf buf;
+       unsigned int nr_pages;
+       struct page **pages;
+       unsigned int count;
+};
+
 static int privcmd_vma_range_is_mapped(
                struct vm_area_struct *vma,
                unsigned long addr,
@@ -548,9 +564,164 @@ out_unlock:
        goto out;
 }
 
+static int privcmd_hbuf_compare(struct privcmd_hcall_buf *a,
+                               struct privcmd_hcall_buf *b)
+{
+       if (b->start == a->start)
+               return b->len - a->len;
+       return b->start - a->start;
+}
+
+static void privcmd_hbuf_insert(struct privcmd_data *priv,
+                               struct privcmd_hbuf *hbuf)
+{
+       struct rb_node **new = &priv->hbuf_root.rb_node, *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct privcmd_hbuf *this = container_of(*new, struct 
privcmd_hbuf, node);
+               int result = privcmd_hbuf_compare(&hbuf->buf, &this->buf);
+
+               parent = *new;
+               if (result < 0)
+                       new = &(*new)->rb_left;
+               else if (result > 0)
+                       new = &(*new)->rb_right;
+               else {
+                       this->count++;
+                       kfree(hbuf->pages);
+                       kfree(hbuf);
+                       return;
+               }
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&hbuf->node, parent, new);
+       rb_insert_color(&hbuf->node, &priv->hbuf_root);
+
+       list_add_tail(&hbuf->release_node, &priv->hbuf_list);
+
+       hbuf->count = 1;
+}
+
+static struct privcmd_hbuf *privcmd_hbuf_lookup(struct privcmd_data *priv,
+                                               struct privcmd_hcall_buf *key)
+{
+       struct rb_node *node = priv->hbuf_root.rb_node;
+
+       while (node) {
+               struct privcmd_hbuf *hbuf = container_of(node, struct 
privcmd_hbuf, node);
+               int result;
+
+               result = privcmd_hbuf_compare(key, &hbuf->buf);
+
+               if (result < 0)
+                       node = node->rb_left;
+               else if (result > 0)
+                       node = node->rb_right;
+               else
+                       return hbuf;
+       }
+       return NULL;
+}
+
+static void privcmd_hbuf_unlock(struct privcmd_data *priv,
+                               struct privcmd_hbuf *hbuf)
+{
+       unsigned int i;
+
+       for (i = 0; i < hbuf->nr_pages; i++)
+               put_page(hbuf->pages[i]);
+
+       if (--hbuf->count == 0) {
+               rb_erase(&hbuf->node, &priv->hbuf_root);
+               list_del(&hbuf->release_node);
+               kfree(hbuf->pages);
+               kfree(hbuf);
+       }
+}
+
+static int privcmd_ioctl_hcall_buf_lock(struct privcmd_data *priv,
+                                       void __user *udata)
+{
+       struct privcmd_hbuf *hbuf;
+       unsigned long start, end;
+       int ret;
+
+       hbuf = kzalloc(sizeof(*hbuf), GFP_KERNEL);
+       if (!hbuf)
+               return -ENOMEM;
+
+       if (copy_from_user(&hbuf->buf, udata, sizeof(hbuf->buf))) {
+               ret = -EFAULT;
+               goto error;
+       }
+
+       start = (unsigned long)hbuf->buf.start & PAGE_MASK;
+       end = ALIGN((unsigned long)hbuf->buf.start + hbuf->buf.len, PAGE_SIZE);
+       hbuf->nr_pages = (end - start) / PAGE_SIZE;
+
+       hbuf->pages = kcalloc(hbuf->nr_pages, sizeof(*hbuf->pages), GFP_KERNEL);
+       if (!hbuf->pages) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       /*
+        * Take a reference to each page, this will prevent swapping
+        * and page migration.
+        */
+       ret = get_user_pages_fast(start, hbuf->nr_pages, 1, hbuf->pages);
+       if (ret < 0)
+               goto error;
+
+       mutex_lock(&priv->hbuf_mutex);
+       privcmd_hbuf_insert(priv, hbuf);
+       mutex_unlock(&priv->hbuf_mutex);
+
+       return 0;
+
+  error:
+       kfree(hbuf->pages);
+       kfree(hbuf);
+       return ret;
+}
+
+static int privcmd_ioctl_hcall_buf_unlock(struct privcmd_data *priv,
+                                         void __user *udata)
+{
+       struct privcmd_hcall_buf hcall_buf;
+       struct privcmd_hbuf *hbuf;
+
+       if (copy_from_user(&hcall_buf, udata, sizeof(hcall_buf)))
+               return -EFAULT;
+
+       mutex_lock(&priv->hbuf_mutex);
+
+       hbuf = privcmd_hbuf_lookup(priv, &hcall_buf);
+       if (hbuf)
+               privcmd_hbuf_unlock(priv, hbuf);
+
+       mutex_unlock(&priv->hbuf_mutex);
+
+       return 0;
+}
+
+static void privcmd_hbuf_erase_all(struct privcmd_data *priv)
+{
+       while (!list_empty(&priv->hbuf_list)) {
+               struct privcmd_hbuf *hbuf;
+
+               hbuf = list_first_entry(&priv->hbuf_list,
+                                       struct privcmd_hbuf, release_node);
+               privcmd_hbuf_unlock(priv, hbuf);
+       }
+}
+
 static long privcmd_ioctl(struct file *file,
                          unsigned int cmd, unsigned long data)
 {
+       struct privcmd_data *priv = file->private_data;
        int ret = -ENOSYS;
        void __user *udata = (void __user *) data;
 
@@ -571,6 +742,14 @@ static long privcmd_ioctl(struct file *file,
                ret = privcmd_ioctl_mmap_batch(udata, 2);
                break;
 
+       case IOCTL_PRIVCMD_HCALL_BUF_LOCK:
+               ret = privcmd_ioctl_hcall_buf_lock(priv, udata);
+               break;
+
+       case IOCTL_PRIVCMD_HCALL_BUF_UNLOCK:
+               ret = privcmd_ioctl_hcall_buf_unlock(priv, udata);
+               break;
+
        default:
                ret = -ENOTTY;
                break;
@@ -644,8 +823,37 @@ static int privcmd_vma_range_is_mapped(
                                   is_mapped_fn, NULL) != 0;
 }
 
+static int privcmd_open(struct inode *inode, struct file *file)
+{
+       struct privcmd_data *priv;
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->hbuf_root = RB_ROOT;
+       INIT_LIST_HEAD(&priv->hbuf_list);
+       mutex_init(&priv->hbuf_mutex);
+
+       file->private_data = priv;
+
+       return 0;
+}
+
+static int privcmd_release(struct inode *inode, struct file *file)
+{
+       struct privcmd_data *priv = file->private_data;
+
+       privcmd_hbuf_erase_all(priv);
+
+       kfree(priv);
+       return 0;
+}
+
 const struct file_operations xen_privcmd_fops = {
        .owner = THIS_MODULE,
+       .open = privcmd_open,
+       .release = privcmd_release,
        .unlocked_ioctl = privcmd_ioctl,
        .mmap = privcmd_mmap,
 };
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index 7ddeeda..425bd02 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -77,6 +77,11 @@ struct privcmd_mmapbatch_v2 {
        int __user *err;  /* array of error codes */
 };
 
+struct privcmd_hcall_buf {
+       void *start;
+       size_t len;
+};
+
 /*
  * @cmd: IOCTL_PRIVCMD_HYPERCALL
  * @arg: &privcmd_hypercall_t
@@ -99,4 +104,36 @@ struct privcmd_mmapbatch_v2 {
 #define IOCTL_PRIVCMD_MMAPBATCH_V2                             \
        _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2))
 
+/*
+ * @cmd: IOCTL_PRIVCMD_HCALL_BUF_LOCK
+ * @arg: struct privcmd hcall_buf *
+ * Return: 0 on success. On an error, -1 is returned and errno is set
+ * to EINVAL, ENOMEM, or EFAULT.
+ *
+ * Locks a memory buffer so it may be used in a hypercall.  This is
+ * similar to mlock(2) but also prevents compaction/page migration.
+ *
+ * The buffers may have any alignment and size and may overlap other
+ * buffers.
+ *
+ * Locked buffers are unlocked with IOCTL_PRIVCMD_HCALL_BUF_UNLOCK or
+ * by closing the file handle.
+ */
+#define IOCTL_PRIVCMD_HCALL_BUF_LOCK                           \
+       _IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_hcall_buf))
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HCALL_BUF_UNLOCK
+ * @arg: struct privcmd hcall_buf *
+ * Return: Always 0.
+ *
+ * Unlocks a memory buffer previously locked with
+ * IOCTL_PRIVCMD_HCALL_BUF_LOCK.
+ *
+ * It is not possible to partially unlock a buffer.  i.e., the
+ * LOCK/UNLOCK must be exactly paired.
+ */
+#define IOCTL_PRIVCMD_HCALL_BUF_UNLOCK                         \
+       _IOC(_IOC_NONE, 'P', 6, sizeof(struct privcmd_hcall_buf))
+
 #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.