[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] blktap: Add fallback code to blktap for missing poll-on-aio support.



# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Date 1182267148 -3600
# Node ID eeeb77195ac20e8e08c57a7248913bedc0d999bf
# Parent  865c4ae59be3b0aa7e375b929414244d1e5bdf74
blktap: Add fallback code to blktap for missing poll-on-aio support.

blktap requires a xen specific kernel AIO ABI which has been vetoed by
upstream in favour of another approach. Rather than include this ABI,
Fedora has been carrying a patch which makes tap:aio use a thread to
poll for aio events and notify the main thread via a pipe.

The upstream approach of allowing io_getevents() poll normal file
descriptors via epoll is still progressing:
 http://lkml.org/lkml/2007/1/3/16

but when that does make it upstream, blktap will require significant
re-working to use that approach.

In the meantime, here's a patch which uses the poll-in-a-thread
approach only if AIO poll support isn't available. It also hides the
details behind a simple abstraction and makes both tap:aio and
tap:qcow use it.

Signed-off-by: Mark McLoughlin <markmc@xxxxxxxxxx>
---
 tools/blktap/drivers/Makefile     |    1 
 tools/blktap/drivers/block-aio.c  |   49 ++++-------
 tools/blktap/drivers/block-qcow.c |   48 +++++------
 tools/blktap/drivers/tapaio.c     |  164 ++++++++++++++++++++++++++++++++++++++
 tools/blktap/drivers/tapaio.h     |   58 +++++++++++++
 5 files changed, 267 insertions(+), 53 deletions(-)

diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/Makefile
--- a/tools/blktap/drivers/Makefile     Tue Jun 19 16:29:22 2007 +0100
+++ b/tools/blktap/drivers/Makefile     Tue Jun 19 16:32:28 2007 +0100
@@ -35,6 +35,7 @@ BLK-OBJS  += block-ram.o
 BLK-OBJS  += block-ram.o
 BLK-OBJS  += block-qcow.o
 BLK-OBJS  += aes.o
+BLK-OBJS  += tapaio.o
 
 all: $(IBIN) qcow-util
 
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/block-aio.c
--- a/tools/blktap/drivers/block-aio.c  Tue Jun 19 16:29:22 2007 +0100
+++ b/tools/blktap/drivers/block-aio.c  Tue Jun 19 16:32:28 2007 +0100
@@ -43,14 +43,7 @@
 #include <sys/ioctl.h>
 #include <linux/fs.h>
 #include "tapdisk.h"
-
-
-/**
- * We used a kernel patch to return an fd associated with the AIO context
- * so that we can concurrently poll on synchronous and async descriptors.
- * This is signalled by passing 1 as the io context to io_setup.
- */
-#define REQUEST_ASYNC_FD 1
+#include "tapaio.h"
 
 #define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
 
@@ -65,14 +58,13 @@ struct tdaio_state {
        int fd;
        
        /* libaio state */
-       io_context_t       aio_ctx;
+       tap_aio_context_t  aio_ctx;
        struct iocb        iocb_list  [MAX_AIO_REQS];
        struct iocb       *iocb_free  [MAX_AIO_REQS];
        struct pending_aio pending_aio[MAX_AIO_REQS];
        int                iocb_free_count;
        struct iocb       *iocb_queue[MAX_AIO_REQS];
        int                iocb_queued;
-       int                poll_fd; /* NB: we require aio_poll support */
        struct io_event    aio_events[MAX_AIO_REQS];
 };
 
@@ -148,7 +140,7 @@ static inline void init_fds(struct disk_
        for(i = 0; i < MAX_IOFD; i++) 
                dd->io_fd[i] = 0;
 
-       dd->io_fd[0] = prv->poll_fd;
+       dd->io_fd[0] = prv->aio_ctx.pollfd;
 }
 
 /* Open the disk file and initialize aio state. */
@@ -162,12 +154,9 @@ int tdaio_open (struct disk_driver *dd, 
        /* Initialize AIO */
        prv->iocb_free_count = MAX_AIO_REQS;
        prv->iocb_queued     = 0;
-       
-       prv->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
-       prv->poll_fd = io_setup(MAX_AIO_REQS, &prv->aio_ctx);
-
-       if (prv->poll_fd < 0) {
-               ret = prv->poll_fd;
+
+       ret = tap_aio_setup(&prv->aio_ctx, prv->aio_events, MAX_AIO_REQS);
+       if (ret < 0) {
                 if (ret == -EAGAIN) {
                         DPRINTF("Couldn't setup AIO context.  If you are "
                                 "trying to concurrently use a large number "
@@ -176,9 +165,7 @@ int tdaio_open (struct disk_driver *dd, 
                                 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
                                 "aio-max-nr')\n");
                 } else {
-                        DPRINTF("Couldn't get fd for AIO poll support.  This "
-                                "is probably because your kernel does not "
-                                "have the aio-poll patch applied.\n");
+                        DPRINTF("Couldn't setup AIO context.\n");
                 }
                goto done;
        }
@@ -286,7 +273,7 @@ int tdaio_submit(struct disk_driver *dd)
        if (!prv->iocb_queued)
                return 0;
 
-       ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
+       ret = io_submit(prv->aio_ctx.aio_ctx, prv->iocb_queued, 
prv->iocb_queue);
        
        /* XXX: TODO: Handle error conditions here. */
        
@@ -300,7 +287,7 @@ int tdaio_close(struct disk_driver *dd)
 {
        struct tdaio_state *prv = (struct tdaio_state *)dd->private;
        
-       io_destroy(prv->aio_ctx);
+       io_destroy(prv->aio_ctx.aio_ctx);
        close(prv->fd);
 
        return 0;
@@ -308,15 +295,13 @@ int tdaio_close(struct disk_driver *dd)
 
 int tdaio_do_callbacks(struct disk_driver *dd, int sid)
 {
-       int ret, i, rsp = 0;
+       int i, nr_events, rsp = 0;
        struct io_event *ep;
        struct tdaio_state *prv = (struct tdaio_state *)dd->private;
 
-       /* Non-blocking test for completed io. */
-       ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events,
-                          NULL);
-                       
-       for (ep=prv->aio_events,i=ret; i-->0; ep++) {
+       nr_events = tap_aio_get_events(&prv->aio_ctx);
+repeat:
+       for (ep = prv->aio_events, i = nr_events; i-- > 0; ep++) {
                struct iocb        *io  = ep->obj;
                struct pending_aio *pio;
                
@@ -327,6 +312,14 @@ int tdaio_do_callbacks(struct disk_drive
 
                prv->iocb_free[prv->iocb_free_count++] = io;
        }
+
+       if (nr_events) {
+               nr_events = tap_aio_more_events(&prv->aio_ctx);
+               goto repeat;
+       }
+
+       tap_aio_continue(&prv->aio_ctx);
+
        return rsp;
 }
 
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Tue Jun 19 16:29:22 2007 +0100
+++ b/tools/blktap/drivers/block-qcow.c Tue Jun 19 16:32:28 2007 +0100
@@ -38,6 +38,7 @@
 #include "bswap.h"
 #include "aes.h"
 #include "tapdisk.h"
+#include "tapaio.h"
 
 #if 1
 #define ASSERT(_p) \
@@ -52,9 +53,6 @@
     (uint64_t)( \
         (l + (s - 1)) - ((l + (s - 1)) % s)); \
 })
-
-/******AIO DEFINES******/
-#define REQUEST_ASYNC_FD 1
 
 struct pending_aio {
         td_callback_t cb;
@@ -145,7 +143,7 @@ struct tdqcow_state {
        AES_KEY aes_encrypt_key;       /*AES key*/
        AES_KEY aes_decrypt_key;       /*AES key*/
         /* libaio state */
-        io_context_t        aio_ctx;
+        tap_aio_context_t   aio_ctx;
         int                 max_aio_reqs;
         struct iocb        *iocb_list;
         struct iocb       **iocb_free;
@@ -153,7 +151,6 @@ struct tdqcow_state {
         int                 iocb_free_count;
         struct iocb       **iocb_queue;
         int                 iocb_queued;
-        int                 poll_fd;      /* NB: we require aio_poll support */
         struct io_event    *aio_events;
 };
 
@@ -179,7 +176,7 @@ static void free_aio_state(struct disk_d
 
 static int init_aio_state(struct disk_driver *dd)
 {
-        int i;
+       int i, ret;
        struct td_state     *bs = dd->td_state;
        struct tdqcow_state  *s = (struct tdqcow_state *)dd->private;
         long     ioidx;
@@ -216,12 +213,9 @@ static int init_aio_state(struct disk_dr
                 goto fail;
         }
 
-        /*Signal kernel to create Poll FD for Asyc completion events*/
-        s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;   
-        s->poll_fd = io_setup(s->max_aio_reqs, &s->aio_ctx);
-
-       if (s->poll_fd < 0) {
-                if (s->poll_fd == -EAGAIN) {
+       ret = tap_aio_setup(&s->aio_ctx, s->aio_events, s->max_aio_reqs);
+       if (ret < 0) {
+                if (ret == -EAGAIN) {
                         DPRINTF("Couldn't setup AIO context.  If you are "
                                 "trying to concurrently use a large number "
                                 "of blktap-based disks, you may need to "
@@ -229,9 +223,7 @@ static int init_aio_state(struct disk_dr
                                 "(e.g. 'echo echo 1048576 > /proc/sys/fs/"
                                 "aio-max-nr')\n");
                 } else {
-                        DPRINTF("Couldn't get fd for AIO poll support.  This "
-                                "is probably because your kernel does not "
-                                "have the aio-poll patch applied.\n");
+                        DPRINTF("Couldn't setup AIO context.\n");
                 }
                goto fail;
        }
@@ -845,7 +837,7 @@ static inline void init_fds(struct disk_
        for(i = 0; i < MAX_IOFD; i++) 
                dd->io_fd[i] = 0;
 
-       dd->io_fd[0] = s->poll_fd;
+       dd->io_fd[0] = s->aio_ctx.pollfd;
 }
 
 /* Open the disk file and initialize qcow state. */
@@ -1144,7 +1136,7 @@ int tdqcow_submit(struct disk_driver *dd
        if (!prv->iocb_queued)
                return 0;
 
-       ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue);
+       ret = io_submit(prv->aio_ctx.aio_ctx, prv->iocb_queued, 
prv->iocb_queue);
 
         /* XXX: TODO: Handle error conditions here. */
 
@@ -1172,7 +1164,7 @@ int tdqcow_close(struct disk_driver *dd)
                close(fd);
        }
 
-       io_destroy(s->aio_ctx);
+       io_destroy(s->aio_ctx.aio_ctx);
        free(s->name);
        free(s->l1_table);
        free(s->l2_cache);
@@ -1184,17 +1176,15 @@ int tdqcow_close(struct disk_driver *dd)
 
 int tdqcow_do_callbacks(struct disk_driver *dd, int sid)
 {
-        int ret, i, rsp = 0,*ptr;
+        int ret, i, nr_events, rsp = 0,*ptr;
         struct io_event *ep;
         struct tdqcow_state *prv = (struct tdqcow_state *)dd->private;
 
         if (sid > MAX_IOFD) return 1;
-       
-       /* Non-blocking test for completed io. */
-        ret = io_getevents(prv->aio_ctx, 0, prv->max_aio_reqs, prv->aio_events,
-                           NULL);
-
-        for (ep = prv->aio_events, i = ret; i-- > 0; ep++) {
+
+        nr_events = tap_aio_get_events(&prv->aio_ctx);
+repeat:
+        for (ep = prv->aio_events, i = nr_events; i-- > 0; ep++) {
                 struct iocb        *io  = ep->obj;
                 struct pending_aio *pio;
 
@@ -1215,6 +1205,14 @@ int tdqcow_do_callbacks(struct disk_driv
 
                 prv->iocb_free[prv->iocb_free_count++] = io;
         }
+
+        if (nr_events) {
+                nr_events = tap_aio_more_events(&prv->aio_ctx);
+                goto repeat;
+        }
+
+        tap_aio_continue(&prv->aio_ctx);
+
         return rsp;
 }
 
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/tapaio.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap/drivers/tapaio.c     Tue Jun 19 16:32:28 2007 +0100
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield
+ * Copyright (c) 2007 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "tapaio.h"
+#include "tapdisk.h"
+#include <unistd.h>
+
+/**
+ * We used a kernel patch to return an fd associated with the AIO context
+ * so that we can concurrently poll on synchronous and async descriptors.
+ * This is signalled by passing 1 as the io context to io_setup.
+ */
+#define REQUEST_ASYNC_FD 1
+
+/*
+ * If we don't have any way to do epoll on aio events in a normal kernel,
+ * wait for aio events in a separate thread and return completion status
+ * that via a pipe that can be waited on normally.
+ *
+ * To keep locking problems between the completion thread and the submit
+ * thread to a minimum, there's a handshake which allows only one thread
+ * to be doing work on the completion queue at a time:
+ *
+ * 1) main thread sends completion thread a command via the command pipe;
+ * 2) completion thread waits for aio events and returns the number
+ *    received on the completion pipe
+ * 3) main thread processes the received ctx->aio_events events
+ * 4) loop back to 1) to let the completion thread refill the aio_events
+ *    buffer.
+ *
+ * This workaround needs to disappear once the kernel provides a single
+ * mechanism for waiting on both aio and normal fd wakeups.
+ */
+static void *
+tap_aio_completion_thread(void *arg)
+{
+       tap_aio_context_t *ctx = (tap_aio_context_t *) arg;
+       int command;
+       int nr_events;
+       int rc;
+
+       while (1) {
+               rc = read(ctx->command_fd[0], &command, sizeof(command));
+
+               do {
+                       rc = io_getevents(ctx->aio_ctx, 1,
+                                         ctx->max_aio_events, ctx->aio_events,
+                                         NULL);
+                       if (rc) {
+                               nr_events = rc;
+                               rc = write(ctx->completion_fd[1], &nr_events,
+                                          sizeof(nr_events));
+                       }
+               } while (!rc);
+       }
+}
+
+void
+tap_aio_continue(tap_aio_context_t *ctx)
+{
+        int cmd = 0;
+
+        if (!ctx->poll_in_thread)
+                return;
+
+        if (write(ctx->command_fd[1], &cmd, sizeof(cmd)) < 0)
+                DPRINTF("Cannot write to command pipe\n");
+}
+
+int
+tap_aio_setup(tap_aio_context_t *ctx,
+              struct io_event *aio_events,
+              int max_aio_events)
+{
+        int ret;
+
+        ctx->aio_events = aio_events;
+        ctx->max_aio_events = max_aio_events;
+        ctx->poll_in_thread = 0;
+
+        ctx->aio_ctx = (io_context_t) REQUEST_ASYNC_FD;
+        ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
+        if (ret < 0 && ret != -EINVAL)
+                return ret;
+        else if (ret > 0) {
+                ctx->pollfd = ret;
+                return ctx->pollfd;
+        }
+
+        ctx->aio_ctx = (io_context_t) 0;
+        ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx);
+        if (ret < 0)
+                return ret;
+
+        if ((ret = pipe(ctx->command_fd)) < 0) {
+                DPRINTF("Unable to create command pipe\n");
+                return -1;
+        }
+        if ((ret = pipe(ctx->completion_fd)) < 0) {
+                DPRINTF("Unable to create completion pipe\n");
+                return -1;
+        }
+
+        if ((ret = pthread_create(&ctx->aio_thread, NULL,
+                                  tap_aio_completion_thread, ctx)) != 0) {
+                DPRINTF("Unable to create completion thread\n");
+                return -1;
+        }
+
+        ctx->pollfd = ctx->completion_fd[0];
+        ctx->poll_in_thread = 1;
+
+        tap_aio_continue(ctx);
+
+        return 0;
+}
+
+int
+tap_aio_get_events(tap_aio_context_t *ctx)
+{
+        int nr_events = 0;
+
+        if (!ctx->poll_in_thread)
+                nr_events = io_getevents(ctx->aio_ctx, 1,
+                                         ctx->max_aio_events, ctx->aio_events, 
NULL);
+        else
+                read(ctx->completion_fd[0], &nr_events, sizeof(nr_events));
+
+        return nr_events;
+}
+
+int tap_aio_more_events(tap_aio_context_t *ctx)
+{
+        return io_getevents(ctx->aio_ctx, 0,
+                            ctx->max_aio_events, ctx->aio_events, NULL);
+}
+
+
diff -r 865c4ae59be3 -r eeeb77195ac2 tools/blktap/drivers/tapaio.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap/drivers/tapaio.h     Tue Jun 19 16:32:28 2007 +0100
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield
+ * Copyright (c) 2007 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __TAPAIO_H__
+#define __TAPAIO_H__
+
+#include <pthread.h>
+#include <libaio.h>
+
+struct tap_aio_context {
+        io_context_t     aio_ctx;
+
+        struct io_event *aio_events;
+        int              max_aio_events;
+
+        pthread_t        aio_thread;
+        int              command_fd[2];
+        int              completion_fd[2];
+        int              pollfd;
+        unsigned int     poll_in_thread : 1;
+};
+
+typedef struct tap_aio_context tap_aio_context_t;
+
+int  tap_aio_setup      (tap_aio_context_t *ctx,
+                         struct io_event *aio_events,
+                         int max_aio_events);
+void tap_aio_continue   (tap_aio_context_t *ctx);
+int  tap_aio_get_events (tap_aio_context_t *ctx);
+int  tap_aio_more_events(tap_aio_context_t *ctx);
+
+#endif /* __TAPAIO_H__ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.