[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] stubdom: add asynchronous disk flush support



stubdom: add asynchronous disk flush support

Signed-off-by: Samuel Thibault <samuel.thibault@xxxxxxxxxxxxx>

diff -r 4558664bea4a extras/mini-os/blkfront.c
--- a/extras/mini-os/blkfront.c Fri Apr 04 16:07:44 2008 +0100
+++ b/extras/mini-os/blkfront.c Sat Apr 05 13:25:43 2008 +0100
@@ -48,11 +48,7 @@ struct blkfront_dev {
 
     char *nodename;
     char *backend;
-    unsigned sector_size;
-    unsigned sectors;
-    int mode;
-    int barrier;
-    int flush;
+    struct blkfront_info info;
 
 #ifdef HAVE_LIBC
     int fd;
@@ -70,7 +66,7 @@ void blkfront_handler(evtchn_port_t port
     wake_up(&blkfront_queue);
 }
 
-struct blkfront_dev *init_blkfront(char *nodename, uint64_t *sectors, unsigned 
*sector_size, int *mode, int *info)
+struct blkfront_dev *init_blkfront(char *nodename, struct blkfront_info *info)
 {
     xenbus_transaction_t xbt;
     char* err;
@@ -163,9 +159,9 @@ done:
             return NULL;
         }
         if (*c == 'w')
-            *mode = dev->mode = O_RDWR;
+            dev->info.mode = O_RDWR;
         else
-            *mode = dev->mode = O_RDONLY;
+            dev->info.mode = O_RDONLY;
         free(c);
 
         snprintf(path, sizeof(path), "%s/state", dev->backend);
@@ -177,24 +173,26 @@ done:
         xenbus_unwatch_path(XBT_NIL, path);
 
         snprintf(path, sizeof(path), "%s/info", dev->backend);
-        *info = xenbus_read_integer(path);
+        dev->info.info = xenbus_read_integer(path);
 
         snprintf(path, sizeof(path), "%s/sectors", dev->backend);
         // FIXME: read_integer returns an int, so disk size limited to 1TB for 
now
-        *sectors = dev->sectors = xenbus_read_integer(path);
+        dev->info.sectors = xenbus_read_integer(path);
 
         snprintf(path, sizeof(path), "%s/sector-size", dev->backend);
-        *sector_size = dev->sector_size = xenbus_read_integer(path);
+        dev->info.sector_size = xenbus_read_integer(path);
 
         snprintf(path, sizeof(path), "%s/feature-barrier", dev->backend);
-        dev->barrier = xenbus_read_integer(path);
+        dev->info.barrier = xenbus_read_integer(path);
 
         snprintf(path, sizeof(path), "%s/feature-flush-cache", dev->backend);
-        dev->flush = xenbus_read_integer(path);
+        dev->info.flush = xenbus_read_integer(path);
+
+        *info = dev->info;
     }
     unmask_evtchn(dev->evtchn);
 
-    printk("%u sectors of %u bytes\n", dev->sectors, dev->sector_size);
+    printk("%u sectors of %u bytes\n", dev->info.sectors, 
dev->info.sector_size);
     printk("**************************\n");
 
     return dev;
@@ -258,11 +256,11 @@ void blkfront_aio(struct blkfront_aiocb 
     uintptr_t start, end;
 
     // Can't io at non-sector-aligned location
-    ASSERT(!(aiocbp->aio_offset & (dev->sector_size-1)));
+    ASSERT(!(aiocbp->aio_offset & (dev->info.sector_size-1)));
     // Can't io non-sector-sized amounts
-    ASSERT(!(aiocbp->aio_nbytes & (dev->sector_size-1)));
+    ASSERT(!(aiocbp->aio_nbytes & (dev->info.sector_size-1)));
     // Can't io non-sector-aligned buffer
-    ASSERT(!((uintptr_t) aiocbp->aio_buf & (dev->sector_size-1)));
+    ASSERT(!((uintptr_t) aiocbp->aio_buf & (dev->info.sector_size-1)));
 
     start = (uintptr_t)aiocbp->aio_buf & PAGE_MASK;
     end = ((uintptr_t)aiocbp->aio_buf + aiocbp->aio_nbytes + PAGE_SIZE - 1) & 
PAGE_MASK;
@@ -280,7 +278,7 @@ void blkfront_aio(struct blkfront_aiocb 
     req->nr_segments = n;
     req->handle = dev->handle;
     req->id = (uintptr_t) aiocbp;
-    req->sector_number = aiocbp->aio_offset / dev->sector_size;
+    req->sector_number = aiocbp->aio_offset / dev->info.sector_size;
 
     for (j = 0; j < n; j++) {
        uintptr_t data = start + j * PAGE_SIZE;
@@ -292,10 +290,10 @@ void blkfront_aio(struct blkfront_aiocb 
        aiocbp->gref[j] = req->seg[j].gref =
             gnttab_grant_access(dev->dom, virtual_to_mfn(data), write);
        req->seg[j].first_sect = 0;
-       req->seg[j].last_sect = PAGE_SIZE / dev->sector_size - 1;
+       req->seg[j].last_sect = PAGE_SIZE / dev->info.sector_size - 1;
     }
-    req->seg[0].first_sect = ((uintptr_t)aiocbp->aio_buf & ~PAGE_MASK) / 
dev->sector_size;
-    req->seg[n-1].last_sect = (((uintptr_t)aiocbp->aio_buf + 
aiocbp->aio_nbytes - 1) & ~PAGE_MASK) / dev->sector_size;
+    req->seg[0].first_sect = ((uintptr_t)aiocbp->aio_buf & ~PAGE_MASK) / 
dev->info.sector_size;
+    req->seg[n-1].last_sect = (((uintptr_t)aiocbp->aio_buf + 
aiocbp->aio_nbytes - 1) & ~PAGE_MASK) / dev->info.sector_size;
 
     dev->ring.req_prod_pvt = i + 1;
 
@@ -313,6 +311,62 @@ void blkfront_aio_read(struct blkfront_a
 void blkfront_aio_read(struct blkfront_aiocb *aiocbp)
 {
     blkfront_aio(aiocbp, 0);
+}
+
+static void blkfront_push_operation(struct blkfront_dev *dev, uint8_t op, 
uint64_t id)
+{
+    int i;
+    struct blkif_request *req;
+    int notify;
+
+    blkfront_wait_slot(dev);
+    i = dev->ring.req_prod_pvt;
+    req = RING_GET_REQUEST(&dev->ring, i);
+    req->operation = op;
+    req->nr_segments = 0;
+    req->handle = dev->handle;
+    req->id = id;
+    /* Not needed anyway, but the backend will check it */
+    req->sector_number = 0;
+    dev->ring.req_prod_pvt = i + 1;
+    wmb();
+    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
+    if (notify) notify_remote_via_evtchn(dev->evtchn);
+}
+
+void blkfront_aio_push_operation(struct blkfront_aiocb *aiocbp, uint8_t op)
+{
+    struct blkfront_dev *dev = aiocbp->aio_dev;
+    blkfront_push_operation(dev, op, (uintptr_t) aiocbp);
+}
+
+void blkfront_sync(struct blkfront_dev *dev)
+{
+    unsigned long flags;
+
+    if (dev->info.mode == O_RDWR) {
+        if (dev->info.barrier == 1)
+            blkfront_push_operation(dev, BLKIF_OP_WRITE_BARRIER, 0);
+
+        if (dev->info.flush == 1)
+            blkfront_push_operation(dev, BLKIF_OP_FLUSH_DISKCACHE, 0);
+    }
+
+    /* Note: This won't finish if another thread enqueues requests.  */
+    local_irq_save(flags);
+    DEFINE_WAIT(w);
+    while (1) {
+       blkfront_aio_poll(dev);
+       if (RING_FREE_REQUESTS(&dev->ring) == RING_SIZE(&dev->ring))
+           break;
+
+       add_waiter(w, blkfront_queue);
+       local_irq_restore(flags);
+       schedule();
+       local_irq_save(flags);
+    }
+    remove_waiter(w);
+    local_irq_restore(flags);
 }
 
 int blkfront_aio_poll(struct blkfront_dev *dev)
@@ -337,93 +391,45 @@ moretodo:
        rsp = RING_GET_RESPONSE(&dev->ring, cons);
        nr_consumed++;
 
-        if (rsp->status != BLKIF_RSP_OKAY)
-            printk("block error %d for op %d\n", rsp->status, rsp->operation);
+        struct blkfront_aiocb *aiocbp = (void*) (uintptr_t) rsp->id;
+        int status = rsp->status;
+
+        if (status != BLKIF_RSP_OKAY)
+            printk("block error %d for op %d\n", status, rsp->operation);
 
         switch (rsp->operation) {
         case BLKIF_OP_READ:
         case BLKIF_OP_WRITE:
         {
-            struct blkfront_aiocb *aiocbp = (void*) (uintptr_t) rsp->id;
-            int status = rsp->status;
             int j;
 
             for (j = 0; j < aiocbp->n; j++)
                 gnttab_end_access(aiocbp->gref[j]);
 
-            dev->ring.rsp_cons = ++cons;
-            /* Nota: callback frees aiocbp itself */
-            aiocbp->aio_cb(aiocbp, status ? -EIO : 0);
-            if (dev->ring.rsp_cons != cons)
-                /* We reentered, we must not continue here */
-                goto out;
             break;
         }
+
+        case BLKIF_OP_WRITE_BARRIER:
+        case BLKIF_OP_FLUSH_DISKCACHE:
+            break;
+
         default:
             printk("unrecognized block operation %d response\n", 
rsp->operation);
-        case BLKIF_OP_WRITE_BARRIER:
-        case BLKIF_OP_FLUSH_DISKCACHE:
-            dev->ring.rsp_cons = ++cons;
+        }
+
+        dev->ring.rsp_cons = ++cons;
+        /* Nota: callback frees aiocbp itself */
+        if (aiocbp && aiocbp->aio_cb)
+            aiocbp->aio_cb(aiocbp, status ? -EIO : 0);
+        if (dev->ring.rsp_cons != cons)
+            /* We reentered, we must not continue here */
             break;
-        }
     }
 
-out:
     RING_FINAL_CHECK_FOR_RESPONSES(&dev->ring, more);
     if (more) goto moretodo;
 
     return nr_consumed;
-}
-
-static void blkfront_push_operation(struct blkfront_dev *dev, uint8_t op)
-{
-    int i;
-    struct blkif_request *req;
-    int notify;
-
-    blkfront_wait_slot(dev);
-    i = dev->ring.req_prod_pvt;
-    req = RING_GET_REQUEST(&dev->ring, i);
-    req->operation = op;
-    req->nr_segments = 0;
-    req->handle = dev->handle;
-    /* Not used */
-    req->id = 0;
-    /* Not needed anyway, but the backend will check it */
-    req->sector_number = 0;
-    dev->ring.req_prod_pvt = i + 1;
-    wmb();
-    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
-    if (notify) notify_remote_via_evtchn(dev->evtchn);
-}
-
-void blkfront_sync(struct blkfront_dev *dev)
-{
-    unsigned long flags;
-
-    if (dev->mode == O_RDWR) {
-        if (dev->barrier == 1)
-            blkfront_push_operation(dev, BLKIF_OP_WRITE_BARRIER);
-
-        if (dev->flush == 1)
-            blkfront_push_operation(dev, BLKIF_OP_FLUSH_DISKCACHE);
-    }
-
-    /* Note: This won't finish if another thread enqueues requests.  */
-    local_irq_save(flags);
-    DEFINE_WAIT(w);
-    while (1) {
-       blkfront_aio_poll(dev);
-       if (RING_FREE_REQUESTS(&dev->ring) == RING_SIZE(&dev->ring))
-           break;
-
-       add_waiter(w, blkfront_queue);
-       local_irq_restore(flags);
-       schedule();
-       local_irq_save(flags);
-    }
-    remove_waiter(w);
-    local_irq_restore(flags);
 }
 
 #ifdef HAVE_LIBC
diff -r 4558664bea4a extras/mini-os/include/blkfront.h
--- a/extras/mini-os/include/blkfront.h Fri Apr 04 16:07:44 2008 +0100
+++ b/extras/mini-os/include/blkfront.h Sat Apr 05 13:25:43 2008 +0100
@@ -15,13 +15,23 @@ struct blkfront_aiocb
 
     void (*aio_cb)(struct blkfront_aiocb *aiocb, int ret);
 };
-struct blkfront_dev *init_blkfront(char *nodename, uint64_t *sectors, unsigned 
*sector_size, int *mode, int *info);
+struct blkfront_info
+{
+    uint64_t sectors;
+    unsigned sector_size;
+    int mode;
+    int info;
+    int barrier;
+    int flush;
+};
+struct blkfront_dev *init_blkfront(char *nodename, struct blkfront_info *info);
 #ifdef HAVE_LIBC
 int blkfront_open(struct blkfront_dev *dev);
 #endif
 void blkfront_aio(struct blkfront_aiocb *aiocbp, int write);
 void blkfront_aio_read(struct blkfront_aiocb *aiocbp);
 void blkfront_aio_write(struct blkfront_aiocb *aiocbp);
+void blkfront_aio_push_operation(struct blkfront_aiocb *aiocbp, uint8_t op);
 int blkfront_aio_poll(struct blkfront_dev *dev);
 void blkfront_sync(struct blkfront_dev *dev);
 void shutdown_blkfront(struct blkfront_dev *dev);
diff -r 4558664bea4a extras/mini-os/kernel.c
--- a/extras/mini-os/kernel.c   Fri Apr 04 16:07:44 2008 +0100
+++ b/extras/mini-os/kernel.c   Sat Apr 05 13:25:43 2008 +0100
@@ -91,9 +91,7 @@ static void netfront_thread(void *p)
 }
 
 static struct blkfront_dev *blk_dev;
-static uint64_t blk_sectors;
-static unsigned blk_sector_size;
-static int blk_mode;
+static struct blkfront_info blk_info;
 static uint64_t blk_size_read;
 static uint64_t blk_size_write;
 
@@ -111,9 +109,9 @@ static struct blk_req *blk_alloc_req(uin
 {
     struct blk_req *req = xmalloc(struct blk_req);
     req->aiocb.aio_dev = blk_dev;
-    req->aiocb.aio_buf = _xmalloc(blk_sector_size, blk_sector_size);
-    req->aiocb.aio_nbytes = blk_sector_size;
-    req->aiocb.aio_offset = sector * blk_sector_size;
+    req->aiocb.aio_buf = _xmalloc(blk_info.sector_size, blk_info.sector_size);
+    req->aiocb.aio_nbytes = blk_info.sector_size;
+    req->aiocb.aio_offset = sector * blk_info.sector_size;
     req->aiocb.data = req;
     req->next = NULL;
     return req;
@@ -125,7 +123,7 @@ static void blk_read_completed(struct bl
     if (ret)
         printk("got error code %d when reading at offset %ld\n", ret, 
aiocb->aio_offset);
     else
-        blk_size_read += blk_sector_size;
+        blk_size_read += blk_info.sector_size;
     free(aiocb->aio_buf);
     free(req);
 }
@@ -154,10 +152,10 @@ static void blk_write_read_completed(str
         free(req);
         return;
     }
-    blk_size_read += blk_sector_size;
+    blk_size_read += blk_info.sector_size;
     buf = (int*) aiocb->aio_buf;
     rand_value = req->rand_value;
-    for (i = 0; i < blk_sector_size / sizeof(int); i++) {
+    for (i = 0; i < blk_info.sector_size / sizeof(int); i++) {
         if (buf[i] != rand_value) {
             printk("bogus data at offset %ld\n", aiocb->aio_offset + i);
             break;
@@ -177,7 +175,7 @@ static void blk_write_completed(struct b
         free(req);
         return;
     }
-    blk_size_write += blk_sector_size;
+    blk_size_write += blk_info.sector_size;
     /* Push write check */
     req->next = blk_to_read;
     blk_to_read = req;
@@ -195,7 +193,7 @@ static void blk_write_sector(uint64_t se
     req->rand_value = rand_value = rand();
 
     buf = (int*) req->aiocb.aio_buf;
-    for (i = 0; i < blk_sector_size / sizeof(int); i++) {
+    for (i = 0; i < blk_info.sector_size / sizeof(int); i++) {
         buf[i] = rand_value;
         rand_value *= RAND_MIX;
     }
@@ -207,35 +205,34 @@ static void blkfront_thread(void *p)
 static void blkfront_thread(void *p)
 {
     time_t lasttime = 0;
-    int blk_info;
 
-    blk_dev = init_blkfront(NULL, &blk_sectors, &blk_sector_size, &blk_mode, 
&blk_info);
+    blk_dev = init_blkfront(NULL, &blk_info);
     if (!blk_dev)
         return;
 
-    if (blk_info & VDISK_CDROM)
+    if (blk_info.info & VDISK_CDROM)
         printk("Block device is a CDROM\n");
-    if (blk_info & VDISK_REMOVABLE)
+    if (blk_info.info & VDISK_REMOVABLE)
         printk("Block device is removable\n");
-    if (blk_info & VDISK_READONLY)
+    if (blk_info.info & VDISK_READONLY)
         printk("Block device is read-only\n");
 
 #ifdef BLKTEST_WRITE
-    if (blk_mode == O_RDWR) {
+    if (blk_info.mode == O_RDWR) {
         blk_write_sector(0);
-        blk_write_sector(blk_sectors-1);
+        blk_write_sector(blk_info.sectors-1);
     } else
 #endif
     {
         blk_read_sector(0);
-        blk_read_sector(blk_sectors-1);
+        blk_read_sector(blk_info.sectors-1);
     }
 
     while (1) {
-        uint64_t sector = rand() % blk_sectors;
+        uint64_t sector = rand() % blk_info.sectors;
         struct timeval tv;
 #ifdef BLKTEST_WRITE
-        if (blk_mode == O_RDWR)
+        if (blk_info.mode == O_RDWR)
             blk_write_sector(sector);
         else
 #endif
diff -r 4558664bea4a tools/ioemu/block-vbd.c
--- a/tools/ioemu/block-vbd.c   Fri Apr 04 16:07:44 2008 +0100
+++ b/tools/ioemu/block-vbd.c   Sat Apr 05 13:25:43 2008 +0100
@@ -49,11 +49,7 @@ typedef struct BDRVVbdState {
 typedef struct BDRVVbdState {
     struct blkfront_dev *dev;
     int fd;
-    int type;
-    int mode;
-    int info;
-    uint64_t sectors;
-    unsigned sector_size;
+    struct blkfront_info info;
     QEMU_LIST_ENTRY(BDRVVbdState) list;
 } BDRVVbdState;
 
@@ -81,13 +77,13 @@ static int vbd_open(BlockDriverState *bs
     //handy to test posix access
     //return -EIO;
 
-    s->dev = init_blkfront((char *) filename, &s->sectors, &s->sector_size, 
&s->mode, &s->info);
+    s->dev = init_blkfront((char *) filename, &s->info);
 
     if (!s->dev)
        return -EIO;
 
-    if (SECTOR_SIZE % s->sector_size) {
-       printf("sector size is %d, we only support sector sizes that divide 
%d\n", s->sector_size, SECTOR_SIZE);
+    if (SECTOR_SIZE % s->info.sector_size) {
+       printf("sector size is %d, we only support sector sizes that divide 
%d\n", s->info.sector_size, SECTOR_SIZE);
        return -EIO;
     }
 
@@ -267,6 +263,32 @@ static void vbd_aio_cancel(BlockDriverAI
     // Try to cancel. If can't, wait for it, drop the callback and call 
qemu_aio_release(acb)
 }
 
+static void vbd_nop_cb(void *opaque, int ret)
+{
+}
+
+static BlockDriverAIOCB *vbd_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVVbdState *s = bs->opaque;
+    VbdAIOCB *acb = NULL;
+
+    if (s->info.barrier == 1) {
+        acb = vbd_aio_setup(bs, 0, NULL, 0,
+                s->info.flush == 1 ? vbd_nop_cb : cb, opaque);
+        if (!acb)
+            return NULL;
+        blkfront_aio_push_operation(&acb->aiocb, BLKIF_OP_WRITE_BARRIER);
+    }
+    if (s->info.flush == 1) {
+        acb = vbd_aio_setup(bs, 0, NULL, 0, cb, opaque);
+        if (!acb)
+            return NULL;
+        blkfront_aio_push_operation(&acb->aiocb, BLKIF_OP_FLUSH_DISKCACHE);
+    }
+    return &acb->common;
+}
+
 static void vbd_close(BlockDriverState *bs)
 {
     BDRVVbdState *s = bs->opaque;
@@ -282,13 +304,14 @@ static int64_t  vbd_getlength(BlockDrive
 static int64_t  vbd_getlength(BlockDriverState *bs)
 {
     BDRVVbdState *s = bs->opaque;
-    return s->sectors * s->sector_size;
+    return s->info.sectors * s->info.sector_size;
 }
 
-static void vbd_flush(BlockDriverState *bs)
+static int vbd_flush(BlockDriverState *bs)
 {
     BDRVVbdState *s = bs->opaque;
     blkfront_sync(s->dev);
+    return 0;
 }
 
 /***********************************************/
@@ -333,6 +356,7 @@ BlockDriver bdrv_vbd = {
     .bdrv_aio_read = vbd_aio_read,
     .bdrv_aio_write = vbd_aio_write,
     .bdrv_aio_cancel = vbd_aio_cancel,
+    .bdrv_aio_flush = vbd_aio_flush,
     .aiocb_size = sizeof(VbdAIOCB),
     .bdrv_read = vbd_read,
     .bdrv_write = vbd_write,

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.