[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v2 2/2] xen/block: add multi-page ring support



On 03/31/2015 08:15 PM, Bob Liu wrote:
> Extend xen/block to support multi-page ring, so that more requests can be 
> issued
> by using more than one pages as the request ring between blkfront and backend.
> As a result, the performance can get improved significantly.
> 
> We got some impressive improvements on our highend iscsi storage cluster 
> backend.
> If using 64 pages as the ring, the IOPS increased about 15 times for the
> throughput testing and above doubled for the latency testing.
> 
> The reason was the limit on outstanding requests is 32 if use only one-page
> ring, but in our case the iscsi lun was spread across about 100 physical 
> drives,
> 32 was really not enough to keep them busy.
> 
> Change in V2:
> * Rebased to 4.0-rc6
> * Add description on how this protocol works into io/blkif.h
> 

I found there were already some related descriptions in file
xen/include/public/io/blkif.h. I'll send an new patch following the
protocol in that file, and leave file include/xen/interface/io/blkif.h
unchanged.

190  * max-ring-pages
191  *      Values:         <uint32_t>
192  *      Default Value:  1
193  *      Notes:          DEPRECATED, 2, 3
194  *
195  *      The maximum supported size of the request ring buffer in
units of
196  *      machine pages.  The value must be a power of 2.

302  * num-ring-pages
303  *      Values:         <uint32_t>
304  *      Default Value:  1
305  *      Maximum Value:  MAX(max-ring-pages,(0x1 << max-ring-page-order))
306  *      Notes:          DEPRECATED, 2, 3
307  *
308  *      The size of the frontend allocated request ring buffer in
units of
309  *      machine pages.  The value must be a power of 2.
310  *

Regards,
-Bob

> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
> ---
>  drivers/block/xen-blkback/blkback.c |   2 +-
>  drivers/block/xen-blkback/common.h  |   3 +-
>  drivers/block/xen-blkback/xenbus.c  |  83 ++++++++++++++++++++-------
>  drivers/block/xen-blkfront.c        | 111 
> +++++++++++++++++++++++++-----------
>  include/xen/interface/io/blkif.h    |  13 +++++
>  5 files changed, 156 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/block/xen-blkback/blkback.c 
> b/drivers/block/xen-blkback/blkback.c
> index 0c8da82..072d15b 100644
> --- a/drivers/block/xen-blkback/blkback.c
> +++ b/drivers/block/xen-blkback/blkback.c
> @@ -628,7 +628,7 @@ purge_gnt_list:
>               }
>  
>               /* Shrink if we have more than xen_blkif_max_buffer_pages */
> -             shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
> +             shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * 
> blkif->nr_ring_pages);
>  
>               if (log_stats && time_after(jiffies, blkif->st_print))
>                       print_stats(blkif);
> diff --git a/drivers/block/xen-blkback/common.h 
> b/drivers/block/xen-blkback/common.h
> index 375d288..320ca35 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -254,7 +254,7 @@ struct backend_info;
>  #define PERSISTENT_GNT_WAS_ACTIVE    1
>  
>  /* Number of requests that we can fit in a ring */
> -#define XEN_BLKIF_REQS                       32
> +#define XEN_BLKIF_REQS                       (32 * XENBUS_MAX_RING_PAGES)
>  
>  struct persistent_gnt {
>       struct page *page;
> @@ -326,6 +326,7 @@ struct xen_blkif {
>       struct work_struct      free_work;
>       /* Thread shutdown wait queue. */
>       wait_queue_head_t       shutdown_wq;
> +     int nr_ring_pages;
>  };
>  
>  struct seg_buf {
> diff --git a/drivers/block/xen-blkback/xenbus.c 
> b/drivers/block/xen-blkback/xenbus.c
> index ff30259..b75be66 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -193,8 +193,8 @@ fail:
>       return ERR_PTR(-ENOMEM);
>  }
>  
> -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> -                      unsigned int evtchn)
> +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
> +                      unsigned int nr_grefs, unsigned int evtchn)
>  {
>       int err;
>  
> @@ -202,7 +202,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, 
> grant_ref_t gref,
>       if (blkif->irq)
>               return 0;
>  
> -     err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
> +     err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
>                                    &blkif->blk_ring);
>       if (err < 0)
>               return err;
> @@ -212,21 +212,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, 
> grant_ref_t gref,
>       {
>               struct blkif_sring *sring;
>               sring = (struct blkif_sring *)blkif->blk_ring;
> -             BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
> +             BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * 
> nr_grefs);
>               break;
>       }
>       case BLKIF_PROTOCOL_X86_32:
>       {
>               struct blkif_x86_32_sring *sring_x86_32;
>               sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
> -             BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, 
> PAGE_SIZE);
> +             BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, 
> PAGE_SIZE * nr_grefs);
>               break;
>       }
>       case BLKIF_PROTOCOL_X86_64:
>       {
>               struct blkif_x86_64_sring *sring_x86_64;
>               sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
> -             BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, 
> PAGE_SIZE);
> +             BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, 
> PAGE_SIZE * nr_grefs);
>               break;
>       }
>       default:
> @@ -588,6 +588,10 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
>       if (err)
>               goto fail;
>  
> +     err = xenbus_printf(XBT_NIL, dev->nodename, "feature-multi-page-ring", 
> "%u", 1);
> +     if (err)
> +             goto fail;
> +
>       err = xenbus_switch_state(dev, XenbusStateInitWait);
>       if (err)
>               goto fail;
> @@ -852,22 +856,61 @@ again:
>  static int connect_ring(struct backend_info *be)
>  {
>       struct xenbus_device *dev = be->dev;
> -     unsigned long ring_ref;
> -     unsigned int evtchn;
> +     unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
> +     unsigned int evtchn, nr_grefs;
>       unsigned int pers_grants;
>       char protocol[64] = "";
>       int err;
>  
>       DPRINTK("%s", dev->otherend);
> -
> -     err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
> -                         &ring_ref, "event-channel", "%u", &evtchn, NULL);
> -     if (err) {
> -             xenbus_dev_fatal(dev, err,
> -                              "reading %s/ring-ref and event-channel",
> +     err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
> +                       &evtchn);
> +     if (err != 1) {
> +             err = -EINVAL;
> +             xenbus_dev_fatal(dev, err, "reading %s/event-channel",
>                                dev->otherend);
>               return err;
>       }
> +     pr_info(DRV_PFX "event-channel %u\n", evtchn);
> +
> +     err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-pages", "%u",
> +                       &nr_grefs);
> +     if (err != 1) {
> +             err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
> +                               "%u", &ring_ref[0]);
> +             if (err != 1) {
> +                     err = -EINVAL;
> +                     xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
> +                                      dev->otherend);
> +                     return err;
> +             }
> +             nr_grefs = 1;
> +             pr_info(DRV_PFX "%s:using single page: ring-ref %d\n",
> +                              dev->otherend, ring_ref[0]);
> +     } else {
> +             unsigned int i;
> +
> +             if (nr_grefs > XENBUS_MAX_RING_PAGES) {
> +                     err = -EINVAL;
> +                     xenbus_dev_fatal(dev, err, "%s/request %d ring pages 
> exceed max",
> +                                      dev->otherend, nr_grefs);
> +                     return err;
> +             }
> +             for (i = 0; i < nr_grefs; i++) {
> +                     char ring_ref_name[15];
> +
> +                     snprintf(ring_ref_name, sizeof(ring_ref_name), 
> "ring-ref%u", i);
> +                     err = xenbus_scanf(XBT_NIL, dev->otherend,
> +                                        ring_ref_name, "%d", &ring_ref[i]);
> +                     if (err != 1) {
> +                             err = -EINVAL;
> +                             xenbus_dev_fatal(dev, err, "reading %s/%s",
> +                                              dev->otherend, ring_ref_name);
> +                             return err;
> +                     }
> +                     pr_info(DRV_PFX "ring-ref%u: %d\n", i, ring_ref[i]);
> +             }
> +     }
>  
>       be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
>       err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
> @@ -892,16 +935,16 @@ static int connect_ring(struct backend_info *be)
>  
>       be->blkif->vbd.feature_gnt_persistent = pers_grants;
>       be->blkif->vbd.overflow_max_grants = 0;
> +     be->blkif->nr_ring_pages = nr_grefs;
>  
> -     pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
> -             ring_ref, evtchn, be->blkif->blk_protocol, protocol,
> -             pers_grants ? "persistent grants" : "");
> +     pr_info(DRV_PFX "ring-pages:%d, event-channel %d, protocol %d (%s) 
> %s\n",
> +                     nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
> +                     pers_grants ? "persistent grants" : "");
>  
>       /* Map the shared frame, irq etc. */
> -     err = xen_blkif_map(be->blkif, ring_ref, evtchn);
> +     err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
>       if (err) {
> -             xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
> -                              ring_ref, evtchn);
> +             xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
>               return err;
>       }
>  
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 2c61cf8..528e4f6 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -98,7 +98,12 @@ static unsigned int xen_blkif_max_segments = 32;
>  module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
>  MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests 
> (default is 32)");
>  
> -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
> +static unsigned int xen_blkif_max_ring_pages = 1;
> +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, 0644);
> +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the 
> ring");
> +
> +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * 
> info->nr_ring_pages)
> +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * 
> XENBUS_MAX_RING_PAGES)
>  
>  /*
>   * We have one of these per vbd, whether ide, scsi or 'other'.  They
> @@ -114,13 +119,14 @@ struct blkfront_info
>       int vdevice;
>       blkif_vdev_t handle;
>       enum blkif_state connected;
> -     int ring_ref;
> +     int ring_ref[XENBUS_MAX_RING_PAGES];
> +     int nr_ring_pages;
>       struct blkif_front_ring ring;
>       unsigned int evtchn, irq;
>       struct request_queue *rq;
>       struct work_struct work;
>       struct gnttab_free_callback callback;
> -     struct blk_shadow shadow[BLK_RING_SIZE];
> +     struct blk_shadow shadow[BLK_MAX_RING_SIZE];
>       struct list_head grants;
>       struct list_head indirect_pages;
>       unsigned int persistent_gnts_c;
> @@ -139,8 +145,6 @@ static unsigned int nr_minors;
>  static unsigned long *minors;
>  static DEFINE_SPINLOCK(minor_lock);
>  
> -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> -     (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
>  #define GRANT_INVALID_REF    0
>  
>  #define PARTS_PER_DISK               16
> @@ -1033,12 +1037,15 @@ free_shadow:
>       flush_work(&info->work);
>  
>       /* Free resources associated with old device channel. */
> -     if (info->ring_ref != GRANT_INVALID_REF) {
> -             gnttab_end_foreign_access(info->ring_ref, 0,
> -                                       (unsigned long)info->ring.sring);
> -             info->ring_ref = GRANT_INVALID_REF;
> -             info->ring.sring = NULL;
> +     for (i = 0; i < info->nr_ring_pages; i++) {
> +             if (info->ring_ref[i] != GRANT_INVALID_REF) {
> +                     gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
> +                     info->ring_ref[i] = GRANT_INVALID_REF;
> +             }
>       }
> +     free_pages((unsigned long)info->ring.sring, 
> get_order(info->nr_ring_pages * PAGE_SIZE));
> +     info->ring.sring = NULL;
> +
>       if (info->irq)
>               unbind_from_irqhandler(info->irq, info);
>       info->evtchn = info->irq = 0;
> @@ -1245,26 +1252,30 @@ static int setup_blkring(struct xenbus_device *dev,
>                        struct blkfront_info *info)
>  {
>       struct blkif_sring *sring;
> -     grant_ref_t gref;
> -     int err;
> +     int err, i;
> +     unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
> +     grant_ref_t gref[XENBUS_MAX_RING_PAGES];
>  
> -     info->ring_ref = GRANT_INVALID_REF;
> +     for (i = 0; i < XENBUS_MAX_RING_PAGES; i++)
> +             info->ring_ref[i] = GRANT_INVALID_REF;
>  
> -     sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
> +     sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
> +                                                    get_order(ring_size));
>       if (!sring) {
>               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
>               return -ENOMEM;
>       }
>       SHARED_RING_INIT(sring);
> -     FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
> +     FRONT_RING_INIT(&info->ring, sring, ring_size);
>  
> -     err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
> +     err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, 
> gref);
>       if (err < 0) {
>               free_page((unsigned long)sring);
>               info->ring.sring = NULL;
>               goto fail;
>       }
> -     info->ring_ref = gref;
> +     for (i = 0; i < info->nr_ring_pages; i++)
> +             info->ring_ref[i] = gref[i];
>  
>       err = xenbus_alloc_evtchn(dev, &info->evtchn);
>       if (err)
> @@ -1292,7 +1303,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
>  {
>       const char *message = NULL;
>       struct xenbus_transaction xbt;
> -     int err;
> +     int err, i, multi_ring_pages = 0;
> +
> +     err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> +                        "feature-multi-page-ring", "%u", &multi_ring_pages);
> +     if (err != 1)
> +             info->nr_ring_pages = 1;
> +     else
> +             info->nr_ring_pages = xen_blkif_max_ring_pages;
>  
>       /* Create shared ring, alloc event channel. */
>       err = setup_blkring(dev, info);
> @@ -1306,11 +1324,31 @@ again:
>               goto destroy_blkring;
>       }
>  
> -     err = xenbus_printf(xbt, dev->nodename,
> -                         "ring-ref", "%u", info->ring_ref);
> -     if (err) {
> -             message = "writing ring-ref";
> -             goto abort_transaction;
> +     if (info->nr_ring_pages == 1) {
> +             err = xenbus_printf(xbt, dev->nodename,
> +                                 "ring-ref", "%u", info->ring_ref[0]);
> +             if (err) {
> +                     message = "writing ring-ref";
> +                     goto abort_transaction;
> +             }
> +     } else {
> +             err = xenbus_printf(xbt, dev->nodename,
> +                                 "max-ring-pages", "%u", 
> info->nr_ring_pages);
> +             if (err) {
> +                     message = "writing max-ring-pages";
> +                     goto abort_transaction;
> +             }
> +
> +             for (i = 0; i < info->nr_ring_pages; i++) {
> +                     char ring_ref_name[15];
> +                     sprintf(ring_ref_name, "ring-ref%d", i);
> +                     err = xenbus_printf(xbt, dev->nodename,
> +                                     ring_ref_name, "%d", info->ring_ref[i]);
> +                     if (err) {
> +                             message = "writing ring-ref";
> +                             goto abort_transaction;
> +                     }
> +             }
>       }
>       err = xenbus_printf(xbt, dev->nodename,
>                           "event-channel", "%u", info->evtchn);
> @@ -1338,6 +1376,7 @@ again:
>               goto destroy_blkring;
>       }
>  
> +     info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>       xenbus_switch_state(dev, XenbusStateInitialised);
>  
>       return 0;
> @@ -1422,21 +1461,13 @@ static int blkfront_probe(struct xenbus_device *dev,
>       info->connected = BLKIF_STATE_DISCONNECTED;
>       INIT_WORK(&info->work, blkif_restart_queue);
>  
> -     for (i = 0; i < BLK_RING_SIZE; i++)
> +     for (i = 0; i < BLK_MAX_RING_SIZE; i++)
>               info->shadow[i].req.u.rw.id = i+1;
> -     info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>  
>       /* Front end dir is a number, which is used as the id. */
>       info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
>       dev_set_drvdata(&dev->dev, info);
>  
> -     err = talk_to_blkback(dev, info);
> -     if (err) {
> -             kfree(info);
> -             dev_set_drvdata(&dev->dev, NULL);
> -             return err;
> -     }
> -
>       return 0;
>  }
>  
> @@ -1476,7 +1507,7 @@ static int blkif_recover(struct blkfront_info *info)
>  
>       /* Stage 2: Set up free list. */
>       memset(&info->shadow, 0, sizeof(info->shadow));
> -     for (i = 0; i < BLK_RING_SIZE; i++)
> +     for (i = 0; i < BLK_MAX_RING_SIZE; i++)
>               info->shadow[i].req.u.rw.id = i+1;
>       info->shadow_free = info->ring.req_prod_pvt;
>       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> @@ -1906,8 +1937,17 @@ static void blkback_changed(struct xenbus_device *dev,
>       dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", 
> backend_state);
>  
>       switch (backend_state) {
> -     case XenbusStateInitialising:
>       case XenbusStateInitWait:
> +             /*
> +              * Talk to blkback after backend enter InitWait so we can know
> +              * whether backend spports feature-multi-page-ring.
> +              */
> +             if (talk_to_blkback(dev, info)) {
> +                     kfree(info);
> +                     dev_set_drvdata(&dev->dev, NULL);
> +             }
> +             break;
> +     case XenbusStateInitialising:
>       case XenbusStateInitialised:
>       case XenbusStateReconfiguring:
>       case XenbusStateReconfigured:
> @@ -2088,6 +2128,9 @@ static int __init xlblk_init(void)
>  {
>       int ret;
>  
> +     if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES)
> +             return -EINVAL;
> +
>       if (!xen_domain())
>               return -ENODEV;
>  
> diff --git a/include/xen/interface/io/blkif.h 
> b/include/xen/interface/io/blkif.h
> index c33e1c4..9c2c9a7 100644
> --- a/include/xen/interface/io/blkif.h
> +++ b/include/xen/interface/io/blkif.h
> @@ -127,6 +127,19 @@ typedef uint64_t blkif_sector_t;
>  #define BLKIF_OP_INDIRECT          6
>  
>  /*
> + * "feature-multi-page-ring" is introduced to use more than one page as the
> + * request ring between blkfront and backend. As a result, more reqs can be
> + * issued from blkfront and the performance get improved accordingly.
> + *
> + * If supported, the backend will write the key 'feature-multi-page-ring'
> + * for that vbd.
> + * Frontends that are aware of this feature and wish to use it will write key
> + * "max-ring-pages" to set the number of pages that they wish to be used as 
> the
> + * ring. The 'xen_blkif_max_ring_pages' parameter is introduced with default
> + * number still one page, the maximum is XENBUS_MAX_RING_PAGES.
> + */
> +
> +/*
>   * Maximum scatter/gather segments per request.
>   * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
>   * NB. This could be 12 if the ring indexes weren't stored in the same page.
>

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.