[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 13/20] block/export: rewrite vduse-blk drain code



Hi Stefan,

On Thu, Apr 20, 2023 at 7:39 PM Stefan Hajnoczi <stefanha@xxxxxxxxxx> wrote:
>
> vduse_blk_detach_ctx() waits for in-flight requests using
> AIO_WAIT_WHILE(). This is not allowed according to a comment in
> bdrv_set_aio_context_commit():
>
>   /*
>    * Take the old AioContex when detaching it from bs.
>    * At this point, new_context lock is already acquired, and we are now
>    * also taking old_context. This is safe as long as bdrv_detach_aio_context
>    * does not call AIO_POLL_WHILE().
>    */
>
> Use this opportunity to rewrite the drain code in vduse-blk:
>
> - Use the BlockExport refcount so that vduse_blk_exp_delete() is only
>   called when there are no more requests in flight.
>
> - Implement .drained_poll() so in-flight request coroutines are stopped
>   by the time .bdrv_detach_aio_context() is called.
>
> - Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
>   .bdrv_detach_aio_context() constraint violation. It's no longer
>   needed due to the previous changes.
>
> - Always handle the VDUSE file descriptor, even in drained sections. The
>   VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
>   drained sections. This ensures that the VDUSE kernel code gets a fast
>   response.
>
> - Suspend virtqueue fd handlers in .drained_begin() and resume them in
>   .drained_end(). This eliminates the need for the
>   aio_set_fd_handler(is_external=true) flag, which is being removed from
>   QEMU.
>
> This is a long list but splitting it into individual commits would
> probably lead to git bisect failures - the changes are all related.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxx>
> ---
>  block/export/vduse-blk.c | 132 +++++++++++++++++++++++++++------------
>  1 file changed, 93 insertions(+), 39 deletions(-)
>
> diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
> index f7ae44e3ce..35dc8fcf45 100644
> --- a/block/export/vduse-blk.c
> +++ b/block/export/vduse-blk.c
> @@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
>      VduseDev *dev;
>      uint16_t num_queues;
>      char *recon_file;
> -    unsigned int inflight;
> +    unsigned int inflight; /* atomic */
> +    bool vqs_started;
>  } VduseBlkExport;
>
>  typedef struct VduseBlkReq {
> @@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
>
>  static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
>  {
> -    vblk_exp->inflight++;
> +    if (qatomic_fetch_inc(&vblk_exp->inflight) == 0) {

I wonder why we need to use atomic operations here.

> +        /* Prevent export from being deleted */
> +        aio_context_acquire(vblk_exp->export.ctx);
> +        blk_exp_ref(&vblk_exp->export);
> +        aio_context_release(vblk_exp->export.ctx);
> +    }
>  }
>
>  static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
>  {
> -    if (--vblk_exp->inflight == 0) {
> +    if (qatomic_fetch_dec(&vblk_exp->inflight) == 1) {
> +        /* Wake AIO_WAIT_WHILE() */
>          aio_wait_kick();
> +
> +        /* Now the export can be deleted */
> +        aio_context_acquire(vblk_exp->export.ctx);
> +        blk_exp_unref(&vblk_exp->export);
> +        aio_context_release(vblk_exp->export.ctx);
>      }
>  }
>
> @@ -124,8 +136,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, 
> VduseVirtq *vq)
>  {
>      VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
>
> +    if (!vblk_exp->vqs_started) {
> +        return; /* vduse_blk_drained_end() will start vqs later */
> +    }
> +
>      aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
> -                       true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
> +                       false, on_vduse_vq_kick, NULL, NULL, NULL, vq);
>      /* Make sure we don't miss any kick afer reconnecting */
>      eventfd_write(vduse_queue_get_fd(vq), 1);
>  }
> @@ -133,9 +149,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, 
> VduseVirtq *vq)
>  static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
>  {
>      VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
> +    int fd = vduse_queue_get_fd(vq);
>
> -    aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
> -                       true, NULL, NULL, NULL, NULL, NULL);
> +    if (fd < 0) {
> +        return;
> +    }
> +
> +    aio_set_fd_handler(vblk_exp->export.ctx, fd, false,
> +                       NULL, NULL, NULL, NULL, NULL);
>  }
>
>  static const VduseOps vduse_blk_ops = {
> @@ -152,42 +173,19 @@ static void on_vduse_dev_kick(void *opaque)
>
>  static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
>  {
> -    int i;
> -
>      aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
> -                       true, on_vduse_dev_kick, NULL, NULL, NULL,
> +                       false, on_vduse_dev_kick, NULL, NULL, NULL,
>                         vblk_exp->dev);
>
> -    for (i = 0; i < vblk_exp->num_queues; i++) {
> -        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> -        int fd = vduse_queue_get_fd(vq);
> -
> -        if (fd < 0) {
> -            continue;
> -        }
> -        aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
> -                           on_vduse_vq_kick, NULL, NULL, NULL, vq);
> -    }
> +    /* Virtqueues are handled by vduse_blk_drained_end() */
>  }
>
>  static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
>  {
> -    int i;
> -
> -    for (i = 0; i < vblk_exp->num_queues; i++) {
> -        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> -        int fd = vduse_queue_get_fd(vq);
> -
> -        if (fd < 0) {
> -            continue;
> -        }
> -        aio_set_fd_handler(vblk_exp->export.ctx, fd,
> -                           true, NULL, NULL, NULL, NULL, NULL);
> -    }
>      aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
> -                       true, NULL, NULL, NULL, NULL, NULL);
> +                       false, NULL, NULL, NULL, NULL, NULL);
>
> -    AIO_WAIT_WHILE(vblk_exp->export.ctx, vblk_exp->inflight > 0);
> +    /* Virtqueues are handled by vduse_blk_drained_begin() */
>  }
>
>
> @@ -220,8 +218,55 @@ static void vduse_blk_resize(void *opaque)
>                              (char *)&config.capacity);
>  }
>
> +static void vduse_blk_stop_virtqueues(VduseBlkExport *vblk_exp)
> +{
> +    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
> +        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> +        vduse_blk_disable_queue(vblk_exp->dev, vq);
> +    }
> +
> +    vblk_exp->vqs_started = false;
> +}
> +
> +static void vduse_blk_start_virtqueues(VduseBlkExport *vblk_exp)
> +{
> +    vblk_exp->vqs_started = true;
> +
> +    for (uint16_t i = 0; i < vblk_exp->num_queues; i++) {
> +        VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
> +        vduse_blk_enable_queue(vblk_exp->dev, vq);
> +    }
> +}
> +
> +static void vduse_blk_drained_begin(void *opaque)
> +{
> +    BlockExport *exp = opaque;
> +    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> +    vduse_blk_stop_virtqueues(vblk_exp);
> +}
> +
> +static void vduse_blk_drained_end(void *opaque)
> +{
> +    BlockExport *exp = opaque;
> +    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> +    vduse_blk_start_virtqueues(vblk_exp);
> +}
> +
> +static bool vduse_blk_drained_poll(void *opaque)
> +{
> +    BlockExport *exp = opaque;
> +    VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
> +
> +    return qatomic_read(&vblk_exp->inflight) > 0;
> +}
> +
>  static const BlockDevOps vduse_block_ops = {
> -    .resize_cb = vduse_blk_resize,
> +    .resize_cb     = vduse_blk_resize,
> +    .drained_begin = vduse_blk_drained_begin,
> +    .drained_end   = vduse_blk_drained_end,
> +    .drained_poll  = vduse_blk_drained_poll,
>  };
>
>  static int vduse_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
> @@ -268,6 +313,7 @@ static int vduse_blk_exp_create(BlockExport *exp, 
> BlockExportOptions *opts,
>      vblk_exp->handler.serial = g_strdup(vblk_opts->serial ?: "");
>      vblk_exp->handler.logical_block_size = logical_block_size;
>      vblk_exp->handler.writable = opts->writable;
> +    vblk_exp->vqs_started = true;
>
>      config.capacity =
>              cpu_to_le64(blk_getlength(exp->blk) >> VIRTIO_BLK_SECTOR_BITS);
> @@ -322,14 +368,20 @@ static int vduse_blk_exp_create(BlockExport *exp, 
> BlockExportOptions *opts,
>          vduse_dev_setup_queue(vblk_exp->dev, i, queue_size);
>      }
>
> -    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), true,
> +    aio_set_fd_handler(exp->ctx, vduse_dev_get_fd(vblk_exp->dev), false,
>                         on_vduse_dev_kick, NULL, NULL, NULL, vblk_exp->dev);
>
>      blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
>                                   vblk_exp);
> -
>      blk_set_dev_ops(exp->blk, &vduse_block_ops, exp);
>
> +    /*
> +     * We handle draining ourselves using an in-flight counter and by 
> disabling
> +     * virtqueue fd handlers. Do not queue BlockBackend requests, they need 
> to
> +     * complete so the in-flight counter reaches zero.
> +     */
> +    blk_set_disable_request_queuing(exp->blk, true);
> +
>      return 0;
>  err:
>      vduse_dev_destroy(vblk_exp->dev);
> @@ -344,6 +396,9 @@ static void vduse_blk_exp_delete(BlockExport *exp)
>      VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
>      int ret;
>
> +    assert(qatomic_read(&vblk_exp->inflight) == 0);
> +
> +    vduse_blk_detach_ctx(vblk_exp);
>      blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, 
> blk_aio_detach,
>                                      vblk_exp);
>      blk_set_dev_ops(exp->blk, NULL, NULL);
> @@ -355,13 +410,12 @@ static void vduse_blk_exp_delete(BlockExport *exp)
>      g_free(vblk_exp->handler.serial);
>  }
>
> +/* Called with exp->ctx acquired */
>  static void vduse_blk_exp_request_shutdown(BlockExport *exp)
>  {
>      VduseBlkExport *vblk_exp = container_of(exp, VduseBlkExport, export);
>
> -    aio_context_acquire(vblk_exp->export.ctx);
> -    vduse_blk_detach_ctx(vblk_exp);
> -    aio_context_acquire(vblk_exp->export.ctx);
> +    vduse_blk_stop_virtqueues(vblk_exp);

Can we add a AIO_WAIT_WHILE() here? Then we don't need to
increase/decrease the BlockExport refcount during I/O processing.

Thanks,
Yongji



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.