Xen project Mailing List

[Xen-changelog] [qemu-xen master] Merge remote-tracking branch 'remotes/cohuck/tags/s390x-20161031' into staging

Date: Sat, 07 Jan 2017 13:22:03 +0000

Delivery-date: Sat, 07 Jan 2017 13:22:13 +0000

List-id: "Change log for Mercurial $receive only$" <xen-changelog.lists.xen.org>

=== This changeset includes merge from high-traffic branch === Commits on that branch are not reported individually. commit 0bb1137930f51a89fb1bfeb0c46aa68af0395167 Merge: eab9e9629c8f3c0ce87f7bcad82176f55029a640 88ee13c7b656e5504613b527f3a51591e9afae69 Author: Peter Maydell <peter.maydell@xxxxxxxxxx> AuthorDate: Mon Oct 31 14:48:47 2016 +0000 Commit: Peter Maydell <peter.maydell@xxxxxxxxxx> CommitDate: Mon Oct 31 14:48:47 2016 +0000 Merge remote-tracking branch 'remotes/cohuck/tags/s390x-20161031' into staging Two PCI fixes/improvements for s390x. # gpg: Signature made Mon 31 Oct 2016 10:09:24 GMT # gpg: using RSA key 0xDECF6B93C6F02FAF # gpg: Good signature from "Cornelia Huck <huckc@xxxxxxxxxxxxxxxxxx>" # gpg: aka "Cornelia Huck <cornelia.huck@xxxxxxxxxx>" # Primary key fingerprint: C3D0 D66D C362 4FF6 A8C0 18CE DECF 6B93 C6F0 2FAF * remotes/cohuck/tags/s390x-20161031: s390x/pci: Check memory region dispatching callbacks s390x/pci: use generic interface to inject interrupt Signed-off-by: Peter Maydell <peter.maydell@xxxxxxxxxx> MAINTAINERS | 19 ++ Makefile | 2 +- accel.c | 1 - async.c | 29 +- block.c | 6 +- block/backup.c | 17 ++ block/block-backend.c | 30 +- block/commit.c | 2 +- block/io.c | 137 +++++----- block/mirror.c | 70 +++-- block/nfs.c | 55 ++-- block/qed-table.c | 16 +- block/qed.c | 16 +- block/replication.c | 27 +- block/sheepdog.c | 67 +++-- blockjob.c | 37 +-- configure | 11 + cputlb.c | 1 - docs/COLO-FT.txt | 189 +++++++++++++ docs/multiple-iothreads.txt | 40 +-- docs/qmp-commands.txt | 17 +- gdbstub.c | 1 - hmp-commands.hx | 15 + hmp.c | 16 ++ hmp.h | 1 + hw/arm/cubieboard.c | 1 + hw/arm/pxa2xx.c | 4 +- hw/arm/spitz.c | 13 +- hw/arm/tosa.c | 12 +- hw/arm/versatilepb.c | 9 + hw/arm/virt-acpi-build.c | 2 +- hw/arm/virt.c | 9 +- hw/block/nvme.c | 4 +- hw/block/xen_disk.c | 65 ++--- hw/char/cadence_uart.c | 3 +- hw/char/xen_console.c | 30 +- hw/display/milkymist-tmu2.c | 2 +- hw/display/xenfb.c | 127 +++++---- hw/gpio/imx_gpio.c | 2 +- hw/i386/acpi-build.c | 1 - hw/microblaze/boot.c | 1 - hw/mips/mips_malta.c | 1 - hw/misc/milkymist-pfpu.c | 2 +- hw/net/xen_nic.c | 36 ++- hw/nvram/fw_cfg.c | 1 - hw/pci-bridge/pci_expander_bridge.c | 1 - hw/ppc/ppc405_boards.c | 1 - hw/ppc/spapr.c | 1 - hw/s390x/s390-pci-bus.c | 10 +- hw/scsi/virtio-scsi-dataplane.c | 4 +- hw/timer/grlib_gptimer.c | 1 - hw/tpm/tpm_passthrough.c | 6 +- hw/tpm/tpm_tis.c | 1 - hw/unicore32/puv3.c | 1 - hw/usb/ccid-card-emulated.c | 3 +- hw/usb/ccid-card-passthru.c | 6 - hw/usb/ccid.h | 2 +- hw/usb/dev-mtp.c | 1 - hw/usb/dev-smartcard-reader.c | 11 +- hw/usb/xen-usb.c | 46 ++-- hw/xen/Makefile.objs | 2 +- hw/xen/xen_backend.c | 348 +++--------------------- hw/xen/xen_devconfig.c | 4 +- hw/xen/xen_pvdev.c | 316 +++++++++++++++++++++ include/block/aio.h | 24 +- include/block/block.h | 31 ++- include/block/block_int.h | 27 +- include/block/blockjob.h | 7 + include/hw/i386/pc.h | 1 - include/hw/xen/xen_backend.h | 72 +---- include/hw/xen/xen_pvdev.h | 78 ++++++ include/migration/colo.h | 38 +++ include/migration/failover.h | 26 ++ include/migration/migration.h | 8 + include/monitor/monitor.h | 2 +- include/qemu/rfifolock.h | 54 ---- include/qemu/thread-posix.h | 6 + include/qemu/thread-win32.h | 10 + include/qemu/thread.h | 3 + iothread.c | 33 ++- migration/Makefile.objs | 2 + migration/colo-comm.c | 72 +++++ migration/colo-failover.c | 83 ++++++ migration/colo.c | 529 ++++++++++++++++++++++++++++++++++++ migration/migration.c | 86 +++++- migration/ram.c | 37 ++- migration/trace-events | 6 + monitor.c | 4 +- net/colo-compare.c | 29 +- net/trace-events | 3 +- qapi-schema.json | 100 ++++++- qemu-ga.texi | 2 +- qemu-img.c | 6 + qemu-io-cmds.c | 7 +- qemu-options.hx | 12 +- qmp.c | 1 - scripts/clean-includes | 56 +++- scripts/hxtool | 20 +- scripts/tracetool.py | 2 +- stubs/Makefile.objs | 2 + stubs/iothread.c | 8 + stubs/migration-colo.c | 46 ++++ target-arm/cpu.c | 15 + target-arm/cpu.h | 1 + target-arm/cpu64.c | 2 + target-arm/kvm64.c | 17 +- target-i386/machine.c | 3 - target-lm32/translate.c | 57 ++-- target-mips/machine.c | 1 - target-ppc/machine.c | 1 - target-ppc/mem_helper.c | 1 - target-sparc/machine.c | 3 - target-xtensa/translate.c | 1 - tests/.gitignore | 1 - tests/Makefile.include | 2 - tests/crypto-tls-x509-helpers.h | 3 - tests/test-aio.c | 22 +- tests/test-rfifolock.c | 91 ------- tests/vhost-user-test.c | 2 - util/Makefile.objs | 1 - util/oslib-posix.c | 1 - util/qemu-thread-posix.c | 14 + util/qemu-thread-win32.c | 25 ++ util/rfifolock.c | 78 ------ vl.c | 23 +- xen-common.c | 4 +- 126 files changed, 2617 insertions(+), 1197 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 82d4d00..3fecf45 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -63,6 +63,17 @@ W: http://wiki.qemu.org/SecurityProcess M: Michael S. Tsirkin <mst@xxxxxxxxxx> L: secalert@xxxxxxxxxx +Trivial patches +--------------- +Trivial patches +M: Michael Tokarev <mjt@xxxxxxxxxx> +M: Laurent Vivier <laurent@xxxxxxxxx> +S: Maintained +L: qemu-trivial@xxxxxxxxxx +K: ^Subject:.*(?i)trivial +T: git git://git.corpit.ru/qemu.git trivial-patches +T: git git://github.com/vivier/qemu.git trivial-patches + Guest CPU cores (TCG): ---------------------- Overall @@ -1415,6 +1426,14 @@ F: util/uuid.c F: include/qemu/uuid.h F: tests/test-uuid.c +COLO Framework +M: zhanghailiang <zhang.zhanghailiang@xxxxxxxxxx> +S: Maintained +F: migration/colo* +F: include/migration/colo.h +F: include/migration/failover.h +F: docs/COLO-FT.txt + COLO Proxy M: Zhang Chen <zhangchen.fnst@xxxxxxxxxxxxxx> M: Li Zhijian <lizhijian@xxxxxxxxxxxxxx> diff --git a/Makefile b/Makefile index 11f5154..474cc5e 100644 --- a/Makefile +++ b/Makefile @@ -695,7 +695,7 @@ help: @echo '' ifdef CONFIG_WIN32 @echo 'Windows targets:' - @echo ' installer - Build NSIS-based installer for qemu-ga' + @echo ' installer - Build NSIS-based installer for QEMU' ifdef QEMU_GA_MSI_ENABLED @echo ' msi - Build MSI-based installer for qemu-ga' endif diff --git a/accel.c b/accel.c index 403eb5e..664bb88 100644 --- a/accel.c +++ b/accel.c @@ -33,7 +33,6 @@ #include "sysemu/qtest.h" #include "hw/xen/xen.h" #include "qom/object.h" -#include "hw/boards.h" int tcg_tb_size; static bool tcg_allowed = true; diff --git a/async.c b/async.c index f30d011..b2de360 100644 --- a/async.c +++ b/async.c @@ -61,6 +61,7 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) smp_wmb(); ctx->first_bh = bh; qemu_mutex_unlock(&ctx->bh_lock); + aio_notify(ctx); } QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque) @@ -106,8 +107,8 @@ int aio_bh_poll(AioContext *ctx) * aio_notify again if necessary. */ if (atomic_xchg(&bh->scheduled, 0)) { - /* Idle BHs and the notify BH don't count as progress */ - if (!bh->idle && bh != ctx->notify_dummy_bh) { + /* Idle BHs don't count as progress */ + if (!bh->idle) { ret = 1; } bh->idle = 0; @@ -259,7 +260,6 @@ aio_ctx_finalize(GSource *source) { AioContext *ctx = (AioContext *) source; - qemu_bh_delete(ctx->notify_dummy_bh); thread_pool_free(ctx->thread_pool); #ifdef CONFIG_LINUX_AIO @@ -284,7 +284,7 @@ aio_ctx_finalize(GSource *source) aio_set_event_notifier(ctx, &ctx->notifier, false, NULL); event_notifier_cleanup(&ctx->notifier); - rfifolock_destroy(&ctx->lock); + qemu_rec_mutex_destroy(&ctx->lock); qemu_mutex_destroy(&ctx->bh_lock); timerlistgroup_deinit(&ctx->tlg); } @@ -345,19 +345,6 @@ static void aio_timerlist_notify(void *opaque) aio_notify(opaque); } -static void aio_rfifolock_cb(void *opaque) -{ - AioContext *ctx = opaque; - - /* Kick owner thread in case they are blocked in aio_poll() */ - qemu_bh_schedule(ctx->notify_dummy_bh); -} - -static void notify_dummy_bh(void *opaque) -{ - /* Do nothing, we were invoked just to force the event loop to iterate */ -} - static void event_notifier_dummy_cb(EventNotifier *e) { } @@ -385,11 +372,9 @@ AioContext *aio_context_new(Error **errp) #endif ctx->thread_pool = NULL; qemu_mutex_init(&ctx->bh_lock); - rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx); + qemu_rec_mutex_init(&ctx->lock); timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); - ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL); - return ctx; fail: g_source_destroy(&ctx->source); @@ -408,10 +393,10 @@ void aio_context_unref(AioContext *ctx) void aio_context_acquire(AioContext *ctx) { - rfifolock_lock(&ctx->lock); + qemu_rec_mutex_lock(&ctx->lock); } void aio_context_release(AioContext *ctx) { - rfifolock_unlock(&ctx->lock); + qemu_rec_mutex_unlock(&ctx->lock); } diff --git a/block.c b/block.c index 7f3e7bc..a17baab 100644 --- a/block.c +++ b/block.c @@ -2082,7 +2082,7 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, * to all devices. * */ -int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) +int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp) { int ret = -1; BlockReopenQueueEntry *bs_entry, *next; @@ -2090,7 +2090,9 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) assert(bs_queue != NULL); + aio_context_release(ctx); bdrv_drain_all(); + aio_context_acquire(ctx); QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) { if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) { @@ -2131,7 +2133,7 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp) Error *local_err = NULL; BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags); - ret = bdrv_reopen_multiple(queue, &local_err); + ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, &local_err); if (local_err != NULL) { error_propagate(errp, local_err); } diff --git a/block/backup.c b/block/backup.c index 02dbe48..81d4042 100644 --- a/block/backup.c +++ b/block/backup.c @@ -300,6 +300,21 @@ void backup_cow_request_end(CowRequest *req) cow_request_end(req); } +static void backup_drain(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + + /* Need to keep a reference in case blk_drain triggers execution + * of backup_complete... + */ + if (s->target) { + BlockBackend *target = s->target; + blk_ref(target); + blk_drain(target); + blk_unref(target); + } +} + static const BlockJobDriver backup_job_driver = { .instance_size = sizeof(BackupBlockJob), .job_type = BLOCK_JOB_TYPE_BACKUP, @@ -307,6 +322,7 @@ static const BlockJobDriver backup_job_driver = { .commit = backup_commit, .abort = backup_abort, .attached_aio_context = backup_attached_aio_context, + .drain = backup_drain, }; static BlockErrorAction backup_error_action(BackupBlockJob *job, @@ -331,6 +347,7 @@ static void backup_complete(BlockJob *job, void *opaque) BackupCompleteData *data = opaque; blk_unref(s->target); + s->target = NULL; block_job_completed(job, data->ret); g_free(data); diff --git a/block/block-backend.c b/block/block-backend.c index c53ca30..27a7f6f 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -799,20 +799,25 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, BdrvRequestFlags flags) { int ret; + BlockDriverState *bs = blk_bs(blk); - trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags); + trace_blk_co_preadv(blk, bs, offset, bytes, flags); ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; } + bdrv_inc_in_flight(bs); + /* throttling disk I/O */ if (blk->public.throttle_state) { throttle_group_co_io_limits_intercept(blk, bytes, false); } - return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags); + ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags); + bdrv_dec_in_flight(bs); + return ret; } int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, @@ -820,14 +825,17 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, BdrvRequestFlags flags) { int ret; + BlockDriverState *bs = blk_bs(blk); - trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags); + trace_blk_co_pwritev(blk, bs, offset, bytes, flags); ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; } + bdrv_inc_in_flight(bs); + /* throttling disk I/O */ if (blk->public.throttle_state) { throttle_group_co_io_limits_intercept(blk, bytes, true); @@ -837,7 +845,9 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, flags |= BDRV_REQ_FUA; } - return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags); + ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags); + bdrv_dec_in_flight(bs); + return ret; } typedef struct BlkRwCo { @@ -868,7 +878,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, int64_t bytes, CoroutineEntry co_entry, BdrvRequestFlags flags) { - AioContext *aio_context; QEMUIOVector qiov; struct iovec iov; Coroutine *co; @@ -890,11 +899,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, co = qemu_coroutine_create(co_entry, &rwco); qemu_coroutine_enter(co); - - aio_context = blk_get_aio_context(blk); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } + BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE); return rwco.ret; } @@ -930,6 +935,8 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) static void error_callback_bh(void *opaque) { struct BlockBackendAIOCB *acb = opaque; + + bdrv_dec_in_flight(acb->common.bs); acb->common.cb(acb->common.opaque, acb->ret); qemu_aio_unref(acb); } @@ -940,6 +947,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, { struct BlockBackendAIOCB *acb; + bdrv_inc_in_flight(blk_bs(blk)); acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque); acb->blk = blk; acb->ret = ret; @@ -962,6 +970,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = { static void blk_aio_complete(BlkAioEmAIOCB *acb) { if (acb->has_returned) { + bdrv_dec_in_flight(acb->common.bs); acb->common.cb(acb->common.opaque, acb->rwco.ret); qemu_aio_unref(acb); } @@ -983,6 +992,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes, BlkAioEmAIOCB *acb; Coroutine *co; + bdrv_inc_in_flight(blk_bs(blk)); acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); acb->rwco = (BlkRwCo) { .blk = blk, diff --git a/block/commit.c b/block/commit.c index 9f67a8b..499ecca 100644 --- a/block/commit.c +++ b/block/commit.c @@ -251,7 +251,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, orig_overlay_flags | BDRV_O_RDWR); } if (reopen_queue) { - bdrv_reopen_multiple(reopen_queue, &local_err); + bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err); if (local_err != NULL) { error_propagate(errp, local_err); block_job_unref(&s->common); diff --git a/block/io.c b/block/io.c index 79cbbdf..be0d862 100644 --- a/block/io.c +++ b/block/io.c @@ -143,7 +143,7 @@ bool bdrv_requests_pending(BlockDriverState *bs) { BdrvChild *child; - if (!QLIST_EMPTY(&bs->tracked_requests)) { + if (atomic_read(&bs->in_flight)) { return true; } @@ -156,16 +156,22 @@ bool bdrv_requests_pending(BlockDriverState *bs) return false; } -static void bdrv_drain_recurse(BlockDriverState *bs) +static bool bdrv_drain_recurse(BlockDriverState *bs) { BdrvChild *child; + bool waited; + + waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0); if (bs->drv && bs->drv->bdrv_drain) { bs->drv->bdrv_drain(bs); } + QLIST_FOREACH(child, &bs->children, next) { - bdrv_drain_recurse(child->bs); + waited |= bdrv_drain_recurse(child->bs); } + + return waited; } typedef struct { @@ -174,23 +180,14 @@ typedef struct { bool done; } BdrvCoDrainData; -static void bdrv_drain_poll(BlockDriverState *bs) -{ - bool busy = true; - - while (busy) { - /* Keep iterating */ - busy = bdrv_requests_pending(bs); - busy |= aio_poll(bdrv_get_aio_context(bs), busy); - } -} - static void bdrv_co_drain_bh_cb(void *opaque) { BdrvCoDrainData *data = opaque; Coroutine *co = data->co; + BlockDriverState *bs = data->bs; - bdrv_drain_poll(data->bs); + bdrv_dec_in_flight(bs); + bdrv_drained_begin(bs); data->done = true; qemu_coroutine_enter(co); } @@ -209,6 +206,7 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) .bs = bs, .done = false, }; + bdrv_inc_in_flight(bs); aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data); @@ -220,6 +218,11 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs) void bdrv_drained_begin(BlockDriverState *bs) { + if (qemu_in_coroutine()) { + bdrv_co_yield_to_drain(bs); + return; + } + if (!bs->quiesce_counter++) { aio_disable_external(bdrv_get_aio_context(bs)); bdrv_parent_drained_begin(bs); @@ -227,11 +230,6 @@ void bdrv_drained_begin(BlockDriverState *bs) bdrv_io_unplugged_begin(bs); bdrv_drain_recurse(bs); - if (qemu_in_coroutine()) { - bdrv_co_yield_to_drain(bs); - } else { - bdrv_drain_poll(bs); - } bdrv_io_unplugged_end(bs); } @@ -279,7 +277,7 @@ void bdrv_drain(BlockDriverState *bs) void bdrv_drain_all(void) { /* Always run first iteration so any pending completion BHs run */ - bool busy = true; + bool waited = true; BlockDriverState *bs; BdrvNextIterator it; BlockJob *job = NULL; @@ -299,7 +297,6 @@ void bdrv_drain_all(void) aio_context_acquire(aio_context); bdrv_parent_drained_begin(bs); bdrv_io_unplugged_begin(bs); - bdrv_drain_recurse(bs); aio_context_release(aio_context); if (!g_slist_find(aio_ctxs, aio_context)) { @@ -313,8 +310,8 @@ void bdrv_drain_all(void) * request completion. Therefore we must keep looping until there was no * more activity rather than simply draining each device independently. */ - while (busy) { - busy = false; + while (waited) { + waited = false; for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { AioContext *aio_context = ctx->data; @@ -322,13 +319,9 @@ void bdrv_drain_all(void) aio_context_acquire(aio_context); for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { if (aio_context == bdrv_get_aio_context(bs)) { - if (bdrv_requests_pending(bs)) { - busy = true; - aio_poll(aio_context, busy); - } + waited |= bdrv_drain_recurse(bs); } } - busy |= aio_poll(aio_context, false); aio_context_release(aio_context); } } @@ -476,6 +469,28 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req, return true; } +void bdrv_inc_in_flight(BlockDriverState *bs) +{ + atomic_inc(&bs->in_flight); +} + +static void dummy_bh_cb(void *opaque) +{ +} + +void bdrv_wakeup(BlockDriverState *bs) +{ + if (bs->wakeup) { + aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); + } +} + +void bdrv_dec_in_flight(BlockDriverState *bs) +{ + atomic_dec(&bs->in_flight); + bdrv_wakeup(bs); +} + static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) { BlockDriverState *bs = self->bs; @@ -583,13 +598,9 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset, /* Fast-path if already in coroutine context */ bdrv_rw_co_entry(&rwco); } else { - AioContext *aio_context = bdrv_get_aio_context(child->bs); - co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); qemu_coroutine_enter(co); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } + BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); } return rwco.ret; } @@ -1097,6 +1108,8 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child, return ret; } + bdrv_inc_in_flight(bs); + /* Don't do copy-on-read if we read data before write operation */ if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { flags |= BDRV_REQ_COPY_ON_READ; @@ -1132,6 +1145,7 @@ int coroutine_fn bdrv_co_preadv(BdrvChild *child, use_local_qiov ? &local_qiov : qiov, flags); tracked_request_end(&req); + bdrv_dec_in_flight(bs); if (use_local_qiov) { qemu_iovec_destroy(&local_qiov); @@ -1480,6 +1494,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, return ret; } + bdrv_inc_in_flight(bs); /* * Align write if necessary by performing a read-modify-write cycle. * Pad qiov with the read parts and be sure to have a tracked request not @@ -1581,6 +1596,7 @@ fail: qemu_vfree(tail_buf); out: tracked_request_end(&req); + bdrv_dec_in_flight(bs); return ret; } @@ -1705,17 +1721,19 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, } *file = NULL; + bdrv_inc_in_flight(bs); ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, file); if (ret < 0) { *pnum = 0; - return ret; + goto out; } if (ret & BDRV_BLOCK_RAW) { assert(ret & BDRV_BLOCK_OFFSET_VALID); - return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, - *pnum, pnum, file); + ret = bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, + *pnum, pnum, file); + goto out; } if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { @@ -1757,6 +1775,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, } } +out: + bdrv_dec_in_flight(bs); return ret; } @@ -1822,14 +1842,10 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs, /* Fast-path if already in coroutine context */ bdrv_get_block_status_above_co_entry(&data); } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry, &data); qemu_coroutine_enter(co); - while (!data.done) { - aio_poll(aio_context, true); - } + BDRV_POLL_WHILE(bs, !data.done); } return data.ret; } @@ -2102,6 +2118,7 @@ static const AIOCBInfo bdrv_em_co_aiocb_info = { static void bdrv_co_complete(BlockAIOCBCoroutine *acb) { if (!acb->need_bh) { + bdrv_dec_in_flight(acb->common.bs); acb->common.cb(acb->common.opaque, acb->req.error); qemu_aio_unref(acb); } @@ -2152,6 +2169,9 @@ static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child, Coroutine *co; BlockAIOCBCoroutine *acb; + /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */ + bdrv_inc_in_flight(child->bs); + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque); acb->child = child; acb->need_bh = true; @@ -2185,6 +2205,9 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, Coroutine *co; BlockAIOCBCoroutine *acb; + /* Matched by bdrv_co_complete's bdrv_dec_in_flight. */ + bdrv_inc_in_flight(bs); + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->need_bh = true; acb->req.error = -EINPROGRESS; @@ -2244,23 +2267,22 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque) int coroutine_fn bdrv_co_flush(BlockDriverState *bs) { int ret; - BdrvTrackedRequest req; if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || bdrv_is_sg(bs)) { return 0; } - tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); + bdrv_inc_in_flight(bs); int current_gen = bs->write_gen; /* Wait until any previous flushes are completed */ - while (bs->active_flush_req != NULL) { + while (bs->active_flush_req) { qemu_co_queue_wait(&bs->flush_queue); } - bs->active_flush_req = &req; + bs->active_flush_req = true; /* Write back all layers by calling one driver function */ if (bs->drv->bdrv_co_flush) { @@ -2330,11 +2352,11 @@ flush_parent: out: /* Notify any pending flushes that we have completed */ bs->flushed_gen = current_gen; - bs->active_flush_req = NULL; + bs->active_flush_req = false; /* Return value is ignored - it's ok if wait queue is empty */ qemu_co_queue_next(&bs->flush_queue); - tracked_request_end(&req); + bdrv_dec_in_flight(bs); return ret; } @@ -2350,13 +2372,9 @@ int bdrv_flush(BlockDriverState *bs) /* Fast-path if already in coroutine context */ bdrv_flush_co_entry(&flush_co); } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); qemu_coroutine_enter(co); - while (flush_co.ret == NOT_DONE) { - aio_poll(aio_context, true); - } + BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); } return flush_co.ret; @@ -2417,6 +2435,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, return 0; } + bdrv_inc_in_flight(bs); tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); @@ -2463,6 +2482,7 @@ out: bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, req.bytes >> BDRV_SECTOR_BITS); tracked_request_end(&req); + bdrv_dec_in_flight(bs); return ret; } @@ -2480,13 +2500,9 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count) /* Fast-path if already in coroutine context */ bdrv_pdiscard_co_entry(&rwco); } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); qemu_coroutine_enter(co); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } + BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); } return rwco.ret; @@ -2495,13 +2511,12 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count) int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest tracked_req; CoroutineIOCompletion co = { .coroutine = qemu_coroutine_self(), }; BlockAIOCB *acb; - tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); + bdrv_inc_in_flight(bs); if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) { co.ret = -ENOTSUP; goto out; @@ -2518,7 +2533,7 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) qemu_coroutine_yield(); } out: - tracked_request_end(&tracked_req); + bdrv_dec_in_flight(bs); return co.ret; } diff --git a/block/mirror.c b/block/mirror.c index a433e68..3a0788e 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -469,7 +469,11 @@ static void mirror_free_init(MirrorBlockJob *s) } } -static void mirror_drain(MirrorBlockJob *s) +/* This is also used for the .pause callback. There is no matching + * mirror_resume() because mirror_run() will begin iterating again + * when the job is resumed. + */ +static void mirror_wait_for_all_io(MirrorBlockJob *s) { while (s->in_flight > 0) { mirror_wait_for_io(s); @@ -528,6 +532,7 @@ static void mirror_exit(BlockJob *job, void *opaque) g_free(s->replaces); bdrv_op_unblock_all(target_bs, s->common.blocker); blk_unref(s->target); + s->target = NULL; block_job_completed(&s->common, data->ret); g_free(data); bdrv_drained_end(src); @@ -582,7 +587,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) sector_num += nb_sectors; } - mirror_drain(s); + mirror_wait_for_all_io(s); } /* First part, loop on the sectors and initialize the dirty bitmap. */ @@ -617,6 +622,7 @@ static void coroutine_fn mirror_run(void *opaque) MirrorExitData *data; BlockDriverState *bs = blk_bs(s->common.blk); BlockDriverState *target_bs = blk_bs(s->target); + bool need_drain = true; int64_t length; BlockDriverInfo bdi; char backing_filename[2]; /* we only need 2 characters because we are only @@ -752,11 +758,26 @@ static void coroutine_fn mirror_run(void *opaque) * source has dirty data to copy! * * Note that I/O can be submitted by the guest while - * mirror_populate runs. + * mirror_populate runs, so pause it now. Before deciding + * whether to switch to target check one last time if I/O has + * come in the meanwhile, and if not flush the data to disk. */ trace_mirror_before_drain(s, cnt); - bdrv_co_drain(bs); + + bdrv_drained_begin(bs); cnt = bdrv_get_dirty_count(s->dirty_bitmap); + if (cnt > 0) { + bdrv_drained_end(bs); + continue; + } + + /* The two disks are in sync. Exit and report successful + * completion. + */ + assert(QLIST_EMPTY(&bs->tracked_requests)); + s->common.cancelled = false; + need_drain = false; + break; } ret = 0; @@ -769,13 +790,6 @@ static void coroutine_fn mirror_run(void *opaque) } else if (!should_complete) { delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); - } else if (cnt == 0) { - /* The two disks are in sync. Exit and report successful - * completion. - */ - assert(QLIST_EMPTY(&bs->tracked_requests)); - s->common.cancelled = false; - break; } s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); } @@ -787,7 +801,8 @@ immediate_exit: * the target is a copy of the source. */ assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common))); - mirror_drain(s); + assert(need_drain); + mirror_wait_for_all_io(s); } assert(s->in_flight == 0); @@ -799,9 +814,10 @@ immediate_exit: data = g_malloc(sizeof(*data)); data->ret = ret; - /* Before we switch to target in mirror_exit, make sure data doesn't - * change. */ - bdrv_drained_begin(bs); + + if (need_drain) { + bdrv_drained_begin(bs); + } block_job_defer_to_main_loop(&s->common, mirror_exit, data); } @@ -872,14 +888,11 @@ static void mirror_complete(BlockJob *job, Error **errp) block_job_enter(&s->common); } -/* There is no matching mirror_resume() because mirror_run() will begin - * iterating again when the job is resumed. - */ -static void coroutine_fn mirror_pause(BlockJob *job) +static void mirror_pause(BlockJob *job) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - mirror_drain(s); + mirror_wait_for_all_io(s); } static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context) @@ -889,6 +902,21 @@ static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context) blk_set_aio_context(s->target, new_context); } +static void mirror_drain(BlockJob *job) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + + /* Need to keep a reference in case blk_drain triggers execution + * of mirror_complete... + */ + if (s->target) { + BlockBackend *target = s->target; + blk_ref(target); + blk_drain(target); + blk_unref(target); + } +} + static const BlockJobDriver mirror_job_driver = { .instance_size = sizeof(MirrorBlockJob), .job_type = BLOCK_JOB_TYPE_MIRROR, @@ -896,6 +924,7 @@ static const BlockJobDriver mirror_job_driver = { .complete = mirror_complete, .pause = mirror_pause, .attached_aio_context = mirror_attached_aio_context, + .drain = mirror_drain, }; static const BlockJobDriver commit_active_job_driver = { @@ -905,6 +934,7 @@ static const BlockJobDriver commit_active_job_driver = { .complete = mirror_complete, .pause = mirror_pause, .attached_aio_context = mirror_attached_aio_context, + .drain = mirror_drain, }; static void mirror_start_job(const char *job_id, BlockDriverState *bs, diff --git a/block/nfs.c b/block/nfs.c index c3db2ec..88c60a9 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -52,6 +52,7 @@ typedef struct NFSClient { } NFSClient; typedef struct NFSRPC { + BlockDriverState *bs; int ret; int complete; QEMUIOVector *iov; @@ -90,11 +91,12 @@ static void nfs_process_write(void *arg) nfs_set_events(client); } -static void nfs_co_init_task(NFSClient *client, NFSRPC *task) +static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task) { *task = (NFSRPC) { .co = qemu_coroutine_self(), - .client = client, + .bs = bs, + .client = bs->opaque, }; } @@ -111,6 +113,7 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, { NFSRPC *task = private_data; task->ret = ret; + assert(!task->st); if (task->ret > 0 && task->iov) { if (task->ret <= task->iov->size) { qemu_iovec_from_buf(task->iov, 0, data, task->ret); @@ -118,18 +121,11 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, task->ret = -EIO; } } - if (task->ret == 0 && task->st) { - memcpy(task->st, data, sizeof(struct stat)); - } if (task->ret < 0) { error_report("NFS Error: %s", nfs_get_error(nfs)); } - if (task->co) { - aio_bh_schedule_oneshot(task->client->aio_context, - nfs_co_generic_bh_cb, task); - } else { - task->complete = 1; - } + aio_bh_schedule_oneshot(task->client->aio_context, + nfs_co_generic_bh_cb, task); } static int coroutine_fn nfs_co_readv(BlockDriverState *bs, @@ -139,7 +135,7 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs, NFSClient *client = bs->opaque; NFSRPC task; - nfs_co_init_task(client, &task); + nfs_co_init_task(bs, &task); task.iov = iov; if (nfs_pread_async(client->context, client->fh, @@ -149,8 +145,8 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs, return -ENOMEM; } + nfs_set_events(client); while (!task.complete) { - nfs_set_events(client); qemu_coroutine_yield(); } @@ -174,7 +170,7 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs, NFSRPC task; char *buf = NULL; - nfs_co_init_task(client, &task); + nfs_co_init_task(bs, &task); buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE); if (nb_sectors && buf == NULL) { @@ -191,8 +187,8 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs, return -ENOMEM; } + nfs_set_events(client); while (!task.complete) { - nfs_set_events(client); qemu_coroutine_yield(); } @@ -210,15 +206,15 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs) NFSClient *client = bs->opaque; NFSRPC task; - nfs_co_init_task(client, &task); + nfs_co_init_task(bs, &task); if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb, &task) != 0) { return -ENOMEM; } + nfs_set_events(client); while (!task.complete) { - nfs_set_events(client); qemu_coroutine_yield(); } @@ -496,6 +492,22 @@ static int nfs_has_zero_init(BlockDriverState *bs) return client->has_zero_init; } +static void +nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data, + void *private_data) +{ + NFSRPC *task = private_data; + task->ret = ret; + if (task->ret == 0) { + memcpy(task->st, data, sizeof(struct stat)); + } + if (task->ret < 0) { + error_report("NFS Error: %s", nfs_get_error(nfs)); + } + task->complete = 1; + bdrv_wakeup(task->bs); +} + static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) { NFSClient *client = bs->opaque; @@ -507,16 +519,15 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) return client->st_blocks * 512; } + task.bs = bs; task.st = &st; - if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb, + if (nfs_fstat_async(client->context, client->fh, nfs_get_allocated_file_size_cb, &task) != 0) { return -ENOMEM; } - while (!task.complete) { - nfs_set_events(client); - aio_poll(client->aio_context, true); - } + nfs_set_events(client); + BDRV_POLL_WHILE(bs, !task.complete); return (task.ret < 0 ? task.ret : st.st_blocks * 512); } diff --git a/block/qed-table.c b/block/qed-table.c index 1a731df..ed443e2 100644 --- a/block/qed-table.c +++ b/block/qed-table.c @@ -174,9 +174,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s) qed_read_table(s, s->header.l1_table_offset, s->l1_table, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } + BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS); return ret; } @@ -195,9 +193,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, int ret = -EINPROGRESS; qed_write_l1_table(s, index, n, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } + BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS); return ret; } @@ -268,9 +264,7 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset int ret = -EINPROGRESS; qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } + BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS); return ret; } @@ -290,9 +284,7 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, int ret = -EINPROGRESS; qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } + BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS); return ret; } diff --git a/block/qed.c b/block/qed.c index 3ee879b..1a7ef0a 100644 --- a/block/qed.c +++ b/block/qed.c @@ -336,7 +336,7 @@ static void qed_need_check_timer_cb(void *opaque) qed_plug_allocating_write_reqs(s); /* Ensure writes are on disk before clearing flag */ - bdrv_aio_flush(s->bs, qed_clear_need_check, s); + bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s); } static void qed_start_need_check_timer(BDRVQEDState *s) @@ -378,6 +378,19 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs, } } +static void bdrv_qed_drain(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + /* Fire the timer immediately in order to start doing I/O as soon as the + * header is flushed. + */ + if (s->need_check_timer && timer_pending(s->need_check_timer)) { + qed_cancel_need_check_timer(s); + qed_need_check_timer_cb(s); + } +} + static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -1668,6 +1681,7 @@ static BlockDriver bdrv_qed = { .bdrv_check = bdrv_qed_check, .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, + .bdrv_drain = bdrv_qed_drain, }; static void bdrv_qed_init(void) diff --git a/block/replication.c b/block/replication.c index 8bbfc8f..02aeaaf 100644 --- a/block/replication.c +++ b/block/replication.c @@ -138,6 +138,9 @@ static void replication_close(BlockDriverState *bs) if (s->replication_state == BLOCK_REPLICATION_RUNNING) { replication_stop(s->rs, false, NULL); } + if (s->replication_state == BLOCK_REPLICATION_FAILOVER) { + block_job_cancel_sync(s->active_disk->bs->job); + } if (s->mode == REPLICATION_MODE_SECONDARY) { g_free(s->top_id); @@ -319,9 +322,10 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp) } } -static void reopen_backing_file(BDRVReplicationState *s, bool writable, +static void reopen_backing_file(BlockDriverState *bs, bool writable, Error **errp) { + BDRVReplicationState *s = bs->opaque; BlockReopenQueue *reopen_queue = NULL; int orig_hidden_flags, orig_secondary_flags; int new_hidden_flags, new_secondary_flags; @@ -356,13 +360,15 @@ static void reopen_backing_file(BDRVReplicationState *s, bool writable, } if (reopen_queue) { - bdrv_reopen_multiple(reopen_queue, &local_err); + bdrv_reopen_multiple(bdrv_get_aio_context(bs), + reopen_queue, &local_err); error_propagate(errp, local_err); } } -static void backup_job_cleanup(BDRVReplicationState *s) +static void backup_job_cleanup(BlockDriverState *bs) { + BDRVReplicationState *s = bs->opaque; BlockDriverState *top_bs; top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL); @@ -371,19 +377,20 @@ static void backup_job_cleanup(BDRVReplicationState *s) } bdrv_op_unblock_all(top_bs, s->blocker); error_free(s->blocker); - reopen_backing_file(s, false, NULL); + reopen_backing_file(bs, false, NULL); } static void backup_job_completed(void *opaque, int ret) { - BDRVReplicationState *s = opaque; + BlockDriverState *bs = opaque; + BDRVReplicationState *s = bs->opaque; if (s->replication_state != BLOCK_REPLICATION_FAILOVER) { /* The backup job is cancelled unexpectedly */ s->error = -EIO; } - backup_job_cleanup(s); + backup_job_cleanup(bs); } static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs) @@ -479,7 +486,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, } /* reopen the backing file in r/w mode */ - reopen_backing_file(s, true, &local_err); + reopen_backing_file(bs, true, &local_err); if (local_err) { error_propagate(errp, local_err); aio_context_release(aio_context); @@ -494,7 +501,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, if (!top_bs || !bdrv_is_root_node(top_bs) || !check_top_bs(top_bs, bs)) { error_setg(errp, "No top_bs or it is invalid"); - reopen_backing_file(s, false, NULL); + reopen_backing_file(bs, false, NULL); aio_context_release(aio_context); return; } @@ -504,10 +511,10 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, backup_start("replication-backup", s->secondary_disk->bs, s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false, BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT, - backup_job_completed, s, NULL, &local_err); + backup_job_completed, bs, NULL, &local_err); if (local_err) { error_propagate(errp, local_err); - backup_job_cleanup(s); + backup_job_cleanup(bs); aio_context_release(aio_context); return; } diff --git a/block/sheepdog.c b/block/sheepdog.c index ccbf7e1..1fb9173 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -641,6 +641,7 @@ static void restart_co_req(void *opaque) typedef struct SheepdogReqCo { int sockfd; + BlockDriverState *bs; AioContext *aio_context; SheepdogReq *hdr; void *data; @@ -701,6 +702,9 @@ out: srco->ret = ret; srco->finished = true; + if (srco->bs) { + bdrv_wakeup(srco->bs); + } } /* @@ -708,13 +712,14 @@ out: * * Return 0 on success, -errno in case of error. */ -static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, +static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr, void *data, unsigned int *wlen, unsigned int *rlen) { Coroutine *co; SheepdogReqCo srco = { .sockfd = sockfd, - .aio_context = aio_context, + .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(), + .bs = bs, .hdr = hdr, .data = data, .wlen = wlen, @@ -727,9 +732,14 @@ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, do_co_req(&srco); } else { co = qemu_coroutine_create(do_co_req, &srco); - qemu_coroutine_enter(co); - while (!srco.finished) { - aio_poll(aio_context, true); + if (bs) { + qemu_coroutine_enter(co); + BDRV_POLL_WHILE(bs, !srco.finished); + } else { + qemu_coroutine_enter(co); + while (!srco.finished) { + aio_poll(qemu_get_aio_context(), true); + } } } @@ -1125,7 +1135,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename, hdr.snapid = snapid; hdr.flags = SD_FLAG_CMD_WRITE; - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen); if (ret) { error_setg_errno(errp, -ret, "cannot get vdi info"); goto out; @@ -1240,7 +1250,7 @@ out: qemu_co_mutex_unlock(&s->lock); } -static int read_write_object(int fd, AioContext *aio_context, char *buf, +static int read_write_object(int fd, BlockDriverState *bs, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool write, bool create, uint32_t cache_flags) @@ -1274,7 +1284,7 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf, hdr.offset = offset; hdr.copies = copies; - ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen); if (ret) { error_report("failed to send a request to the sheep"); return ret; @@ -1289,22 +1299,22 @@ static int read_write_object(int fd, AioContext *aio_context, char *buf, } } -static int read_object(int fd, AioContext *aio_context, char *buf, +static int read_object(int fd, BlockDriverState *bs, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, uint32_t cache_flags) { - return read_write_object(fd, aio_context, buf, oid, copies, + return read_write_object(fd, bs, buf, oid, copies, datalen, offset, false, false, cache_flags); } -static int write_object(int fd, AioContext *aio_context, char *buf, +static int write_object(int fd, BlockDriverState *bs, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool create, uint32_t cache_flags) { - return read_write_object(fd, aio_context, buf, oid, copies, + return read_write_object(fd, bs, buf, oid, copies, datalen, offset, true, create, cache_flags); } @@ -1331,7 +1341,7 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) goto out; } - ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid), + ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid), s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0, s->cache_flags); if (ret < 0) { @@ -1489,7 +1499,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, } buf = g_malloc(SD_INODE_SIZE); - ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), + ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0, s->cache_flags); closesocket(fd); @@ -1618,7 +1628,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, hdr.copies = s->inode.nr_copies; hdr.block_size_shift = s->inode.block_size_shift; - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen); closesocket(fd); @@ -1886,7 +1896,7 @@ static int sd_create(const char *filename, QemuOpts *opts, hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; hdr.proto_ver = SD_PROTO_VER; - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + ret = do_req(fd, NULL, (SheepdogReq *)&hdr, NULL, &wlen, &rlen); closesocket(fd); if (ret) { @@ -1951,7 +1961,7 @@ static void sd_close(BlockDriverState *bs) hdr.data_length = wlen; hdr.flags = SD_FLAG_CMD_WRITE; - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, s->name, &wlen, &rlen); closesocket(fd); @@ -2000,7 +2010,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) /* we don't need to update entire object */ datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); s->inode.vdi_size = offset; - ret = write_object(fd, s->aio_context, (char *)&s->inode, + ret = write_object(fd, s->bs, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, datalen, 0, false, s->cache_flags); close(fd); @@ -2070,7 +2080,7 @@ static bool sd_delete(BDRVSheepdogState *s) return false; } - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, s->name, &wlen, &rlen); closesocket(fd); if (ret) { @@ -2126,7 +2136,7 @@ static int sd_create_branch(BDRVSheepdogState *s) goto out; } - ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), + ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid), s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags); closesocket(fd); @@ -2411,7 +2421,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto cleanup; } - ret = write_object(fd, s->aio_context, (char *)&s->inode, + ret = write_object(fd, s->bs, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, datalen, 0, false, s->cache_flags); if (ret < 0) { @@ -2426,7 +2436,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto cleanup; } - ret = read_object(fd, s->aio_context, (char *)inode, + ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0, s->cache_flags); @@ -2528,7 +2538,7 @@ static bool remove_objects(BDRVSheepdogState *s) i++; } - ret = write_object(fd, s->aio_context, + ret = write_object(fd, s->bs, (char *)&inode->data_vdi_id[start_idx], vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies, (i - start_idx) * sizeof(uint32_t), @@ -2600,7 +2610,7 @@ static int sd_snapshot_delete(BlockDriverState *bs, return -1; } - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen); closesocket(fd); if (ret) { @@ -2652,8 +2662,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) req.opcode = SD_OP_READ_VDIS; req.data_length = max; - ret = do_req(fd, s->aio_context, &req, - vdi_inuse, &wlen, &rlen); + ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen); closesocket(fd); if (ret) { @@ -2679,7 +2688,7 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) } /* we don't need to read entire object */ - ret = read_object(fd, s->aio_context, (char *)&inode, + ret = read_object(fd, s->bs, (char *)&inode, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0, s->cache_flags); @@ -2745,11 +2754,11 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, create = (offset == 0); if (load) { - ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid, + ret = read_object(fd, s->bs, (char *)data, vmstate_oid, s->inode.nr_copies, data_len, offset, s->cache_flags); } else { - ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid, + ret = write_object(fd, s->bs, (char *)data, vmstate_oid, s->inode.nr_copies, data_len, offset, create, s->cache_flags); } diff --git a/blockjob.c b/blockjob.c index 43fecbe..7c88b30 100644 --- a/blockjob.c +++ b/blockjob.c @@ -74,17 +74,6 @@ BlockJob *block_job_get(const char *id) return NULL; } -/* Normally the job runs in its BlockBackend's AioContext. The exception is - * block_job_defer_to_main_loop() where it runs in the QEMU main loop. Code - * that supports both cases uses this helper function. - */ -static AioContext *block_job_get_aio_context(BlockJob *job) -{ - return job->deferred_to_main_loop ? - qemu_get_aio_context() : - blk_get_aio_context(job->blk); -} - static void block_job_attached_aio_context(AioContext *new_context, void *opaque) { @@ -97,6 +86,17 @@ static void block_job_attached_aio_context(AioContext *new_context, block_job_resume(job); } +static void block_job_drain(BlockJob *job) +{ + /* If job is !job->busy this kicks it into the next pause point. */ + block_job_enter(job); + + blk_drain(job->blk); + if (job->driver->drain) { + job->driver->drain(job); + } +} + static void block_job_detach_aio_context(void *opaque) { BlockJob *job = opaque; @@ -106,12 +106,8 @@ static void block_job_detach_aio_context(void *opaque) block_job_pause(job); - if (!job->paused) { - /* If job is !job->busy this kicks it into the next pause point. */ - block_job_enter(job); - } while (!job->paused && !job->completed) { - aio_poll(block_job_get_aio_context(job), true); + block_job_drain(job); } block_job_unref(job); @@ -413,14 +409,21 @@ static int block_job_finish_sync(BlockJob *job, assert(blk_bs(job->blk)->job == job); block_job_ref(job); + finish(job, &local_err); if (local_err) { error_propagate(errp, local_err); block_job_unref(job); return -EBUSY; } + /* block_job_drain calls block_job_enter, and it should be enough to + * induce progress until the job completes or moves to the main thread. + */ + while (!job->deferred_to_main_loop && !job->completed) { + block_job_drain(job); + } while (!job->completed) { - aio_poll(block_job_get_aio_context(job), true); + aio_poll(qemu_get_aio_context(), true); } ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret; block_job_unref(job); diff --git a/configure b/configure index f83cdf8..6b7acb1 100755 --- a/configure +++ b/configure @@ -230,6 +230,7 @@ vhost_net="no" vhost_scsi="no" vhost_vsock="no" kvm="no" +colo="yes" rdma="" gprof="no" debug_tcg="no" @@ -918,6 +919,10 @@ for opt do ;; --enable-kvm) kvm="yes" ;; + --disable-colo) colo="no" + ;; + --enable-colo) colo="yes" + ;; --disable-tcg-interpreter) tcg_interpreter="no" ;; --enable-tcg-interpreter) tcg_interpreter="yes" @@ -1366,6 +1371,7 @@ disabled with --disable-FEATURE, default is enabled if available: fdt fdt device tree bluez bluez stack connectivity kvm KVM acceleration support + colo COarse-grain LOck-stepping VM for Non-stop Service rdma RDMA-based migration support vde support for vde network netmap support for netmap network @@ -5004,6 +5010,7 @@ echo "Linux AIO support $linux_aio" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" +echo "COLO support $colo" echo "RDMA support $rdma" echo "TCG interpreter $tcg_interpreter" echo "fdt support $fdt" @@ -5639,6 +5646,10 @@ if have_backend "syslog"; then fi echo "CONFIG_TRACE_FILE=$trace_file" >> $config_host_mak +if test "$colo" = "yes"; then + echo "CONFIG_COLO=y" >> $config_host_mak +fi + if test "$rdma" = "yes" ; then echo "CONFIG_RDMA=y" >> $config_host_mak fi diff --git a/cputlb.c b/cputlb.c index cc4da4d..813279f 100644 --- a/cputlb.c +++ b/cputlb.c @@ -26,7 +26,6 @@ #include "exec/cputlb.h" #include "exec/memory-internal.h" #include "exec/ram_addr.h" -#include "exec/exec-all.h" #include "tcg/tcg.h" #include "qemu/error-report.h" #include "exec/log.h" diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt new file mode 100644 index 0000000..6282938 --- /dev/null +++ b/docs/COLO-FT.txt @@ -0,0 +1,189 @@ +COarse-grained LOck-stepping Virtual Machines for Non-stop Service +---------------------------------------- +Copyright (c) 2016 Intel Corporation +Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. +Copyright (c) 2016 Fujitsu, Corp. + +This work is licensed under the terms of the GNU GPL, version 2 or later. +See the COPYING file in the top-level directory. + +This document gives an overview of COLO's design and how to use it. + +== Background == +Virtual machine (VM) replication is a well known technique for providing +application-agnostic software-implemented hardware fault tolerance, +also known as "non-stop service". + +COLO (COarse-grained LOck-stepping) is a high availability solution. +Both primary VM (PVM) and secondary VM (SVM) run in parallel. They receive the +same request from client, and generate response in parallel too. +If the response packets from PVM and SVM are identical, they are released +immediately. Otherwise, a VM checkpoint (on demand) is conducted. + +== Architecture == + +The architecture of COLO is shown in the diagram below. +It consists of a pair of networked physical nodes: +The primary node running the PVM, and the secondary node running the SVM +to maintain a valid replica of the PVM. +PVM and SVM execute in parallel and generate output of response packets for +client requests according to the application semantics. + +The incoming packets from the client or external network are received by the +primary node, and then forwarded to the secondary node, so that both the PVM +and the SVM are stimulated with the same requests. + +COLO receives the outbound packets from both the PVM and SVM and compares them +before allowing the output to be sent to clients. + +The SVM is qualified as a valid replica of the PVM, as long as it generates +identical responses to all client requests. Once the differences in the outputs +are detected between the PVM and SVM, COLO withholds transmission of the +outbound packets until it has successfully synchronized the PVM state to the SVM. + + Primary Node Secondary Node + +------------+ +-----------------------+ +------------------------+ +------------+ + | | | HeartBeat |<----->| HeartBeat | | | + | Primary VM | +-----------|-----------+ +-----------|------------+ |Secondary VM| + | | | | | | + | | +-----------|-----------+ +-----------|------------+ | | + | | |QEMU +---v----+ | |QEMU +----v---+ | | | + | | | |Failover| | | |Failover| | | | + | | | +--------+ | | +--------+ | | | + | | | +---------------+ | | +---------------+ | | | + | | | | VM Checkpoint |-------------->| VM Checkpoint | | | | + | | | +---------------+ | | +---------------+ | | | + | | | | | | | | + |Requests<---------------------------^------------------------------------------>Requests| + |Responses----------------------\ /--|--------------\ /------------------------Responses| + | | | | | | | | | | | | | + | | | +-----------+ | | | | | | | +------------+ | | | + | | | | COLO disk | | | | | | | | | COLO disk | | | | + | | | | Manager |-|-|--|--------------|--|->| Manager | | | | + | | | +|----------+ | | | | | | | +-----------|+ | | | + | | | | | | | | | | | | | | | + +------------+ +--|------------|-|--|--+ +---|--|--------------|--+ +------------+ + | | | | | | | + +-------------+ | +----------v-v--|--+ +---|--v-----------+ | +-------------+ + | VM Monitor | | | COLO Proxy | | COLO Proxy | | | VM Monitor | + | | | |(compare packet) | | (adjust sequence)| | | | + +-------------+ | +----------|----^--+ +------------------+ | +-------------+ + | | | | + +------------------|------------|----|--+ +---------------------|------------------+ + | Kernel | | | | | Kernel | | + +------------------|------------|----|--+ +---------------------|------------------+ + | | | | + +--------------v+ +--------v----|--+ +------------------+ +v-------------+ + | Storage | |External Network| | External Network | | Storage | + +---------------+ +----------------+ +------------------+ +--------------+ + +== Components introduction == + +You can see there are several components in COLO's diagram of architecture. +Their functions are described below. + +HeartBeat: +Runs on both the primary and secondary nodes, to periodically check platform +availability. When the primary node suffers a hardware fail-stop failure, +the heartbeat stops responding, the secondary node will trigger a failover +as soon as it determines the absence. + +COLO disk Manager: +When primary VM writes data into image, the colo disk manger captures this data +and sends it to secondary VM's which makes sure the context of secondary VM's +image is consistent with the context of primary VM 's image. +For more details, please refer to docs/block-replication.txt. + +Checkpoint/Failover Controller: +Modifications of save/restore flow to realize continuous migration, +to make sure the state of VM in Secondary side is always consistent with VM in +Primary side. + +COLO Proxy: +Delivers packets to Primary and Seconday, and then compare the responses from +both side. Then decide whether to start a checkpoint according to some rules. +Please refer to docs/colo-proxy.txt for more informations. + +Note: +HeartBeat has not been implemented yet, so you need to trigger failover process +by using 'x-colo-lost-heartbeat' command. + +== Test procedure == +1. Startup qemu +Primary: +# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name primary \ + -device piix3-usb-uhci \ + -device usb-tablet -netdev tap,id=hn0,vhost=off \ + -device virtio-net-pci,id=net-pci0,netdev=hn0 \ + -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\ + children.0.file.filename=1.raw,\ + children.0.driver=raw -S +Secondary: +# qemu-kvm -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc :7 -name secondary \ + -device piix3-usb-uhci \ + -device usb-tablet -netdev tap,id=hn0,vhost=off \ + -device virtio-net-pci,id=net-pci0,netdev=hn0 \ + -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \ + -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\ + file.driver=qcow2,top-id=active-disk0,\ + file.file.filename=/mnt/ramfs/active_disk.img,\ + file.backing.driver=qcow2,\ + file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\ + file.backing.backing=secondary-disk0 \ + -incoming tcp:0:8888 + +2. On Secondary VM's QEMU monitor, issue command +{'execute':'qmp_capabilities'} +{ 'execute': 'nbd-server-start', + 'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } } +} +{'execute': 'nbd-server-add', 'arguments': {'device': 'secondeary-disk0', 'writable': true } } + +Note: + a. The qmp command nbd-server-start and nbd-server-add must be run + before running the qmp command migrate on primary QEMU + b. Active disk, hidden disk and nbd target's length should be the + same. + c. It is better to put active disk and hidden disk in ramdisk. + +3. On Primary VM's QEMU monitor, issue command: +{'execute':'qmp_capabilities'} +{ 'execute': 'human-monitor-command', + 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}} +{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } } +{ 'execute': 'migrate-set-capabilities', + 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } +{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } } + + Note: + a. There should be only one NBD Client for each primary disk. + b. xx.xx.xx.xx is the secondary physical machine's hostname or IP + c. The qmp command line must be run after running qmp command line in + secondary qemu. + +4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced. +You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }' +to change the checkpoint period time + +5. Failover test +You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's +monitor at the same time, then SVM will failover and client will not detect this +change. + +Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to +issue block related command to stop block replication. +Primary: + Remove the nbd child from the quorum: + { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}} + { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}} + Note: there is no qmp command to remove the blockdev now + +Secondary: + The primary host is down, so we should do the following thing: + { 'execute': 'nbd-server-stop' } + +== TODO == +1. Support continuous VM replication. +2. Support shared storage. +3. Develop the heartbeat part. +4. Reduce checkpoint VMâ??s downtime while doing checkpoint. diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt index 40b8419..0e7cdb2 100644 --- a/docs/multiple-iothreads.txt +++ b/docs/multiple-iothreads.txt @@ -105,13 +105,10 @@ a BH in the target AioContext beforehand and then call qemu_bh_schedule(). No acquire/release or locking is needed for the qemu_bh_schedule() call. But be sure to acquire the AioContext for aio_bh_new() if necessary. -The relationship between AioContext and the block layer -------------------------------------------------------- -The AioContext originates from the QEMU block layer because it provides a -scoped way of running event loop iterations until all work is done. This -feature is used to complete all in-flight block I/O requests (see -bdrv_drain_all()). Nowadays AioContext is a generic event loop that can be -used by any QEMU subsystem. +AioContext and the block layer +------------------------------ +The AioContext originates from the QEMU block layer, even though nowadays +AioContext is a generic event loop that can be used by any QEMU subsystem. The block layer has support for AioContext integrated. Each BlockDriverState is associated with an AioContext using bdrv_set_aio_context() and @@ -122,13 +119,22 @@ Block layer code must therefore expect to run in an IOThread and avoid using old APIs that implicitly use the main loop. See the "How to program for IOThreads" above for information on how to do that. -If main loop code such as a QMP function wishes to access a BlockDriverState it -must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the -IOThread does not run in parallel. - -Long-running jobs (usually in the form of coroutines) are best scheduled in the -BlockDriverState's AioContext to avoid the need to acquire/release around each -bdrv_*() call. Be aware that there is currently no mechanism to get notified -when bdrv_set_aio_context() moves this BlockDriverState to a different -AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you -may need to add this if you want to support long-running jobs. +If main loop code such as a QMP function wishes to access a BlockDriverState +it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure +that callbacks in the IOThread do not run in parallel. + +Code running in the monitor typically needs to ensure that past +requests from the guest are completed. When a block device is running +in an IOThread, the IOThread can also process requests from the guest +(via ioeventfd). To achieve both objects, wrap the code between +bdrv_drained_begin() and bdrv_drained_end(), thus creating a "drained +section". The functions must be called between aio_context_acquire() +and aio_context_release(). You can freely release and re-acquire the +AioContext within a drained section. + +Long-running jobs (usually in the form of coroutines) are best scheduled in +the BlockDriverState's AioContext to avoid the need to acquire/release around +each bdrv_*() call. The functions bdrv_add/remove_aio_context_notifier, +or alternatively blk_add/remove_aio_context_notifier if you use BlockBackends, +can be used to get a notification whenever bdrv_set_aio_context() moves a +BlockDriverState to a different AioContext. diff --git a/docs/qmp-commands.txt b/docs/qmp-commands.txt index 284576d..a4732a5 100644 --- a/docs/qmp-commands.txt +++ b/docs/qmp-commands.txt @@ -554,6 +554,16 @@ Example: -> { "execute": "migrate_set_downtime", "arguments": { "value": 0.1 } } <- { "return": {} } +x-colo-lost-heartbeat +-------------------- + +Tell COLO that heartbeat is lost, a failover or takeover is needed. + +Example: + +-> { "execute": "x-colo-lost-heartbeat" } +<- { "return": {} } + client_migrate_info ------------------- @@ -2861,6 +2871,7 @@ Enable/Disable migration capabilities - "compress": use multiple compression threads to accelerate live migration - "events": generate events for each migration state change - "postcopy-ram": postcopy mode for live migration +- "x-colo": COarse-Grain LOck Stepping (COLO) for Non-stop Service Arguments: @@ -2882,6 +2893,7 @@ Query current migration capabilities - "compress": Multiple compression threads state (json-bool) - "events": Migration state change event state (json-bool) - "postcopy-ram": postcopy ram state (json-bool) + - "x-colo": COarse-Grain LOck Stepping for Non-stop Service (json-bool) Arguments: @@ -2895,7 +2907,8 @@ Example: {"state": false, "capability": "zero-blocks"}, {"state": false, "capability": "compress"}, {"state": true, "capability": "events"}, - {"state": false, "capability": "postcopy-ram"} + {"state": false, "capability": "postcopy-ram"}, + {"state": false, "capability": "x-colo"} ]} migrate-set-parameters @@ -2913,6 +2926,8 @@ Set migration parameters - "max-bandwidth": set maximum speed for migrations (in bytes/sec) (json-int) - "downtime-limit": set maximum tolerated downtime (in milliseconds) for migrations (json-int) +- "x-checkpoint-delay": set the delay time for periodic checkpoint (json-int) + Arguments: Example: diff --git a/gdbstub.c b/gdbstub.c index b2e1b79..de62d26 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -31,7 +31,6 @@ #define MAX_PACKET_LENGTH 4096 -#include "cpu.h" #include "qemu/sockets.h" #include "sysemu/kvm.h" #include "exec/semihost.h" diff --git a/hmp-commands.hx b/hmp-commands.hx index 06bef47..8819281 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1040,6 +1040,21 @@ migration (or once already in postcopy). ETEXI { + .name = "x_colo_lost_heartbeat", + .args_type = "", + .params = "", + .help = "Tell COLO that heartbeat is lost,\n\t\t\t" + "a failover or takeover is needed.", + .cmd = hmp_x_colo_lost_heartbeat, + }, + +STEXI +@item x_colo_lost_heartbeat +@findex x_colo_lost_heartbeat +Tell COLO that heartbeat is lost, a failover or takeover is needed. +ETEXI + + { .name = "client_migrate_info", .args_type = "protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?", .params = "protocol hostname port tls-port cert-subject", diff --git a/hmp.c b/hmp.c index 3d60259..00af423 100644 --- a/hmp.c +++ b/hmp.c @@ -318,6 +318,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, " %s: %" PRId64 " milliseconds", MigrationParameter_lookup[MIGRATION_PARAMETER_DOWNTIME_LIMIT], params->downtime_limit); + monitor_printf(mon, " %s: %" PRId64, + MigrationParameter_lookup[MIGRATION_PARAMETER_X_CHECKPOINT_DELAY], + params->x_checkpoint_delay); monitor_printf(mon, "\n"); } @@ -1386,6 +1389,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p.has_downtime_limit = true; use_int_value = true; break; + case MIGRATION_PARAMETER_X_CHECKPOINT_DELAY: + p.has_x_checkpoint_delay = true; + use_int_value = true; + break; } if (use_int_value) { @@ -1402,6 +1409,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p.cpu_throttle_initial = valueint; p.cpu_throttle_increment = valueint; p.downtime_limit = valueint; + p.x_checkpoint_delay = valueint; } qmp_migrate_set_parameters(&p, &err); @@ -1443,6 +1451,14 @@ void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } +void hmp_x_colo_lost_heartbeat(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + + qmp_x_colo_lost_heartbeat(&err); + hmp_handle_error(mon, &err); +} + void hmp_set_password(Monitor *mon, const QDict *qdict) { const char *protocol = qdict_get_str(qdict, "protocol"); diff --git a/hmp.h b/hmp.h index 184769c..05daf7c 100644 --- a/hmp.h +++ b/hmp.h @@ -72,6 +72,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict); void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict); void hmp_client_migrate_info(Monitor *mon, const QDict *qdict); void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict); +void hmp_x_colo_lost_heartbeat(Monitor *mon, const QDict *qdict); void hmp_set_password(Monitor *mon, const QDict *qdict); void hmp_expire_password(Monitor *mon, const QDict *qdict); void hmp_eject(Monitor *mon, const QDict *qdict); diff --git a/hw/arm/cubieboard.c b/hw/arm/cubieboard.c index fbd78ed..dd19ba3 100644 --- a/hw/arm/cubieboard.c +++ b/hw/arm/cubieboard.c @@ -74,6 +74,7 @@ static void cubieboard_init(MachineState *machine) cubieboard_binfo.ram_size = machine->ram_size; cubieboard_binfo.kernel_filename = machine->kernel_filename; cubieboard_binfo.kernel_cmdline = machine->kernel_cmdline; + cubieboard_binfo.initrd_filename = machine->initrd_filename; arm_load_kernel(&s->a10->cpu, &cubieboard_binfo); } diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c index 42cdde0..21ea1d6 100644 --- a/hw/arm/pxa2xx.c +++ b/hw/arm/pxa2xx.c @@ -2267,7 +2267,9 @@ PXA2xxState *pxa255_init(MemoryRegion *address_space, unsigned int sdram_size) qdev_get_gpio_in(s->pic, PXA2XX_PIC_LCD)); s->cm_base = 0x41300000; - s->cm_regs[CCCR >> 2] = 0x02000210; /* 416.0 MHz */ + s->cm_regs[CCCR >> 2] = 0x00000121; /* from datasheet */ + s->cm_regs[CKEN >> 2] = 0x00017def; /* from datasheet */ + s->clkcfg = 0x00000009; /* Turbo mode active */ memory_region_init_io(&s->cm_iomem, NULL, &pxa2xx_cm_ops, s, "pxa2xx-cm", 0x1000); memory_region_add_subregion(address_space, s->cm_base, &s->cm_iomem); diff --git a/hw/arm/spitz.c b/hw/arm/spitz.c index 41cc2ee..949a15a 100644 --- a/hw/arm/spitz.c +++ b/hw/arm/spitz.c @@ -29,6 +29,7 @@ #include "sysemu/block-backend.h" #include "hw/sysbus.h" #include "exec/address-spaces.h" +#include "sysemu/sysemu.h" #undef REG_FMT #define REG_FMT "0x%02lx" @@ -844,9 +845,18 @@ static void spitz_lcd_hsync_handler(void *opaque, int line, int level) spitz_hsync ^= 1; } +static void spitz_reset(void *opaque, int line, int level) +{ + if (level) { + qemu_system_reset_request(); + } +} + static void spitz_gpio_setup(PXA2xxState *cpu, int slots) { qemu_irq lcd_hsync; + qemu_irq reset; + /* * Bad hack: We toggle the LCD hsync GPIO on every GPIO status * read to satisfy broken guests that poll-wait for hsync. @@ -867,7 +877,8 @@ static void spitz_gpio_setup(PXA2xxState *cpu, int slots) qemu_irq_raise(qdev_get_gpio_in(cpu->gpio, SPITZ_GPIO_BAT_COVER)); /* Handle reset */ - qdev_connect_gpio_out(cpu->gpio, SPITZ_GPIO_ON_RESET, cpu->reset); + reset = qemu_allocate_irq(spitz_reset, cpu, 0); + qdev_connect_gpio_out(cpu->gpio, SPITZ_GPIO_ON_RESET, reset); /* PCMCIA signals: card's IRQ and Card-Detect */ if (slots >= 1) diff --git a/hw/arm/tosa.c b/hw/arm/tosa.c index 2db6650..1ee12f4 100644 --- a/hw/arm/tosa.c +++ b/hw/arm/tosa.c @@ -25,6 +25,7 @@ #include "sysemu/block-backend.h" #include "hw/sysbus.h" #include "exec/address-spaces.h" +#include "sysemu/sysemu.h" #define TOSA_RAM 0x04000000 #define TOSA_ROM 0x00800000 @@ -86,6 +87,12 @@ static void tosa_out_switch(void *opaque, int line, int level) } } +static void tosa_reset(void *opaque, int line, int level) +{ + if (level) { + qemu_system_reset_request(); + } +} static void tosa_gpio_setup(PXA2xxState *cpu, DeviceState *scp0, @@ -93,13 +100,16 @@ static void tosa_gpio_setup(PXA2xxState *cpu, TC6393xbState *tmio) { qemu_irq *outsignals = qemu_allocate_irqs(tosa_out_switch, cpu, 4); + qemu_irq reset; + /* MMC/SD host */ pxa2xx_mmci_handlers(cpu->mmc, qdev_get_gpio_in(scp0, TOSA_GPIO_SD_WP), qemu_irq_invert(qdev_get_gpio_in(cpu->gpio, TOSA_GPIO_nSD_DETECT))); /* Handle reset */ - qdev_connect_gpio_out(cpu->gpio, TOSA_GPIO_ON_RESET, cpu->reset); + reset = qemu_allocate_irq(tosa_reset, cpu, 0); + qdev_connect_gpio_out(cpu->gpio, TOSA_GPIO_ON_RESET, reset); /* PCMCIA signals: card's IRQ and Card-Detect */ pxa2xx_pcmcia_set_irq_cb(cpu->pcmcia[0], diff --git a/hw/arm/versatilepb.c b/hw/arm/versatilepb.c index 8ae5392..7b5cb36 100644 --- a/hw/arm/versatilepb.c +++ b/hw/arm/versatilepb.c @@ -198,6 +198,15 @@ static void versatile_init(MachineState *machine, int board_id) int done_smc = 0; DriveInfo *dinfo; + if (machine->ram_size > 0x10000000) { + /* Device starting at address 0x10000000, + * and memory cannot overlap with devices. + * Refuse to run rather than behaving very confusingly. + */ + error_report("versatilepb: memory size must not exceed 256MB"); + exit(1); + } + if (!machine->cpu_model) { machine->cpu_model = "arm926"; } diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 5fc10df..f953610 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -594,7 +594,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) gicc->uid = i; gicc->flags = cpu_to_le32(ACPI_GICC_ENABLED); - if (armcpu->has_pmu) { + if (arm_feature(&armcpu->env, ARM_FEATURE_PMU)) { gicc->performance_interrupt = cpu_to_le32(PPI(VIRTUAL_PMU_IRQ)); } } diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 070bbf8..54a8b28 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -85,6 +85,7 @@ typedef struct { VirtBoardInfo *daughterboard; bool disallow_affinity_adjustment; bool no_its; + bool no_pmu; } VirtMachineClass; typedef struct { @@ -490,7 +491,7 @@ static void fdt_add_pmu_nodes(const VirtBoardInfo *vbi, int gictype) CPU_FOREACH(cpu) { armcpu = ARM_CPU(cpu); - if (!armcpu->has_pmu || + if (!arm_feature(&armcpu->env, ARM_FEATURE_PMU) || !kvm_arm_pmu_create(cpu, PPI(VIRTUAL_PMU_IRQ))) { return; } @@ -1353,6 +1354,10 @@ static void machvirt_init(MachineState *machine) } } + if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) { + object_property_set_bool(cpuobj, false, "pmu", NULL); + } + if (object_property_find(cpuobj, "reset-cbar", NULL)) { object_property_set_int(cpuobj, vbi->memmap[VIRT_CPUPERIPHS].base, "reset-cbar", &error_abort); @@ -1592,5 +1597,7 @@ static void virt_machine_2_6_options(MachineClass *mc) virt_machine_2_7_options(mc); SET_MACHINE_COMPAT(mc, VIRT_COMPAT_2_6); vmc->disallow_affinity_adjustment = true; + /* Disable PMU for 2.6 as PMU support was first introduced in 2.7 */ + vmc->no_pmu = true; } DEFINE_VIRT_MACHINE(2, 6) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b380142..d479fd2 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -375,7 +375,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd) if (!cqid || nvme_check_cqid(n, cqid)) { return NVME_INVALID_CQID | NVME_DNR; } - if (!sqid || (sqid && !nvme_check_sqid(n, sqid))) { + if (!sqid || !nvme_check_sqid(n, sqid)) { return NVME_INVALID_QID | NVME_DNR; } if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) { @@ -449,7 +449,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd) uint16_t qflags = le16_to_cpu(c->cq_flags); uint64_t prp1 = le64_to_cpu(c->prp1); - if (!cqid || (cqid && !nvme_check_cqid(n, cqid))) { + if (!cqid || !nvme_check_cqid(n, cqid)) { return NVME_INVALID_CQID | NVME_DNR; } if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) { diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c index 1292a4b..3a7dc19 100644 --- a/hw/block/xen_disk.c +++ b/hw/block/xen_disk.c @@ -167,12 +167,12 @@ static void destroy_grant(gpointer pgnt) xengnttab_handle *gnt = grant->blkdev->xendev.gnttabdev; if (xengnttab_unmap(gnt, grant->page, 1) != 0) { - xen_be_printf(&grant->blkdev->xendev, 0, + xen_pv_printf(&grant->blkdev->xendev, 0, "xengnttab_unmap failed: %s\n", strerror(errno)); } grant->blkdev->persistent_gnt_count--; - xen_be_printf(&grant->blkdev->xendev, 3, + xen_pv_printf(&grant->blkdev->xendev, 3, "unmapped grant %p\n", grant->page); g_free(grant); } @@ -184,11 +184,11 @@ static void remove_persistent_region(gpointer data, gpointer dev) xengnttab_handle *gnt = blkdev->xendev.gnttabdev; if (xengnttab_unmap(gnt, region->addr, region->num) != 0) { - xen_be_printf(&blkdev->xendev, 0, + xen_pv_printf(&blkdev->xendev, 0, "xengnttab_unmap region %p failed: %s\n", region->addr, strerror(errno)); } - xen_be_printf(&blkdev->xendev, 3, + xen_pv_printf(&blkdev->xendev, 3, "unmapped grant region %p with %d pages\n", region->addr, region->num); g_free(region); @@ -255,7 +255,7 @@ static int ioreq_parse(struct ioreq *ioreq) size_t len; int i; - xen_be_printf(&blkdev->xendev, 3, + xen_pv_printf(&blkdev->xendev, 3, "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n", ioreq->req.operation, ioreq->req.nr_segments, ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number); @@ -275,28 +275,28 @@ static int ioreq_parse(struct ioreq *ioreq) case BLKIF_OP_DISCARD: return 0; default: - xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n", + xen_pv_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n", ioreq->req.operation); goto err; }; if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') { - xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n"); + xen_pv_printf(&blkdev->xendev, 0, "error: write req for ro device\n"); goto err; } ioreq->start = ioreq->req.sector_number * blkdev->file_blk; for (i = 0; i < ioreq->req.nr_segments; i++) { if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) { - xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n"); + xen_pv_printf(&blkdev->xendev, 0, "error: nr_segments too big\n"); goto err; } if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) { - xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n"); + xen_pv_printf(&blkdev->xendev, 0, "error: first > last sector\n"); goto err; } if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) { - xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n"); + xen_pv_printf(&blkdev->xendev, 0, "error: page crossing\n"); goto err; } @@ -308,7 +308,7 @@ static int ioreq_parse(struct ioreq *ioreq) qemu_iovec_add(&ioreq->v, (void*)mem, len); } if (ioreq->start + ioreq->v.size > blkdev->file_size) { - xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n"); + xen_pv_printf(&blkdev->xendev, 0, "error: access beyond end of file\n"); goto err; } return 0; @@ -331,7 +331,7 @@ static void ioreq_unmap(struct ioreq *ioreq) return; } if (xengnttab_unmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) { - xen_be_printf(&ioreq->blkdev->xendev, 0, + xen_pv_printf(&ioreq->blkdev->xendev, 0, "xengnttab_unmap failed: %s\n", strerror(errno)); } @@ -343,7 +343,7 @@ static void ioreq_unmap(struct ioreq *ioreq) continue; } if (xengnttab_unmap(gnt, ioreq->page[i], 1) != 0) { - xen_be_printf(&ioreq->blkdev->xendev, 0, + xen_pv_printf(&ioreq->blkdev->xendev, 0, "xengnttab_unmap failed: %s\n", strerror(errno)); } @@ -381,7 +381,7 @@ static int ioreq_map(struct ioreq *ioreq) if (grant != NULL) { page[i] = grant->page; - xen_be_printf(&ioreq->blkdev->xendev, 3, + xen_pv_printf(&ioreq->blkdev->xendev, 3, "using persistent-grant %" PRIu32 "\n", ioreq->refs[i]); } else { @@ -410,7 +410,7 @@ static int ioreq_map(struct ioreq *ioreq) ioreq->pages = xengnttab_map_grant_refs (gnt, new_maps, domids, refs, ioreq->prot); if (ioreq->pages == NULL) { - xen_be_printf(&ioreq->blkdev->xendev, 0, + xen_pv_printf(&ioreq->blkdev->xendev, 0, "can't map %d grant refs (%s, %d maps)\n", new_maps, strerror(errno), ioreq->blkdev->cnt_map); return -1; @@ -426,7 +426,7 @@ static int ioreq_map(struct ioreq *ioreq) ioreq->page[i] = xengnttab_map_grant_ref (gnt, domids[i], refs[i], ioreq->prot); if (ioreq->page[i] == NULL) { - xen_be_printf(&ioreq->blkdev->xendev, 0, + xen_pv_printf(&ioreq->blkdev->xendev, 0, "can't map grant ref %d (%s, %d maps)\n", refs[i], strerror(errno), ioreq->blkdev->cnt_map); ioreq->mapped = 1; @@ -474,7 +474,7 @@ static int ioreq_map(struct ioreq *ioreq) grant->page = ioreq->page[new_maps]; } grant->blkdev = ioreq->blkdev; - xen_be_printf(&ioreq->blkdev->xendev, 3, + xen_pv_printf(&ioreq->blkdev->xendev, 3, "adding grant %" PRIu32 " page: %p\n", refs[new_maps], grant->page); g_tree_insert(ioreq->blkdev->persistent_gnts, @@ -557,7 +557,7 @@ static int ioreq_grant_copy(struct ioreq *ioreq) rc = xengnttab_grant_copy(gnt, count, segs); if (rc) { - xen_be_printf(&ioreq->blkdev->xendev, 0, + xen_pv_printf(&ioreq->blkdev->xendev, 0, "failed to copy data %d\n", rc); ioreq->aio_errors++; return -1; @@ -565,7 +565,7 @@ static int ioreq_grant_copy(struct ioreq *ioreq) for (i = 0; i < count; i++) { if (segs[i].status != GNTST_okay) { - xen_be_printf(&ioreq->blkdev->xendev, 3, + xen_pv_printf(&ioreq->blkdev->xendev, 3, "failed to copy data %d for gref %d, domid %d\n", segs[i].status, ioreq->refs[i], ioreq->domids[i]); ioreq->aio_errors++; @@ -599,7 +599,7 @@ static void qemu_aio_complete(void *opaque, int ret) struct ioreq *ioreq = opaque; if (ret != 0) { - xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n", + xen_pv_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n", ioreq->req.operation == BLKIF_OP_READ ? "read" : "write"); ioreq->aio_errors++; } @@ -796,7 +796,7 @@ static void blk_send_response_all(struct XenBlkDev *blkdev) ioreq_release(ioreq, true); } if (send_notify) { - xen_be_send_notify(&blkdev->xendev); + xen_pv_send_notify(&blkdev->xendev); } } @@ -866,7 +866,7 @@ static void blk_handle_requests(struct XenBlkDev *blkdev) }; if (blk_send_response_one(ioreq)) { - xen_be_send_notify(&blkdev->xendev); + xen_pv_send_notify(&blkdev->xendev); } ioreq_release(ioreq, false); continue; @@ -910,7 +910,7 @@ static void blk_alloc(struct XenDevice *xendev) } if (xengnttab_set_max_grants(xendev->gnttabdev, MAX_GRANTS(max_requests, BLKIF_MAX_SEGMENTS_PER_REQUEST)) < 0) { - xen_be_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n", + xen_pv_printf(xendev, 0, "xengnttab_set_max_grants failed: %s\n", strerror(errno)); } } @@ -1056,11 +1056,11 @@ static int blk_connect(struct XenDevice *xendev) } /* setup via xenbus -> create new block driver instance */ - xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n"); + xen_pv_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n"); blkdev->blk = blk_new_open(blkdev->filename, NULL, options, qflags, &local_err); if (!blkdev->blk) { - xen_be_printf(&blkdev->xendev, 0, "error: %s\n", + xen_pv_printf(&blkdev->xendev, 0, "error: %s\n", error_get_pretty(local_err)); error_free(local_err); return -1; @@ -1068,10 +1068,11 @@ static int blk_connect(struct XenDevice *xendev) blk_set_enable_write_cache(blkdev->blk, !writethrough); } else { /* setup via qemu cmdline -> already setup for us */ - xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n"); + xen_pv_printf(&blkdev->xendev, 2, + "get configured bdrv (cmdline setup)\n"); blkdev->blk = blk_by_legacy_dinfo(blkdev->dinfo); if (blk_is_read_only(blkdev->blk) && !readonly) { - xen_be_printf(&blkdev->xendev, 0, "Unexpected read-only drive"); + xen_pv_printf(&blkdev->xendev, 0, "Unexpected read-only drive"); blkdev->blk = NULL; return -1; } @@ -1084,13 +1085,13 @@ static int blk_connect(struct XenDevice *xendev) if (blkdev->file_size < 0) { BlockDriverState *bs = blk_bs(blkdev->blk); const char *drv_name = bs ? bdrv_get_format_name(bs) : NULL; - xen_be_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n", + xen_pv_printf(&blkdev->xendev, 1, "blk_getlength: %d (%s) | drv %s\n", (int)blkdev->file_size, strerror(-blkdev->file_size), drv_name ?: "-"); blkdev->file_size = 0; } - xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\"," + xen_pv_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\"," " size %" PRId64 " (%" PRId64 " MB)\n", blkdev->type, blkdev->fileproto, blkdev->filename, blkdev->file_size, blkdev->file_size >> 20); @@ -1174,10 +1175,10 @@ static int blk_connect(struct XenDevice *xendev) blkdev->feature_grant_copy = (xengnttab_grant_copy(blkdev->xendev.gnttabdev, 0, NULL) == 0); - xen_be_printf(&blkdev->xendev, 3, "grant copy operation %s\n", + xen_pv_printf(&blkdev->xendev, 3, "grant copy operation %s\n", blkdev->feature_grant_copy ? "enabled" : "disabled"); - xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, " + xen_pv_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, " "remote port %d, local port %d\n", blkdev->xendev.protocol, blkdev->ring_ref, blkdev->xendev.remote_port, blkdev->xendev.local_port); @@ -1193,7 +1194,7 @@ static void blk_disconnect(struct XenDevice *xendev) blk_unref(blkdev->blk); blkdev->blk = NULL; } - xen_be_unbind_evtchn(&blkdev->xendev); + xen_pv_unbind_evtchn(&blkdev->xendev); if (blkdev->sring) { xengnttab_unmap(blkdev->xendev.gnttabdev, blkdev->sring, 1); diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c index c2b9154..def34cd 100644 --- a/hw/char/cadence_uart.c +++ b/hw/char/cadence_uart.c @@ -450,7 +450,8 @@ static void cadence_uart_reset(DeviceState *dev) s->r[R_IMR] = 0; s->r[R_CISR] = 0; s->r[R_RTRIG] = 0x00000020; - s->r[R_BRGR] = 0x0000000F; + s->r[R_BRGR] = 0x0000028B; + s->r[R_BDIV] = 0x0000000F; s->r[R_TTRIG] = 0x00000020; uart_rx_reset(s); diff --git a/hw/char/xen_console.c b/hw/char/xen_console.c index 86cdc52..c01f410 100644 --- a/hw/char/xen_console.c +++ b/hw/char/xen_console.c @@ -74,7 +74,7 @@ static void buffer_append(struct XenConsole *con) xen_mb(); intf->out_cons = cons; - xen_be_send_notify(&con->xendev); + xen_pv_send_notify(&con->xendev); if (buffer->max_capacity && buffer->size > buffer->max_capacity) { @@ -142,7 +142,7 @@ static void xencons_receive(void *opaque, const uint8_t *buf, int len) } xen_wmb(); intf->in_prod = prod; - xen_be_send_notify(&con->xendev); + xen_pv_send_notify(&con->xendev); } static void xencons_send(struct XenConsole *con) @@ -158,16 +158,17 @@ static void xencons_send(struct XenConsole *con) len = size; } if (len < 1) { - if (!con->backlog) { - con->backlog = 1; - xen_be_printf(&con->xendev, 1, "backlog piling up, nobody listening?\n"); - } + if (!con->backlog) { + con->backlog = 1; + xen_pv_printf(&con->xendev, 1, + "backlog piling up, nobody listening?\n"); + } } else { - buffer_advance(&con->buffer, len); - if (con->backlog && len == size) { - con->backlog = 0; - xen_be_printf(&con->xendev, 1, "backlog is gone\n"); - } + buffer_advance(&con->buffer, len); + if (con->backlog && len == size) { + con->backlog = 0; + xen_pv_printf(&con->xendev, 1, "backlog is gone\n"); + } } } @@ -191,7 +192,7 @@ static int con_init(struct XenDevice *xendev) type = xenstore_read_str(con->console, "type"); if (!type || strcmp(type, "ioemu") != 0) { - xen_be_printf(xendev, 1, "not for me (type=%s)\n", type); + xen_pv_printf(xendev, 1, "not for me (type=%s)\n", type); ret = -1; goto out; } @@ -247,7 +248,8 @@ static int con_initialise(struct XenDevice *xendev) qemu_chr_fe_set_handlers(&con->chr, xencons_can_receive, xencons_receive, NULL, con, NULL, true); - xen_be_printf(xendev, 1, "ring mfn %d, remote port %d, local port %d, limit %zd\n", + xen_pv_printf(xendev, 1, + "ring mfn %d, remote port %d, local port %d, limit %zd\n", con->ring_ref, con->xendev.remote_port, con->xendev.local_port, @@ -260,7 +262,7 @@ static void con_disconnect(struct XenDevice *xendev) struct XenConsole *con = container_of(xendev, struct XenConsole, xendev); qemu_chr_fe_deinit(&con->chr); - xen_be_unbind_evtchn(&con->xendev); + xen_pv_unbind_evtchn(&con->xendev); if (con->sring) { if (!xendev->dev) { diff --git a/hw/display/milkymist-tmu2.c b/hw/display/milkymist-tmu2.c index 9c00184..5c666f9 100644 --- a/hw/display/milkymist-tmu2.c +++ b/hw/display/milkymist-tmu2.c @@ -213,7 +213,7 @@ static void tmu2_start(MilkymistTMU2State *s) /* Read the QEMU source framebuffer into an OpenGL texture */ glGenTextures(1, &texture); glBindTexture(GL_TEXTURE_2D, texture); - fb_len = 2*s->regs[R_TEXHRES]*s->regs[R_TEXVRES]; + fb_len = 2ULL * s->regs[R_TEXHRES] * s->regs[R_TEXVRES]; fb = cpu_physical_memory_map(s->regs[R_TEXFBUF], &fb_len, 0); if (fb == NULL) { glDeleteTextures(1, &texture); diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c index 46b7d5e..7a8727a 100644 --- a/hw/display/xenfb.c +++ b/hw/display/xenfb.c @@ -90,28 +90,29 @@ static int common_bind(struct common *c) xen_pfn_t mfn; if (xenstore_read_fe_uint64(&c->xendev, "page-ref", &val) == -1) - return -1; + return -1; mfn = (xen_pfn_t)val; assert(val == mfn); if (xenstore_read_fe_int(&c->xendev, "event-channel", &c->xendev.remote_port) == -1) - return -1; + return -1; c->page = xenforeignmemory_map(xen_fmem, c->xendev.dom, PROT_READ | PROT_WRITE, 1, &mfn, NULL); if (c->page == NULL) - return -1; + return -1; xen_be_bind_evtchn(&c->xendev); - xen_be_printf(&c->xendev, 1, "ring mfn %"PRI_xen_pfn", remote-port %d, local-port %d\n", - mfn, c->xendev.remote_port, c->xendev.local_port); + xen_pv_printf(&c->xendev, 1, + "ring mfn %"PRI_xen_pfn", remote-port %d, local-port %d\n", + mfn, c->xendev.remote_port, c->xendev.local_port); return 0; } static void common_unbind(struct common *c) { - xen_be_unbind_evtchn(&c->xendev); + xen_pv_unbind_evtchn(&c->xendev); if (c->page) { xenforeignmemory_unmap(xen_fmem, c->page, 1); c->page = NULL; @@ -214,7 +215,7 @@ static int xenfb_kbd_event(struct XenInput *xenfb, XENKBD_IN_RING_REF(page, prod) = *event; xen_wmb(); /* ensure ring contents visible */ page->in_prod = prod + 1; - return xen_be_send_notify(&xenfb->c.xendev); + return xen_pv_send_notify(&xenfb->c.xendev); } /* Send a keyboard (or mouse button) event */ @@ -345,7 +346,7 @@ static int input_initialise(struct XenDevice *xendev) int rc; if (!in->c.con) { - xen_be_printf(xendev, 1, "ds not set (yet)\n"); + xen_pv_printf(xendev, 1, "ds not set (yet)\n"); return -1; } @@ -396,7 +397,7 @@ static void input_event(struct XenDevice *xendev) if (page->out_prod == page->out_cons) return; page->out_cons = page->out_prod; - xen_be_send_notify(&xenfb->c.xendev); + xen_pv_send_notify(&xenfb->c.xendev); } /* -------------------------------------------------------------------- */ @@ -500,8 +501,8 @@ out: } static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim, - int width, int height, int depth, - size_t fb_len, int offset, int row_stride) + int width, int height, int depth, + size_t fb_len, int offset, int row_stride) { size_t mfn_sz = sizeof(*((struct xenfb_page *)0)->pd); size_t pd_len = sizeof(((struct xenfb_page *)0)->pd) / mfn_sz; @@ -510,40 +511,47 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim, int max_width, max_height; if (fb_len_lim > fb_len_max) { - xen_be_printf(&xenfb->c.xendev, 0, "fb size limit %zu exceeds %zu, corrected\n", - fb_len_lim, fb_len_max); - fb_len_lim = fb_len_max; + xen_pv_printf(&xenfb->c.xendev, 0, + "fb size limit %zu exceeds %zu, corrected\n", + fb_len_lim, fb_len_max); + fb_len_lim = fb_len_max; } if (fb_len_lim && fb_len > fb_len_lim) { - xen_be_printf(&xenfb->c.xendev, 0, "frontend fb size %zu limited to %zu\n", - fb_len, fb_len_lim); - fb_len = fb_len_lim; + xen_pv_printf(&xenfb->c.xendev, 0, + "frontend fb size %zu limited to %zu\n", + fb_len, fb_len_lim); + fb_len = fb_len_lim; } if (depth != 8 && depth != 16 && depth != 24 && depth != 32) { - xen_be_printf(&xenfb->c.xendev, 0, "can't handle frontend fb depth %d\n", - depth); - return -1; + xen_pv_printf(&xenfb->c.xendev, 0, + "can't handle frontend fb depth %d\n", + depth); + return -1; } if (row_stride <= 0 || row_stride > fb_len) { - xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend stride %d\n", row_stride); - return -1; + xen_pv_printf(&xenfb->c.xendev, 0, "invalid frontend stride %d\n", + row_stride); + return -1; } max_width = row_stride / (depth / 8); if (width < 0 || width > max_width) { - xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend width %d limited to %d\n", - width, max_width); - width = max_width; + xen_pv_printf(&xenfb->c.xendev, 0, + "invalid frontend width %d limited to %d\n", + width, max_width); + width = max_width; } if (offset < 0 || offset >= fb_len) { - xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend offset %d (max %zu)\n", - offset, fb_len - 1); - return -1; + xen_pv_printf(&xenfb->c.xendev, 0, + "invalid frontend offset %d (max %zu)\n", + offset, fb_len - 1); + return -1; } max_height = (fb_len - offset) / row_stride; if (height < 0 || height > max_height) { - xen_be_printf(&xenfb->c.xendev, 0, "invalid frontend height %d limited to %d\n", - height, max_height); - height = max_height; + xen_pv_printf(&xenfb->c.xendev, 0, + "invalid frontend height %d limited to %d\n", + height, max_height); + height = max_height; } xenfb->fb_len = fb_len; xenfb->row_stride = row_stride; @@ -553,8 +561,9 @@ static int xenfb_configure_fb(struct XenFB *xenfb, size_t fb_len_lim, xenfb->offset = offset; xenfb->up_fullscreen = 1; xenfb->do_resize = 1; - xen_be_printf(&xenfb->c.xendev, 1, "framebuffer %dx%dx%d offset %d stride %d\n", - width, height, depth, offset, row_stride); + xen_pv_printf(&xenfb->c.xendev, 1, + "framebuffer %dx%dx%d offset %d stride %d\n", + width, height, depth, offset, row_stride); return 0; } @@ -631,7 +640,7 @@ static void xenfb_guest_copy(struct XenFB *xenfb, int x, int y, int w, int h) } } if (oops) /* should not happen */ - xen_be_printf(&xenfb->c.xendev, 0, "%s: oops: convert %d -> %d bpp?\n", + xen_pv_printf(&xenfb->c.xendev, 0, "%s: oops: convert %d -> %d bpp?\n", __FUNCTION__, xenfb->depth, bpp); dpy_gfx_update(xenfb->c.con, x, y, w, h); @@ -663,7 +672,7 @@ static void xenfb_send_event(struct XenFB *xenfb, union xenfb_in_event *event) xen_wmb(); /* ensure ring contents visible */ page->in_prod = prod + 1; - xen_be_send_notify(&xenfb->c.xendev); + xen_pv_send_notify(&xenfb->c.xendev); } static void xenfb_send_refresh_period(struct XenFB *xenfb, int period) @@ -696,9 +705,9 @@ static void xenfb_update(void *opaque) return; if (!xenfb->feature_update) { - /* we don't get update notifications, thus use the - * sledge hammer approach ... */ - xenfb->up_fullscreen = 1; + /* we don't get update notifications, thus use the + * sledge hammer approach ... */ + xenfb->up_fullscreen = 1; } /* resize if needed */ @@ -721,7 +730,8 @@ static void xenfb_update(void *opaque) break; } dpy_gfx_replace_surface(xenfb->c.con, surface); - xen_be_printf(&xenfb->c.xendev, 1, "update: resizing: %dx%d @ %d bpp%s\n", + xen_pv_printf(&xenfb->c.xendev, 1, + "update: resizing: %dx%d @ %d bpp%s\n", xenfb->width, xenfb->height, xenfb->depth, is_buffer_shared(surface) ? " (shared)" : ""); xenfb->up_fullscreen = 1; @@ -729,18 +739,19 @@ static void xenfb_update(void *opaque) /* run queued updates */ if (xenfb->up_fullscreen) { - xen_be_printf(&xenfb->c.xendev, 3, "update: fullscreen\n"); - xenfb_guest_copy(xenfb, 0, 0, xenfb->width, xenfb->height); + xen_pv_printf(&xenfb->c.xendev, 3, "update: fullscreen\n"); + xenfb_guest_copy(xenfb, 0, 0, xenfb->width, xenfb->height); } else if (xenfb->up_count) { - xen_be_printf(&xenfb->c.xendev, 3, "update: %d rects\n", xenfb->up_count); - for (i = 0; i < xenfb->up_count; i++) - xenfb_guest_copy(xenfb, - xenfb->up_rects[i].x, - xenfb->up_rects[i].y, - xenfb->up_rects[i].w, - xenfb->up_rects[i].h);

_______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxx https://lists.xenproject.org/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.