Xen project Mailing List

[Xen-changelog] [qemu-xen stable-4.11] Merge tag 'v2.11.2' into staging-4.11

Date: Fri, 27 Jul 2018 08:22:03 +0000

Delivery-date: Fri, 27 Jul 2018 08:22:14 +0000

List-id: "Change log for Mercurial $receive only$" <xen-changelog.lists.xenproject.org>

=== This changeset includes merge from high-traffic branch === Commits on that branch are not reported individually. commit 20c76f9a5fbf16d58c6add2ace2ff0fabd785926 Merge: 43139135a8938de44f66333831d3a8655d07663a 0982a56a551556c704dc15752dabf57b4be1c640 Author: Anthony PERARD <anthony.perard@xxxxxxxxxx> AuthorDate: Mon Jul 9 14:42:32 2018 +0100 Commit: Anthony PERARD <anthony.perard@xxxxxxxxxx> CommitDate: Mon Jul 9 14:42:32 2018 +0100 Merge tag 'v2.11.2' into staging-4.11 2.11.2 MAINTAINERS | 6 + VERSION | 2 +- block.c | 17 +- block/file-posix.c | 5 +- block/gluster.c | 21 +- block/nbd-client.c | 3 - block/nbd.c | 2 + block/qcow2.c | 2 +- block/raw-format.c | 64 +++-- block/rbd.c | 3 +- block/ssh.c | 1 + block/throttle.c | 54 ++-- configure | 3 + cpus.c | 10 +- device_tree.c | 2 +- docs/interop/qcow2.txt | 16 +- exec.c | 92 ++++--- gdbstub.c | 3 +- hw/block/pflash_cfi01.c | 10 +- hw/block/pflash_cfi02.c | 9 +- hw/char/cmsdk-apb-uart.c | 1 + hw/core/loader.c | 20 +- hw/core/qdev.c | 24 ++ hw/display/qxl-render.c | 3 +- hw/display/vga.c | 2 + hw/i386/acpi-build.c | 3 +- hw/i386/intel_iommu.c | 489 +++++++++++++++++++++++++++---------- hw/i386/intel_iommu_internal.h | 43 ++-- hw/i386/multiboot.c | 85 ++++--- hw/i386/trace-events | 5 +- hw/ide/ahci.c | 13 +- hw/intc/arm_gicv3_common.c | 79 ++++++ hw/intc/arm_gicv3_cpuif.c | 12 +- hw/intc/arm_gicv3_kvm.c | 57 ++++- hw/intc/openpic_kvm.c | 4 - hw/net/virtio-net.c | 11 + hw/pci-bridge/i82801b11.c | 1 + hw/ppc/spapr.c | 161 +++++++----- hw/ppc/spapr_cpu_core.c | 9 +- hw/s390x/ccw-device.c | 8 + hw/s390x/css.c | 8 + hw/s390x/s390-virtio-ccw.c | 30 ++- hw/s390x/virtio-ccw.c | 54 ++-- hw/s390x/virtio-ccw.h | 3 +- hw/tpm/tpm_emulator.c | 4 +- hw/tpm/tpm_passthrough.c | 36 +-- hw/usb/dev-mtp.c | 6 +- hw/usb/dev-smartcard-reader.c | 4 +- hw/usb/redirect.c | 2 +- hw/vfio/ccw.c | 2 + hw/virtio/virtio-balloon.c | 1 + include/block/block.h | 1 + include/exec/cpu-all.h | 6 +- include/exec/cpu_ldst.h | 16 +- include/exec/memory-internal.h | 13 +- include/exec/memory.h | 45 ++-- include/hw/i386/intel_iommu.h | 26 +- include/hw/intc/arm_gicv3_common.h | 1 + include/hw/ppc/spapr.h | 3 +- include/hw/qdev-core.h | 14 +- include/net/net.h | 1 + include/qemu/iova-tree.h | 134 ++++++++++ linux-user/mmap.c | 26 +- linux-user/syscall.c | 13 +- memory.c | 30 --- migration/block.c | 5 +- nbd/client.c | 14 +- net/net.c | 1 - net/tap.c | 2 + net/vhost-user.c | 11 +- pc-bios/s390-ccw.img | Bin 26416 -> 26416 bytes pc-bios/s390-ccw/bootmap.c | 7 + pc-bios/s390-ccw/cio.h | 2 +- pc-bios/s390-ccw/iplb.h | 16 +- qemu-img.c | 29 ++- qemu-io.c | 4 +- scripts/qapi.py | 2 +- scsi/qemu-pr-helper.c | 7 +- target/arm/translate-a64.c | 6 +- target/arm/translate.c | 17 +- target/arm/translate.h | 2 +- target/i386/cpu.c | 4 +- target/i386/cpu.h | 3 + target/i386/kvm.c | 16 +- target/i386/machine.c | 20 ++ target/i386/translate.c | 2 +- target/lm32/op_helper.c | 4 + target/ppc/compat.c | 25 +- target/ppc/cpu.h | 2 +- target/ppc/machine.c | 5 + target/sparc/translate.c | 5 + target/xtensa/translate.c | 1 + tcg/arm/tcg-target.inc.c | 4 +- tcg/tcg-opc.h | 4 +- tcg/tcg.h | 10 + tests/boot-serial-test.c | 8 +- tests/migration-test.c | 4 +- tests/multiboot/.gitignore | 3 + tests/multiboot/Makefile | 22 +- tests/multiboot/aout_kludge.S | 138 +++++++++++ tests/multiboot/aout_kludge.out | 42 ++++ tests/multiboot/run_test.sh | 34 +-- tests/prom-env-test.c | 6 +- tests/pxe-test.c | 10 +- tests/qemu-iotests/024 | 82 ++++++- tests/qemu-iotests/024.out | 30 +++ tests/qemu-iotests/060 | 30 +++ tests/qemu-iotests/060.out | 14 ++ tests/qemu-iotests/106 | 24 ++ tests/qemu-iotests/106.out | 10 + tests/qemu-iotests/153 | 17 ++ tests/qemu-iotests/153.out | 16 ++ tests/qemu-iotests/221 | 60 +++++ tests/qemu-iotests/221.out | 16 ++ tests/qemu-iotests/group | 1 + tests/test-crypto-tlssession.c | 1 + tests/test-io-channel-tls.c | 1 + ui/console.c | 5 + util/Makefile.objs | 1 + util/iova-tree.c | 114 +++++++++ 120 files changed, 2141 insertions(+), 612 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0255113470..a8e01de523 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1680,6 +1680,12 @@ F: include/sysemu/replay.h F: docs/replay.txt F: stubs/replay.c +IOVA Tree +M: Peter Xu <peterx@xxxxxxxxxx> +S: Maintained +F: include/qemu/iova-tree.h +F: util/iova-tree.c + Usermode Emulation ------------------ Overall diff --git a/VERSION b/VERSION index 6ceb272eec..9e5bb77a3b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.11.1 +2.11.2 diff --git a/block.c b/block.c index f236431da1..24dd28d51d 100644 --- a/block.c +++ b/block.c @@ -1596,13 +1596,24 @@ static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs) /* Returns whether the image file can be written to after the reopen queue @q * has been successfully applied, or right now if @q is NULL. */ -static bool bdrv_is_writable(BlockDriverState *bs, BlockReopenQueue *q) +static bool bdrv_is_writable_after_reopen(BlockDriverState *bs, + BlockReopenQueue *q) { int flags = bdrv_reopen_get_flags(q, bs); return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR; } +/* + * Return whether the BDS can be written to. This is not necessarily + * the same as !bdrv_is_read_only(bs), as inactivated images may not + * be written to but do not count as read-only images. + */ +bool bdrv_is_writable(BlockDriverState *bs) +{ + return bdrv_is_writable_after_reopen(bs, NULL); +} + static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs, BdrvChild *c, const BdrvChildRole *role, BlockReopenQueue *reopen_queue, @@ -1640,7 +1651,7 @@ static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q, /* Write permissions never work with read-only images */ if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) && - !bdrv_is_writable(bs, q)) + !bdrv_is_writable_after_reopen(bs, q)) { error_setg(errp, "Block node is read-only"); return -EPERM; @@ -1930,7 +1941,7 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c, &perm, &shared); /* Format drivers may touch metadata even if the guest doesn't write */ - if (bdrv_is_writable(bs, reopen_queue)) { + if (bdrv_is_writable_after_reopen(bs, reopen_queue)) { perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; } diff --git a/block/file-posix.c b/block/file-posix.c index 36ee89e940..275953fdc6 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1694,6 +1694,7 @@ static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc, case PREALLOC_MODE_FULL: { int64_t num = 0, left = offset - current_length; + off_t seek_result; /* * Knowing the final size from the beginning could allow the file @@ -1708,8 +1709,8 @@ static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc, buf = g_malloc0(65536); - result = lseek(fd, current_length, SEEK_SET); - if (result < 0) { + seek_result = lseek(fd, current_length, SEEK_SET); + if (seek_result < 0) { result = -errno; error_setg_errno(errp, -result, "Failed to seek to the old end of file"); diff --git a/block/gluster.c b/block/gluster.c index 0f4265a3a4..d09f4f2283 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -164,7 +164,12 @@ static QemuOptsList runtime_unix_opts = { { .name = GLUSTER_OPT_SOCKET, .type = QEMU_OPT_STRING, - .help = "socket file path)", + .help = "socket file path (legacy)", + }, + { + .name = GLUSTER_OPT_PATH, + .type = QEMU_OPT_STRING, + .help = "socket file path (QAPI)", }, { /* end of list */ } }, @@ -612,10 +617,18 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf, goto out; } - ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET); + ptr = qemu_opt_get(opts, GLUSTER_OPT_PATH); + if (!ptr) { + ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET); + } else if (qemu_opt_get(opts, GLUSTER_OPT_SOCKET)) { + error_setg(&local_err, + "Conflicting parameters 'path' and 'socket'"); + error_append_hint(&local_err, GERR_INDEX_HINT, i); + goto out; + } if (!ptr) { error_setg(&local_err, QERR_MISSING_PARAMETER, - GLUSTER_OPT_SOCKET); + GLUSTER_OPT_PATH); error_append_hint(&local_err, GERR_INDEX_HINT, i); goto out; } @@ -680,7 +693,7 @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf, "file.server.0.host=1.2.3.4," "file.server.0.port=24007," "file.server.1.transport=unix," - "file.server.1.socket=/var/run/glusterd.socket ..." + "file.server.1.path=/var/run/glusterd.socket ..." "\n"); errno = -ret; return NULL; diff --git a/block/nbd-client.c b/block/nbd-client.c index 9206652e45..7b68499b76 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -846,9 +846,6 @@ int nbd_client_init(BlockDriverState *bs, if (client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) { bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP; } - if (client->info.min_block > bs->bl.request_alignment) { - bs->bl.request_alignment = client->info.min_block; - } qemu_co_mutex_init(&client->send_mutex); qemu_co_queue_init(&client->free_sema); diff --git a/block/nbd.c b/block/nbd.c index 8b8ba56cdd..c32ea9fd73 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -474,8 +474,10 @@ static int nbd_co_flush(BlockDriverState *bs) static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) { NBDClientSession *s = nbd_get_client_session(bs); + uint32_t min = s->info.min_block; uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block); + bs->bl.request_alignment = min ? min : BDRV_SECTOR_SIZE; bs->bl.max_pdiscard = max; bs->bl.max_pwrite_zeroes = max; bs->bl.max_transfer = max; diff --git a/block/qcow2.c b/block/qcow2.c index 1914a940e5..28f2d91797 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -4235,7 +4235,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, char *message; va_list ap; - fatal = fatal && !bs->read_only; + fatal = fatal && bdrv_is_writable(bs); if (s->signaled_corruption && (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT))) diff --git a/block/raw-format.c b/block/raw-format.c index ab552c0954..c77290b93f 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -167,16 +167,37 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } +/* Check and adjust the offset, against 'offset' and 'size' options. */ +static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset, + uint64_t bytes, bool is_write) +{ + BDRVRawState *s = bs->opaque; + + if (s->has_size && (*offset > s->size || bytes > (s->size - *offset))) { + /* There's not enough space for the write, or the read request is + * out-of-range. Don't read/write anything to prevent leaking out of + * the size specified in options. */ + return is_write ? -ENOSPC : -EINVAL;; + } + + if (*offset > INT64_MAX - s->offset) { + return -EINVAL; + } + *offset += s->offset; + + return 0; +} + static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - BDRVRawState *s = bs->opaque; + int ret; - if (offset > UINT64_MAX - s->offset) { - return -EINVAL; + ret = raw_adjust_offset(bs, &offset, bytes, false); + if (ret) { + return ret; } - offset += s->offset; BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); @@ -186,23 +207,11 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - BDRVRawState *s = bs->opaque; void *buf = NULL; BlockDriver *drv; QEMUIOVector local_qiov; int ret; - if (s->has_size && (offset > s->size || bytes > (s->size - offset))) { - /* There's not enough space for the data. Don't write anything and just - * fail to prevent leaking out of the size specified in options. */ - return -ENOSPC; - } - - if (offset > UINT64_MAX - s->offset) { - ret = -EINVAL; - goto fail; - } - if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) { /* Handling partial writes would be a pain - so we just * require that guests have 512-byte request alignment if @@ -237,7 +246,10 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, qiov = &local_qiov; } - offset += s->offset; + ret = raw_adjust_offset(bs, &offset, bytes, true); + if (ret) { + goto fail; + } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); @@ -267,22 +279,24 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags) { - BDRVRawState *s = bs->opaque; - if (offset > UINT64_MAX - s->offset) { - return -EINVAL; + int ret; + + ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); + if (ret) { + return ret; } - offset += s->offset; return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); } static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { - BDRVRawState *s = bs->opaque; - if (offset > UINT64_MAX - s->offset) { - return -EINVAL; + int ret; + + ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true); + if (ret) { + return ret; } - offset += s->offset; return bdrv_co_pdiscard(bs->file->bs, offset, bytes); } diff --git a/block/rbd.c b/block/rbd.c index a76a5e8755..2de434dfdd 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -265,13 +265,14 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json, key = qstring_get_str(name); ret = rados_conf_set(cluster, key, qstring_get_str(value)); - QDECREF(name); QDECREF(value); if (ret < 0) { error_setg_errno(errp, -ret, "invalid conf option %s", key); + QDECREF(name); ret = -EINVAL; break; } + QDECREF(name); } QDECREF(keypairs); diff --git a/block/ssh.c b/block/ssh.c index b049a16eb9..8890a0c4ba 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -556,6 +556,7 @@ static QemuOptsList ssh_runtime_opts = { .type = QEMU_OPT_STRING, .help = "Defines how and what to check the host key against", }, + { /* end of list */ } }, }; diff --git a/block/throttle.c b/block/throttle.c index 833175ac77..d5903784c0 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -35,9 +35,12 @@ static QemuOptsList throttle_opts = { }, }; -static int throttle_configure_tgm(BlockDriverState *bs, - ThrottleGroupMember *tgm, - QDict *options, Error **errp) +/* + * If this function succeeds then the throttle group name is stored in + * @group and must be freed by the caller. + * If there's an error then @group remains unmodified. + */ +static int throttle_parse_options(QDict *options, char **group, Error **errp) { int ret; const char *group_name; @@ -62,8 +65,7 @@ static int throttle_configure_tgm(BlockDriverState *bs, goto fin; } - /* Register membership to group with name group_name */ - throttle_group_register_tgm(tgm, group_name, bdrv_get_aio_context(bs)); + *group = g_strdup(group_name); ret = 0; fin: qemu_opts_del(opts); @@ -74,6 +76,8 @@ static int throttle_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { ThrottleGroupMember *tgm = bs->opaque; + char *group; + int ret; bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false, errp); @@ -83,7 +87,14 @@ static int throttle_open(BlockDriverState *bs, QDict *options, bs->supported_write_flags = bs->file->bs->supported_write_flags; bs->supported_zero_flags = bs->file->bs->supported_zero_flags; - return throttle_configure_tgm(bs, tgm, options, errp); + ret = throttle_parse_options(options, &group, errp); + if (ret == 0) { + /* Register membership to group with name group_name */ + throttle_group_register_tgm(tgm, group, bdrv_get_aio_context(bs)); + g_free(group); + } + + return ret; } static void throttle_close(BlockDriverState *bs) @@ -159,35 +170,36 @@ static void throttle_attach_aio_context(BlockDriverState *bs, static int throttle_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, Error **errp) { - ThrottleGroupMember *tgm; + int ret; + char *group = NULL; assert(reopen_state != NULL); assert(reopen_state->bs != NULL); - reopen_state->opaque = g_new0(ThrottleGroupMember, 1); - tgm = reopen_state->opaque; - - return throttle_configure_tgm(reopen_state->bs, tgm, reopen_state->options, - errp); + ret = throttle_parse_options(reopen_state->options, &group, errp); + reopen_state->opaque = group; + return ret; } static void throttle_reopen_commit(BDRVReopenState *reopen_state) { - ThrottleGroupMember *old_tgm = reopen_state->bs->opaque; - ThrottleGroupMember *new_tgm = reopen_state->opaque; + BlockDriverState *bs = reopen_state->bs; + ThrottleGroupMember *tgm = bs->opaque; + char *group = reopen_state->opaque; + + assert(group); - throttle_group_unregister_tgm(old_tgm); - g_free(old_tgm); - reopen_state->bs->opaque = new_tgm; + if (strcmp(group, throttle_group_get_name(tgm))) { + throttle_group_unregister_tgm(tgm); + throttle_group_register_tgm(tgm, group, bdrv_get_aio_context(bs)); + } + g_free(reopen_state->opaque); reopen_state->opaque = NULL; } static void throttle_reopen_abort(BDRVReopenState *reopen_state) { - ThrottleGroupMember *tgm = reopen_state->opaque; - - throttle_group_unregister_tgm(tgm); - g_free(tgm); + g_free(reopen_state->opaque); reopen_state->opaque = NULL; } diff --git a/configure b/configure index 087a82ac50..ceec276693 100755 --- a/configure +++ b/configure @@ -930,6 +930,8 @@ for opt do ;; --firmwarepath=*) firmwarepath="$optarg" ;; + --host=*|--build=*|\ + --disable-dependency-tracking|\ --sbindir=*|--sharedstatedir=*|\ --oldincludedir=*|--datarootdir=*|--infodir=*|--localedir=*|\ --htmldir=*|--dvidir=*|--pdfdir=*|--psdir=*) @@ -2788,6 +2790,7 @@ if test "$sdl" != "no" ; then int main( void ) { return SDL_Init (SDL_INIT_VIDEO); } EOF sdl_cflags=$($sdlconfig --cflags 2>/dev/null) + sdl_cflags="$sdl_cflags -Wno-undef" # workaround 2.0.8 bug if test "$static" = "yes" ; then if $pkg_config $sdlname --exists; then sdl_libs=$($pkg_config $sdlname --static --libs 2>/dev/null) diff --git a/cpus.c b/cpus.c index 114c29b6a0..96bb688d7b 100644 --- a/cpus.c +++ b/cpus.c @@ -843,11 +843,19 @@ void qemu_timer_notify_cb(void *opaque, QEMUClockType type) return; } - if (!qemu_in_vcpu_thread() && first_cpu) { + if (qemu_in_vcpu_thread()) { + /* A CPU is currently running; kick it back out to the + * tcg_cpu_exec() loop so it will recalculate its + * icount deadline immediately. + */ + qemu_cpu_kick(current_cpu); + } else if (first_cpu) { /* qemu_cpu_kick is not enough to kick a halted CPU out of * qemu_tcg_wait_io_event. async_run_on_cpu, instead, * causes cpu_thread_is_idle to return false. This way, * handle_icount_deadline can run. + * If we have no CPUs at all for some reason, we don't + * need to do anything. */ async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL); } diff --git a/device_tree.c b/device_tree.c index a24ddff02b..9eb5fae738 100644 --- a/device_tree.c +++ b/device_tree.c @@ -29,7 +29,7 @@ #include <libfdt.h> -#define FDT_MAX_SIZE 0x10000 +#define FDT_MAX_SIZE 0x100000 void *create_device_tree(int *sizep) { diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index d7fdb1fee3..feb711fb6a 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -426,10 +426,20 @@ Standard Cluster Descriptor: Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)): - Bit 0 - x: Host cluster offset. This is usually _not_ aligned to a - cluster boundary! + Bit 0 - x-1: Host cluster offset. This is usually _not_ aligned to a + cluster or sector boundary! - x+1 - 61: Compressed size of the images in sectors of 512 bytes + x - 61: Number of additional 512-byte sectors used for the + compressed data, beyond the sector containing the offset + in the previous field. Some of these sectors may reside + in the next contiguous host cluster. + + Note that the compressed data does not necessarily occupy + all of the bytes in the final sector; rather, decompression + stops when it has produced a cluster of data. + + Another compressed cluster may map to the tail of the final + sector used by this compressed cluster. If a cluster is unallocated, read requests shall read the data from the backing file (except if bit 0 in the Standard Cluster Descriptor is set). If there is diff --git a/exec.c b/exec.c index 03238a3449..e60ad94a42 100644 --- a/exec.c +++ b/exec.c @@ -1455,6 +1455,7 @@ static int find_max_supported_pagesize(Object *obj, void *opaque) mem_path = object_property_get_str(obj, "mem-path", NULL); if (mem_path) { long hpsize = qemu_mempath_getpagesize(mem_path); + g_free(mem_path); if (hpsize < *hpsize_min) { *hpsize_min = hpsize; } @@ -2575,6 +2576,8 @@ static const MemoryRegionOps watch_mem_ops = { }, }; +static MemTxResult flatview_read(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, int len); static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, const uint8_t *buf, int len); static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len, @@ -3005,6 +3008,7 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, return result; } +/* Called from RCU critical section. */ static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, const uint8_t *buf, int len) { @@ -3013,25 +3017,14 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, MemoryRegion *mr; MemTxResult result = MEMTX_OK; - if (len > 0) { - rcu_read_lock(); - l = len; - mr = flatview_translate(fv, addr, &addr1, &l, true); - result = flatview_write_continue(fv, addr, attrs, buf, len, - addr1, l, mr); - rcu_read_unlock(); - } + l = len; + mr = flatview_translate(fv, addr, &addr1, &l, true); + result = flatview_write_continue(fv, addr, attrs, buf, len, + addr1, l, mr); return result; } -MemTxResult address_space_write(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, - const uint8_t *buf, int len) -{ - return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len); -} - /* Called within RCU critical section. */ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, MemTxAttrs attrs, uint8_t *buf, @@ -3102,42 +3095,61 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, return result; } -MemTxResult flatview_read_full(FlatView *fv, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, int len) +/* Called from RCU critical section. */ +static MemTxResult flatview_read(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, int len) { hwaddr l; hwaddr addr1; MemoryRegion *mr; + + l = len; + mr = flatview_translate(fv, addr, &addr1, &l, false); + return flatview_read_continue(fv, addr, attrs, buf, len, + addr1, l, mr); +} + +MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, int len) +{ MemTxResult result = MEMTX_OK; + FlatView *fv; if (len > 0) { rcu_read_lock(); - l = len; - mr = flatview_translate(fv, addr, &addr1, &l, false); - result = flatview_read_continue(fv, addr, attrs, buf, len, - addr1, l, mr); + fv = address_space_to_flatview(as); + result = flatview_read(fv, addr, attrs, buf, len); rcu_read_unlock(); } return result; } -static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs, - uint8_t *buf, int len, bool is_write) +MemTxResult address_space_write(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, + const uint8_t *buf, int len) { - if (is_write) { - return flatview_write(fv, addr, attrs, (uint8_t *)buf, len); - } else { - return flatview_read(fv, addr, attrs, (uint8_t *)buf, len); + MemTxResult result = MEMTX_OK; + FlatView *fv; + + if (len > 0) { + rcu_read_lock(); + fv = address_space_to_flatview(as); + result = flatview_write(fv, addr, attrs, buf, len); + rcu_read_unlock(); } + + return result; } -MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, - int len, bool is_write) +MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, + uint8_t *buf, int len, bool is_write) { - return flatview_rw(address_space_to_flatview(as), - addr, attrs, buf, len, is_write); + if (is_write) { + return address_space_write(as, addr, attrs, buf, len); + } else { + return address_space_read_full(as, addr, attrs, buf, len); + } } void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf, @@ -3303,14 +3315,12 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len, MemoryRegion *mr; hwaddr l, xlat; - rcu_read_lock(); while (len > 0) { l = len; mr = flatview_translate(fv, addr, &xlat, &l, is_write); if (!memory_access_is_direct(mr, is_write)) { l = memory_access_size(mr, l, addr); if (!memory_region_access_valid(mr, xlat, l, is_write)) { - rcu_read_unlock(); return false; } } @@ -3318,15 +3328,20 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len, len -= l; addr += l; } - rcu_read_unlock(); return true; } bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write) { - return flatview_access_valid(address_space_to_flatview(as), - addr, len, is_write); + FlatView *fv; + bool result; + + rcu_read_lock(); + fv = address_space_to_flatview(as); + result = flatview_access_valid(fv, addr, len, is_write); + rcu_read_unlock(); + return result; } static hwaddr @@ -3372,7 +3387,7 @@ void *address_space_map(AddressSpace *as, hwaddr l, xlat; MemoryRegion *mr; void *ptr; - FlatView *fv = address_space_to_flatview(as); + FlatView *fv; if (len == 0) { return NULL; @@ -3380,6 +3395,7 @@ void *address_space_map(AddressSpace *as, l = len; rcu_read_lock(); + fv = address_space_to_flatview(as); mr = flatview_translate(fv, addr, &xlat, &l, is_write); if (!memory_access_is_direct(mr, is_write)) { diff --git a/gdbstub.c b/gdbstub.c index 2a94030d3b..ca8433e1b6 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -515,6 +515,7 @@ static inline int tohex(int v) return v - 10 + 'a'; } +/* writes 2*len+1 bytes in buf */ static void memtohex(char *buf, const uint8_t *mem, int len) { int i, c; @@ -970,8 +971,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf) const char *p; uint32_t thread; int ch, reg_size, type, res; - char buf[MAX_PACKET_LENGTH]; uint8_t mem_buf[MAX_PACKET_LENGTH]; + char buf[sizeof(mem_buf) + 1 /* trailing NUL */]; uint8_t *registers; target_ulong addr, len; diff --git a/hw/block/pflash_cfi01.c b/hw/block/pflash_cfi01.c index 1113ab1ccf..2e8284001d 100644 --- a/hw/block/pflash_cfi01.c +++ b/hw/block/pflash_cfi01.c @@ -90,7 +90,6 @@ struct pflash_t { uint16_t ident1; uint16_t ident2; uint16_t ident3; - uint8_t cfi_len; uint8_t cfi_table[0x52]; uint64_t counter; unsigned int writeblock_size; @@ -153,7 +152,7 @@ static uint32_t pflash_cfi_query(pflash_t *pfl, hwaddr offset) boff = offset >> (ctz32(pfl->bank_width) + ctz32(pfl->max_device_width) - ctz32(pfl->device_width)); - if (boff > pfl->cfi_len) { + if (boff >= sizeof(pfl->cfi_table)) { return 0; } /* Now we will construct the CFI response generated by a single @@ -385,10 +384,10 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset, boff = boff >> 2; } - if (boff > pfl->cfi_len) { - ret = 0; - } else { + if (boff < sizeof(pfl->cfi_table)) { ret = pfl->cfi_table[boff]; + } else { + ret = 0; } } else { /* If we have a read larger than the bank_width, combine multiple @@ -791,7 +790,6 @@ static void pflash_cfi01_realize(DeviceState *dev, Error **errp) pfl->cmd = 0; pfl->status = 0; /* Hardcoded CFI table */ - pfl->cfi_len = 0x52; /* Standard "QRY" string */ pfl->cfi_table[0x10] = 'Q'; pfl->cfi_table[0x11] = 'R'; diff --git a/hw/block/pflash_cfi02.c b/hw/block/pflash_cfi02.c index c81ddd3a99..75d1ae1026 100644 --- a/hw/block/pflash_cfi02.c +++ b/hw/block/pflash_cfi02.c @@ -83,7 +83,6 @@ struct pflash_t { uint16_t ident3; uint16_t unlock_addr0; uint16_t unlock_addr1; - uint8_t cfi_len; uint8_t cfi_table[0x52]; QEMUTimer *timer; /* The device replicates the flash memory across its memory space. Emulate @@ -235,10 +234,11 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset, break; case 0x98: /* CFI query mode */ - if (boff > pfl->cfi_len) - ret = 0; - else + if (boff < sizeof(pfl->cfi_table)) { ret = pfl->cfi_table[boff]; + } else { + ret = 0; + } break; } @@ -663,7 +663,6 @@ static void pflash_cfi02_realize(DeviceState *dev, Error **errp) pfl->cmd = 0; pfl->status = 0; /* Hardcoded CFI table (mostly from SG29 Spansion flash) */ - pfl->cfi_len = 0x52; /* Standard "QRY" string */ pfl->cfi_table[0x10] = 'Q'; pfl->cfi_table[0x11] = 'R'; diff --git a/hw/char/cmsdk-apb-uart.c b/hw/char/cmsdk-apb-uart.c index 1ad1e14295..9c0929d8a2 100644 --- a/hw/char/cmsdk-apb-uart.c +++ b/hw/char/cmsdk-apb-uart.c @@ -274,6 +274,7 @@ static void uart_write(void *opaque, hwaddr offset, uint64_t value, * is then reflected into the intstatus value by the update function). */ s->state &= ~(value & (R_INTSTATUS_TXO_MASK | R_INTSTATUS_RXO_MASK)); + s->intstatus &= ~value; cmsdk_apb_uart_update(s); break; case A_BAUDDIV: diff --git a/hw/core/loader.c b/hw/core/loader.c index 91669d65aa..c08f130461 100644 --- a/hw/core/loader.c +++ b/hw/core/loader.c @@ -1104,20 +1104,22 @@ int rom_check_and_register_reset(void) if (rom->fw_file) { continue; } - if ((addr > rom->addr) && (as == rom->as)) { - fprintf(stderr, "rom: requested regions overlap " - "(rom %s. free=0x" TARGET_FMT_plx - ", addr=0x" TARGET_FMT_plx ")\n", - rom->name, addr, rom->addr); - return -1; + if (!rom->mr) { + if ((addr > rom->addr) && (as == rom->as)) { + fprintf(stderr, "rom: requested regions overlap " + "(rom %s. free=0x" TARGET_FMT_plx + ", addr=0x" TARGET_FMT_plx ")\n", + rom->name, addr, rom->addr); + return -1; + } + addr = rom->addr; + addr += rom->romsize; + as = rom->as; } - addr = rom->addr; - addr += rom->romsize; section = memory_region_find(rom->mr ? rom->mr : get_system_memory(), rom->addr, 1); rom->isrom = int128_nz(section.size) && memory_region_is_rom(section.mr); memory_region_unref(section.mr); - as = rom->as; } qemu_register_reset(rom_reset, NULL); roms_loaded = 1; diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 11112951a5..a71cd264e2 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -1140,6 +1140,30 @@ static void device_class_init(ObjectClass *class, void *data) dc->user_creatable = true; } +void device_class_set_parent_reset(DeviceClass *dc, + DeviceReset dev_reset, + DeviceReset *parent_reset) +{ + *parent_reset = dc->reset; + dc->reset = dev_reset; +} + +void device_class_set_parent_realize(DeviceClass *dc, + DeviceRealize dev_realize, + DeviceRealize *parent_realize) +{ + *parent_realize = dc->realize; + dc->realize = dev_realize; +} + +void device_class_set_parent_unrealize(DeviceClass *dc, + DeviceUnrealize dev_unrealize, + DeviceUnrealize *parent_unrealize) +{ + *parent_unrealize = dc->unrealize; + dc->unrealize = dev_unrealize; +} + void device_reset(DeviceState *dev) { DeviceClass *klass = DEVICE_GET_CLASS(dev); diff --git a/hw/display/qxl-render.c b/hw/display/qxl-render.c index 90e0865618..9c1c44481f 100644 --- a/hw/display/qxl-render.c +++ b/hw/display/qxl-render.c @@ -169,7 +169,8 @@ void qxl_render_update(PCIQXLDevice *qxl) qemu_mutex_lock(&qxl->ssd.lock); - if (!runstate_is_running() || !qxl->guest_primary.commands) { + if (!runstate_is_running() || !qxl->guest_primary.commands || + qxl->mode == QXL_MODE_UNDEFINED) { qxl_render_update_area_unlocked(qxl); qemu_mutex_unlock(&qxl->ssd.lock); return; diff --git a/hw/display/vga.c b/hw/display/vga.c index d150a3a3eb..1fa66d597d 100644 --- a/hw/display/vga.c +++ b/hw/display/vga.c @@ -1489,6 +1489,8 @@ static void vga_draw_graphic(VGACommonState *s, int full_update) region_start = (s->start_addr * 4); region_end = region_start + (ram_addr_t)s->line_offset * height; + region_end += width * s->get_bpp(s) / 8; /* scanline length */ + region_end -= s->line_offset; if (region_end > s->vbe_size) { /* wraps around (can happen with cirrus vbe modes) */ region_start = 0; diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 73519ab3ac..537957c89a 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2460,6 +2460,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker) AcpiDmarDeviceScope *scope = NULL; /* Root complex IOAPIC use one path[0] only */ size_t ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]); + IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu); assert(iommu); if (iommu->intr_supported) { @@ -2467,7 +2468,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker) } dmar = acpi_data_push(table_data, sizeof(*dmar)); - dmar->host_address_width = VTD_HOST_ADDRESS_WIDTH - 1; + dmar->host_address_width = intel_iommu->aw_bits - 1; dmar->flags = dmar_flags; /* DMAR Remapping Hardware Unit Definition structure */ diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 3a5bb0bc2e..8e0c03e35d 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -128,6 +128,22 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, return new_val; } +static inline void vtd_iommu_lock(IntelIOMMUState *s) +{ + qemu_mutex_lock(&s->iommu_lock); +} + +static inline void vtd_iommu_unlock(IntelIOMMUState *s) +{ + qemu_mutex_unlock(&s->iommu_lock); +} + +/* Whether the address space needs to notify new mappings */ +static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) +{ + return as->notifier_flags & IOMMU_NOTIFIER_MAP; +} + /* GHashTable functions */ static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) { @@ -172,9 +188,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, } /* Reset all the gen of VTDAddressSpace to zero and set the gen of - * IntelIOMMUState to 1. + * IntelIOMMUState to 1. Must be called with IOMMU lock held. */ -static void vtd_reset_context_cache(IntelIOMMUState *s) +static void vtd_reset_context_cache_locked(IntelIOMMUState *s) { VTDAddressSpace *vtd_as; VTDBus *vtd_bus; @@ -197,12 +213,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s) s->context_cache_gen = 1; } -static void vtd_reset_iotlb(IntelIOMMUState *s) +/* Must be called with IOMMU lock held. */ +static void vtd_reset_iotlb_locked(IntelIOMMUState *s) { assert(s->iotlb); g_hash_table_remove_all(s->iotlb); } +static void vtd_reset_iotlb(IntelIOMMUState *s) +{ + vtd_iommu_lock(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); +} + static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { @@ -215,6 +239,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; } +/* Must be called with IOMMU lock held */ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, hwaddr addr) { @@ -235,6 +260,7 @@ out: return entry; } +/* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint16_t domain_id, hwaddr addr, uint64_t slpte, uint8_t access_flags, uint32_t level) @@ -246,7 +272,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { trace_vtd_iotlb_reset("iotlb exceeds size limit"); - vtd_reset_iotlb(s); + vtd_reset_iotlb_locked(s); } entry->gfn = gfn; @@ -521,9 +547,9 @@ static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; } -static inline uint64_t vtd_get_slpte_addr(uint64_t slpte) +static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) { - return slpte & VTD_SL_PT_BASE_ADDR_MASK; + return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); } /* Whether the pte indicates the address of the page frame */ @@ -608,35 +634,29 @@ static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, return true; } -static inline uint64_t vtd_iova_limit(VTDContextEntry *ce) +static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw) { uint32_t ce_agaw = vtd_ce_get_agaw(ce); - return 1ULL << MIN(ce_agaw, VTD_MGAW); + return 1ULL << MIN(ce_agaw, aw); } /* Return true if IOVA passes range check, otherwise false. */ -static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce) +static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce, + uint8_t aw) { /* * Check if @iova is above 2^X-1, where X is the minimum of MGAW * in CAP_REG and AW in context-entry. */ - return !(iova & ~(vtd_iova_limit(ce) - 1)); -} - -static const uint64_t vtd_paging_entry_rsvd_field[] = { - [0] = ~0ULL, - /* For not large page */ - [1] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), - [2] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), - [3] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), - [4] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), - /* For large page */ - [5] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), - [6] = 0x1ff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), - [7] = 0x3ffff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), - [8] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM), -}; + return !(iova & ~(vtd_iova_limit(ce, aw) - 1)); +} + +/* + * Rsvd field masks for spte: + * Index [1] to [4] 4k pages + * Index [5] to [8] large pages + */ +static uint64_t vtd_paging_entry_rsvd_field[9]; static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) { @@ -676,7 +696,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) */ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, uint64_t *slptep, uint32_t *slpte_level, - bool *reads, bool *writes) + bool *reads, bool *writes, uint8_t aw_bits) { dma_addr_t addr = vtd_ce_get_slpt_base(ce); uint32_t level = vtd_ce_get_level(ce); @@ -684,7 +704,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, uint64_t slpte; uint64_t access_right_check; - if (!vtd_iova_range_check(iova, ce)) { + if (!vtd_iova_range_check(iova, ce, aw_bits)) { trace_vtd_err_dmar_iova_overflow(iova); return -VTD_FR_ADDR_BEYOND_MGAW; } @@ -721,7 +741,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, *slpte_level = level; return 0; } - addr = vtd_get_slpte_addr(slpte); + addr = vtd_get_slpte_addr(slpte, aw_bits); level--; } } @@ -729,21 +749,116 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write, typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); /** + * Constant information used during page walking + * + * @hook_fn: hook func to be called when detected page + * @private: private data to be passed into hook func + * @notify_unmap: whether we should notify invalid entries + * @as: VT-d address space of the device + * @aw: maximum address width + * @domain: domain ID of the page walk + */ +typedef struct { + VTDAddressSpace *as; + vtd_page_walk_hook hook_fn; + void *private; + bool notify_unmap; + uint8_t aw; + uint16_t domain_id; +} vtd_page_walk_info; + +static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info) +{ + VTDAddressSpace *as = info->as; + vtd_page_walk_hook hook_fn = info->hook_fn; + void *private = info->private; + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (entry->perm == IOMMU_NONE && !info->notify_unmap) { + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + + assert(hook_fn); + + /* Update local IOVA mapped ranges */ + if (entry->perm) { + if (mapped) { + /* If it's exactly the same translation, skip */ + if (!memcmp(mapped, &target, sizeof(target))) { + trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask, + entry->translated_addr); + return 0; + } else { + /* + * Translation changed. Normally this should not + * happen, but it can happen when with buggy guest + * OSes. Note that there will be a small window that + * we don't have map at all. But that's the best + * effort we can do. The ideal way to emulate this is + * atomically modify the PTE to follow what has + * changed, but we can't. One example is that vfio + * driver only has VFIO_IOMMU_[UN]MAP_DMA but no + * interface to modify a mapping (meanwhile it seems + * meaningless to even provide one). Anyway, let's + * mark this as a TODO in case one day we'll have + * a better solution. + */ + IOMMUAccessFlags cache_perm = entry->perm; + int ret; + + /* Emulate an UNMAP */ + entry->perm = IOMMU_NONE; + trace_vtd_page_walk_one(info->domain_id, + entry->iova, + entry->translated_addr, + entry->addr_mask, + entry->perm); + ret = hook_fn(entry, private); + if (ret) { + return ret; + } + /* Drop any existing mapping */ + iova_tree_remove(as->iova_tree, &target); + /* Recover the correct permission */ + entry->perm = cache_perm; + } + } + iova_tree_insert(as->iova_tree, &target); + } else { + if (!mapped) { + /* Skip since we didn't map this range at all */ + trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); + return 0; + } + iova_tree_remove(as->iova_tree, &target); + } + + trace_vtd_page_walk_one(info->domain_id, entry->iova, + entry->translated_addr, entry->addr_mask, + entry->perm); + return hook_fn(entry, private); +} + +/** * vtd_page_walk_level - walk over specific level for IOVA range * * @addr: base GPA addr to start the walk * @start: IOVA range start address * @end: IOVA range end address (start <= addr < end) - * @hook_fn: hook func to be called when detected page - * @private: private data to be passed into hook func * @read: whether parent level has read permission * @write: whether parent level has write permission - * @notify_unmap: whether we should notify invalid entries + * @info: constant information for the page walk */ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, - uint64_t end, vtd_page_walk_hook hook_fn, - void *private, uint32_t level, - bool read, bool write, bool notify_unmap) + uint64_t end, uint32_t level, bool read, + bool write, vtd_page_walk_info *info) { bool read_cur, write_cur, entry_valid; uint32_t offset; @@ -786,37 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; - if (vtd_is_last_slpte(slpte, level)) { + if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + /* + * This is a valid PDE (or even bigger than PDE). We need + * to walk one further level. + */ + ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), + iova, MIN(iova_next, end), level - 1, + read_cur, write_cur, info); + } else { + /* + * This means we are either: + * + * (1) the real page entry (either 4K page, or huge page) + * (2) the whole range is invalid + * + * In either case, we send an IOTLB notification down. + */ entry.target_as = &address_space_memory; entry.iova = iova & subpage_mask; - /* NOTE: this is only meaningful if entry_valid == true */ - entry.translated_addr = vtd_get_slpte_addr(slpte); - entry.addr_mask = ~subpage_mask; entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); - if (!entry_valid && !notify_unmap) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr, - entry.addr_mask, entry.perm); - if (hook_fn) { - ret = hook_fn(&entry, private); - if (ret < 0) { - return ret; - } - } - } else { - if (!entry_valid) { - trace_vtd_page_walk_skip_perm(iova, iova_next); - goto next; - } - ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova, - MIN(iova_next, end), hook_fn, private, - level - 1, read_cur, write_cur, - notify_unmap); - if (ret < 0) { - return ret; - } + entry.addr_mask = ~subpage_mask; + /* NOTE: this is only meaningful if entry_valid == true */ + entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + ret = vtd_page_walk_one(&entry, info); + } + + if (ret < 0) { + return ret; } next: @@ -832,27 +944,24 @@ next: * @ce: context entry to walk upon * @start: IOVA address to start the walk * @end: IOVA range end address (start <= addr < end) - * @hook_fn: the hook that to be called for each detected area - * @private: private data for the hook function + * @info: page walking information struct */ static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, - vtd_page_walk_hook hook_fn, void *private, - bool notify_unmap) + vtd_page_walk_info *info) { dma_addr_t addr = vtd_ce_get_slpt_base(ce); uint32_t level = vtd_ce_get_level(ce); - if (!vtd_iova_range_check(start, ce)) { + if (!vtd_iova_range_check(start, ce, info->aw)) { return -VTD_FR_ADDR_BEYOND_MGAW; } - if (!vtd_iova_range_check(end, ce)) { + if (!vtd_iova_range_check(end, ce, info->aw)) { /* Fix end so that it reaches the maximum */ - end = vtd_iova_limit(ce); + end = vtd_iova_limit(ce, info->aw); } - return vtd_page_walk_level(addr, start, end, hook_fn, private, - level, true, true, notify_unmap); + return vtd_page_walk_level(addr, start, end, level, true, true, info); } /* Map a device to its corresponding domain (context-entry) */ @@ -874,7 +983,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return -VTD_FR_ROOT_ENTRY_P; } - if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) { + if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) { trace_vtd_re_invalid(re.rsvd, re.val); return -VTD_FR_ROOT_ENTRY_RSVD; } @@ -891,7 +1000,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, } if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || - (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { + (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) { trace_vtd_ce_invalid(ce->hi, ce->lo); return -VTD_FR_CONTEXT_ENTRY_RSVD; } @@ -911,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry, + void *private) +{ + memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); + return 0; +} + +/* If context entry is NULL, we'll try to fetch it on our own. */ +static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, + VTDContextEntry *ce, + hwaddr addr, hwaddr size) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + vtd_page_walk_info info = { + .hook_fn = vtd_sync_shadow_page_hook, + .private = (void *)&vtd_as->iommu, + .notify_unmap = true, + .aw = s->aw_bits, + .as = vtd_as, + }; + VTDContextEntry ce_cache; + int ret; + + if (ce) { + /* If the caller provided context entry, use it */ + ce_cache = *ce; + } else { + /* If the caller didn't provide ce, try to fetch */ + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce_cache); + if (ret) { + /* + * This should not really happen, but in case it happens, + * we just skip the sync for this time. After all we even + * don't have the root table pointer! + */ + trace_vtd_err("Detected invalid context entry when " + "trying to sync shadow page table"); + return 0; + } + } + + info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi); + + return vtd_page_walk(&ce_cache, addr, addr + size, &info); +} + +static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +{ + return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX); +} + /* * Fetch translation type for specific device. Returns <0 if error * happens, otherwise return the shifted type to check against @@ -1092,7 +1253,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, IntelIOMMUState *s = vtd_as->iommu_state; VTDContextEntry ce; uint8_t bus_num = pci_bus_num(bus); - VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry; + VTDContextCacheEntry *cc_entry; uint64_t slpte, page_mask; uint32_t level; uint16_t source_id = vtd_make_source_id(bus_num, devfn); @@ -1109,6 +1270,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, */ assert(!vtd_is_interrupt_addr(addr)); + vtd_iommu_lock(s); + + cc_entry = &vtd_as->context_cache_entry; + /* Try to fetch slpte form IOTLB */ iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); if (iotlb_entry) { @@ -1168,12 +1333,12 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, * IOMMU region can be swapped back. */ vtd_pt_enable_fast_path(s, source_id); - + vtd_iommu_unlock(s); return true; } ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, - &reads, &writes); + &reads, &writes, s->aw_bits); if (ret_fr) { ret_fr = -ret_fr; if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { @@ -1189,13 +1354,15 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte, access_flags, level); out: + vtd_iommu_unlock(s); entry->iova = addr & page_mask; - entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask; + entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; entry->perm = access_flags; return true; error: + vtd_iommu_unlock(s); entry->iova = 0; entry->translated_addr = 0; entry->addr_mask = 0; @@ -1207,7 +1374,7 @@ static void vtd_root_table_setup(IntelIOMMUState *s) { s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG); s->root_extended = s->root & VTD_RTADDR_RTT; - s->root &= VTD_RTADDR_ADDR_MASK; + s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits); trace_vtd_reg_dmar_root(s->root, s->root_extended); } @@ -1223,7 +1390,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) uint64_t value = 0; value = vtd_get_quad_raw(s, DMAR_IRTA_REG); s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1); - s->intr_root = value & VTD_IRTA_ADDR_MASK; + s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits); s->intr_eime = value & VTD_IRTA_EIME; /* Notify global invalidation */ @@ -1234,20 +1401,23 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s) static void vtd_iommu_replay_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; - QLIST_FOREACH(node, &s->notifiers_list, next) { - memory_region_iommu_replay_all(&node->vtd_as->iommu); + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + vtd_sync_shadow_page_table(vtd_as); } } static void vtd_context_global_invalidate(IntelIOMMUState *s) { trace_vtd_inv_desc_cc_global(); + /* Protects context cache */ + vtd_iommu_lock(s); s->context_cache_gen++; if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) { - vtd_reset_context_cache(s); + vtd_reset_context_cache_locked(s); } + vtd_iommu_unlock(s); vtd_switch_address_space_all(s); /* * From VT-d spec 6.5.2.1, a global context entry invalidation @@ -1299,7 +1469,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), VTD_PCI_FUNC(devfn_it)); + vtd_iommu_lock(s); vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); /* * Do switch address space when needed, in case if the * device passthrough bit is switched. @@ -1307,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, vtd_switch_address_space(vtd_as); /* * So a device is moving out of (or moving into) a - * domain, a replay() suites here to notify all the - * IOMMU_NOTIFIER_MAP registers about this change. + * domain, resync the shadow page table. * This won't bring bad even if we have no such * notifier registered - the IOMMU notification * framework will skip MAP notifications if that * happened. */ - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } @@ -1358,48 +1529,60 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s) static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) { - IntelIOMMUNotifierNode *node; VTDContextEntry ce; VTDAddressSpace *vtd_as; trace_vtd_inv_desc_iotlb_domain(domain_id); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain, &domain_id); + vtd_iommu_unlock(s); - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - memory_region_iommu_replay_all(&vtd_as->iommu); + vtd_sync_shadow_page_table(vtd_as); } } } -static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry, - void *private) -{ - memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry); - return 0; -} - static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, uint8_t am) { - IntelIOMMUNotifierNode *node; + VTDAddressSpace *vtd_as; VTDContextEntry ce; int ret; + hwaddr size = (1 << am) * VTD_PAGE_SIZE; - QLIST_FOREACH(node, &(s->notifiers_list), next) { - VTDAddressSpace *vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) { - vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE, - vtd_page_invalidate_notify_hook, - (void *)&vtd_as->iommu, true); + if (vtd_as_has_map_notifier(vtd_as)) { + /* + * As long as we have MAP notifications registered in + * any of our IOMMU notifiers, we need to sync the + * shadow page table. + */ + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); + } else { + /* + * For UNMAP-only notifiers, we don't need to walk the + * page tables. We just deliver the PSI down to + * invalidate caches. + */ + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = 0, + .addr_mask = size - 1, + .perm = IOMMU_NONE, + }; + memory_region_notify_iommu(&vtd_as->iommu, entry); + } } } } @@ -1415,7 +1598,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, info.domain_id = domain_id; info.addr = addr; info.mask = ~((1 << am) - 1); + vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); + vtd_iommu_unlock(s); vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); } @@ -1479,7 +1664,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en) trace_vtd_inv_qi_enable(en); if (en) { - s->iq = iqa_val & VTD_IQA_IQA_MASK; + s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits); /* 2^(x+8) entries */ s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8); s->qi_enabled = true; @@ -2323,8 +2508,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; - IntelIOMMUNotifierNode *node = NULL; - IntelIOMMUNotifierNode *next_node = NULL; if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) { error_report("We need to set cache_mode=1 for intel-iommu to enable " @@ -2332,22 +2515,13 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, exit(1); } - if (old == IOMMU_NOTIFIER_NONE) { - node = g_malloc0(sizeof(*node)); - node->vtd_as = vtd_as; - QLIST_INSERT_HEAD(&s->notifiers_list, node, next); - return; - } + /* Update per-address-space notifier flags */ + vtd_as->notifier_flags = new; - /* update notifier node with new flags */ - QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) { - if (node->vtd_as == vtd_as) { - if (new == IOMMU_NOTIFIER_NONE) { - QLIST_REMOVE(node, next); - g_free(node); - } - return; - } + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(vtd_as, next); } } @@ -2410,6 +2584,8 @@ static Property vtd_properties[] = { DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false), + DEFINE_PROP_UINT8("x-aw-bits", IntelIOMMUState, aw_bits, + VTD_HOST_ADDRESS_WIDTH), DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), DEFINE_PROP_END_OF_LIST(), }; @@ -2714,6 +2890,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) vtd_dev_as->devfn = (uint8_t)devfn; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; + vtd_dev_as->iova_tree = iova_tree_new(); /* * Memory region relationships looks like (Address range shows @@ -2765,6 +2942,8 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) hwaddr size; hwaddr start = n->start; hwaddr end = n->end; + IntelIOMMUState *s = as->iommu_state; + DMAMap map; /* * Note: all the codes in this function has a assumption that IOVA @@ -2772,12 +2951,12 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) * VT-d spec), otherwise we need to consider overflow of 64 bits. */ - if (end > VTD_ADDRESS_SIZE) { + if (end > VTD_ADDRESS_SIZE(s->aw_bits)) { /* * Don't need to unmap regions that is bigger than the whole * VT-d supported address space size */ - end = VTD_ADDRESS_SIZE; + end = VTD_ADDRESS_SIZE(s->aw_bits); } assert(start <= end); @@ -2789,9 +2968,9 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) * suite the minimum available mask. */ int n = 64 - clz64(size); - if (n > VTD_MGAW) { + if (n > s->aw_bits) { /* should not happen, but in case it happens, limit it */ - n = VTD_MGAW; + n = s->aw_bits; } size = 1ULL << n; } @@ -2809,17 +2988,19 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) VTD_PCI_FUNC(as->devfn), entry.iova, size); + map.iova = entry.iova; + map.size = entry.addr_mask; + iova_tree_remove(as->iova_tree, &map); + memory_region_notify_one(n, &entry); } static void vtd_address_space_unmap_all(IntelIOMMUState *s) { - IntelIOMMUNotifierNode *node; VTDAddressSpace *vtd_as; IOMMUNotifier *n; - QLIST_FOREACH(node, &s->notifiers_list, next) { - vtd_as = node->vtd_as; + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { vtd_address_space_unmap(vtd_as, n); } @@ -2851,7 +3032,19 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) PCI_FUNC(vtd_as->devfn), VTD_CONTEXT_ENTRY_DID(ce.hi), ce.hi, ce.lo); - vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false); + if (vtd_as_has_map_notifier(vtd_as)) { + /* This is required only for MAP typed notifiers */ + vtd_page_walk_info info = { + .hook_fn = vtd_replay_hook, + .private = (void *)n, + .notify_unmap = false, + .aw = s->aw_bits, + .as = vtd_as, + .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi), + }; + + vtd_page_walk(&ce, 0, ~0ULL, &info); + } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); @@ -2882,10 +3075,27 @@ static void vtd_init(IntelIOMMUState *s) s->qi_enabled = false; s->iq_last_desc_type = VTD_INV_DESC_NONE; s->next_frcd_reg = 0; - s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW | - VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS; + s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | + VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | + VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); + if (s->aw_bits == VTD_HOST_AW_48BIT) { + s->cap |= VTD_CAP_SAGAW_48bit; + } s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; + /* + * Rsvd field masks for spte + */ + vtd_paging_entry_rsvd_field[0] = ~0ULL; + vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits); + vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); + vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); + vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); + vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits); + vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits); + vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits); + vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits); + if (x86_iommu->intr_supported) { s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; if (s->intr_eim == ON_OFF_AUTO_ON) { @@ -2906,8 +3116,10 @@ static void vtd_init(IntelIOMMUState *s) s->cap |= VTD_CAP_CM; } - vtd_reset_context_cache(s); - vtd_reset_iotlb(s); + vtd_iommu_lock(s); + vtd_reset_context_cache_locked(s); + vtd_reset_iotlb_locked(s); + vtd_iommu_unlock(s); /* Define registers with default values and bit semantics */ vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); @@ -3021,6 +3233,14 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) } } + /* Currently only address widths supported are 39 and 48 bits */ + if ((s->aw_bits != VTD_HOST_AW_39BIT) && + (s->aw_bits != VTD_HOST_AW_48BIT)) { + error_setg(errp, "Supported values for x-aw-bits are: %d, %d", + VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); + return false; + } + return true; } @@ -3047,7 +3267,8 @@ static void vtd_realize(DeviceState *dev, Error **errp) return; } - QLIST_INIT(&s->notifiers_list); + QLIST_INIT(&s->vtd_as_with_notifiers); + qemu_mutex_init(&s->iommu_lock); memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 0e73a65bf2..d084099ed9 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -131,7 +131,7 @@ #define VTD_TLB_DID(val) (((val) >> 32) & VTD_DOMAIN_ID_MASK) /* IVA_REG */ -#define VTD_IVA_ADDR(val) ((val) & ~0xfffULL & ((1ULL << VTD_MGAW) - 1)) +#define VTD_IVA_ADDR(val) ((val) & ~0xfffULL) #define VTD_IVA_AM(val) ((val) & 0x3fULL) /* GCMD_REG */ @@ -172,10 +172,10 @@ /* RTADDR_REG */ #define VTD_RTADDR_RTT (1ULL << 11) -#define VTD_RTADDR_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL) +#define VTD_RTADDR_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL) /* IRTA_REG */ -#define VTD_IRTA_ADDR_MASK (VTD_HAW_MASK ^ 0xfffULL) +#define VTD_IRTA_ADDR_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL) #define VTD_IRTA_EIME (1ULL << 11) #define VTD_IRTA_SIZE_MASK (0xfULL) @@ -197,9 +197,8 @@ #define VTD_DOMAIN_ID_SHIFT 16 /* 16-bit domain id for 64K domains */ #define VTD_DOMAIN_ID_MASK ((1UL << VTD_DOMAIN_ID_SHIFT) - 1) #define VTD_CAP_ND (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL) -#define VTD_MGAW 39 /* Maximum Guest Address Width */ -#define VTD_ADDRESS_SIZE (1ULL << VTD_MGAW) -#define VTD_CAP_MGAW (((VTD_MGAW - 1) & 0x3fULL) << 16) +#define VTD_ADDRESS_SIZE(aw) (1ULL << (aw)) +#define VTD_CAP_MGAW(aw) ((((aw) - 1) & 0x3fULL) << 16) #define VTD_MAMV 18ULL #define VTD_CAP_MAMV (VTD_MAMV << 48) #define VTD_CAP_PSI (1ULL << 39) @@ -213,13 +212,12 @@ #define VTD_CAP_SAGAW_39bit (0x2ULL << VTD_CAP_SAGAW_SHIFT) /* 48-bit AGAW, 4-level page-table */ #define VTD_CAP_SAGAW_48bit (0x4ULL << VTD_CAP_SAGAW_SHIFT) -#define VTD_CAP_SAGAW VTD_CAP_SAGAW_39bit /* IQT_REG */ #define VTD_IQT_QT(val) (((val) >> 4) & 0x7fffULL) /* IQA_REG */ -#define VTD_IQA_IQA_MASK (VTD_HAW_MASK ^ 0xfffULL) +#define VTD_IQA_IQA_MASK(aw) (VTD_HAW_MASK(aw) ^ 0xfffULL) #define VTD_IQA_QS 0x7ULL /* IQH_REG */ @@ -252,7 +250,7 @@ #define VTD_FRCD_SID_MASK 0xffffULL #define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK) /* For the low 64-bit of 128-bit */ -#define VTD_FRCD_FI(val) ((val) & (((1ULL << VTD_MGAW) - 1) ^ 0xfffULL)) +#define VTD_FRCD_FI(val) ((val) & ~0xfffULL) /* DMA Remapping Fault Conditions */ typedef enum VTDFaultReason { @@ -360,8 +358,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_IOTLB_DOMAIN (2ULL << 4) #define VTD_INV_DESC_IOTLB_PAGE (3ULL << 4) #define VTD_INV_DESC_IOTLB_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK) -#define VTD_INV_DESC_IOTLB_ADDR(val) ((val) & ~0xfffULL & \ - ((1ULL << VTD_MGAW) - 1)) +#define VTD_INV_DESC_IOTLB_ADDR(val) ((val) & ~0xfffULL) #define VTD_INV_DESC_IOTLB_AM(val) ((val) & 0x3fULL) #define VTD_INV_DESC_IOTLB_RSVD_LO 0xffffffff0000ff00ULL #define VTD_INV_DESC_IOTLB_RSVD_HI 0xf80ULL @@ -373,6 +370,24 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8 +/* Rsvd field masks for spte */ +#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw) \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_PAGE_L3_RSVD_MASK(aw) \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \ + (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_LPAGE_L1_RSVD_MASK(aw) \ + (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw) \ + (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw) \ + (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +#define VTD_SPTE_LPAGE_L4_RSVD_MASK(aw) \ + (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) + /* Information about page-selective IOTLB invalidate */ struct VTDIOTLBPageInvInfo { uint16_t domain_id; @@ -403,7 +418,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_ROOT_ENTRY_CTP (~0xfffULL) #define VTD_ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(VTDRootEntry)) -#define VTD_ROOT_ENTRY_RSVD (0xffeULL | ~VTD_HAW_MASK) +#define VTD_ROOT_ENTRY_RSVD(aw) (0xffeULL | ~VTD_HAW_MASK(aw)) /* Masks for struct VTDContextEntry */ /* lo */ @@ -415,7 +430,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_CONTEXT_TT_PASS_THROUGH (2ULL << 2) /* Second Level Page Translation Pointer*/ #define VTD_CONTEXT_ENTRY_SLPTPTR (~0xfffULL) -#define VTD_CONTEXT_ENTRY_RSVD_LO (0xff0ULL | ~VTD_HAW_MASK) +#define VTD_CONTEXT_ENTRY_RSVD_LO(aw) (0xff0ULL | ~VTD_HAW_MASK(aw)) /* hi */ #define VTD_CONTEXT_ENTRY_AW 7ULL /* Adjusted guest-address-width */ #define VTD_CONTEXT_ENTRY_DID(val) (((val) >> 8) & VTD_DOMAIN_ID_MASK) @@ -439,7 +454,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_SL_RW_MASK 3ULL #define VTD_SL_R 1ULL #define VTD_SL_W (1ULL << 1) -#define VTD_SL_PT_BASE_ADDR_MASK (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK) +#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw)) #define VTD_SL_IGN_COM 0xbff0000000000000ULL #endif diff --git a/hw/i386/multiboot.c b/hw/i386/multiboot.c index c7b70c91d5..36b22832cd 100644 --- a/hw/i386/multiboot.c +++ b/hw/i386/multiboot.c @@ -31,12 +31,13 @@ #include "hw/loader.h" #include "elf.h" #include "sysemu/sysemu.h" +#include "qemu/error-report.h" /* Show multiboot debug output */ //#define DEBUG_MULTIBOOT #ifdef DEBUG_MULTIBOOT -#define mb_debug(a...) fprintf(stderr, ## a) +#define mb_debug(a...) error_report(a) #else #define mb_debug(a...) #endif @@ -137,7 +138,7 @@ static void mb_add_mod(MultibootState *s, stl_p(p + MB_MOD_END, end); stl_p(p + MB_MOD_CMDLINE, cmdline_phys); - mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx"\n", + mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx, s->mb_mods_count, start, end); s->mb_mods_count++; @@ -179,12 +180,12 @@ int load_multiboot(FWCfgState *fw_cfg, if (!is_multiboot) return 0; /* no multiboot */ - mb_debug("qemu: I believe we found a multiboot image!\n"); + mb_debug("qemu: I believe we found a multiboot image!"); memset(bootinfo, 0, sizeof(bootinfo)); memset(&mbs, 0, sizeof(mbs)); if (flags & 0x00000004) { /* MULTIBOOT_HEADER_HAS_VBE */ - fprintf(stderr, "qemu: multiboot knows VBE. we don't.\n"); + error_report("qemu: multiboot knows VBE. we don't."); } if (!(flags & 0x00010000)) { /* MULTIBOOT_HEADER_HAS_ADDR */ uint64_t elf_entry; @@ -193,7 +194,7 @@ int load_multiboot(FWCfgState *fw_cfg, fclose(f); if (((struct elf64_hdr*)header)->e_machine == EM_X86_64) { - fprintf(stderr, "Cannot load x86-64 image, give a 32bit one.\n"); + error_report("Cannot load x86-64 image, give a 32bit one."); exit(1); } @@ -201,7 +202,7 @@ int load_multiboot(FWCfgState *fw_cfg, &elf_low, &elf_high, 0, I386_ELF_MACHINE, 0, 0); if (kernel_size < 0) { - fprintf(stderr, "Error while loading elf kernel\n"); + error_report("Error while loading elf kernel"); exit(1); } mh_load_addr = elf_low; @@ -210,12 +211,13 @@ int load_multiboot(FWCfgState *fw_cfg, mbs.mb_buf = g_malloc(mb_kernel_size); if (rom_copy(mbs.mb_buf, mh_load_addr, mb_kernel_size) != mb_kernel_size) { - fprintf(stderr, "Error while fetching elf kernel from rom\n"); + error_report("Error while fetching elf kernel from rom"); exit(1); } - mb_debug("qemu: loading multiboot-elf kernel (%#x bytes) with entry %#zx\n", - mb_kernel_size, (size_t)mh_entry_addr); + mb_debug("qemu: loading multiboot-elf kernel " + "(%#x bytes) with entry %#zx", + mb_kernel_size, (size_t)mh_entry_addr); } else { /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_ADDR. */ uint32_t mh_header_addr = ldl_p(header+i+12); @@ -224,7 +226,11 @@ int load_multiboot(FWCfgState *fw_cfg, mh_load_addr = ldl_p(header+i+16); if (mh_header_addr < mh_load_addr) { - fprintf(stderr, "invalid mh_load_addr address\n"); + error_report("invalid load_addr address"); + exit(1); + } + if (mh_header_addr - mh_load_addr > i) { + error_report("invalid header_addr address"); exit(1); } @@ -233,43 +239,43 @@ int load_multiboot(FWCfgState *fw_cfg, mh_entry_addr = ldl_p(header+i+28); if (mh_load_end_addr) { - if (mh_bss_end_addr < mh_load_addr) { - fprintf(stderr, "invalid mh_bss_end_addr address\n"); - exit(1); - } - mb_kernel_size = mh_bss_end_addr - mh_load_addr; - if (mh_load_end_addr < mh_load_addr) { - fprintf(stderr, "invalid mh_load_end_addr address\n"); + error_report("invalid load_end_addr address"); exit(1); } mb_load_size = mh_load_end_addr - mh_load_addr; } else { if (kernel_file_size < mb_kernel_text_offset) { - fprintf(stderr, "invalid kernel_file_size\n"); + error_report("invalid kernel_file_size"); exit(1); } - mb_kernel_size = kernel_file_size - mb_kernel_text_offset; - mb_load_size = mb_kernel_size; + mb_load_size = kernel_file_size - mb_kernel_text_offset; + } + if (mb_load_size > UINT32_MAX - mh_load_addr) { + error_report("kernel does not fit in address space"); + exit(1); + } + if (mh_bss_end_addr) { + if (mh_bss_end_addr < (mh_load_addr + mb_load_size)) { + error_report("invalid bss_end_addr address"); + exit(1); + } + mb_kernel_size = mh_bss_end_addr - mh_load_addr; + } else { + mb_kernel_size = mb_load_size; } - /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_VBE. - uint32_t mh_mode_type = ldl_p(header+i+32); - uint32_t mh_width = ldl_p(header+i+36); - uint32_t mh_height = ldl_p(header+i+40); - uint32_t mh_depth = ldl_p(header+i+44); */ - - mb_debug("multiboot: mh_header_addr = %#x\n", mh_header_addr); - mb_debug("multiboot: mh_load_addr = %#x\n", mh_load_addr); - mb_debug("multiboot: mh_load_end_addr = %#x\n", mh_load_end_addr); - mb_debug("multiboot: mh_bss_end_addr = %#x\n", mh_bss_end_addr); - mb_debug("qemu: loading multiboot kernel (%#x bytes) at %#x\n", + mb_debug("multiboot: header_addr = %#x", mh_header_addr); + mb_debug("multiboot: load_addr = %#x", mh_load_addr); + mb_debug("multiboot: load_end_addr = %#x", mh_load_end_addr); + mb_debug("multiboot: bss_end_addr = %#x", mh_bss_end_addr); + mb_debug("qemu: loading multiboot kernel (%#x bytes) at %#x", mb_load_size, mh_load_addr); mbs.mb_buf = g_malloc(mb_kernel_size); fseek(f, mb_kernel_text_offset, SEEK_SET); if (fread(mbs.mb_buf, 1, mb_load_size, f) != mb_load_size) { - fprintf(stderr, "fread() failed\n"); + error_report("fread() failed"); exit(1); } memset(mbs.mb_buf + mb_load_size, 0, mb_kernel_size - mb_load_size); @@ -323,10 +329,10 @@ int load_multiboot(FWCfgState *fw_cfg, hwaddr c = mb_add_cmdline(&mbs, tmpbuf); if ((next_space = strchr(tmpbuf, ' '))) *next_space = '\0'; - mb_debug("multiboot loading module: %s\n", tmpbuf); + mb_debug("multiboot loading module: %s", tmpbuf); mb_mod_length = get_image_size(tmpbuf); if (mb_mod_length < 0) { - fprintf(stderr, "Failed to open file '%s'\n", tmpbuf); + error_report("Failed to open file '%s'", tmpbuf); exit(1); } @@ -337,7 +343,7 @@ int load_multiboot(FWCfgState *fw_cfg, mb_add_mod(&mbs, mbs.mb_buf_phys + offs, mbs.mb_buf_phys + offs + mb_mod_length, c); - mb_debug("mod_start: %p\nmod_end: %p\n cmdline: "TARGET_FMT_plx"\n", + mb_debug("mod_start: %p\nmod_end: %p\n cmdline: "TARGET_FMT_plx, (char *)mbs.mb_buf + offs, (char *)mbs.mb_buf + offs + mb_mod_length, c); initrd_filename = next_initrd+1; @@ -365,10 +371,11 @@ int load_multiboot(FWCfgState *fw_cfg, stl_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot switch? */ stl_p(bootinfo + MBI_MMAP_ADDR, ADDR_E820_MAP); - mb_debug("multiboot: mh_entry_addr = %#x\n", mh_entry_addr); - mb_debug(" mb_buf_phys = "TARGET_FMT_plx"\n", mbs.mb_buf_phys); - mb_debug(" mod_start = "TARGET_FMT_plx"\n", mbs.mb_buf_phys + mbs.offset_mods); - mb_debug(" mb_mods_count = %d\n", mbs.mb_mods_count); + mb_debug("multiboot: entry_addr = %#x", mh_entry_addr); + mb_debug(" mb_buf_phys = "TARGET_FMT_plx, mbs.mb_buf_phys); + mb_debug(" mod_start = "TARGET_FMT_plx, + mbs.mb_buf_phys + mbs.offset_mods); + mb_debug(" mb_mods_count = %d", mbs.mb_mods_count); /* save bootinfo off the stack */ mb_bootinfo_data = g_memdup(bootinfo, sizeof(bootinfo)); diff --git a/hw/i386/trace-events b/hw/i386/trace-events index d43b4b6cd3..9e5dcb6b2b 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -39,9 +39,10 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry" vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64 -vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d" +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64 +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read" -vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty" vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 373311f91a..0741f3405e 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -533,13 +533,6 @@ static void ahci_check_cmd_bh(void *opaque) qemu_bh_delete(ad->check_bh); ad->check_bh = NULL; - if ((ad->busy_slot != -1) && - !(ad->port.ifs[0].status & (BUSY_STAT|DRQ_STAT))) { - /* no longer busy */ - ad->port_regs.cmd_issue &= ~(1 << ad->busy_slot); - ad->busy_slot = -1; - } - check_cmd(ad->hba, ad->port_no); } @@ -1426,6 +1419,12 @@ static void ahci_cmd_done(IDEDMA *dma) trace_ahci_cmd_done(ad->hba, ad->port_no); + /* no longer busy */ + if (ad->busy_slot != -1) { + ad->port_regs.cmd_issue &= ~(1 << ad->busy_slot); + ad->busy_slot = -1; + } + /* update d2h status */ ahci_write_fis_d2h(ad); diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c index 7b54d52376..864b7c6515 100644 --- a/hw/intc/arm_gicv3_common.c +++ b/hw/intc/arm_gicv3_common.c @@ -27,6 +27,7 @@ #include "hw/intc/arm_gicv3_common.h" #include "gicv3_internal.h" #include "hw/arm/linux-boot-if.h" +#include "sysemu/kvm.h" static int gicv3_pre_save(void *opaque) { @@ -141,6 +142,79 @@ static const VMStateDescription vmstate_gicv3_cpu = { } }; +static int gicv3_gicd_no_migration_shift_bug_pre_load(void *opaque) +{ + GICv3State *cs = opaque; + + /* + * The gicd_no_migration_shift_bug flag is used for migration compatibility + * for old version QEMU which may have the GICD bmp shift bug under KVM mode. + * Strictly, what we want to know is whether the migration source is using + * KVM. Since we don't have any way to determine that, we look at whether the + * destination is using KVM; this is close enough because for the older QEMU + * versions with this bug KVM -> TCG migration didn't work anyway. If the + * source is a newer QEMU without this bug it will transmit the migration + * subsection which sets the flag to true; otherwise it will remain set to + * the value we select here. + */ + if (kvm_enabled()) { + cs->gicd_no_migration_shift_bug = false; + } + + return 0; +} + +static int gicv3_gicd_no_migration_shift_bug_post_load(void *opaque, + int version_id) +{ + GICv3State *cs = opaque; + + if (cs->gicd_no_migration_shift_bug) { + return 0; + } + + /* Older versions of QEMU had a bug in the handling of state save/restore + * to the KVM GICv3: they got the offset in the bitmap arrays wrong, + * so that instead of the data for external interrupts 32 and up + * starting at bit position 32 in the bitmap, it started at bit + * position 64. If we're receiving data from a QEMU with that bug, + * we must move the data down into the right place. + */ + memmove(cs->group, (uint8_t *)cs->group + GIC_INTERNAL / 8, + sizeof(cs->group) - GIC_INTERNAL / 8); + memmove(cs->grpmod, (uint8_t *)cs->grpmod + GIC_INTERNAL / 8, + sizeof(cs->grpmod) - GIC_INTERNAL / 8); + memmove(cs->enabled, (uint8_t *)cs->enabled + GIC_INTERNAL / 8, + sizeof(cs->enabled) - GIC_INTERNAL / 8); + memmove(cs->pending, (uint8_t *)cs->pending + GIC_INTERNAL / 8, + sizeof(cs->pending) - GIC_INTERNAL / 8); + memmove(cs->active, (uint8_t *)cs->active + GIC_INTERNAL / 8, + sizeof(cs->active) - GIC_INTERNAL / 8); + memmove(cs->edge_trigger, (uint8_t *)cs->edge_trigger + GIC_INTERNAL / 8, + sizeof(cs->edge_trigger) - GIC_INTERNAL / 8); + + /* + * While this new version QEMU doesn't have this kind of bug as we fix it, + * so it needs to set the flag to true to indicate that and it's necessary + * for next migration to work from this new version QEMU. + */ + cs->gicd_no_migration_shift_bug = true; + + return 0; +} + +const VMStateDescription vmstate_gicv3_gicd_no_migration_shift_bug = { + .name = "arm_gicv3/gicd_no_migration_shift_bug", + .version_id = 1, + .minimum_version_id = 1, + .pre_load = gicv3_gicd_no_migration_shift_bug_pre_load, + .post_load = gicv3_gicd_no_migration_shift_bug_post_load, + .fields = (VMStateField[]) { + VMSTATE_BOOL(gicd_no_migration_shift_bug, GICv3State), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_gicv3 = { .name = "arm_gicv3", .version_id = 1, @@ -165,6 +239,10 @@ static const VMStateDescription vmstate_gicv3 = { VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, GICv3State, num_cpu, vmstate_gicv3_cpu, GICv3CPUState), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &vmstate_gicv3_gicd_no_migration_shift_bug, + NULL } }; @@ -364,6 +442,7 @@ static void arm_gicv3_common_reset(DeviceState *dev) gicv3_gicd_group_set(s, i); } } + s->gicd_no_migration_shift_bug = true; } static void arm_gic_common_linux_init(ARMLinuxBootIf *obj, diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index 5cbafaf497..519d581bb6 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -431,7 +431,7 @@ static uint64_t icv_ap_read(CPUARMState *env, const ARMCPRegInfo *ri) { GICv3CPUState *cs = icc_cs_from_env(env); int regno = ri->opc2 & 3; - int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS; + int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0; uint64_t value = cs->ich_apr[grp][regno]; trace_gicv3_icv_ap_read(ri->crm & 1, regno, gicv3_redist_affid(cs), value); @@ -443,7 +443,7 @@ static void icv_ap_write(CPUARMState *env, const ARMCPRegInfo *ri, { GICv3CPUState *cs = icc_cs_from_env(env); int regno = ri->opc2 & 3; - int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS; + int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0; trace_gicv3_icv_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs), value); @@ -1465,7 +1465,7 @@ static uint64_t icc_ap_read(CPUARMState *env, const ARMCPRegInfo *ri) uint64_t value; int regno = ri->opc2 & 3; - int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1; + int grp = (ri->crm & 1) ? GICV3_G1 : GICV3_G0; if (icv_access(env, grp == GICV3_G0 ? HCR_FMO : HCR_IMO)) { return icv_ap_read(env, ri); @@ -1487,7 +1487,7 @@ static void icc_ap_write(CPUARMState *env, const ARMCPRegInfo *ri, GICv3CPUState *cs = icc_cs_from_env(env); int regno = ri->opc2 & 3; - int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1; + int grp = (ri->crm & 1) ? GICV3_G1 : GICV3_G0; if (icv_access(env, grp == GICV3_G0 ? HCR_FMO : HCR_IMO)) { icv_ap_write(env, ri, value); @@ -2296,7 +2296,7 @@ static uint64_t ich_ap_read(CPUARMState *env, const ARMCPRegInfo *ri) { GICv3CPUState *cs = icc_cs_from_env(env); int regno = ri->opc2 & 3; - int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS; + int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0; uint64_t value; value = cs->ich_apr[grp][regno]; @@ -2309,7 +2309,7 @@ static void ich_ap_write(CPUARMState *env, const ARMCPRegInfo *ri, { GICv3CPUState *cs = icc_cs_from_env(env); int regno = ri->opc2 & 3; - int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS; + int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0; trace_gicv3_ich_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs), value); diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 481fe5405a..3fff4687ee 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -135,7 +135,14 @@ static void kvm_dist_get_priority(GICv3State *s, uint32_t offset, uint8_t *bmp) uint32_t reg, *field; int irq; - field = (uint32_t *)bmp; + /* For the KVM GICv3, affinity routing is always enabled, and the first 8 + * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding + * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to + * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and + * offset. + */ + field = (uint32_t *)(bmp + GIC_INTERNAL); + offset += (GIC_INTERNAL * 8) / 8; for_each_dist_irq_reg(irq, s->num_irq, 8) { kvm_gicd_access(s, offset, &reg, false); *field = reg; @@ -149,7 +156,14 @@ static void kvm_dist_put_priority(GICv3State *s, uint32_t offset, uint8_t *bmp) uint32_t reg, *field; int irq; - field = (uint32_t *)bmp; + /* For the KVM GICv3, affinity routing is always enabled, and the first 8 + * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding + * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to + * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and + * offset. + */ + field = (uint32_t *)(bmp + GIC_INTERNAL); + offset += (GIC_INTERNAL * 8) / 8; for_each_dist_irq_reg(irq, s->num_irq, 8) { reg = *field; kvm_gicd_access(s, offset, &reg, true); @@ -164,6 +178,14 @@ static void kvm_dist_get_edge_trigger(GICv3State *s, uint32_t offset, uint32_t reg; int irq; + /* For the KVM GICv3, affinity routing is always enabled, and the first 2 + * GICD_ICFGR<n> registers are always RAZ/WI. The corresponding + * functionality is replaced by GICR_ICFGR<n>. It doesn't need to sync + * them. So it should increase the offset to skip GIC_INTERNAL irqs. + * This matches the for_each_dist_irq_reg() macro which also skips the + * first GIC_INTERNAL irqs. + */ + offset += (GIC_INTERNAL * 2) / 8; for_each_dist_irq_reg(irq, s->num_irq, 2) { kvm_gicd_access(s, offset, &reg, false); reg = half_unshuffle32(reg >> 1); @@ -181,6 +203,14 @@ static void kvm_dist_put_edge_trigger(GICv3State *s, uint32_t offset, uint32_t reg; int irq; + /* For the KVM GICv3, affinity routing is always enabled, and the first 2 + * GICD_ICFGR<n> registers are always RAZ/WI. The corresponding + * functionality is replaced by GICR_ICFGR<n>. It doesn't need to sync + * them. So it should increase the offset to skip GIC_INTERNAL irqs. + * This matches the for_each_dist_irq_reg() macro which also skips the + * first GIC_INTERNAL irqs. + */ + offset += (GIC_INTERNAL * 2) / 8; for_each_dist_irq_reg(irq, s->num_irq, 2) { reg = *gic_bmp_ptr32(bmp, irq); if (irq % 32 != 0) { @@ -222,6 +252,15 @@ static void kvm_dist_getbmp(GICv3State *s, uint32_t offset, uint32_t *bmp) uint32_t reg; int irq; + /* For the KVM GICv3, affinity routing is always enabled, and the + * GICD_IGROUPR0/GICD_IGRPMODR0/GICD_ISENABLER0/GICD_ISPENDR0/ + * GICD_ISACTIVER0 registers are always RAZ/WI. The corresponding + * functionality is replaced by the GICR registers. It doesn't need to sync + * them. So it should increase the offset to skip GIC_INTERNAL irqs. + * This matches the for_each_dist_irq_reg() macro which also skips the + * first GIC_INTERNAL irqs. + */ + offset += (GIC_INTERNAL * 1) / 8; for_each_dist_irq_reg(irq, s->num_irq, 1) { kvm_gicd_access(s, offset, &reg, false); *gic_bmp_ptr32(bmp, irq) = reg; @@ -235,6 +274,19 @@ static void kvm_dist_putbmp(GICv3State *s, uint32_t offset, uint32_t reg; int irq; + /* For the KVM GICv3, affinity routing is always enabled, and the + * GICD_IGROUPR0/GICD_IGRPMODR0/GICD_ISENABLER0/GICD_ISPENDR0/ + * GICD_ISACTIVER0 registers are always RAZ/WI. The corresponding + * functionality is replaced by the GICR registers. It doesn't need to sync + * them. So it should increase the offset and clroffset to skip GIC_INTERNAL + * irqs. This matches the for_each_dist_irq_reg() macro which also skips the + * first GIC_INTERNAL irqs. + */ + offset += (GIC_INTERNAL * 1) / 8; + if (clroffset != 0) { + clroffset += (GIC_INTERNAL * 1) / 8; + } + for_each_dist_irq_reg(irq, s->num_irq, 1) { /* If this bitmap is a set/clear register pair, first write to the * clear-reg to clear all bits before using the set-reg to write @@ -243,6 +295,7 @@ static void kvm_dist_putbmp(GICv3State *s, uint32_t offset, if (clroffset != 0) { reg = 0; kvm_gicd_access(s, clroffset, &reg, true); + clroffset += 4; } reg = *gic_bmp_ptr32(bmp, irq); kvm_gicd_access(s, offset, &reg, true); diff --git a/hw/intc/openpic_kvm.c b/hw/intc/openpic_kvm.c index fa83420254..39a6f369c5 100644 --- a/hw/intc/openpic_kvm.c +++ b/hw/intc/openpic_kvm.c @@ -124,10 +124,6 @@ static void kvm_openpic_region_add(MemoryListener *listener, uint64_t reg_base; int ret; - if (section->fv != address_space_to_flatview(&address_space_memory)) { - abort(); - } - /* Ignore events on regions that are not us */ if (section->mr != &opp->mem) { return; diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 38674b08aa..6bdef38ceb 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -422,6 +422,7 @@ static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc) static void virtio_net_reset(VirtIODevice *vdev) { VirtIONet *n = VIRTIO_NET(vdev); + int i; /* Reset back to compatibility mode */ n->promisc = 1; @@ -445,6 +446,16 @@ static void virtio_net_reset(VirtIODevice *vdev) memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac)); qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac); memset(n->vlans, 0, MAX_VLAN >> 3); + + /* Flush any async TX */ + for (i = 0; i < n->max_queues; i++) { + NetClientState *nc = qemu_get_subqueue(n->nic, i); + + if (nc->peer) { + qemu_flush_or_purge_queued_packets(nc->peer, true); + assert(!virtio_net_get_subqueue(nc)->async_tx.elem); + } + } } static void peer_test_vnet_hdr(VirtIONet *n) diff --git a/hw/pci-bridge/i82801b11.c b/hw/pci-bridge/i82801b11.c index cb522bf30c..ebf7f5f0e8 100644 --- a/hw/pci-bridge/i82801b11.c +++ b/hw/pci-bridge/i82801b11.c @@ -98,6 +98,7 @@ static void i82801b11_bridge_class_init(ObjectClass *klass, void *data) k->realize = i82801b11_bridge_realize; k->config_write = pci_bridge_write_config; dc->vmsd = &i82801b11_bridge_dev_vmstate; + dc->reset = pci_bridge_reset; set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); } diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index b57528baf4..a74eb2dc68 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -100,6 +100,21 @@ #define PHANDLE_XICP 0x00001111 +/* These two functions implement the VCPU id numbering: one to compute them + * all and one to identify thread 0 of a VCORE. Any change to the first one + * is likely to have an impact on the second one, so let's keep them close. + */ +static int spapr_vcpu_id(sPAPRMachineState *spapr, int cpu_index) +{ + return + (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads; +} +static bool spapr_is_thread0_in_vcore(sPAPRMachineState *spapr, + PowerPCCPU *cpu) +{ + return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0; +} + static ICSState *spapr_ics_create(sPAPRMachineState *spapr, const char *type_ics, int nr_irqs, Error **errp) @@ -161,15 +176,14 @@ static void pre_2_10_vmstate_unregister_dummy_icp(int i) (void *)(uintptr_t) i); } -static inline int xics_max_server_number(void) +static int xics_max_server_number(sPAPRMachineState *spapr) { - return DIV_ROUND_UP(max_cpus * kvmppc_smt_threads(), smp_threads); + return DIV_ROUND_UP(max_cpus * spapr->vsmt, smp_threads); } static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp) { sPAPRMachineState *spapr = SPAPR_MACHINE(machine); - sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); if (kvm_enabled()) { if (machine_kernel_irqchip_allowed(machine) && @@ -191,17 +205,6 @@ static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp) return; } } - - if (smc->pre_2_10_has_unused_icps) { - int i; - - for (i = 0; i < xics_max_server_number(); i++) { - /* Dummy entries get deregistered when real ICPState objects - * are registered during CPU core hotplug. - */ - pre_2_10_vmstate_register_dummy_icp(i); - } - } } static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu, @@ -210,7 +213,7 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu, int i, ret = 0; uint32_t servers_prop[smt_threads]; uint32_t gservers_prop[smt_threads * 2]; - int index = spapr_vcpu_id(cpu); + int index = spapr_get_vcpu_id(cpu); if (cpu->compat_pvr) { ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr); @@ -239,7 +242,7 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu, static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu) { - int index = spapr_vcpu_id(cpu); + int index = spapr_get_vcpu_id(cpu); uint32_t associativity[] = {cpu_to_be32(0x5), cpu_to_be32(0x0), cpu_to_be32(0x0), @@ -338,16 +341,15 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr) int ret = 0, offset, cpus_offset; CPUState *cs; char cpu_model[32]; - int smt = kvmppc_smt_threads(); uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)}; CPU_FOREACH(cs) { PowerPCCPU *cpu = POWERPC_CPU(cs); DeviceClass *dc = DEVICE_GET_CLASS(cs); - int index = spapr_vcpu_id(cpu); - int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu)); + int index = spapr_get_vcpu_id(cpu); + int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu)); - if ((index % smt) != 0) { + if (!spapr_is_thread0_in_vcore(spapr, cpu)) { continue; } @@ -493,7 +495,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset, PowerPCCPU *cpu = POWERPC_CPU(cs); CPUPPCState *env = &cpu->env; PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs); - int index = spapr_vcpu_id(cpu); + int index = spapr_get_vcpu_id(cpu); uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40), 0xffffffff, 0xffffffff}; uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() @@ -503,7 +505,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset, size_t page_sizes_prop_size; uint32_t vcpus_per_socket = smp_threads * smp_cores; uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)}; - int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu)); + int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu)); sPAPRDRConnector *drc; int drc_index; uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ]; @@ -614,7 +616,6 @@ static void spapr_populate_cpus_dt_node(void *fdt, sPAPRMachineState *spapr) CPUState *cs; int cpus_offset; char *nodename; - int smt = kvmppc_smt_threads(); cpus_offset = fdt_add_subnode(fdt, 0, "cpus"); _FDT(cpus_offset); @@ -628,11 +629,11 @@ static void spapr_populate_cpus_dt_node(void *fdt, sPAPRMachineState *spapr) */ CPU_FOREACH_REVERSE(cs) { PowerPCCPU *cpu = POWERPC_CPU(cs); - int index = spapr_vcpu_id(cpu); + int index = spapr_get_vcpu_id(cpu); DeviceClass *dc = DEVICE_GET_CLASS(cs); int offset; - if ((index % smt) != 0) { + if (!spapr_is_thread0_in_vcore(spapr, cpu)) { continue; } @@ -1105,7 +1106,7 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr, _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2)); /* /interrupt controller */ - spapr_dt_xics(xics_max_server_number(), fdt, PHANDLE_XICP); + spapr_dt_xics(xics_max_server_number(spapr), fdt, PHANDLE_XICP); ret = spapr_populate_memory(spapr, fdt); if (ret < 0) { @@ -2197,8 +2198,8 @@ static void spapr_init_cpus(sPAPRMachineState *spapr) { MachineState *machine = MACHINE(spapr); MachineClass *mc = MACHINE_GET_CLASS(machine); + sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); const char *type = spapr_get_cpu_core_type(machine->cpu_type); - int smt = kvmppc_smt_threads(); const CPUArchIdList *possible_cpus; int boot_cores_nr = smp_cpus / smp_threads; int i; @@ -2228,12 +2229,23 @@ static void spapr_init_cpus(sPAPRMachineState *spapr) boot_cores_nr = possible_cpus->len; } + if (smc->pre_2_10_has_unused_icps) { + int i; + + for (i = 0; i < xics_max_server_number(spapr); i++) { + /* Dummy entries get deregistered when real ICPState objects + * are registered during CPU core hotplug. + */ + pre_2_10_vmstate_register_dummy_icp(i); + } + } + for (i = 0; i < possible_cpus->len; i++) { int core_id = i * smp_threads; if (mc->has_hotpluggable_cpus) { spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU, - (core_id / smp_threads) * smt); + spapr_vcpu_id(spapr, core_id)); } if (i < boot_cores_nr) { @@ -2282,26 +2294,43 @@ static void spapr_set_vsmt_mode(sPAPRMachineState *spapr, Error **errp) } /* In this case, spapr->vsmt has been set by the command line */ } else { - /* Choose a VSMT mode that may be higher than necessary but is - * likely to be compatible with hosts that don't have VSMT. */ - spapr->vsmt = MAX(kvm_smt, smp_threads); + /* + * Default VSMT value is tricky, because we need it to be as + * consistent as possible (for migration), but this requires + * changing it for at least some existing cases. We pick 8 as + * the value that we'd get with KVM on POWER8, the + * overwhelmingly common case in production systems. + */ + spapr->vsmt = MAX(8, smp_threads); } /* KVM: If necessary, set the SMT mode: */ if (kvm_enabled() && (spapr->vsmt != kvm_smt)) { ret = kvmppc_set_smt_threads(spapr->vsmt); if (ret) { + /* Looks like KVM isn't able to change VSMT mode */ error_setg(&local_err, "Failed to set KVM's VSMT mode to %d (errno %d)", spapr->vsmt, ret); - if (!vsmt_user) { - error_append_hint(&local_err, "On PPC, a VM with %d threads/" - "core on a host with %d threads/core requires " - " the use of VSMT mode %d.\n", - smp_threads, kvm_smt, spapr->vsmt); + /* We can live with that if the default one is big enough + * for the number of threads, and a submultiple of the one + * we want. In this case we'll waste some vcpu ids, but + * behaviour will be correct */ + if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) { + warn_report_err(local_err); + local_err = NULL; + goto out; + } else { + if (!vsmt_user) { + error_append_hint(&local_err, + "On PPC, a VM with %d threads/core" + " on a host with %d threads/core" + " requires the use of VSMT mode %d.\n", + smp_threads, kvm_smt, spapr->vsmt); + } + kvmppc_hint_smt_possible(&local_err); + goto out; } - kvmppc_hint_smt_possible(&local_err); - goto out; } } /* else TCG: nothing to do currently */ @@ -2327,6 +2356,7 @@ static void ppc_spapr_init(MachineState *machine) long load_limit, fw_size; char *filename; Error *resize_hpt_err = NULL; + PowerPCCPU *first_ppc_cpu; msi_nonbroken = true; @@ -2419,11 +2449,6 @@ static void ppc_spapr_init(MachineState *machine) } spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY); - if (!kvm_enabled() || kvmppc_has_cap_mmu_radix()) { - /* KVM and TCG always allow GTSE with radix... */ - spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE); - } - /* ... but not with hash (currently). */ /* advertise support for dedicated HP event source to guests */ if (spapr->use_hotplug_event_source) { @@ -2440,6 +2465,15 @@ static void ppc_spapr_init(MachineState *machine) spapr_init_cpus(spapr); + first_ppc_cpu = POWERPC_CPU(first_cpu); + if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) && + ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0, + spapr->max_compat_pvr)) { + /* KVM and TCG always allow GTSE with radix... */ + spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE); + } + /* ... but not with hash (currently). */ + if (kvm_enabled()) { /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */ kvmppc_enable_logical_ci_hcalls(); @@ -3199,7 +3233,7 @@ static void *spapr_populate_hotplug_cpu_dt(CPUState *cs, int *fdt_offset, { PowerPCCPU *cpu = POWERPC_CPU(cs); DeviceClass *dc = DEVICE_GET_CLASS(cs); - int id = spapr_vcpu_id(cpu); + int id = spapr_get_vcpu_id(cpu); void *fdt; int offset, fdt_size; char *nodename; @@ -3245,10 +3279,10 @@ static void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { + sPAPRMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev)); int index; sPAPRDRConnector *drc; CPUCore *cc = CPU_CORE(dev); - int smt = kvmppc_smt_threads(); if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) { error_setg(errp, "Unable to find CPU core with core-id: %d", @@ -3260,7 +3294,8 @@ void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index * smt); + drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, + spapr_vcpu_id(spapr, cc->core_id)); g_assert(drc); spapr_drc_detach(drc); @@ -3279,7 +3314,6 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, CPUState *cs = CPU(core->threads); sPAPRDRConnector *drc; Error *local_err = NULL; - int smt = kvmppc_smt_threads(); CPUArchId *core_slot; _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.