Xen project Mailing List

Re: [Xen-devel] [PATCH v2 5/6] ioreq-server: add support for multiple servers

To: Paul Durrant <paul.durrant@xxxxxxxxxx>

From: George Dunlap <dunlapg@xxxxxxxxx>

Date: Mon, 10 Mar 2014 18:41:09 +0000

Cc: "xen-devel@xxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxx>

Delivery-date: Mon, 10 Mar 2014 18:41:18 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

On Tue, Mar 4, 2014 at 11:40 AM, Paul Durrant <paul.durrant@xxxxxxxxxx> wrote: > diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c > index 1f6ce50..3116653 100644 > --- a/tools/libxc/xc_domain_restore.c > +++ b/tools/libxc/xc_domain_restore.c > @@ -746,6 +746,7 @@ typedef struct { > uint64_t acpi_ioport_location; > uint64_t viridian; > uint64_t vm_generationid_addr; > + uint64_t nr_ioreq_servers; > > struct toolstack_data_t tdata; > } pagebuf_t; > @@ -996,6 +997,16 @@ static int pagebuf_get_one(xc_interface *xch, struct > restore_ctx *ctx, > DPRINTF("read generation id buffer address"); > return pagebuf_get_one(xch, ctx, buf, fd, dom); > > + case XC_SAVE_ID_HVM_NR_IOREQ_SERVERS: > + /* Skip padding 4 bytes then read the acpi ioport location. */ This comment might be confusing. :-) > + if ( RDEXACT(fd, &buf->nr_ioreq_servers, sizeof(uint32_t)) || > + RDEXACT(fd, &buf->nr_ioreq_servers, sizeof(uint64_t)) ) > + { > + PERROR("error reading the number of IOREQ servers"); > + return -1; > + } > + return pagebuf_get_one(xch, ctx, buf, fd, dom); > + > default: > if ( (count > MAX_BATCH_SIZE) || (count < 0) ) { > ERROR("Max batch size exceeded (%d). Giving up.", count); > > diff --git a/tools/libxc/xc_hvm_build_x86.c b/tools/libxc/xc_hvm_build_x86.c > index b65e702..6d6328a 100644 > --- a/tools/libxc/xc_hvm_build_x86.c > +++ b/tools/libxc/xc_hvm_build_x86.c > @@ -45,7 +45,7 @@ > #define SPECIALPAGE_IDENT_PT 4 > #define SPECIALPAGE_CONSOLE 5 > #define SPECIALPAGE_IOREQ 6 > -#define NR_SPECIAL_PAGES SPECIALPAGE_IOREQ + 2 /* ioreq server needs 2 > pages */ > +#define NR_SPECIAL_PAGES(n) SPECIALPAGE_IOREQ + (2 * n) /* ioreq server > needs 2 pages */ "each ioreq server needs 2 pages"? > #define special_pfn(x) (0xff000u - 1 - (x)) > > #define VGA_HOLE_SIZE (0x20) > [snip] > @@ -515,7 +521,9 @@ static int setup_guest(xc_interface *xch, > xc_set_hvm_param(xch, dom, HVM_PARAM_IOREQ_PFN, > special_pfn(SPECIALPAGE_IOREQ)); > xc_set_hvm_param(xch, dom, HVM_PARAM_BUFIOREQ_PFN, > - special_pfn(SPECIALPAGE_IOREQ) - 1); > + special_pfn(SPECIALPAGE_IOREQ) - max_emulators); Similarly, special_pfn(SPECIALPAGE_IOREQ+max_emulators)? Although actually, are you planning to make it possible to add more emulators (above "max_emulators") dynamically after the VM is created -- maybe in a future series? If not, and you're always going to be statically allocating a fixed number of emulators at the beginning, there's not actually a reason to change the direction that the special PFNs go at all. > + xc_set_hvm_param(xch, dom, HVM_PARAM_NR_IOREQ_SERVERS, > + max_emulators); > > /* > * Identity-map page table is required for running with CR0.PG=0 when [snip] > diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c > index 4fc46eb..cf9b67d 100644 > --- a/tools/libxl/xl_cmdimpl.c > +++ b/tools/libxl/xl_cmdimpl.c > @@ -1750,6 +1750,9 @@ skip_vfb: > > b_info->u.hvm.vendor_device = d; > } > + > + if (!xlu_cfg_get_long (config, "secondary_device_emulators", &l, 0)) > + b_info->u.hvm.max_emulators = l + 1; Do we want to give this a more structured naming convention? device_model_secondary_max? device_model_secondary_emulators? Also, how are you planning on starting these secondary emulators? Would it make sense for libxl to start them, in which case it should be able to do its own counting? Or are you envisioning starting / destroying secondary emulators as the guest is running? > } > > xlu_cfg_destroy(config); > diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c > index fb2dd73..e8b73fa 100644 > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -357,14 +357,21 @@ static ioreq_t *get_ioreq(struct hvm_ioreq_server *s, > int id) > bool_t hvm_io_pending(struct vcpu *v) > { > struct domain *d = v->domain; > - struct hvm_ioreq_server *s = d->arch.hvm_domain.ioreq_server; > - ioreq_t *p; > + struct list_head *entry; > > - if ( !s ) > - return 0; > + list_for_each ( entry, &d->arch.hvm_domain.ioreq_server_list ) > + { > + struct hvm_ioreq_server *s = list_entry(entry, > + struct hvm_ioreq_server, > + list_entry); > + ioreq_t *p = get_ioreq(s, v->vcpu_id); > > - p = get_ioreq(s, v->vcpu_id); > - return ( p->state != STATE_IOREQ_NONE ); > + p = get_ioreq(s, v->vcpu_id); > + if ( p->state != STATE_IOREQ_NONE ) > + return 1; Redundant calls to get_ioreq(). > + } > + > + return 0; > } > > static void hvm_wait_on_io(struct domain *d, ioreq_t *p) [snip] > +static int hvm_access_cf8( > + int dir, uint32_t port, uint32_t bytes, uint32_t *val) I take it this is part of virtualizing the pci space? This wasn't mentioned in the commit message; it seems like it probably should have been introduced in a separate patch. > +{ > + struct vcpu *curr = current; > + struct hvm_domain *hd = &curr->domain->arch.hvm_domain; > + int rc; > + > + BUG_ON(port < 0xcf8); > + port -= 0xcf8; > + > + spin_lock(&hd->pci_lock); > + > + if ( dir == IOREQ_WRITE ) > + { > + switch ( bytes ) > + { > + case 4: > + hd->pci_cf8 = *val; > + break; > + > + case 2: > + { > + uint32_t mask = 0xffff << (port * 8); > + uint32_t subval = *val << (port * 8); > + > + hd->pci_cf8 = (hd->pci_cf8 & ~mask) | > + (subval & mask); > + break; > + } > + > + case 1: > + { > + uint32_t mask = 0xff << (port * 8); > + uint32_t subval = *val << (port * 8); > + > + hd->pci_cf8 = (hd->pci_cf8 & ~mask) | > + (subval & mask); > + break; > + } > + > + default: > + break; > + } > + > + /* We always need to fall through to the catch all emulator */ > + rc = X86EMUL_UNHANDLEABLE; > + } > + else > + { > + switch ( bytes ) > + { > + case 4: > + *val = hd->pci_cf8; > + rc = X86EMUL_OKAY; > + break; > + > + case 2: > + *val = (hd->pci_cf8 >> (port * 8)) & 0xffff; > + rc = X86EMUL_OKAY; > + break; > + > + case 1: > + *val = (hd->pci_cf8 >> (port * 8)) & 0xff; > + rc = X86EMUL_OKAY; > + break; > + > + default: > + rc = X86EMUL_UNHANDLEABLE; > + break; > + } > + } > + > + spin_unlock(&hd->pci_lock); > + > + return rc; > +} > + > static int handle_pvh_io( > int dir, uint32_t port, uint32_t bytes, uint32_t *val) > { > @@ -618,39 +704,53 @@ static void hvm_ioreq_server_remove_vcpu(struct > hvm_ioreq_server *s, struct vcpu > } > } > > -static int hvm_create_ioreq_server(struct domain *d, domid_t domid) > +static int hvm_create_ioreq_server(struct domain *d, ioservid_t id, domid_t > domid) > { > struct hvm_ioreq_server *s; > unsigned long pfn; > struct vcpu *v; > int i, rc; > > + if ( id >= d->arch.hvm_domain.params[HVM_PARAM_NR_IOREQ_SERVERS] ) > + return -EINVAL; > + > + spin_lock(&d->arch.hvm_domain.ioreq_server_lock); Hmm, so for a few patches we're just completely lockless? Regressions like that can wreak havoc on bisections. [snip] > @@ -1646,11 +2113,112 @@ void hvm_vcpu_down(struct vcpu *v) > } > } > > +static DEFINE_RCU_READ_LOCK(ioreq_server_rcu_lock); > + > +static struct hvm_ioreq_server *hvm_select_ioreq_server(struct vcpu *v, > ioreq_t *p) > +{ > +#define BDF(cf8) (((cf8) & 0x00ffff00) >> 8) > + > + struct domain *d = v->domain; > + struct hvm_ioreq_server *s; > + uint8_t type; > + uint64_t addr; > + > + if ( p->type == IOREQ_TYPE_PIO && > + (p->addr & ~3) == 0xcfc ) > + { > + /* PCI config data cycle */ > + type = IOREQ_TYPE_PCI_CONFIG; > + > + spin_lock(&d->arch.hvm_domain.pci_lock); > + addr = d->arch.hvm_domain.pci_cf8 + (p->addr & 3); > + spin_unlock(&d->arch.hvm_domain.pci_lock); > + } > + else > + { > + type = p->type; > + addr = p->addr; > + } > + > + rcu_read_lock(&ioreq_server_rcu_lock); > + > + switch ( type ) > + { > + case IOREQ_TYPE_COPY: > + case IOREQ_TYPE_PIO: > + case IOREQ_TYPE_PCI_CONFIG: > + break; > + default: > + goto done; > + } > + > + list_for_each_entry ( s, > + &d->arch.hvm_domain.ioreq_server_list, > + list_entry ) > + { > + switch ( type ) > + { > + case IOREQ_TYPE_COPY: > + case IOREQ_TYPE_PIO: { > + struct list_head *list; > + struct hvm_io_range *x; > + > + list = ( type == IOREQ_TYPE_COPY ) ? > + &s->mmio_range_list : > + &s->portio_range_list; > + > + list_for_each_entry ( x, > + list, > + list_entry ) > + { > + if ( (addr >= x->start) && (addr <= x->end) ) > + goto found; > + } > + break; > + } > + case IOREQ_TYPE_PCI_CONFIG: { > + struct hvm_pcidev *x; > + > + list_for_each_entry ( x, > + &s->pcidev_list, > + list_entry ) > + { > + if ( BDF(addr) == x->bdf ) { > + p->type = type; > + p->addr = addr; > + goto found; > + } > + } > + break; > + } > + } > + } > + > +done: > + /* The catch-all server has id 0 */ > + list_for_each_entry ( s, > + &d->arch.hvm_domain.ioreq_server_list, > + list_entry ) > + { > + if ( s->id == 0 ) > + goto found; > + } This is an awful lot of code to go through for every single IO, particularly if the common case is that there's only a single ioreq server. Have you done any performance tests with this on a workload that has a high IO count? I realize that the cost of going all the way to qemu and back will still dominate the time, but I can't help but think this might add up, and I wonder if having a fast-path for max_emulators=1 would make sense on some of these potentially hot paths would make sense. > @@ -4466,9 +5295,9 @@ long do_hvm_op(unsigned long op, > XEN_GUEST_HANDLE_PARAM(void) arg) > if ( a.value == DOMID_SELF ) > a.value = curr_d->domain_id; > > - rc = hvm_create_ioreq_server(d, a.value); > + rc = hvm_create_ioreq_server(d, 0, a.value); > if ( rc == -EEXIST ) > - rc = hvm_set_ioreq_server_domid(d, a.value); > + rc = hvm_set_ioreq_server_domid(d, 0, a.value); > break; Is there a plan to deprecate this old way of creating ioreq_server 0 at some point, so we can get rid of these HVM params? Obviously we'll need to handle incoming migration from domains one release back, but after that we should be able to get rid of them, right? -George _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.