[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] Make the hypercall PHYSDEVOP_alloc_irq_vector hypercall dummy.



On 08/31/09 19:03, Zhang, Xiantao wrote:
> Hi, Keir
>      This patch tends to make the hypercall PHYSDEVOP_alloc_irq_vector
> dummy, and defer vector allocation to programe ioapic entries by dom0.
> Basically, dom0 shouldn't touch vector namespace which is only used by
> hypervisor for servicing real device's interrupts. And this patch also
> makes broken NetBSD dom0 work again.

This is an interesting approach, and it seems like a less intrusive way
to achieve what my functionally similar patch does.  All it needs is a
corresponding change to replace the paravirtualized IO APIC writes with
a hypercall to directly install the vector for a given GSI (ie, to make
use of Xen's GSI -> chip+pin tables rather than relying on dom0 to do
the discovery for itself).

> # HG changeset patch # User xiantao.zhang@xxxxxxxxx # Date 1251742441
> 14400 # Node ID 3c91a9ef5010ef6188728948d6145346b650414a # Parent
> dd71c509f3ff13326ddbb8fa39f638022aa814ff Dummy
> PHYSDEVOP_alloc_irq_vector hypercall. Make the hypercall
> PHYSDEVOP_alloc_irq_vector dummy, and defer vector allocation to
> programe ioapic entries by dom0. Basically, dom0 shouldn't touch
> vector namespace which is only used by hypervisor for servicing real
> device's interrupts. And this patch also make broken NetBSD dom0 work
> again. Signed-off-by: Xiantao Zhang <xiantao.zhang.intel.com> diff -r
> 3b7cbf32fee9 xen/arch/x86/io_apic.c --- a/xen/arch/x86/io_apic.c Mon
> Aug 31 10:54:32 2009 +0100 +++ b/xen/arch/x86/io_apic.c Tue Sep 01
> 05:58:48 2009 -0400 @@ -85,35 +85,7 @@ static struct irq_pin_list {
> unsigned int next; } *irq_2_pin; -static int *pin_irq_map; - static
> unsigned int irq_2_pin_free_entry; - -/* Use an arry to record
> pin_2_irq_mapping */ -static int get_irq_from_apic_pin(int apic, int
> pin) -{ - int i, pin_base = 0; - - ASSERT(apic < nr_ioapics); - - for
> (i = 0; i < apic; i++) - pin_base += nr_ioapic_registers[i]; - -
> return pin_irq_map[pin_base + pin]; -} - -static void
> set_irq_to_apic_pin(int apic, int pin, int irq) -{ - - int i, pin_base
> = 0; - - ASSERT(apic < nr_ioapics); - - for (i = 0; i < apic; i++) -
> pin_base += nr_ioapic_registers[i]; - - pin_irq_map[pin_base + pin] =
> irq; -} /* * The common case is 1:1 IRQ<->pin mappings. Sometimes
> there are @@ -141,42 +113,6 @@ static void add_pin_to_irq(unsigned int
> } entry->apic = apic; entry->pin = pin; - - set_irq_to_apic_pin(apic,
> pin, irq); -} - -static void remove_pin_at_irq(unsigned int irq, int
> apic, int pin) -{ - struct irq_pin_list *entry, *prev; - - for (entry
> = &irq_2_pin[irq]; ; entry = &irq_2_pin[entry->next]) { - if
> ((entry->apic == apic) && (entry->pin == pin)) - break; - if
> (!entry->next) - BUG(); - } - - entry->pin = entry->apic = -1; - - if
> (entry != &irq_2_pin[irq]) { - /* Removed entry is not at head of
> list. */ - prev = &irq_2_pin[irq]; - while (&irq_2_pin[prev->next] !=
> entry) - prev = &irq_2_pin[prev->next]; - prev->next = entry->next; -
> entry->next = irq_2_pin_free_entry; - irq_2_pin_free_entry = entry -
> irq_2_pin; - } else if (entry->next != 0) { - /* Removed entry is at
> head of multi-item list. */ - prev = entry; - entry =
> &irq_2_pin[entry->next]; - *prev = *entry; - entry->pin = entry->apic
> = -1; - entry->next = irq_2_pin_free_entry; - irq_2_pin_free_entry =
> entry - irq_2_pin; - } - - set_irq_to_apic_pin(apic, pin, -1); } /* @@
> -1138,15 +1074,11 @@ static void __init enable_IO_APIC(void) /*
> Initialise dynamic irq_2_pin free list. */ irq_2_pin =
> xmalloc_array(struct irq_pin_list, PIN_MAP_SIZE); memset(irq_2_pin, 0,
> PIN_MAP_SIZE * sizeof(*irq_2_pin)); - pin_irq_map = xmalloc_array(int,
> nr_irqs_gsi); - memset(pin_irq_map, 0, nr_irqs_gsi * sizeof(int)); for
> (i = 0; i < PIN_MAP_SIZE; i++) irq_2_pin[i].pin = -1; for (i =
> irq_2_pin_free_entry = nr_irqs_gsi; i < PIN_MAP_SIZE; i++)
> irq_2_pin[i].next = i + 1; - for (i = 0; i < nr_irqs_gsi; i++) -
> pin_irq_map[i] = -1; for(apic = 0; apic < nr_ioapics; apic++) { int
> pin; @@ -2165,6 +2097,24 @@ static int ioapic_physbase_to_id(unsigne
> return -EINVAL; } +unsigned apic_gsi_base(int apic); + +int
> apic_pin_2_gsi_irq(int apic, int pin) +{ + int idx, irq; + + if (apic
> < 0) + return -EINVAL; + + irq = apic_gsi_base(apic) + pin; + if (apic
> == 0) { + idx = find_irq_entry(apic, pin, mp_INT); + if (idx >= 0) +
> irq = pin_2_irq(idx, apic, pin); + } + return irq; +} + int
> ioapic_guest_read(unsigned long physbase, unsigned int reg, u32 *pval)
> { int apic; @@ -2182,17 +2132,19 @@ int ioapic_guest_read(unsigned
> long phys #define WARN_BOGUS_WRITE(f, a...) \ dprintk(XENLOG_INFO,
> "\n%s: " \ - "apic=%d, pin=%d, old_irq=%d, new_irq=%d\n" \ - "%s:
> old_entry=%08x, new_entry=%08x\n" \ - "%s: " f, __FUNCTION__, apic,
> pin, old_irq, new_irq, \ - __FUNCTION__, *(u32 *)&old_rte, *(u32
> *)&new_rte, \ + "apic=%d, pin=%d, irq=%d\n" \ + "%s: new_entry=%08x\n"
> \ + "%s: " f, __FUNCTION__, apic, pin, irq, \ + __FUNCTION__, *(u32
> *)&rte, \ __FUNCTION__ , ##a ) int ioapic_guest_write(unsigned long
> physbase, unsigned int reg, u32 val) { - int apic, pin, old_irq = -1,
> new_irq = -1; - struct IO_APIC_route_entry old_rte = { 0 }, new_rte =
> { 0 }; + int apic, pin, irq, ret, vector, pirq; + struct
> IO_APIC_route_entry rte = { 0 }; 

Is this just a gratuitious renaming, or does it serve some other purpose?

> unsigned long flags; + struct irq_cfg *cfg; + struct irq_desc *desc;
> if ( (apic = ioapic_physbase_to_id(physbase)) < 0 ) return apic; @@
> -2204,7 +2156,7 @@ int ioapic_guest_write(unsigned long phy pin = (reg
> - 0x10) >> 1; /* Write first half from guest; second half is target
> info. */ - *(u32 *)&new_rte = val; + *(u32 *)&rte = val; /* * What
> about weird destination types? @@ -2214,7 +2166,7 @@ int
> ioapic_guest_write(unsigned long phy * ExtINT: Ignore? Linux only
> asserts this at start of day. * For now, print a message and return an
> error. We can fix up on demand. */ - if ( new_rte.delivery_mode >
> dest_LowestPrio ) + if ( rte.delivery_mode > dest_LowestPrio ) {
> printk("ERROR: Attempt to write weird IOAPIC destination mode!\n");
> printk(" APIC=%d/%d, lo-reg=%x\n", apic, pin, val); @@ -2225,83
> +2177,65 @@ int ioapic_guest_write(unsigned long phy * The guest does
> not know physical APIC arrangement (flat vs. cluster). * Apply genapic
> conventions for this platform. */ - new_rte.delivery_mode =
> INT_DELIVERY_MODE; - new_rte.dest_mode = INT_DEST_MODE; +
> rte.delivery_mode = INT_DELIVERY_MODE; + rte.dest_mode =
> INT_DEST_MODE; + + irq = apic_pin_2_gsi_irq(apic, pin); + if ( irq < 0
> ) + return irq; + + desc = irq_to_desc(irq); + cfg = desc->chip_data;
> + + /* Since PHYSDEVOP_alloc_irq_vector is dummy, rte.vector is the
> pirq + which corresponds to this ioapic pin, retrieve it for building
> + pirq and irq mapping. + */ + pirq = rte.vector; + if(pirq < 0 ||
> pirq >= dom0->nr_pirqs) + return -EINVAL; + + if ( desc->action ) + {
> + WARN_BOGUS_WRITE("Attempt to modify IO-APIC pin for in-use IRQ!\n");
> + return 0; + } + + if ( cfg->vector <= 0 || cfg->vector >
> LAST_DYNAMIC_VECTOR ) { + + printk("allocated vector for irq:%d\n",
> irq); + + vector = assign_irq_vector(irq); + if ( vector < 0 ) +
> return vector; + + add_pin_to_irq(irq, apic, pin); 


If this is replacing or disabling a previously set "vector" will it
properly clean up the old state?  (I don't fully understand how all the
datastructures relate pirq/vector/apic/pin to each other.)

> + } + spin_lock(&pcidevs_lock); + spin_lock(&dom0->event_lock); + ret
> = map_domain_pirq(dom0, pirq, irq, + MAP_PIRQ_TYPE_GSI, NULL); +
> spin_unlock(&dom0->event_lock); + spin_unlock(&pcidevs_lock); + if (
> ret < 0 ) + return ret; spin_lock_irqsave(&ioapic_lock, flags); - - /*
> Read first (interesting) half of current routing entry. */ - *(u32
> *)&old_rte = io_apic_read(apic, 0x10 + 2 * pin); - - /* No change to
> the first half of the routing entry? Bail quietly. */ - if ( *(u32
> *)&old_rte == *(u32 *)&new_rte ) - { -
> spin_unlock_irqrestore(&ioapic_lock, flags); - return 0; - } - - /*
> Special delivery modes (SMI,NMI,INIT,ExtInt) should have no vector. */
> - if ( (old_rte.delivery_mode > dest_LowestPrio) && (old_rte.vector !=
> 0) ) - { - WARN_BOGUS_WRITE("Special delivery mode %d with non-zero
> vector " - "%02x\n", old_rte.delivery_mode, old_rte.vector); - /*
> Nobble the vector here as it does not relate to a valid irq. */ -
> old_rte.vector = 0; - } - - if ( old_rte.vector >=
> FIRST_DYNAMIC_VECTOR ) - old_irq = get_irq_from_apic_pin(apic, pin); -
> - /* FIXME: dirty hack to support per-cpu vector. */ - new_irq =
> new_rte.vector; - - if ( (old_irq != new_irq) && (old_irq >= 0) &&
> IO_APIC_IRQ(old_irq) ) - { - if ( irq_desc[old_irq].action ) - { -
> WARN_BOGUS_WRITE("Attempt to remove IO-APIC pin of in-use IRQ!\n"); -
> spin_unlock_irqrestore(&ioapic_lock, flags); - return 0; - } - -
> remove_pin_at_irq(old_irq, apic, pin); - } - - if ( (new_irq >= 0) &&
> IO_APIC_IRQ(new_irq) ) - { - if ( irq_desc[new_irq].action ) - { -
> WARN_BOGUS_WRITE("Attempt to %s IO-APIC pin for in-use IRQ!\n", -
> (old_irq != new_irq) ? "add" : "modify"); -
> spin_unlock_irqrestore(&ioapic_lock, flags); - return 0; - } - - /*
> Set the correct irq-handling type. */ - irq_desc[new_irq].handler =
> new_rte.trigger ? - &ioapic_level_type: &ioapic_edge_type; - - if (
> old_irq != new_irq ) - add_pin_to_irq(new_irq, apic, pin); - - /* Mask
> iff level triggered. */ - new_rte.mask = new_rte.trigger; - /* Set the
> vector field to the real vector! */ - new_rte.vector =
> irq_cfg[new_irq].vector; - } - else if ( !new_rte.mask ) - { - /* This
> pin leads nowhere but the guest has not masked it. */ -
> WARN_BOGUS_WRITE("Installing bogus unmasked IO-APIC entry!\n"); -
> new_rte.mask = 1; - } - - new_rte.dest.logical.logical_dest = -
> cpu_mask_to_apicid(irq_cfg[new_irq].domain); - - io_apic_write(apic,
> 0x10 + 2 * pin, *(((int *)&new_rte) + 0)); - io_apic_write(apic, 0x11
> + 2 * pin, *(((int *)&new_rte) + 1)); - + /* Set the correct
> irq-handling type. */ + desc->handler = rte.trigger ? +
> &ioapic_level_type: &ioapic_edge_type; + + /* Mask iff level
> triggered. */ + rte.mask = rte.trigger; + /* Set the vector field to
> the real vector! */ + rte.vector = cfg->vector; + +
> rte.dest.logical.logical_dest = + cpu_mask_to_apicid(cfg->domain); + +
> io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&rte) + 0)); +
> io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&rte) + 1)); +
> spin_unlock_irqrestore(&ioapic_lock, flags); return 0; diff -r
> 3b7cbf32fee9 xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/arch/x86/irq.c Tue Sep 01 05:58:48 2009
> -0400 @@ -1293,9 +1293,9 @@ int map_domain_pirq( if ( (old_irq &&
> (old_irq != irq) ) || (old_pirq && (old_pirq != pirq)) ) { -
> dprintk(XENLOG_G_ERR, "dom%d: pirq %d or irq %d already mapped\n", +
> dprintk(XENLOG_G_WARNING, "dom%d: pirq %d or irq %d already mapped\n",
> d->domain_id, pirq, irq); - return -EINVAL; + return 0; } ret =
> irq_permit_access(d, pirq); diff -r 3b7cbf32fee9
> xen/arch/x86/mpparse.c --- a/xen/arch/x86/mpparse.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/arch/x86/mpparse.c Tue Sep 01 05:58:48
> 2009 -0400 @@ -958,6 +958,10 @@ unsigned highest_gsi(void) return res;
> } +unsigned apic_gsi_base(int apic) +{ + return
> mp_ioapic_routing[apic].gsi_base; +} void __init
> mp_override_legacy_irq ( u8 bus_irq, diff -r 3b7cbf32fee9
> xen/arch/x86/physdev.c --- a/xen/arch/x86/physdev.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/arch/x86/physdev.c Tue Sep 01 06:00:15
> 2009 -0400 @@ -329,7 +329,6 @@ ret_t do_physdev_op(int cmd,
> XEN_GUEST_H case PHYSDEVOP_alloc_irq_vector: { struct physdev_irq
> irq_op; - int vector; ret = -EFAULT; if ( copy_from_guest(&irq_op,
> arg, 1) != 0 ) @@ -343,26 +342,13 @@ ret_t do_physdev_op(int cmd,
> XEN_GUEST_H if ( ret ) break; - irq = irq_op.irq; - ret = -EINVAL; +
> /* Vector is only used by hypervisor, and dom0 shouldn't + touch it in
> its world, return irq_op.irq as the vecotr, + and make this hypercall
> dummy, and also defer the vector + allocation when dom0 tries to
> programe ioapic entry. */ + irq_op.vector = irq_op.irq; + ret = 0; -
> /* FIXME: Once dom0 breaks GSI IRQ limit, it is - a must to eliminate
> the limit here */ - BUG_ON(irq >= 256); - - vector =
> assign_irq_vector(irq); - if (vector >= FIRST_DYNAMIC_VECTOR) -
> irq_op.vector = irq; - else - irq_op.vector = -ENOSPC; - -
> spin_lock(&pcidevs_lock); - spin_lock(&dom0->event_lock); - ret =
> map_domain_pirq(dom0, irq_op.irq, irq, - MAP_PIRQ_TYPE_GSI, NULL); -
> spin_unlock(&dom0->event_lock); - spin_unlock(&pcidevs_lock); - if (
> copy_to_guest(arg, &irq_op, 1) != 0 ) ret = -EFAULT; break; diff -r
> 3b7cbf32fee9 xen/common/domain.c --- a/xen/common/domain.c Mon Aug 31
> 10:54:32 2009 +0100 +++ b/xen/common/domain.c Tue Sep 01 05:58:48 2009
> -0400 @@ -197,7 +197,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
> return alloc_vcpu(d, vcpu_id, cpu_id); } -static unsigned int
> extra_dom0_irqs, extra_domU_irqs = 8; +static unsigned int
> extra_dom0_irqs = 256, extra_domU_irqs = 32; static void __init
> parse_extra_guest_irqs(const char *s) { if ( isdigit(*s) ) @@ -253,9
> +253,11 @@ struct domain *domain_create( d->is_paused_by_controller =
> 1; atomic_inc(&d->pause_count); - d->nr_pirqs = (nr_irqs_gsi + -
> (domid ? extra_domU_irqs : - extra_dom0_irqs ?: nr_irqs_gsi)); + if (
> domid ) + d->nr_pirqs = nr_irqs_gsi + extra_domU_irqs; + else +
> d->nr_pirqs = nr_irqs_gsi + extra_dom0_irqs; 


Surely there's a better test for this?  We don't want to be hard-coding
"privileged domain == dom0" into Xen.  What version is this patch
against anyway?  I don't see this "extra_dom0_irqs" stuff in current
Xen-unstable.

> + d->pirq_to_evtchn = xmalloc_array(u16, d->nr_pirqs); d->pirq_mask =
> xmalloc_array( unsigned long, BITS_TO_LONGS(d->nr_pirqs)); 

Thanks,
        J


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.