With IRQs getting bound to the CPU the binding vCPU currently runs on there can result quite a bit of extra cross CPU traffic as soon as that vCPU moves to a different pCPU. Likewise, when a domain re-binds an event channel associated with a pIRQ, that IRQ's affinity should also be adjusted. The open issue is how to break ties for interrupts shared by multiple domains - currently, the last request (at any point in time) is being honored. Signed-off-by: Jan Beulich --- 2010-06-15.orig/xen/arch/ia64/xen/irq.c 2009-10-29 12:24:48.000000000 +0100 +++ 2010-06-15/xen/arch/ia64/xen/irq.c 2010-06-15 09:21:05.000000000 +0200 @@ -612,6 +612,11 @@ xen_debug_irq(unsigned long vector, stru } } +void pirq_set_affinity(struct domain *d, int irq, const cpumask_t *mask) +{ + /* FIXME */ +} + /* * Exit an interrupt context. Process softirqs if needed and possible: */ --- 2010-06-15.orig/xen/arch/x86/hvm/hvm.c 2010-06-11 11:41:35.000000000 +0200 +++ 2010-06-15/xen/arch/x86/hvm/hvm.c 2010-06-15 09:21:05.000000000 +0200 @@ -270,7 +270,7 @@ void hvm_migrate_pirqs(struct vcpu *v) continue; irq = desc - irq_desc; ASSERT(MSI_IRQ(irq)); - irq_set_affinity(irq, *cpumask_of(v->processor)); + irq_set_affinity(desc, cpumask_of(v->processor)); spin_unlock_irq(&desc->lock); } spin_unlock(&d->event_lock); --- 2010-06-15.orig/xen/arch/x86/irq.c 2010-06-11 11:41:35.000000000 +0200 +++ 2010-06-15/xen/arch/x86/irq.c 2010-06-15 09:21:05.000000000 +0200 @@ -501,16 +501,28 @@ void move_native_irq(int irq) } /* For re-setting irq interrupt affinity for specific irq */ -void irq_set_affinity(int irq, cpumask_t mask) +void irq_set_affinity(struct irq_desc *desc, const cpumask_t *mask) { - struct irq_desc *desc = irq_to_desc(irq); - if (!desc->handler->set_affinity) return; ASSERT(spin_is_locked(&desc->lock)); + desc->status &= ~IRQ_MOVE_PENDING; + wmb(); + cpus_copy(desc->pending_mask, *mask); + wmb(); desc->status |= IRQ_MOVE_PENDING; - cpus_copy(desc->pending_mask, mask); +} + +void pirq_set_affinity(struct domain *d, int pirq, const cpumask_t *mask) +{ + unsigned long flags; + struct irq_desc *desc = domain_spin_lock_irq_desc(d, pirq, &flags); + + if ( !desc ) + return; + irq_set_affinity(desc, mask); + spin_unlock_irqrestore(&desc->lock, flags); } DEFINE_PER_CPU(unsigned int, irq_count); --- 2010-06-15.orig/xen/common/event_channel.c 2010-06-11 11:41:35.000000000 +0200 +++ 2010-06-15/xen/common/event_channel.c 2010-06-15 09:21:05.000000000 +0200 @@ -295,10 +295,36 @@ static long evtchn_bind_ipi(evtchn_bind_ } +static void link_pirq_port(int port, struct evtchn *chn, struct vcpu *v) +{ + chn->u.pirq.prev_port = 0; + chn->u.pirq.next_port = v->pirq_evtchn_head; + if ( v->pirq_evtchn_head ) + evtchn_from_port(v->domain, v->pirq_evtchn_head) + ->u.pirq.prev_port = port; + v->pirq_evtchn_head = port; +} + +static void unlink_pirq_port(struct evtchn *chn, struct vcpu *v) +{ + struct domain *d = v->domain; + + if ( chn->u.pirq.prev_port ) + evtchn_from_port(d, chn->u.pirq.prev_port)->u.pirq.next_port = + chn->u.pirq.next_port; + else + v->pirq_evtchn_head = chn->u.pirq.next_port; + if ( chn->u.pirq.next_port ) + evtchn_from_port(d, chn->u.pirq.next_port)->u.pirq.prev_port = + chn->u.pirq.prev_port; +} + + static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind) { struct evtchn *chn; struct domain *d = current->domain; + struct vcpu *v = d->vcpu[0]; int port, pirq = bind->pirq; long rc; @@ -319,7 +345,7 @@ static long evtchn_bind_pirq(evtchn_bind chn = evtchn_from_port(d, port); d->pirq_to_evtchn[pirq] = port; - rc = pirq_guest_bind(d->vcpu[0], pirq, + rc = pirq_guest_bind(v, pirq, !!(bind->flags & BIND_PIRQ__WILL_SHARE)); if ( rc != 0 ) { @@ -328,7 +354,8 @@ static long evtchn_bind_pirq(evtchn_bind } chn->state = ECS_PIRQ; - chn->u.pirq = pirq; + chn->u.pirq.irq = pirq; + link_pirq_port(port, chn, v); bind->port = port; @@ -376,8 +403,9 @@ static long __evtchn_close(struct domain break; case ECS_PIRQ: - pirq_guest_unbind(d1, chn1->u.pirq); - d1->pirq_to_evtchn[chn1->u.pirq] = 0; + pirq_guest_unbind(d1, chn1->u.pirq.irq); + d1->pirq_to_evtchn[chn1->u.pirq.irq] = 0; + unlink_pirq_port(chn1, d1->vcpu[chn1->notify_vcpu_id]); break; case ECS_VIRQ: @@ -688,7 +716,7 @@ static long evtchn_status(evtchn_status_ break; case ECS_PIRQ: status->status = EVTCHNSTAT_pirq; - status->u.pirq = chn->u.pirq; + status->u.pirq = chn->u.pirq.irq; break; case ECS_VIRQ: status->status = EVTCHNSTAT_virq; @@ -747,8 +775,16 @@ long evtchn_bind_vcpu(unsigned int port, break; case ECS_UNBOUND: case ECS_INTERDOMAIN: + chn->notify_vcpu_id = vcpu_id; + break; case ECS_PIRQ: + if ( chn->notify_vcpu_id == vcpu_id ) + break; + unlink_pirq_port(chn, d->vcpu[chn->notify_vcpu_id]); chn->notify_vcpu_id = vcpu_id; + pirq_set_affinity(d, chn->u.pirq.irq, + cpumask_of(d->vcpu[vcpu_id]->processor)); + link_pirq_port(port, chn, d->vcpu[vcpu_id]); break; default: rc = -EINVAL; @@ -1064,6 +1100,23 @@ void evtchn_destroy_final(struct domain } +void evtchn_move_pirqs(struct vcpu *v) +{ + struct domain *d = v->domain; + const cpumask_t *mask = cpumask_of(v->processor); + unsigned int port; + struct evtchn *chn; + + spin_lock(&d->event_lock); + for ( port = v->pirq_evtchn_head; port; port = chn->u.pirq.next_port ) + { + chn = evtchn_from_port(d, port); + pirq_set_affinity(d, chn->u.pirq.irq, mask); + } + spin_unlock(&d->event_lock); +} + + static void domain_dump_evtchn_info(struct domain *d) { unsigned int port; @@ -1105,7 +1158,7 @@ static void domain_dump_evtchn_info(stru chn->u.interdomain.remote_port); break; case ECS_PIRQ: - printk(" p=%d", chn->u.pirq); + printk(" p=%d", chn->u.pirq.irq); break; case ECS_VIRQ: printk(" v=%d", chn->u.virq); --- 2010-06-15.orig/xen/common/schedule.c 2010-06-11 11:41:35.000000000 +0200 +++ 2010-06-15/xen/common/schedule.c 2010-06-15 09:21:05.000000000 +0200 @@ -272,6 +272,7 @@ int sched_move_domain(struct domain *d, cpus_setall(v->cpu_affinity); v->processor = new_p; v->sched_priv = vcpu_priv[v->vcpu_id]; + evtchn_move_pirqs(v); new_p = cycle_cpu(new_p, c->cpu_valid); } @@ -419,6 +420,9 @@ static void vcpu_migrate(struct vcpu *v) spin_unlock_irqrestore( per_cpu(schedule_data, old_cpu).schedule_lock, flags); + if ( old_cpu != new_cpu ) + evtchn_move_pirqs(v); + /* Wake on new CPU. */ vcpu_wake(v); } @@ -1094,6 +1098,9 @@ static void schedule(void) stop_timer(&prev->periodic_timer); + if ( next_slice.migrated ) + evtchn_move_pirqs(next); + /* Ensure that the domain has an up-to-date time base. */ update_vcpu_system_time(next); vcpu_periodic_timer_work(next); --- 2010-06-15.orig/xen/common/sched_credit.c 2010-05-20 09:59:27.000000000 +0200 +++ 2010-06-15/xen/common/sched_credit.c 2010-06-15 09:21:05.000000000 +0200 @@ -1168,7 +1168,7 @@ csched_runq_steal(int peer_cpu, int cpu, static struct csched_vcpu * csched_load_balance(struct csched_private *prv, int cpu, - struct csched_vcpu *snext) + struct csched_vcpu *snext, bool_t *stolen) { struct csched_vcpu *speer; cpumask_t workers; @@ -1221,7 +1221,10 @@ csched_load_balance(struct csched_privat speer = csched_runq_steal(peer_cpu, cpu, snext->pri); spin_unlock(per_cpu(schedule_data, peer_cpu).schedule_lock); if ( speer != NULL ) + { + *stolen = 1; return speer; + } } out: @@ -1269,6 +1272,7 @@ csched_schedule( BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); snext = __runq_elem(runq->next); + ret.migrated = 0; /* Tasklet work (which runs in idle VCPU context) overrides all else. */ if ( tasklet_work_scheduled ) @@ -1288,7 +1292,7 @@ csched_schedule( if ( snext->pri > CSCHED_PRI_TS_OVER ) __runq_remove(snext); else - snext = csched_load_balance(prv, cpu, snext); + snext = csched_load_balance(prv, cpu, snext, &ret.migrated); /* * Update idlers mask if necessary. When we're idling, other CPUs --- 2010-06-15.orig/xen/common/sched_credit2.c 2010-05-20 09:59:27.000000000 +0200 +++ 2010-06-15/xen/common/sched_credit2.c 2010-06-15 09:22:13.000000000 +0200 @@ -991,10 +991,17 @@ csched_schedule( } #endif + ret.migrated = 0; + if ( !is_idle_vcpu(snext->vcpu) ) { snext->start_time = now; - snext->vcpu->processor = cpu; /* Safe because lock for old processor is held */ + /* Safe because lock for old processor is held */ + if ( snext->vcpu->processor != cpu ) + { + snext->vcpu->processor = cpu; + ret.migrated = 1; + } } /* --- 2010-06-15.orig/xen/common/sched_sedf.c 2010-05-20 09:59:27.000000000 +0200 +++ 2010-06-15/xen/common/sched_sedf.c 2010-06-15 09:21:05.000000000 +0200 @@ -875,6 +875,8 @@ static struct task_slice sedf_do_schedul ret.time = EXTRA_QUANTUM; } + ret.migrated = 0; + EDOM_INFO(ret.task)->sched_start_abs = now; CHECK(ret.time > 0); ASSERT(sedf_runnable(ret.task)); --- 2010-06-15.orig/xen/include/asm-x86/irq.h 2009-12-16 09:14:13.000000000 +0100 +++ 2010-06-15/xen/include/asm-x86/irq.h 2010-06-15 09:21:05.000000000 +0200 @@ -143,7 +143,7 @@ void move_native_irq(int irq); void move_masked_irq(int irq); -void irq_set_affinity(int irq, cpumask_t mask); +void irq_set_affinity(struct irq_desc *, const cpumask_t *mask); #define domain_pirq_to_irq(d, pirq) ((d)->arch.pirq_irq[pirq]) #define domain_irq_to_pirq(d, irq) ((d)->arch.irq_pirq[irq]) --- 2010-06-15.orig/xen/include/xen/cpumask.h 2010-05-17 08:45:28.000000000 +0200 +++ 2010-06-15/xen/include/xen/cpumask.h 2010-06-15 09:21:05.000000000 +0200 @@ -206,7 +206,7 @@ static inline int __cpus_weight(const cp } #define cpus_copy(dest, src) __cpus_copy(&(dest), &(src)) -static inline void __cpus_copy(cpumask_t *dstp, cpumask_t *srcp) +static inline void __cpus_copy(cpumask_t *dstp, const cpumask_t *srcp) { bitmap_copy(dstp->bits, srcp->bits, NR_CPUS); } --- 2010-06-15.orig/xen/include/xen/event.h 2010-06-11 11:41:35.000000000 +0200 +++ 2010-06-15/xen/include/xen/event.h 2010-06-15 09:21:05.000000000 +0200 @@ -47,6 +47,9 @@ long evtchn_bind_vcpu(unsigned int port, /* Unmask a local event-channel port. */ int evtchn_unmask(unsigned int port); +/* Move all PIRQs after a vCPU was moved to another pCPU. */ +void evtchn_move_pirqs(struct vcpu *v); + /* Allocate/free a Xen-attached event channel port. */ int alloc_unbound_xen_event_channel( struct vcpu *local_vcpu, domid_t remote_domid); --- 2010-06-15.orig/xen/include/xen/irq.h 2009-10-29 12:24:49.000000000 +0100 +++ 2010-06-15/xen/include/xen/irq.h 2010-06-15 09:21:05.000000000 +0200 @@ -138,6 +138,7 @@ extern int pirq_guest_eoi(struct domain extern int pirq_guest_unmask(struct domain *d); extern int pirq_guest_bind(struct vcpu *v, int irq, int will_share); extern void pirq_guest_unbind(struct domain *d, int irq); +extern void pirq_set_affinity(struct domain *d, int irq, const cpumask_t *); extern irq_desc_t *domain_spin_lock_irq_desc( struct domain *d, int irq, unsigned long *pflags); --- 2010-06-15.orig/xen/include/xen/sched.h 2010-06-14 08:49:36.000000000 +0200 +++ 2010-06-15/xen/include/xen/sched.h 2010-06-15 09:21:05.000000000 +0200 @@ -61,7 +61,11 @@ struct evtchn u16 remote_port; struct domain *remote_dom; } interdomain; /* state == ECS_INTERDOMAIN */ - u16 pirq; /* state == ECS_PIRQ */ + struct { + u16 irq; + u16 next_port; + u16 prev_port; + } pirq; /* state == ECS_PIRQ */ u16 virq; /* state == ECS_VIRQ */ } u; #ifdef FLASK_ENABLE @@ -142,6 +146,9 @@ struct vcpu */ int poll_evtchn; + /* (over-)protected by ->domain->event_lock */ + int pirq_evtchn_head; + unsigned long pause_flags; atomic_t pause_count; --- 2010-06-15.orig/xen/include/xen/sched-if.h 2010-05-20 09:59:27.000000000 +0200 +++ 2010-06-15/xen/include/xen/sched-if.h 2010-06-15 09:21:05.000000000 +0200 @@ -79,6 +79,7 @@ static inline void vcpu_schedule_unlock( struct task_slice { struct vcpu *task; s_time_t time; + bool_t migrated; }; struct scheduler {