[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] Merged.



# HG changeset patch
# User emellor@xxxxxxxxxxxxxxxxxxxxxx
# Node ID 4b89195850398b85cd5a3b57ba8228209f010fd9
# Parent  642b26779c4ecb1538032f5fb66b3a83f3ce9d73
# Parent  821368442403cb9110f466a9c7c2c9849bef9733
Merged.

diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S Thu Jan 12 12:13:34 2006
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S Thu Jan 12 12:20:04 2006
@@ -76,7 +76,9 @@
 DF_MASK                = 0x00000400 
 NT_MASK                = 0x00004000
 VM_MASK                = 0x00020000
-
+/* Pseudo-eflags. */
+NMI_MASK       = 0x80000000
+       
 /* Offsets into shared_info_t. */
 #define evtchn_upcall_pending          /* 0 */
 #define evtchn_upcall_mask             1
@@ -305,8 +307,8 @@
        je ldt_ss                       # returning to user-space with LDT SS
 #endif /* XEN */
 restore_nocheck:
-       testl $VM_MASK, EFLAGS(%esp)
-       jnz resume_vm86
+       testl $(VM_MASK|NMI_MASK), EFLAGS(%esp)
+       jnz hypervisor_iret
        movb EVENT_MASK(%esp), %al
        notb %al                        # %al == ~saved_mask
        XEN_GET_VCPU_INFO(%esi)
@@ -328,11 +330,11 @@
        .long 1b,iret_exc
 .previous
 
-resume_vm86:
-       XEN_UNBLOCK_EVENTS(%esi)
+hypervisor_iret:
+       andl $~NMI_MASK, EFLAGS(%esp)
        RESTORE_REGS
        movl %eax,(%esp)
-       movl $__HYPERVISOR_switch_vm86,%eax
+       movl $__HYPERVISOR_iret,%eax
        int $0x82
        ud2
 
@@ -691,6 +693,15 @@
        call do_debug
        jmp ret_from_exception
 
+ENTRY(nmi)
+       pushl %eax
+       SAVE_ALL
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_nmi
+       orl  $NMI_MASK, EFLAGS(%esp)
+       jmp restore_all
+
 #if 0 /* XEN */
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c Thu Jan 12 12:13:34 2006
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c Thu Jan 12 12:20:04 2006
@@ -506,18 +506,11 @@
 
 static void io_check_error(unsigned char reason, struct pt_regs * regs)
 {
-       unsigned long i;
-
        printk("NMI: IOCK error (debug interrupt?)\n");
        show_registers(regs);
 
        /* Re-enable the IOCK line, wait for a few seconds */
-       reason = (reason & 0xf) | 8;
-       outb(reason, 0x61);
-       i = 2000;
-       while (--i) udelay(1000);
-       reason &= ~8;
-       outb(reason, 0x61);
+       clear_io_check_error(reason);
 }
 
 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S       Thu Jan 12 
12:13:34 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S       Thu Jan 12 
12:20:04 2006
@@ -121,19 +121,19 @@
        .endm
 
         /*
-         * Must be consistent with the definition in arch_x86_64.h:    
-         *     struct switch_to_user {
+         * Must be consistent with the definition in arch-x86_64.h:    
+         *     struct iret_context {
          *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
          *     };
          * #define VGCF_IN_SYSCALL (1<<8) 
          */
-        .macro SWITCH_TO_USER flag
+        .macro HYPERVISOR_IRET flag
         subq $8*4,%rsp                   # reuse rip, cs, rflags, rsp, ss in 
the stack
         movq %rax,(%rsp)
         movq %r11,1*8(%rsp)
         movq %rcx,2*8(%rsp)              # we saved %rcx upon exceptions
         movq $\flag,3*8(%rsp)
-        movq $__HYPERVISOR_switch_to_user,%rax
+        movq $__HYPERVISOR_iret,%rax
         syscall
         .endm
 
@@ -225,7 +225,7 @@
        jnz  sysret_careful 
         XEN_UNBLOCK_EVENTS(%rsi)                
        RESTORE_ARGS 0,8,0
-        SWITCH_TO_USER VGCF_IN_SYSCALL
+        HYPERVISOR_IRET VGCF_IN_SYSCALL
 
        /* Handle reschedules */
        /* edx: work, edi: workmask */  
@@ -478,7 +478,7 @@
         orb   $3,1*8(%rsp)
        iretq
 user_mode:
-       SWITCH_TO_USER 0                        
+       HYPERVISOR_IRET 0
        
        /* edi: workmask, edx: work */  
 retint_careful:
@@ -719,6 +719,18 @@
        call evtchn_do_upcall
         jmp  error_exit
 
+#ifdef CONFIG_X86_LOCAL_APIC
+ENTRY(nmi)
+       zeroentry do_nmi_callback
+ENTRY(do_nmi_callback)
+        addq $8, %rsp
+        call do_nmi
+        RESTORE_REST
+        XEN_BLOCK_EVENTS(%rsi)
+        GET_THREAD_INFO(%rcx)
+        jmp  retint_restore_args
+#endif
+
         ALIGN
 restore_all_enable_events:  
        XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
@@ -733,7 +745,7 @@
         orb   $3,1*8(%rsp)
         iretq
 crit_user_mode:
-        SWITCH_TO_USER 0
+        HYPERVISOR_IRET 0
         
 14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
        XEN_PUT_VCPU_INFO(%rsi)
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c       Thu Jan 12 
12:13:34 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c       Thu Jan 12 
12:20:04 2006
@@ -62,6 +62,7 @@
 #include <asm-xen/xen-public/physdev.h>
 #include "setup_arch_pre.h"
 #include <asm/hypervisor.h>
+#include <asm-xen/xen-public/nmi.h>
 #define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
 #define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
 #define end_pfn_map end_pfn
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c       Thu Jan 12 
12:13:34 2006
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c       Thu Jan 12 
12:20:04 2006
@@ -559,9 +559,11 @@
        printk("Uhhuh. NMI received. Dazed and confused, but trying to 
continue\n");
        printk("You probably have a hardware problem with your RAM chips\n");
 
+#if 0 /* XEN */
        /* Clear and disable the memory parity error line. */
        reason = (reason & 0xf) | 4;
        outb(reason, 0x61);
+#endif /* XEN */
 }
 
 static void io_check_error(unsigned char reason, struct pt_regs * regs)
@@ -569,12 +571,14 @@
        printk("NMI: IOCK error (debug interrupt?)\n");
        show_registers(regs);
 
+#if 0 /* XEN */
        /* Re-enable the IOCK line, wait for a few seconds */
        reason = (reason & 0xf) | 8;
        outb(reason, 0x61);
        mdelay(2000);
        reason &= ~8;
        outb(reason, 0x61);
+#endif /* XEN */
 }
 
 static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h Thu Jan 12 
12:13:34 2006
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h Thu Jan 12 
12:20:04 2006
@@ -32,6 +32,7 @@
 
 #include <asm-xen/xen-public/xen.h>
 #include <asm-xen/xen-public/sched.h>
+#include <asm-xen/xen-public/nmi.h>
 
 #define _hypercall0(type, name)                        \
 ({                                             \
@@ -300,6 +301,14 @@
                           SHUTDOWN_suspend, srec);
 }
 
+static inline int
+HYPERVISOR_nmi_op(
+       unsigned long op,
+       unsigned long arg)
+{
+       return _hypercall2(int, nmi_op, op, arg);
+}
+
 #endif /* __HYPERCALL_H__ */
 
 /*
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h  
Thu Jan 12 12:13:34 2006
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h  
Thu Jan 12 12:20:04 2006
@@ -29,6 +29,7 @@
 
 extern void hypervisor_callback(void);
 extern void failsafe_callback(void);
+extern void nmi(void);
 
 static void __init machine_specific_arch_setup(void)
 {
@@ -36,5 +37,7 @@
            __KERNEL_CS, (unsigned long)hypervisor_callback,
            __KERNEL_CS, (unsigned long)failsafe_callback);
 
+       HYPERVISOR_nmi_op(XENNMI_register_callback, (unsigned long)&nmi);
+
        machine_specific_modify_cpu_capabilities(&boot_cpu_data);
 }
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h       Thu Jan 
12 12:13:34 2006
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h       Thu Jan 
12 12:20:04 2006
@@ -287,9 +287,9 @@
 }
 
 static inline int
-HYPERVISOR_switch_to_user(void)
-{
-       return _hypercall0(int, switch_to_user);
+HYPERVISOR_iret(void)
+{
+       return _hypercall0(int, iret);
 }
 
 static inline int
@@ -305,6 +305,14 @@
 {
        return _hypercall3(int, sched_op, SCHEDOP_shutdown,
                           SHUTDOWN_suspend, srec);
+}
+
+static inline int
+HYPERVISOR_nmi_op(
+       unsigned long op,
+       unsigned long arg)
+{
+       return _hypercall2(int, nmi_op, op, arg);
 }
 
 #endif /* __HYPERCALL_H__ */
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/setup_arch_post.h
--- 
a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/setup_arch_post.h    
    Thu Jan 12 12:13:34 2006
+++ 
b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/setup_arch_post.h    
    Thu Jan 12 12:20:04 2006
@@ -35,6 +35,7 @@
 
 extern void hypervisor_callback(void);
 extern void failsafe_callback(void);
+extern void nmi(void);
 
 static void __init machine_specific_arch_setup(void)
 {
@@ -43,5 +44,9 @@
                 (unsigned long) failsafe_callback,
                 (unsigned long) system_call);
 
+#ifdef CONFIG_X86_LOCAL_APIC
+       HYPERVISOR_nmi_op(XENNMI_register_callback, (unsigned long)&nmi);
+#endif
+
        machine_specific_modify_cpu_capabilities(&boot_cpu_data);
 }
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/domain.c     Thu Jan 12 12:20:04 2006
@@ -288,9 +288,7 @@
 
 #if defined(__i386__)
 
-    d->arch.mapcache.l1tab = d->arch.mm_perdomain_pt +
-        (GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
-    spin_lock_init(&d->arch.mapcache.lock);
+    mapcache_init(d);
 
 #else /* __x86_64__ */
 
@@ -481,14 +479,6 @@
 
 
 #ifdef __x86_64__
-
-void toggle_guest_mode(struct vcpu *v)
-{
-    v->arch.flags ^= TF_kernel_mode;
-    __asm__ __volatile__ ( "swapgs" );
-    update_pagetables(v);
-    write_ptbase(v);
-}
 
 #define loadsegment(seg,value) ({               \
     int __r = 1;                                \
@@ -659,35 +649,6 @@
     percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
 }
 
-long do_switch_to_user(void)
-{
-    struct cpu_user_regs  *regs = guest_cpu_user_regs();
-    struct switch_to_user  stu;
-    struct vcpu    *v = current;
-
-    if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
-         unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
-        return -EFAULT;
-
-    toggle_guest_mode(v);
-
-    regs->rip    = stu.rip;
-    regs->cs     = stu.cs | 3; /* force guest privilege */
-    regs->rflags = (stu.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
-    regs->rsp    = stu.rsp;
-    regs->ss     = stu.ss | 3; /* force guest privilege */
-
-    if ( !(stu.flags & VGCF_IN_SYSCALL) )
-    {
-        regs->entry_vector = 0;
-        regs->r11 = stu.r11;
-        regs->rcx = stu.rcx;
-    }
-
-    /* Saved %rax gets written back to regs->rax in entry.S. */
-    return stu.rax;
-}
-
 #define switch_kernel_stack(_n,_c) ((void)0)
 
 #elif defined(__i386__)
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/mm.c Thu Jan 12 12:20:04 2006
@@ -297,7 +297,6 @@
 
 #if defined(__x86_64__)
     /* If in user mode, switch to kernel mode just to read LDT mapping. */
-    extern void toggle_guest_mode(struct vcpu *);
     int user_mode = !(v->arch.flags & TF_kernel_mode);
 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
 #elif defined(__i386__)
@@ -2971,7 +2970,6 @@
 
 #ifdef CONFIG_X86_64
     struct vcpu *v = current;
-    extern void toggle_guest_mode(struct vcpu *);
     int user_mode = !(v->arch.flags & TF_kernel_mode);
 #endif
 
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/traps.c      Thu Jan 12 12:20:04 2006
@@ -596,7 +596,6 @@
     u16 x;
 #if defined(__x86_64__)
     /* If in user mode, switch to kernel mode just to read I/O bitmap. */
-    extern void toggle_guest_mode(struct vcpu *);
     int user_mode = !(v->arch.flags & TF_kernel_mode);
 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
 #elif defined(__i386__)
@@ -1080,26 +1079,23 @@
     return 0;
 }
 
-
-/* Defer dom0 notification to softirq context (unsafe in NMI context). */
-static unsigned long nmi_dom0_softirq_reason;
-#define NMI_DOM0_PARITY_ERR 0
-#define NMI_DOM0_IO_ERR     1
-#define NMI_DOM0_UNKNOWN    2
-
-static void nmi_dom0_softirq(void)
-{
-    if ( dom0 == NULL )
+static void nmi_softirq(void)
+{
+    /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */
+    evtchn_notify(dom0->vcpu[0]);
+}
+
+static void nmi_dom0_report(unsigned int reason_idx)
+{
+    struct domain *d;
+
+    if ( (d = dom0) == NULL )
         return;
 
-    if ( test_and_clear_bit(NMI_DOM0_PARITY_ERR, &nmi_dom0_softirq_reason) )
-        send_guest_virq(dom0->vcpu[0], VIRQ_PARITY_ERR);
-
-    if ( test_and_clear_bit(NMI_DOM0_IO_ERR, &nmi_dom0_softirq_reason) )
-        send_guest_virq(dom0->vcpu[0], VIRQ_IO_ERR);
-
-    if ( test_and_clear_bit(NMI_DOM0_UNKNOWN, &nmi_dom0_softirq_reason) )
-        send_guest_virq(dom0->vcpu[0], VIRQ_NMI);
+    set_bit(reason_idx, &d->shared_info->arch.nmi_reason);
+
+    if ( test_and_set_bit(_VCPUF_nmi_pending, &d->vcpu[0]->vcpu_flags) )
+        raise_softirq(NMI_SOFTIRQ); /* not safe to wake up a vcpu here */
 }
 
 asmlinkage void mem_parity_error(struct cpu_user_regs *regs)
@@ -1107,8 +1103,7 @@
     switch ( opt_nmi[0] )
     {
     case 'd': /* 'dom0' */
-        set_bit(NMI_DOM0_PARITY_ERR, &nmi_dom0_softirq_reason);
-        raise_softirq(NMI_DOM0_SOFTIRQ);
+        nmi_dom0_report(_XEN_NMIREASON_parity_error);
     case 'i': /* 'ignore' */
         break;
     default:  /* 'fatal' */
@@ -1127,8 +1122,7 @@
     switch ( opt_nmi[0] )
     {
     case 'd': /* 'dom0' */
-        set_bit(NMI_DOM0_IO_ERR, &nmi_dom0_softirq_reason);
-        raise_softirq(NMI_DOM0_SOFTIRQ);
+        nmi_dom0_report(_XEN_NMIREASON_io_error);
     case 'i': /* 'ignore' */
         break;
     default:  /* 'fatal' */
@@ -1147,8 +1141,7 @@
     switch ( opt_nmi[0] )
     {
     case 'd': /* 'dom0' */
-        set_bit(NMI_DOM0_UNKNOWN, &nmi_dom0_softirq_reason);
-        raise_softirq(NMI_DOM0_SOFTIRQ);
+        nmi_dom0_report(_XEN_NMIREASON_unknown);
     case 'i': /* 'ignore' */
         break;
     default:  /* 'fatal' */
@@ -1347,7 +1340,7 @@
 
     cpu_init();
 
-    open_softirq(NMI_DOM0_SOFTIRQ, nmi_dom0_softirq);
+    open_softirq(NMI_SOFTIRQ, nmi_softirq);
 }
 
 
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/x86_32/asm-offsets.c Thu Jan 12 12:20:04 2006
@@ -65,6 +65,10 @@
            arch.guest_context.kernel_ss);
     OFFSET(VCPU_kernel_sp, struct vcpu,
            arch.guest_context.kernel_sp);
+    OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
+    OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
+    DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
+    DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
     BLANK();
 
     OFFSET(VCPUINFO_upcall_pending, vcpu_info_t, evtchn_upcall_pending);
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/x86_32/domain_page.c
--- a/xen/arch/x86/x86_32/domain_page.c Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/x86_32/domain_page.c Thu Jan 12 12:20:04 2006
@@ -20,33 +20,16 @@
 #include <asm/flushtlb.h>
 #include <asm/hardirq.h>
 
-#define MAPCACHE_ORDER    10
-#define MAPCACHE_ENTRIES  (1 << MAPCACHE_ORDER)
-
-/* Use a spare PTE bit to mark entries ready for recycling. */
-#define READY_FOR_TLB_FLUSH (1<<10)
-
-static void flush_all_ready_maps(void)
-{
-    struct mapcache *cache = &current->domain->arch.mapcache;
-    unsigned int i;
-
-    for ( i = 0; i < MAPCACHE_ENTRIES; i++ )
-        if ( (l1e_get_flags(cache->l1tab[i]) & READY_FOR_TLB_FLUSH) )
-            cache->l1tab[i] = l1e_empty();
-}
-
-void *map_domain_pages(unsigned long pfn, unsigned int order)
+void *map_domain_page(unsigned long pfn)
 {
     unsigned long va;
-    unsigned int idx, i, flags, vcpu = current->vcpu_id;
+    unsigned int idx, i, vcpu = current->vcpu_id;
     struct domain *d;
     struct mapcache *cache;
-#ifndef NDEBUG
-    unsigned int flush_count = 0;
-#endif
+    struct vcpu_maphash_entry *hashent;
 
     ASSERT(!in_irq());
+
     perfc_incrc(map_domain_page_count);
 
     /* If we are the idle domain, ensure that we run on our own page tables. */
@@ -56,6 +39,18 @@
 
     cache = &d->arch.mapcache;
 
+    hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)];
+#if 0
+    if ( hashent->pfn == pfn )
+    {
+        idx = hashent->idx;
+        hashent->refcnt++;
+        ASSERT(hashent->refcnt != 0);
+        ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn);
+        goto out;
+    }
+#endif
+
     spin_lock(&cache->lock);
 
     /* Has some other CPU caused a wrap? We must flush if so. */
@@ -70,45 +65,97 @@
         }
     }
 
-    do {
-        idx = cache->cursor = (cache->cursor + 1) & (MAPCACHE_ENTRIES - 1);
-        if ( unlikely(idx == 0) )
-        {
-            ASSERT(flush_count++ == 0);
-            flush_all_ready_maps();
-            perfc_incrc(domain_page_tlb_flush);
-            local_flush_tlb();
-            cache->shadow_epoch[vcpu] = ++cache->epoch;
-            cache->tlbflush_timestamp = tlbflush_current_time();
-        }
-
-        flags = 0;
-        for ( i = 0; i < (1U << order); i++ )
-            flags |= l1e_get_flags(cache->l1tab[idx+i]);
-    }
-    while ( flags & _PAGE_PRESENT );
-
-    for ( i = 0; i < (1U << order); i++ )
-        cache->l1tab[idx+i] = l1e_from_pfn(pfn+i, __PAGE_HYPERVISOR);
+    idx = find_next_zero_bit(cache->inuse, MAPCACHE_ENTRIES, cache->cursor);
+    if ( unlikely(idx >= MAPCACHE_ENTRIES) )
+    {
+        /* /First/, clean the garbage map and update the inuse list. */
+        for ( i = 0; i < ARRAY_SIZE(cache->garbage); i++ )
+        {
+            unsigned long x = xchg(&cache->garbage[i], 0);
+            cache->inuse[i] &= ~x;
+        }
+
+        /* /Second/, flush TLBs. */
+        perfc_incrc(domain_page_tlb_flush);
+        local_flush_tlb();
+        cache->shadow_epoch[vcpu] = ++cache->epoch;
+        cache->tlbflush_timestamp = tlbflush_current_time();
+
+        idx = find_first_zero_bit(cache->inuse, MAPCACHE_ENTRIES);
+        ASSERT(idx < MAPCACHE_ENTRIES);
+    }
+
+    set_bit(idx, cache->inuse);
+    cache->cursor = idx + 1;
 
     spin_unlock(&cache->lock);
 
+    cache->l1tab[idx] = l1e_from_pfn(pfn, __PAGE_HYPERVISOR);
+
+/*out:*/
     va = MAPCACHE_VIRT_START + (idx << PAGE_SHIFT);
     return (void *)va;
 }
 
-void unmap_domain_pages(void *va, unsigned int order)
-{
-    unsigned int idx, i;
+void unmap_domain_page(void *va)
+{
+    unsigned int idx;
     struct mapcache *cache = &current->domain->arch.mapcache;
+    unsigned long pfn;
+    struct vcpu_maphash_entry *hashent;
+
+    ASSERT(!in_irq());
 
     ASSERT((void *)MAPCACHE_VIRT_START <= va);
     ASSERT(va < (void *)MAPCACHE_VIRT_END);
 
     idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
-
-    for ( i = 0; i < (1U << order); i++ )
-        l1e_add_flags(cache->l1tab[idx+i], READY_FOR_TLB_FLUSH);
+    pfn = l1e_get_pfn(cache->l1tab[idx]);
+    hashent = &cache->vcpu_maphash[current->vcpu_id].hash[MAPHASH_HASHFN(pfn)];
+
+    if ( hashent->idx == idx )
+    {
+        ASSERT(hashent->pfn == pfn);
+        ASSERT(hashent->refcnt != 0);
+        hashent->refcnt--;
+    }
+    else if ( hashent->refcnt == 0 )
+    {
+        if ( hashent->idx != MAPHASHENT_NOTINUSE )
+        {
+            /* /First/, zap the PTE. */
+            ASSERT(l1e_get_pfn(cache->l1tab[hashent->idx]) == hashent->pfn);
+            cache->l1tab[hashent->idx] = l1e_empty();
+            /* /Second/, mark as garbage. */
+            set_bit(hashent->idx, cache->garbage);
+        }
+
+        /* Add newly-freed mapping to the maphash. */
+        hashent->pfn = pfn;
+        hashent->idx = idx;
+    }
+    else
+    {
+        /* /First/, zap the PTE. */
+        cache->l1tab[idx] = l1e_empty();
+        /* /Second/, mark as garbage. */
+        set_bit(idx, cache->garbage);
+    }
+}
+
+void mapcache_init(struct domain *d)
+{
+    unsigned int i, j;
+
+    d->arch.mapcache.l1tab = d->arch.mm_perdomain_pt +
+        (GDT_LDT_MBYTES << (20 - PAGE_SHIFT));
+    spin_lock_init(&d->arch.mapcache.lock);
+
+    /* Mark all maphash entries as not in use. */
+    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
+        for ( j = 0; j < MAPHASH_ENTRIES; j++ )
+            d->arch.mapcache.vcpu_maphash[i].hash[j].idx =
+                MAPHASHENT_NOTINUSE;
 }
 
 #define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT))
@@ -128,15 +175,10 @@
 
     spin_lock(&globalmap_lock);
 
-    for ( ; ; )
-    {
-        idx = find_next_zero_bit(inuse, GLOBALMAP_BITS, inuse_cursor);
-        va = IOREMAP_VIRT_START + (idx << PAGE_SHIFT);
-
-        /* End of round? If not then we're done in this loop. */
-        if ( va < FIXADDR_START )
-            break;
-
+    idx = find_next_zero_bit(inuse, GLOBALMAP_BITS, inuse_cursor);
+    va = IOREMAP_VIRT_START + (idx << PAGE_SHIFT);
+    if ( unlikely(va >= FIXADDR_START) )
+    {
         /* /First/, clean the garbage map and update the inuse list. */
         for ( i = 0; i < ARRAY_SIZE(garbage); i++ )
         {
@@ -147,7 +189,9 @@
         /* /Second/, flush all TLBs to get rid of stale garbage mappings. */
         flush_tlb_all();
 
-        inuse_cursor = 0;
+        idx = find_first_zero_bit(inuse, GLOBALMAP_BITS);
+        va = IOREMAP_VIRT_START + (idx << PAGE_SHIFT);
+        ASSERT(va < FIXADDR_START);
     }
 
     set_bit(idx, inuse);
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/x86_32/entry.S       Thu Jan 12 12:20:04 2006
@@ -326,7 +326,9 @@
         shl  $IRQSTAT_shift,%eax
         test %ecx,irq_stat(%eax,1)
         jnz  process_softirqs
-/*test_guest_events:*/
+        btr  $_VCPUF_nmi_pending,VCPU_flags(%ebx)
+        jc   process_nmi
+test_guest_events:
         movl VCPU_vcpu_info(%ebx),%eax
         testb $0xFF,VCPUINFO_upcall_mask(%eax)
         jnz  restore_all_guest
@@ -348,7 +350,24 @@
         sti       
         call do_softirq
         jmp  test_all_events
-                
+       
+       ALIGN
+process_nmi:
+        movl VCPU_nmi_addr(%ebx),%eax
+        test %eax,%eax
+        jz   test_all_events
+        bts  $_VCPUF_nmi_masked,VCPU_flags(%ebx)
+        jc   1f
+        sti
+        leal VCPU_trap_bounce(%ebx),%edx
+        movl %eax,TRAPBOUNCE_eip(%edx)
+        movw $FLAT_KERNEL_CS,TRAPBOUNCE_cs(%edx)
+        movw $TBF_INTERRUPT,TRAPBOUNCE_flags(%edx)
+        call create_bounce_frame
+        jmp  test_all_events
+1:      bts  $_VCPUF_nmi_pending,VCPU_flags(%ebx)
+        jmp  test_guest_events
+
 /* CREATE A BASIC EXCEPTION FRAME ON GUEST OS (RING-1) STACK:            */
 /*   {EIP, CS, EFLAGS, [ESP, SS]}                                        */
 /* %edx == trap_bounce, %ebx == struct vcpu                       */
@@ -620,9 +639,7 @@
         jne   defer_nmi
 
 continue_nmi:
-        movl  $(__HYPERVISOR_DS),%edx
-        movl  %edx,%ds
-        movl  %edx,%es
+        SET_XEN_SEGMENTS(d)
         movl  %esp,%edx
         pushl %edx
         call  do_nmi
@@ -659,42 +676,6 @@
         GET_GUEST_REGS(%ecx)
         movl %eax,UREGS_eax(%ecx)
         jmp  do_sched_op
-
-do_switch_vm86:
-        # Reset the stack pointer
-        GET_GUEST_REGS(%ecx)
-        movl %ecx,%esp
-
-        # GS:ESI == Ring-1 stack activation
-        movl UREGS_esp(%esp),%esi
-VFLT1:  mov  UREGS_ss(%esp),%gs
-
-        # ES:EDI == Ring-0 stack activation
-        leal UREGS_eip(%esp),%edi
-
-        # Restore the hypercall-number-clobbered EAX on our stack frame
-VFLT2:  movl %gs:(%esi),%eax
-        movl %eax,UREGS_eax(%esp)
-        addl $4,%esi
-               
-       # Copy the VM86 activation from the ring-1 stack to the ring-0 stack
-        movl $(UREGS_user_sizeof-UREGS_eip)/4,%ecx
-VFLT3:  movl %gs:(%esi),%eax
-        stosl
-        addl $4,%esi
-        loop VFLT3
-
-        # Fix up EFLAGS: IOPL=0, IF=1, VM=1
-        andl $~X86_EFLAGS_IOPL,UREGS_eflags(%esp)
-        orl  $X86_EFLAGS_IF|X86_EFLAGS_VM,UREGS_eflags(%esp)
-        
-        jmp test_all_events
-
-.section __ex_table,"a"
-        .long VFLT1,domain_crash_synchronous
-        .long VFLT2,domain_crash_synchronous
-        .long VFLT3,domain_crash_synchronous
-.previous
 
 .data
 
@@ -744,11 +725,12 @@
         .long do_grant_table_op     /* 20 */
         .long do_vm_assist
         .long do_update_va_mapping_otherdomain
-        .long do_switch_vm86
+        .long do_iret
         .long do_vcpu_op
         .long do_ni_hypercall       /* 25 */
         .long do_mmuext_op
-        .long do_acm_op             /* 27 */
+        .long do_acm_op
+        .long do_nmi_op
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -777,11 +759,12 @@
         .byte 3 /* do_grant_table_op    */  /* 20 */
         .byte 2 /* do_vm_assist         */
         .byte 5 /* do_update_va_mapping_otherdomain */
-        .byte 0 /* do_switch_vm86       */
+        .byte 0 /* do_iret              */
         .byte 3 /* do_vcpu_op           */
         .byte 0 /* do_ni_hypercall      */  /* 25 */
         .byte 4 /* do_mmuext_op         */
         .byte 1 /* do_acm_op            */
+        .byte 2 /* do_nmi_op            */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/x86_32/traps.c       Thu Jan 12 12:20:04 2006
@@ -157,6 +157,64 @@
         __asm__ __volatile__ ( "hlt" );
 }
 
+static inline void pop_from_guest_stack(
+    void *dst, struct cpu_user_regs *regs, unsigned int bytes)
+{
+    if ( unlikely(__copy_from_user(dst, (void __user *)regs->esp, bytes)) )
+        domain_crash_synchronous();
+    regs->esp += bytes;
+}
+
+asmlinkage unsigned long do_iret(void)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    u32 eflags;
+
+    /* Check worst-case stack frame for overlap with Xen protected area. */
+    if ( unlikely(!access_ok(regs->esp, 40)) )
+        domain_crash_synchronous();
+
+    /* Pop and restore EAX (clobbered by hypercall). */
+    pop_from_guest_stack(&regs->eax, regs, 4);
+
+    /* Pop and restore CS and EIP. */
+    pop_from_guest_stack(&regs->eip, regs, 8);
+
+    /*
+     * Pop, fix up and restore EFLAGS. We fix up in a local staging area
+     * to avoid firing the BUG_ON(IOPL) check in arch_getdomaininfo_ctxt.
+     */
+    pop_from_guest_stack(&eflags, regs, 4);
+    regs->eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF;
+
+    if ( VM86_MODE(regs) )
+    {
+        /* Return to VM86 mode: pop and restore ESP,SS,ES,DS,FS and GS. */
+        pop_from_guest_stack(&regs->esp, regs, 24);
+    }
+    else if ( unlikely(RING_0(regs)) )
+    {
+        domain_crash_synchronous();
+    }
+    else if ( !RING_1(regs) )
+    {
+        /* Return to ring 2/3: pop and restore ESP and SS. */
+        pop_from_guest_stack(&regs->esp, regs, 8);
+    }
+
+    /* No longer in NMI context. */
+    clear_bit(_VCPUF_nmi_masked, &current->vcpu_flags);
+
+    /* Restore upcall mask from saved value. */
+    current->vcpu_info->evtchn_upcall_mask = regs->saved_upcall_mask;
+
+    /*
+     * The hypercall exit path will overwrite EAX with this return
+     * value.
+     */
+    return regs->eax;
+}
+
 BUILD_SMP_INTERRUPT(deferred_nmi, TRAP_deferred_nmi)
 asmlinkage void smp_deferred_nmi(struct cpu_user_regs regs)
 {
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/x86_64/asm-offsets.c Thu Jan 12 12:20:04 2006
@@ -65,6 +65,10 @@
            arch.guest_context.syscall_callback_eip);
     OFFSET(VCPU_kernel_sp, struct vcpu,
            arch.guest_context.kernel_sp);
+    OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
+    OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
+    DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
+    DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
     BLANK();
 
     OFFSET(VCPUINFO_upcall_pending, vcpu_info_t, evtchn_upcall_pending);
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/x86_64/entry.S
--- a/xen/arch/x86/x86_64/entry.S       Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/x86_64/entry.S       Thu Jan 12 12:20:04 2006
@@ -171,7 +171,9 @@
         leaq  irq_stat(%rip),%rcx
         testl $~0,(%rcx,%rax,1)
         jnz   process_softirqs
-/*test_guest_events:*/
+        btr   $_VCPUF_nmi_pending,VCPU_flags(%rbx)
+        jc    process_nmi
+test_guest_events:
         movq  VCPU_vcpu_info(%rbx),%rax
         testb $0xFF,VCPUINFO_upcall_mask(%rax)
         jnz   restore_all_guest
@@ -322,6 +324,23 @@
         call do_softirq
         jmp  test_all_events
 
+       ALIGN
+/* %rbx: struct vcpu */
+process_nmi:
+        movq VCPU_nmi_addr(%rbx),%rax
+        test %rax,%rax
+        jz   test_all_events
+        bts  $_VCPUF_nmi_masked,VCPU_flags(%rbx)
+        jc   1f
+        sti
+        leaq VCPU_trap_bounce(%rbx),%rdx
+        movq %rax,TRAPBOUNCE_eip(%rdx)
+        movw $(TBF_INTERRUPT|TBF_SLOW_IRET),TRAPBOUNCE_flags(%rdx)
+        call create_bounce_frame
+        jmp  test_all_events
+1:      bts  $_VCPUF_nmi_pending,VCPU_flags(%rbx)
+        jmp  test_guest_events
+       
 /* CREATE A BASIC EXCEPTION FRAME ON GUEST OS STACK:                     */
 /*   { RCX, R11, [DS-GS,] [CR2,] [ERRCODE,] RIP, CS, RFLAGS, RSP, SS }   */
 /* %rdx: trap_bounce, %rbx: struct vcpu                           */
@@ -339,6 +358,9 @@
 1:      /* In kernel context already: push new frame at existing %rsp. */
         movq  UREGS_rsp+8(%rsp),%rsi
         andb  $0xfc,UREGS_cs+8(%rsp)    # Indicate kernel context to guest.
+       testw $(TBF_SLOW_IRET),TRAPBOUNCE_flags(%rdx)
+       jz    2f
+       orb   $0x01,UREGS_cs+8(%rsp)
 2:      andq  $~0xf,%rsi                # Stack frames are 16-byte aligned.
         movq  $HYPERVISOR_VIRT_START,%rax
         cmpq  %rax,%rsi
@@ -569,7 +591,7 @@
         SAVE_ALL
         movq  %rsp,%rdi
         call  do_nmi
-       jmp   restore_all_xen
+        jmp   ret_from_intr
 
 do_arch_sched_op:
         # Ensure we return success even if we return via schedule_tail()
@@ -626,11 +648,12 @@
         .quad do_grant_table_op     /* 20 */
         .quad do_vm_assist
         .quad do_update_va_mapping_otherdomain
-        .quad do_switch_to_user
+        .quad do_iret
         .quad do_vcpu_op
         .quad do_set_segment_base   /* 25 */
         .quad do_mmuext_op
         .quad do_acm_op
+        .quad do_nmi_op
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .quad do_ni_hypercall
         .endr
@@ -659,11 +682,12 @@
         .byte 3 /* do_grant_table_op    */  /* 20 */
         .byte 2 /* do_vm_assist         */
         .byte 4 /* do_update_va_mapping_otherdomain */
-        .byte 0 /* do_switch_to_user    */
+        .byte 0 /* do_iret              */
         .byte 3 /* do_vcpu_op           */
         .byte 2 /* do_set_segment_base  */  /* 25 */
         .byte 4 /* do_mmuext_op         */
         .byte 1 /* do_acm_op            */
+        .byte 2 /* do_nmi_op            */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r 642b26779c4e -r 4b8919585039 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Thu Jan 12 12:13:34 2006
+++ b/xen/arch/x86/x86_64/traps.c       Thu Jan 12 12:20:04 2006
@@ -12,6 +12,7 @@
 #include <asm/current.h>
 #include <asm/flushtlb.h>
 #include <asm/msr.h>
+#include <asm/shadow.h>
 #include <asm/vmx.h>
 
 void show_registers(struct cpu_user_regs *regs)
@@ -113,6 +114,52 @@
         __asm__ __volatile__ ( "hlt" );
 }
 
+void toggle_guest_mode(struct vcpu *v)
+{
+    v->arch.flags ^= TF_kernel_mode;
+    __asm__ __volatile__ ( "swapgs" );
+    update_pagetables(v);
+    write_ptbase(v);
+}
+
+long do_iret(void)
+{
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    struct iret_context iret_saved;
+    struct vcpu *v = current;
+
+    if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp,
+                                 sizeof(iret_saved))) )
+        domain_crash_synchronous();
+
+    /* Returning to user mode? */
+    if ( (iret_saved.cs & 3) == 3 )
+    {
+        if ( unlikely(pagetable_get_paddr(v->arch.guest_table_user) == 0) )
+            return -EFAULT;
+        toggle_guest_mode(v);
+    }
+
+    regs->rip    = iret_saved.rip;
+    regs->cs     = iret_saved.cs | 3; /* force guest privilege */
+    regs->rflags = (iret_saved.rflags & ~(EF_IOPL|EF_VM)) | EF_IE;
+    regs->rsp    = iret_saved.rsp;
+    regs->ss     = iret_saved.ss | 3; /* force guest privilege */
+
+    if ( !(iret_saved.flags & VGCF_IN_SYSCALL) )
+    {
+        regs->entry_vector = 0;
+        regs->r11 = iret_saved.r11;
+        regs->rcx = iret_saved.rcx;
+    }
+
+    /* No longer in NMI context. */
+    clear_bit(_VCPUF_nmi_masked, &current->vcpu_flags);
+
+    /* Saved %rax gets written back to regs->rax in entry.S. */
+    return iret_saved.rax;
+}
+
 asmlinkage void syscall_enter(void);
 void __init percpu_traps_init(void)
 {
diff -r 642b26779c4e -r 4b8919585039 xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c     Thu Jan 12 12:13:34 2006
+++ b/xen/common/dom0_ops.c     Thu Jan 12 12:20:04 2006
@@ -323,7 +323,7 @@
         new_affinity = v->cpu_affinity;
         memcpy(cpus_addr(new_affinity),
                &op->u.setvcpuaffinity.cpumap,
-               min((int)BITS_TO_LONGS(NR_CPUS),
+               min((int)(BITS_TO_LONGS(NR_CPUS) * sizeof(long)),
                    (int)sizeof(op->u.setvcpuaffinity.cpumap)));
 
         ret = vcpu_set_affinity(v, &new_affinity);
@@ -501,7 +501,7 @@
         op->u.getvcpuinfo.cpumap   = 0;
         memcpy(&op->u.getvcpuinfo.cpumap,
                cpus_addr(v->cpu_affinity),
-               min((int)BITS_TO_LONGS(NR_CPUS),
+               min((int)(BITS_TO_LONGS(NR_CPUS) * sizeof(long)),
                    (int)sizeof(op->u.getvcpuinfo.cpumap)));
         ret = 0;
 
diff -r 642b26779c4e -r 4b8919585039 xen/common/kernel.c
--- a/xen/common/kernel.c       Thu Jan 12 12:13:34 2006
+++ b/xen/common/kernel.c       Thu Jan 12 12:20:04 2006
@@ -11,6 +11,7 @@
 #include <xen/compile.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <public/nmi.h>
 #include <public/version.h>
 
 void cmdline_parse(char *cmdline)
@@ -146,6 +147,43 @@
     }
 
     return -ENOSYS;
+}
+
+long do_nmi_op(unsigned int cmd, void *arg)
+{
+    struct vcpu *v = current;
+    struct domain *d = current->domain;
+    long rc = 0;
+
+    switch ( cmd )
+    {
+    case XENNMI_register_callback:
+        if ( (d->domain_id != 0) || (v->vcpu_id != 0) )
+        { 
+           rc = -EINVAL;
+        }
+        else
+        {
+            v->nmi_addr = (unsigned long)arg;
+#ifdef CONFIG_X86
+            /*
+             * If no handler was registered we can 'lose the NMI edge'.
+             * Re-assert it now.
+             */
+            if ( d->shared_info->arch.nmi_reason != 0 )
+                set_bit(_VCPUF_nmi_pending, &v->vcpu_flags);
+#endif
+        }
+        break;
+    case XENNMI_unregister_callback:
+        v->nmi_addr = 0;
+        break;
+    default:
+        rc = -ENOSYS;
+        break;
+    }
+
+    return rc;
 }
 
 long do_vm_assist(unsigned int cmd, unsigned int type)
diff -r 642b26779c4e -r 4b8919585039 xen/common/schedule.c
--- a/xen/common/schedule.c     Thu Jan 12 12:13:34 2006
+++ b/xen/common/schedule.c     Thu Jan 12 12:20:04 2006
@@ -207,7 +207,10 @@
 
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity)
 {
-    if ( cpus_empty(*affinity) )
+    cpumask_t online_affinity;
+
+    cpus_and(online_affinity, *affinity, cpu_online_map);
+    if ( cpus_empty(online_affinity) )
         return -EINVAL;
 
     return SCHED_OP(set_affinity, v, affinity);
diff -r 642b26779c4e -r 4b8919585039 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Thu Jan 12 12:13:34 2006
+++ b/xen/include/asm-x86/domain.h      Thu Jan 12 12:20:04 2006
@@ -13,13 +13,43 @@
     unsigned long  eip;
 };
 
+#define MAPHASH_ENTRIES 8
+#define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1))
+#define MAPHASHENT_NOTINUSE ((u16)~0U)
+struct vcpu_maphash {
+    struct vcpu_maphash_entry {
+        unsigned long pfn;
+        uint16_t      idx;
+        uint16_t      refcnt;
+    } hash[MAPHASH_ENTRIES];
+} __cacheline_aligned;
+
+#define MAPCACHE_ORDER   10
+#define MAPCACHE_ENTRIES (1 << MAPCACHE_ORDER)
 struct mapcache {
+    /* The PTEs that provide the mappings, and a cursor into the array. */
     l1_pgentry_t *l1tab;
     unsigned int cursor;
+
+    /* Protects map_domain_page(). */
+    spinlock_t lock;
+
+    /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */
     unsigned int epoch, shadow_epoch[MAX_VIRT_CPUS];
     u32 tlbflush_timestamp;
-    spinlock_t lock;
+
+    /* Which mappings are in use, and which are garbage to reap next epoch? */
+    unsigned long inuse[BITS_TO_LONGS(MAPCACHE_ENTRIES)];
+    unsigned long garbage[BITS_TO_LONGS(MAPCACHE_ENTRIES)];
+
+    /* Lock-free per-VCPU hash of recently-used mappings. */
+    struct vcpu_maphash vcpu_maphash[MAX_VIRT_CPUS];
 };
+
+extern void mapcache_init(struct domain *);
+
+/* x86/64: toggle guest between kernel and user modes. */
+extern void toggle_guest_mode(struct vcpu *);
 
 struct arch_domain
 {
diff -r 642b26779c4e -r 4b8919585039 xen/include/asm-x86/nmi.h
--- a/xen/include/asm-x86/nmi.h Thu Jan 12 12:13:34 2006
+++ b/xen/include/asm-x86/nmi.h Thu Jan 12 12:20:04 2006
@@ -1,6 +1,8 @@
 
 #ifndef ASM_NMI_H
 #define ASM_NMI_H
+
+#include <public/nmi.h>
 
 struct cpu_user_regs;
  
diff -r 642b26779c4e -r 4b8919585039 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h   Thu Jan 12 12:13:34 2006
+++ b/xen/include/asm-x86/processor.h   Thu Jan 12 12:20:04 2006
@@ -123,6 +123,7 @@
 #define TBF_EXCEPTION_ERRCODE  2
 #define TBF_INTERRUPT          8
 #define TBF_FAILSAFE          16
+#define TBF_SLOW_IRET         32
 
 /* 'arch_vcpu' flags values */
 #define _TF_kernel_mode        0
diff -r 642b26779c4e -r 4b8919585039 xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Thu Jan 12 12:13:34 2006
+++ b/xen/include/public/arch-x86_32.h  Thu Jan 12 12:20:04 2006
@@ -135,6 +135,7 @@
     unsigned long max_pfn;                  /* max pfn that appears in table */
     /* Frame containing list of mfns containing list of mfns containing p2m. */
     unsigned long pfn_to_mfn_frame_list_list; 
+    unsigned long nmi_reason;
 } arch_shared_info_t;
 
 typedef struct {
diff -r 642b26779c4e -r 4b8919585039 xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Thu Jan 12 12:13:34 2006
+++ b/xen/include/public/arch-x86_64.h  Thu Jan 12 12:20:04 2006
@@ -88,11 +88,20 @@
 #define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
 
 /*
- * int HYPERVISOR_switch_to_user(void)
+ * int HYPERVISOR_iret(void)
  * All arguments are on the kernel stack, in the following format.
  * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ *   RING0 -> RING3 kernel mode.
+ *   RING1 -> RING3 kernel mode.
+ *   RING2 -> RING3 kernel mode.
+ *   RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ *      orb   $3,1*8(%rsp)
+ *      iretq
  * If flags contains VGCF_IN_SYSCALL:
- *   Restore RAX, RIP, RFLAGS, RSP. 
+ *   Restore RAX, RIP, RFLAGS, RSP.
  *   Discard R11, RCX, CS, SS.
  * Otherwise:
  *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
@@ -100,10 +109,19 @@
  */
 /* Guest exited in SYSCALL context? Return to guest with SYSRET? */
 #define VGCF_IN_SYSCALL (1<<8)
+struct iret_context {
+    /* Top of stack (%rsp at point of hypercall). */
+    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+    /* Bottom of iret stack frame. */
+};
+/*
+ * For compatibility with HYPERVISOR_switch_to_user which is the old
+ * name for HYPERVISOR_iret.
+ */
 struct switch_to_user {
     /* Top of stack (%rsp at point of hypercall). */
     uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
-    /* Bottom of switch_to_user stack frame. */
+    /* Bottom of iret stack frame. */
 };
 
 /*
@@ -202,6 +220,7 @@
     unsigned long max_pfn;                  /* max pfn that appears in table */
     /* Frame containing list of mfns containing list of mfns containing p2m. */
     unsigned long pfn_to_mfn_frame_list_list; 
+    unsigned long nmi_reason;
 } arch_shared_info_t;
 
 typedef struct {
diff -r 642b26779c4e -r 4b8919585039 xen/include/public/xen.h
--- a/xen/include/public/xen.h  Thu Jan 12 12:13:34 2006
+++ b/xen/include/public/xen.h  Thu Jan 12 12:20:04 2006
@@ -53,12 +53,14 @@
 #define __HYPERVISOR_grant_table_op       20
 #define __HYPERVISOR_vm_assist            21
 #define __HYPERVISOR_update_va_mapping_otherdomain 22
-#define __HYPERVISOR_switch_vm86          23 /* x86/32 only */
-#define __HYPERVISOR_switch_to_user       23 /* x86/64 only */
+#define __HYPERVISOR_iret                 23 /* x86 only */
+#define __HYPERVISOR_switch_vm86          23 /* x86/32 only (obsolete name) */
+#define __HYPERVISOR_switch_to_user       23 /* x86/64 only (obsolete name) */
 #define __HYPERVISOR_vcpu_op              24
 #define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
 #define __HYPERVISOR_mmuext_op            26
 #define __HYPERVISOR_acm_op               27
+#define __HYPERVISOR_nmi_op               28
 
 /* 
  * VIRTUAL INTERRUPTS
@@ -69,10 +71,7 @@
 #define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
 #define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
 #define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
-#define VIRQ_PARITY_ERR 4  /* (DOM0) NMI parity error (port 0x61, bit 7). */
-#define VIRQ_IO_ERR     5  /* (DOM0) NMI I/O error    (port 0x61, bit 6). */
 #define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
-#define VIRQ_NMI        7  /* (DOM0) Unknown NMI (not from ISA port 0x61).*/
 #define NR_VIRQS        8
 
 /*
diff -r 642b26779c4e -r 4b8919585039 xen/include/xen/domain_page.h
--- a/xen/include/xen/domain_page.h     Thu Jan 12 12:13:34 2006
+++ b/xen/include/xen/domain_page.h     Thu Jan 12 12:20:04 2006
@@ -10,24 +10,19 @@
 #include <xen/config.h>
 #include <xen/mm.h>
 
-#define map_domain_page(pfn)   map_domain_pages(pfn,0)
-#define unmap_domain_page(va)  unmap_domain_pages(va,0)
-
 #ifdef CONFIG_DOMAIN_PAGE
 
 /*
- * Maps a given range of page frames, returning the mapped virtual address. The
- * pages are now accessible within the current VCPU until a corresponding
- * call to unmap_domain_page().
+ * Map a given page frame, returning the mapped virtual address. The page is
+ * then accessible within the current VCPU until a corresponding unmap call.
  */
-extern void *map_domain_pages(unsigned long pfn, unsigned int order);
+extern void *map_domain_page(unsigned long pfn);
 
 /*
- * Pass a VA within the first page of a range previously mapped in the context
- * of the currently-executing VCPU via a call to map_domain_pages(). Those
- * pages will then be removed from the mapping lists.
+ * Pass a VA within a page previously mapped in the context of the
+ * currently-executing VCPU via a call to map_domain_pages().
  */
-extern void unmap_domain_pages(void *va, unsigned int order);
+extern void unmap_domain_page(void *va);
 
 /*
  * Similar to the above calls, except the mapping is accessible in all
@@ -97,8 +92,8 @@
 
 #else /* !CONFIG_DOMAIN_PAGE */
 
-#define map_domain_pages(pfn,order)         phys_to_virt((pfn)<<PAGE_SHIFT)
-#define unmap_domain_pages(va,order)        ((void)((void)(va),(void)(order)))
+#define map_domain_page(pfn)                phys_to_virt((pfn)<<PAGE_SHIFT)
+#define unmap_domain_page(va)               ((void)(va))
 
 #define map_domain_page_global(pfn)         phys_to_virt((pfn)<<PAGE_SHIFT)
 #define unmap_domain_page_global(va)        ((void)(va))
diff -r 642b26779c4e -r 4b8919585039 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Thu Jan 12 12:13:34 2006
+++ b/xen/include/xen/sched.h   Thu Jan 12 12:20:04 2006
@@ -80,6 +80,8 @@
 
     /* Bitmask of CPUs on which this VCPU may run. */
     cpumask_t        cpu_affinity;
+
+    unsigned long    nmi_addr;      /* NMI callback address. */
 
     /* Bitmask of CPUs which are holding onto this VCPU's state. */
     cpumask_t        vcpu_dirty_cpumask;
@@ -361,6 +363,12 @@
  /* VCPU is not-runnable */
 #define _VCPUF_down            5
 #define VCPUF_down             (1UL<<_VCPUF_down)
+ /* NMI callback pending for this VCPU? */
+#define _VCPUF_nmi_pending     8
+#define VCPUF_nmi_pending      (1UL<<_VCPUF_nmi_pending)
+ /* Avoid NMI reentry by allowing NMIs to be masked for short periods. */
+#define _VCPUF_nmi_masked      9
+#define VCPUF_nmi_masked       (1UL<<_VCPUF_nmi_masked)
 
 /*
  * Per-domain flags (domain_flags).
diff -r 642b26779c4e -r 4b8919585039 xen/include/xen/softirq.h
--- a/xen/include/xen/softirq.h Thu Jan 12 12:13:34 2006
+++ b/xen/include/xen/softirq.h Thu Jan 12 12:20:04 2006
@@ -6,7 +6,7 @@
 #define SCHEDULE_SOFTIRQ                  1
 #define NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ 2
 #define KEYPRESS_SOFTIRQ                  3
-#define NMI_DOM0_SOFTIRQ                  4
+#define NMI_SOFTIRQ                       4
 #define PAGE_SCRUB_SOFTIRQ                5
 #define DOMAIN_SHUTDOWN_FINALISE_SOFTIRQ  6
 #define NR_SOFTIRQS                       7
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/mach_traps.h
--- /dev/null   Thu Jan 12 12:13:34 2006
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/mach_traps.h       
Thu Jan 12 12:20:04 2006
@@ -0,0 +1,33 @@
+/*
+ *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
+ *
+ *  Machine specific NMI handling for Xen
+ */
+#ifndef _MACH_TRAPS_H
+#define _MACH_TRAPS_H
+
+#include <linux/bitops.h>
+#include <asm-xen/xen-public/nmi.h>
+
+static inline void clear_mem_error(unsigned char reason) {}
+static inline void clear_io_check_error(unsigned char reason) {}
+
+static inline unsigned char get_nmi_reason(void)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       unsigned char reason = 0;
+
+       /* construct a value which looks like it came from
+        * port 0x61.
+        */
+       if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
+               reason |= 0x40;
+       if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
+               reason |= 0x80;
+
+        return reason;
+}
+
+static inline void reassert_nmi(void) {}
+
+#endif /* !_MACH_TRAPS_H */
diff -r 642b26779c4e -r 4b8919585039 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/nmi.h
--- /dev/null   Thu Jan 12 12:13:34 2006
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/nmi.h     Thu Jan 12 
12:20:04 2006
@@ -0,0 +1,75 @@
+/*
+ *  linux/include/asm-i386/nmi.h
+ */
+#ifndef ASM_NMI_H
+#define ASM_NMI_H
+
+#include <linux/pm.h>
+
+#include <asm-xen/xen-public/nmi.h>
+
+struct pt_regs;
+ 
+typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
+ 
+/** 
+ * set_nmi_callback
+ *
+ * Set a handler for an NMI. Only one handler may be
+ * set. Return 1 if the NMI was handled.
+ */
+void set_nmi_callback(nmi_callback_t callback);
+ 
+/** 
+ * unset_nmi_callback
+ *
+ * Remove the handler previously set.
+ */
+void unset_nmi_callback(void);
+ 
+#ifdef CONFIG_PM
+ 
+/** Replace the PM callback routine for NMI. */
+struct pm_dev * set_nmi_pm_callback(pm_callback callback);
+
+/** Unset the PM callback routine back to the default. */
+void unset_nmi_pm_callback(struct pm_dev * dev);
+
+#else
+
+static inline struct pm_dev * set_nmi_pm_callback(pm_callback callback)
+{
+       return 0;
+} 
+ 
+static inline void unset_nmi_pm_callback(struct pm_dev * dev)
+{
+}
+
+#endif /* CONFIG_PM */
+ 
+extern void default_do_nmi(struct pt_regs *);
+extern void die_nmi(char *str, struct pt_regs *regs);
+
+static inline unsigned char get_nmi_reason(void)
+{
+        shared_info_t *s = HYPERVISOR_shared_info;
+        unsigned char reason = 0;
+
+        /* construct a value which looks like it came from
+         * port 0x61.
+         */
+        if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
+                reason |= 0x40;
+        if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
+                reason |= 0x80;
+
+        return reason;
+}
+
+extern int panic_on_timeout;
+extern int unknown_nmi_panic;
+
+extern int check_nmi_watchdog(void);
+ 
+#endif /* ASM_NMI_H */
diff -r 642b26779c4e -r 4b8919585039 
patches/linux-2.6.12/i386-mach-io-check-nmi.patch
--- /dev/null   Thu Jan 12 12:13:34 2006
+++ b/patches/linux-2.6.12/i386-mach-io-check-nmi.patch Thu Jan 12 12:20:04 2006
@@ -0,0 +1,43 @@
+--- ref-linux-2.6.12/arch/i386/kernel/traps.c  2005-12-19 09:23:44.000000000 
+0000
++++ linux-2.6.12-xen0/arch/i386/kernel/traps.c 2006-01-05 15:51:52.000000000 
+0000
+@@ -521,18 +521,11 @@
+ 
+ static void io_check_error(unsigned char reason, struct pt_regs * regs)
+ {
+-      unsigned long i;
+-
+       printk("NMI: IOCK error (debug interrupt?)\n");
+       show_registers(regs);
+ 
+       /* Re-enable the IOCK line, wait for a few seconds */
+-      reason = (reason & 0xf) | 8;
+-      outb(reason, 0x61);
+-      i = 2000;
+-      while (--i) udelay(1000);
+-      reason &= ~8;
+-      outb(reason, 0x61);
++      clear_io_check_error(reason);
+ }
+ 
+ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+--- ref-linux-2.6.12/include/asm-i386/mach-default/mach_traps.h        
2005-06-17 20:48:29.000000000 +0100
++++ linux-2.6.12-xen0/include/asm-i386/mach-default/mach_traps.h       
2006-01-05 15:52:33.000000000 +0000
+@@ -15,6 +15,18 @@
+       outb(reason, 0x61);
+ }
+ 
++static inline void clear_io_check_error(unsigned char reason)
++{
++      unsigned long i;
++
++      reason = (reason & 0xf) | 8;
++      outb(reason, 0x61);
++      i = 2000;
++      while (--i) udelay(1000);
++      reason &= ~8;
++      outb(reason, 0x61);
++}
++
+ static inline unsigned char get_nmi_reason(void)
+ {
+       return inb(0x61);
diff -r 642b26779c4e -r 4b8919585039 xen/include/public/nmi.h
--- /dev/null   Thu Jan 12 12:13:34 2006
+++ b/xen/include/public/nmi.h  Thu Jan 12 12:20:04 2006
@@ -0,0 +1,54 @@
+/******************************************************************************
+ * nmi.h
+ * 
+ * NMI callback registration and reason codes.
+ * 
+ * Copyright (c) 2005, Keir Fraser <keir@xxxxxxxxxxxxx>
+ */
+
+#ifndef __XEN_PUBLIC_NMI_H__
+#define __XEN_PUBLIC_NMI_H__
+
+/*
+ * NMI reason codes:
+ * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
+ */
+ /* I/O-check error reported via ISA port 0x61, bit 6. */
+#define _XEN_NMIREASON_io_error     0
+#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
+ /* Parity error reported via ISA port 0x61, bit 7. */
+#define _XEN_NMIREASON_parity_error 1
+#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
+ /* Unknown hardware-generated NMI. */
+#define _XEN_NMIREASON_unknown      2
+#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
+
+/*
+ * long nmi_op(unsigned int cmd, void *arg)
+ * NB. All ops return zero on success, else a negative error code.
+ */
+
+/*
+ * Register NMI callback for this (calling) VCPU. Currently this only makes
+ * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
+ * arg == address of callback function.
+ */
+#define XENNMI_register_callback   0
+
+/*
+ * Deregister NMI callback for this (calling) VCPU.
+ * arg == NULL.
+ */
+#define XENNMI_unregister_callback 1
+
+#endif /* __XEN_PUBLIC_NMI_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.