[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] hvm save restore support



Zhai, Edwin wrote:
ian/keir,
this is the new hvm save/restore patch on top of r12898.
although not tested intensively across all platform combinations, general
linux/windows guest works fine W/O break anything.

So what are the plans for merging save/restore?

If possible, it would be nice to see some changes first. For instance, it seems a real shame to not use the existing QEMU format. There's really no technical reason why a saved Xen VM shouldn't be loadable in QEMU (and vice versa).

Perhaps we should also considering using a higher level kernel interface too? 2.6.20 is already going to contain a kernel interface for transferring guest state (for KVM). It would be nice if we didn't keep introducing Xen specific things to our qemu fork so that we could eventually get this stuff upstream.

Regards,

Anthony Liguori

because recent changes in unstable make rebase hard, we submit this patch now
with hope that check in first and fix bug over time if possible.

thanks,


--
best rgds,
edwin
------------------------------------------------------------------------

Signed-off-by: Zhai Edwin <edwin.zhai@xxxxxxxxx>
Signed-off-by: Nakajima Jun <jun.nakajima@xxxxxxxxx>

diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c       Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/hw/cirrus_vga.c       Wed Dec 13 22:52:02 2006 +0800
@@ -3010,11 +3010,44 @@ static CPUWriteMemoryFunc *cirrus_mmio_w
     cirrus_mmio_writel,
 };

+void cirrus_stop_acc(CirrusVGAState *s)
+{
+    if (s->map_addr){
+        int error;
+        s->map_addr = 0;
+        error = unset_vram_mapping(s->cirrus_lfb_addr,
+                s->cirrus_lfb_end);
+        fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n");
+
+        munmap(s->vram_ptr, VGA_RAM_SIZE);
+    }
+}
+
+void cirrus_restart_acc(CirrusVGAState *s)
+{
+    if (s->cirrus_lfb_addr && s->cirrus_lfb_end) {
+        void *vram_pointer, *old_vram;
+        fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, 
lfb_end=0x%lx.\n",
+                s->cirrus_lfb_addr, s->cirrus_lfb_end);
+        vram_pointer = set_vram_mapping(s->cirrus_lfb_addr ,s->cirrus_lfb_end);
+        if (!vram_pointer){
+            fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n");
+        } else {
+            old_vram = vga_update_vram((VGAState *)s, vram_pointer,
+                    VGA_RAM_SIZE);
+            qemu_free(old_vram);
+            s->map_addr = s->cirrus_lfb_addr;
+            s->map_end = s->cirrus_lfb_end;
+        }
+    }
+}
+
 /* load/save state */

 static void cirrus_vga_save(QEMUFile *f, void *opaque)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc;

     qemu_put_be32s(f, &s->latch);
     qemu_put_8s(f, &s->sr_index);
@@ -3049,11 +3082,20 @@ static void cirrus_vga_save(QEMUFile *f,
     qemu_put_be32s(f, &s->hw_cursor_y);
     /* XXX: we do not save the bitblt state - we assume we do not save
        the state when the blitter is active */
+
+    vga_acc = (!!s->map_addr);
+    qemu_put_8s(f, &vga_acc);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
+    if (vga_acc)
+        cirrus_stop_acc(s);
 }

 static int cirrus_vga_load(QEMUFile *f, void *opaque, int version_id)
 {
     CirrusVGAState *s = opaque;
+    uint8_t vga_acc = 0;

     if (version_id != 1)
         return -EINVAL;
@@ -3091,6 +3133,14 @@ static int cirrus_vga_load(QEMUFile *f,

     qemu_get_be32s(f, &s->hw_cursor_x);
     qemu_get_be32s(f, &s->hw_cursor_y);
+
+    qemu_get_8s(f, &vga_acc);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
+    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE);
+    if (vga_acc){
+        cirrus_restart_acc(s);
+    }

     /* force refresh */
     s->graphic_mode = -1;
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/target-i386-dm/helper2.c
--- a/tools/ioemu/target-i386-dm/helper2.c      Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/target-i386-dm/helper2.c      Wed Dec 13 22:52:02 2006 +0800
@@ -525,6 +525,7 @@ int main_loop(void)
 {
     extern int vm_running;
     extern int shutdown_requested;
+    extern int suspend_requested;
     CPUState *env = cpu_single_env;
     int evtchn_fd = xc_evtchn_fd(xce_handle);

@@ -542,12 +543,24 @@ int main_loop(void)
                 qemu_system_reset();
                 reset_requested = 0;
             }
+            if (suspend_requested) {
+                fprintf(logfile, "device model received suspend signal!\n");
+                break;
+            }
         }

         /* Wait up to 10 msec. */
         main_loop_wait(10);
     }
-    destroy_hvm_domain();
+    if (!suspend_requested)
+        destroy_hvm_domain();
+    else {
+        char qemu_file[20];
+        sprintf(qemu_file, "/tmp/xen.qemu-dm.%d", domid);
+        if (qemu_savevm(qemu_file) < 0)
+            fprintf(stderr, "qemu save fail.\n");
+    }
+
     return 0;
 }

diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/target-i386-dm/piix_pci-dm.c
--- a/tools/ioemu/target-i386-dm/piix_pci-dm.c  Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/target-i386-dm/piix_pci-dm.c  Wed Dec 13 22:52:02 2006 +0800
@@ -83,6 +83,11 @@ PCIBus *i440fx_init(void)
 /* PIIX3 PCI to ISA bridge */

 static PCIDevice *piix3_dev;
+static int pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
+{
+    /* This is the barber's pole mapping used by Xen. */
+    return (irq_num + (pci_dev->devfn >> 3)) & 3;
+}

 static void piix3_write_config(PCIDevice *d,
                                uint32_t address, uint32_t val, int len)
@@ -150,3 +155,227 @@ int piix3_init(PCIBus *bus)
 }

 void pci_bios_init(void) {}
+
+/***********************************************************/
+/* XXX: the following should be moved to the PC BIOS */
+
+static __attribute__((unused)) uint32_t isa_inb(uint32_t addr)
+{
+    return cpu_inb(NULL, addr);
+}
+
+static void isa_outb(uint32_t val, uint32_t addr)
+{
+    cpu_outb(NULL, addr, val);
+}
+
+static __attribute__((unused)) uint32_t isa_inw(uint32_t addr)
+{
+    return cpu_inw(NULL, addr);
+}
+
+static __attribute__((unused)) void isa_outw(uint32_t val, uint32_t addr)
+{
+    cpu_outw(NULL, addr, val);
+}
+
+static __attribute__((unused)) uint32_t isa_inl(uint32_t addr)
+{
+    return cpu_inl(NULL, addr);
+}
+
+static __attribute__((unused)) void isa_outl(uint32_t val, uint32_t addr)
+{
+    cpu_outl(NULL, addr, val);
+}
+
+static uint32_t pci_bios_io_addr;
+static uint32_t pci_bios_mem_addr;
+/* host irqs corresponding to PCI irqs A-D */
+static uint8_t pci_irqs[4] = { 5, 6, 10, 11 };
+
+static void pci_config_writel(PCIDevice *d, uint32_t addr, uint32_t val)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    pci_data_write(s, addr, val, 4);
+}
+
+static void pci_config_writew(PCIDevice *d, uint32_t addr, uint32_t val)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    pci_data_write(s, addr, val, 2);
+}
+
+static void pci_config_writeb(PCIDevice *d, uint32_t addr, uint32_t val)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    pci_data_write(s, addr, val, 1);
+}
+
+static __attribute__((unused)) uint32_t pci_config_readl(PCIDevice *d, 
uint32_t addr)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    return pci_data_read(s, addr, 4);
+}
+
+static uint32_t pci_config_readw(PCIDevice *d, uint32_t addr)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    return pci_data_read(s, addr, 2);
+}
+
+static uint32_t pci_config_readb(PCIDevice *d, uint32_t addr)
+{
+    PCIBus *s = d->bus;
+    addr |= (pci_bus_num(s) << 16) | (d->devfn << 8);
+    return pci_data_read(s, addr, 1);
+}
+
+static void pci_set_io_region_addr(PCIDevice *d, int region_num, uint32_t addr)
+{
+    PCIIORegion *r;
+    uint16_t cmd;
+    uint32_t ofs;
+
+    if ( region_num == PCI_ROM_SLOT ) {
+        ofs = 0x30;
+    }else{
+        ofs = 0x10 + region_num * 4;
+    }
+
+    pci_config_writel(d, ofs, addr);
+    r = &d->io_regions[region_num];
+
+    /* enable memory mappings */
+    cmd = pci_config_readw(d, PCI_COMMAND);
+    if ( region_num == PCI_ROM_SLOT )
+        cmd |= 2;
+    else if (r->type & PCI_ADDRESS_SPACE_IO)
+        cmd |= 1;
+    else
+        cmd |= 2;
+    pci_config_writew(d, PCI_COMMAND, cmd);
+}
+
+static void pci_bios_init_device(PCIDevice *d)
+{
+    int class;
+    PCIIORegion *r;
+    uint32_t *paddr;
+    int i, pin, pic_irq, vendor_id, device_id;
+
+    class = pci_config_readw(d, PCI_CLASS_DEVICE);
+    vendor_id = pci_config_readw(d, PCI_VENDOR_ID);
+    device_id = pci_config_readw(d, PCI_DEVICE_ID);
+    switch(class) {
+    case 0x0101:
+        if (vendor_id == 0x8086 && device_id == 0x7010) {
+            /* PIIX3 IDE */
+            pci_config_writew(d, 0x40, 0x8000); // enable IDE0
+            pci_config_writew(d, 0x42, 0x8000); // enable IDE1
+            goto default_map;
+        } else {
+            /* IDE: we map it as in ISA mode */
+            pci_set_io_region_addr(d, 0, 0x1f0);
+            pci_set_io_region_addr(d, 1, 0x3f4);
+            pci_set_io_region_addr(d, 2, 0x170);
+            pci_set_io_region_addr(d, 3, 0x374);
+        }
+        break;
+    case 0x0680:
+        if (vendor_id == 0x8086 && device_id == 0x7113) {
+            /*
+             * PIIX4 ACPI PM.
+             * Special device with special PCI config space. No ordinary BARs.
+             */
+            pci_config_writew(d, 0x20, 0x0000); // No smb bus IO enable
+            pci_config_writew(d, 0x22, 0x0000);
+            pci_config_writew(d, 0x3c, 0x0009); // Hardcoded IRQ9
+            pci_config_writew(d, 0x3d, 0x0001);
+        }
+        break;
+    case 0x0300:
+        if (vendor_id != 0x1234)
+            goto default_map;
+        /* VGA: map frame buffer to default Bochs VBE address */
+        pci_set_io_region_addr(d, 0, 0xE0000000);
+        break;
+    case 0x0800:
+        /* PIC */
+        vendor_id = pci_config_readw(d, PCI_VENDOR_ID);
+        device_id = pci_config_readw(d, PCI_DEVICE_ID);
+        if (vendor_id == 0x1014) {
+            /* IBM */
+            if (device_id == 0x0046 || device_id == 0xFFFF) {
+                /* MPIC & MPIC2 */
+                pci_set_io_region_addr(d, 0, 0x80800000 + 0x00040000);
+            }
+        }
+        break;
+    case 0xff00:
+        if (vendor_id == 0x0106b &&
+            (device_id == 0x0017 || device_id == 0x0022)) {
+            /* macio bridge */
+            pci_set_io_region_addr(d, 0, 0x80800000);
+        }
+        break;
+    default:
+    default_map:
+        /* default memory mappings */
+        for(i = 0; i < PCI_NUM_REGIONS; i++) {
+            r = &d->io_regions[i];
+            if (r->size) {
+                if (r->type & PCI_ADDRESS_SPACE_IO)
+                    paddr = &pci_bios_io_addr;
+                else
+                    paddr = &pci_bios_mem_addr;
+                *paddr = (*paddr + r->size - 1) & ~(r->size - 1);
+                pci_set_io_region_addr(d, i, *paddr);
+                *paddr += r->size;
+            }
+        }
+        break;
+    }
+
+    /* map the interrupt */
+    pin = pci_config_readb(d, PCI_INTERRUPT_PIN);
+    if (pin != 0) {
+        pin = pci_slot_get_pirq(d, pin - 1);
+        pic_irq = pci_irqs[pin];
+        pci_config_writeb(d, PCI_INTERRUPT_LINE, pic_irq);
+    }
+}
+
+/*
+ * This function initializes the PCI devices as a normal PCI BIOS
+ * would do. It is provided just in case the BIOS has no support for
+ * PCI.
+ */
+void pci_setup(void)
+{
+    int i, irq;
+    uint8_t elcr[2];
+
+    pci_bios_io_addr = 0xc000;
+    pci_bios_mem_addr = HVM_BELOW_4G_MMIO_START;
+
+    /* activate IRQ mappings */
+    elcr[0] = 0x00;
+    elcr[1] = 0x00;
+    for(i = 0; i < 4; i++) {
+        irq = pci_irqs[i];
+        /* set to trigger level */
+        elcr[irq >> 3] |= (1 << (irq & 7));
+        /* activate irq remapping in PIIX */
+        pci_config_writeb(piix3_dev, 0x60 + i, irq);
+    }
+    isa_outb(elcr[0], 0x4d0);
+    isa_outb(elcr[1], 0x4d1);
+
+    pci_for_each_device(pci_bios_init_device);
+}
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c  Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/ioemu/vl.c  Wed Dec 13 22:52:02 2006 +0800
@@ -4441,6 +4441,11 @@ int qemu_loadvm(const char *filename)
         qemu_fseek(f, cur_pos + record_len, SEEK_SET);
     }
     fclose(f);
+
+    /* del tmp file */
+    if (unlink(filename) == -1)
+        fprintf(stderr, "delete tmp qemu state file failed.\n");
+
     ret = 0;
  the_end:
     if (saved_vm_running)
@@ -5027,6 +5032,7 @@ static QEMUResetEntry *first_reset_entry
 static QEMUResetEntry *first_reset_entry;
 int reset_requested;
 int shutdown_requested;
+int suspend_requested;
 static int powerdown_requested;

 void qemu_register_reset(QEMUResetHandler *func, void *opaque)
@@ -5806,6 +5812,14 @@ int set_mm_mapping(int xc_handle, uint32
     }

     return 0;
+}
+
+void suspend(int sig)
+{
+   fprintf(logfile, "suspend sig handler called with requested=%d!\n", 
suspend_requested);
+    if (sig != SIGUSR1)
+        fprintf(logfile, "suspend signal dismatch, get sig=%d!\n", sig);
+    suspend_requested = 1;
 }

 #if defined(__i386__) || defined(__x86_64__)
@@ -6709,8 +6723,12 @@ int main(int argc, char **argv)
         }
     } else
 #endif
-    if (loadvm)
+    if (loadvm) {
+        /*XXX: ugly, since pci_bios_init are moved to hvmloader*/
+        extern void pci_setup(void);
+        pci_setup();
         qemu_loadvm(loadvm);
+    }

     {
         /* XXX: simplify init */
@@ -6719,6 +6737,26 @@ int main(int argc, char **argv)
             vm_start();
         }
     }
+
+    /* register signal for the suspend request when save */
+    {
+        struct sigaction act;
+        sigset_t set;
+        act.sa_handler = suspend;
+        act.sa_flags = SA_RESTART;
+        sigemptyset(&act.sa_mask);
+
+        sigaction(SIGUSR1, &act, NULL);
+
+        /* control panel mask some signals when spawn qemu, need unmask here*/
+        sigemptyset(&set);
+        sigaddset(&set, SIGUSR1);
+        sigaddset(&set, SIGTERM);
+        if (sigprocmask(SIG_UNBLOCK, &set, NULL) == -1)
+            fprintf(stderr, "unblock signal fail, possible issue for HVM 
save!\n");
+
+    }
+
     main_loop();
     quit_timers();
     return 0;
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/Makefile
--- a/tools/libxc/Makefile      Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/Makefile      Wed Dec 13 22:52:02 2006 +0800
@@ -27,7 +27,7 @@ GUEST_SRCS-$(CONFIG_X86) += xc_linux_bui
 GUEST_SRCS-$(CONFIG_X86) += xc_linux_build.c
 GUEST_SRCS-$(CONFIG_IA64) += xc_linux_build.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_linux_restore.c xc_linux_save.c
-GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
+GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_restore.c xc_hvm_save.c

 -include $(XEN_TARGET_ARCH)/Makefile

diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xc_domain.c   Wed Dec 13 22:52:02 2006 +0800
@@ -233,6 +233,50 @@ int xc_domain_getinfolist(int xc_handle,
     unlock_pages(info, max_domains*sizeof(xc_domaininfo_t));

     return ret;
+}
+
+/* get info from hvm guest for save */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_gethvmcontext;
+    domctl.domain = (domid_t)domid;
+    set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_domctl(xc_handle, &domctl);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
+}
+
+/* set info to hvm guest for restore */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt)
+{
+    int rc;
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_sethvmcontext;
+    domctl.domain = domid;
+    set_xen_guest_handle(domctl.u.hvmcontext.ctxt, hvm_ctxt);
+
+    if ( (rc = mlock(hvm_ctxt, sizeof(*hvm_ctxt))) != 0 )
+        return rc;
+
+    rc = do_domctl(xc_handle, &domctl);
+
+    safe_munlock(hvm_ctxt, sizeof(*hvm_ctxt));
+
+    return rc;
 }

 int xc_vcpu_getcontext(int xc_handle,
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xc_hvm_build.c        Wed Dec 13 22:52:02 2006 +0800
@@ -86,7 +86,7 @@ static void build_e820map(void *e820_pag

     /* 0x0-0x9F000: Ordinary RAM. */
     e820entry[nr_map].addr = 0x0;
-    e820entry[nr_map].size = 0x9F000;
+    e820entry[nr_map].size = 0x90000;
     e820entry[nr_map].type = E820_RAM;
     nr_map++;

@@ -96,7 +96,7 @@ static void build_e820map(void *e820_pag
      * TODO: SMBIOS tables should be moved higher (>=0xE0000).
      *       They are unusually low in our memory map: could cause problems?
      */
-    e820entry[nr_map].addr = 0x9F000;
+    e820entry[nr_map].addr = 0x90000;
     e820entry[nr_map].size = 0x1000;
     e820entry[nr_map].type = E820_RESERVED;
     nr_map++;
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xc_linux_save.c       Wed Dec 13 22:52:02 2006 +0800
@@ -261,15 +261,6 @@ static int ratewrite(int io_fd, void *bu
 #endif


-static inline ssize_t write_exact(int fd, void *buf, size_t count)
-{
-    if(write(fd, buf, count) != count)
-        return 0;
-    return 1;
-}
-
-
-
 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
                        xc_shadow_op_stats_t *stats, int print)
 {
@@ -356,7 +347,7 @@ static int analysis_phase(int xc_handle,
 }


-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
                              int dom, xc_dominfo_t *info,
                              vcpu_guest_context_t *ctxt)
 {
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xenctrl.h     Wed Dec 13 22:52:02 2006 +0800
@@ -313,6 +313,30 @@ int xc_domain_getinfolist(int xc_handle,
                           xc_domaininfo_t *info);

 /**
+ * This function returns information about the context of a hvm domain
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to get information from
+ * @parm hvm_ctxt a pointer to a structure to store the execution context of 
the
+ *            hvm domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_getcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
+ * This function will set the context for hvm domain
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm domid the domain to set the hvm domain context for
+ * @parm hvm_ctxt pointer to the the hvm context with the values to set
+ * @return 0 on success, -1 on failure
+ */
+int xc_domain_hvm_setcontext(int xc_handle,
+                             uint32_t domid,
+                             hvm_domain_context_t *hvm_ctxt);
+
+/**
  * This function returns information about the execution context of a
  * particular vcpu of a domain.
  *
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xenguest.h    Wed Dec 13 22:52:02 2006 +0800
@@ -11,6 +11,7 @@

 #define XCFLAGS_LIVE      1
 #define XCFLAGS_DEBUG     2
+#define XCFLAGS_HVM       4


 /**
@@ -25,6 +26,13 @@ int xc_linux_save(int xc_handle, int io_
                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
                   int (*suspend)(int domid));

+/**
+ * This function will save a hvm domain running unmodified guest.
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
+                  int (*suspend)(int domid));

 /**
  * This function will restore a saved domain running Linux.
@@ -41,6 +49,18 @@ int xc_linux_restore(int xc_handle, int
                      unsigned long nr_pfns, unsigned int store_evtchn,
                      unsigned long *store_mfn, unsigned int console_evtchn,
                      unsigned long *console_mfn);
+
+/**
+ * This function will restore a saved hvm domain running unmodified guest.
+ *
+ * @parm store_mfn pass mem size & returned with the mfn of the store page
+ * @return 0 on success, -1 on failure
+ */
+int xc_hvm_restore(int xc_handle, int io_fd, uint32_t dom,
+                      unsigned long nr_pfns, unsigned int store_evtchn,
+                      unsigned long *store_mfn, unsigned int console_evtchn,
+                      unsigned long *console_mfn,
+                      unsigned int pae, unsigned int apic);

 /**
  * This function will create a domain for a paravirtualized Linux
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/libxc/xg_save_restore.h     Wed Dec 13 22:52:02 2006 +0800
@@ -65,6 +65,16 @@ static int get_platform_info(int xc_hand
     return 1;
 }

+static inline ssize_t write_exact(int fd, void *buf, size_t count)
+{
+    if(write(fd, buf, count) != count)
+        return 0;
+    return 1;
+}
+
+extern int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+                             int dom, xc_dominfo_t *info,
+                             vcpu_guest_context_t *ctxt);

 /*
 ** Save/restore deal with the mfn_to_pfn (M2P) and pfn_to_mfn (P2M) tables.
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Dec 13 22:52:02 2006 +0800
@@ -158,6 +158,20 @@ static PyObject *pyxc_domain_destroy(XcO
 static PyObject *pyxc_domain_destroy(XcObject *self, PyObject *args)
 {
     return dom_op(self, args, xc_domain_destroy);
+}
+
+static PyObject *pyxc_domain_shutdown(XcObject *self, PyObject *args)
+{
+    uint32_t dom, reason;
+
+    if (!PyArg_ParseTuple(args, "ii", &dom, &reason))
+      return NULL;
+
+    if (xc_domain_shutdown(self->xc_handle, dom, reason) != 0)
+        return pyxc_error_to_exception();
+
+    Py_INCREF(zero);
+    return zero;
 }


@@ -969,6 +983,14 @@ static PyMethodDef pyxc_methods[] = {
       METH_VARARGS, "\n"
       "Destroy a domain.\n"
       " dom [int]:    Identifier of domain to be destroyed.\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "domain_shutdown",
+      (PyCFunction)pyxc_domain_shutdown,
+      METH_VARARGS, "\n"
+      "Shutdown a domain.\n"
+      " dom       [int, 0]:      Domain identifier to use.\n"
+      " reason     [int, 0]:      Reason for shutdown.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },

     { "vcpu_setaffinity",
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/xend/XendCheckpoint.py   Wed Dec 13 22:52:02 2006 +0800
@@ -22,11 +22,14 @@ from xen.xend.XendConstants import *
 from xen.xend.XendConstants import *

 SIGNATURE = "LinuxGuestRecord"
+QEMU_SIGNATURE = "QemuDeviceModelRecord"
+dm_batch = 512
 XC_SAVE = "xc_save"
 XC_RESTORE = "xc_restore"


 sizeof_int = calcsize("i")
+sizeof_unsigned_int = calcsize("I")
 sizeof_unsigned_long = calcsize("L")


@@ -69,6 +72,11 @@ def save(fd, dominfo, network, live, dst
                     "could not write guest state file: config len")
         write_exact(fd, config, "could not write guest state file: config")

+        image_cfg = dominfo.info.get('image', {})
+        hvm = image_cfg.has_key('hvm')
+
+        if hvm:
+            log.info("save hvm domain")
         # xc_save takes three customization parameters: maxit, max_f, and
         # flags the last controls whether or not save is 'live', while the
         # first two further customize behaviour when 'live' save is
@@ -76,7 +84,7 @@ def save(fd, dominfo, network, live, dst
         # libxenguest; see the comments and/or code in xc_linux_save() for
         # more information.
         cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd),
-               str(dominfo.getDomid()), "0", "0", str(int(live)) ]
+               str(dominfo.getDomid()), "0", "0", str(int(live) | (int(hvm) << 
2)) ]
         log.debug("[xc_save]: %s", string.join(cmd))

         def saveInputHandler(line, tochild):
@@ -90,11 +98,28 @@ def save(fd, dominfo, network, live, dst
                 log.info("Domain %d suspended.", dominfo.getDomid())
                 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP3,
                                        domain_name)
+                #send signal to device model for save
+                if hvm:
+                    log.info("release_devices for hvm domain")
+                    dominfo._releaseDevices(True)
                 tochild.write("done\n")
                 tochild.flush()
                 log.debug('Written done')

         forkHelper(cmd, fd, saveInputHandler, False)
+
+        # put qemu device model state
+        if hvm:
+            write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature")
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(), 
os.O_RDONLY)
+            while True:
+                buf = os.read(qemu_fd, dm_batch)
+                if len(buf):
+                    write_exact(fd, buf, "could not write device model state")
+                else:
+                    break
+            os.close(qemu_fd)
+            os.remove("/tmp/xen.qemu-dm.%d" % dominfo.getDomid())

         dominfo.destroyDomain()
         try:
@@ -147,19 +172,38 @@ def restore(xd, fd, dominfo = None, paus
     assert store_port
     assert console_port

+    #if hvm, pass mem size to calculate the store_mfn
+    hvm = 0
+    apic = 0
+    pae = 0
+    image_cfg = dominfo.info.get('image', {})
+    hvm = image_cfg.has_key('hvm')
+    if hvm:
+        #the 'memory' in config has been removed
+        hvm  = dominfo.info['memory_static_min']
+        apic = dominfo.info['image']['hvm'].get('apic', 0)
+        pae  = dominfo.info['image']['hvm'].get('pae',  0)
+        log.info("restore hvm domain %d, mem=%d, apic=%d, pae=%d", 
dominfo.domid, hvm, apic, pae)
+
     try:
-        l = read_exact(fd, sizeof_unsigned_long,
-                       "not a valid guest state file: pfn count read")
-        nr_pfns = unpack("L", l)[0]    # native sizeof long
+        if hvm:
+            l = read_exact(fd, sizeof_unsigned_int,
+                    "not a valid hvm guest state file: pfn count read")
+            nr_pfns = unpack("I", l)[0]    # native sizeof int
+        else:
+            l = read_exact(fd, sizeof_unsigned_long,
+                           "not a valid guest state file: pfn count read")
+            nr_pfns = unpack("L", l)[0]    # native sizeof long
         if nr_pfns > 16*1024*1024:     # XXX
             raise XendError(
                 "not a valid guest state file: pfn count out of range")

         balloon.free(xc.pages_to_kib(nr_pfns))
+        log.info("HVM restore:balloon free 0x%x pages.", nr_pfns)

         cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
                         fd, dominfo.getDomid(), nr_pfns,
-                        store_port, console_port])
+                        store_port, console_port, hvm, pae, apic])
         log.debug("[xc_restore]: %s", string.join(cmd))

         handler = RestoreInputHandler()
@@ -169,10 +213,29 @@ def restore(xd, fd, dominfo = None, paus
         if handler.store_mfn is None or handler.console_mfn is None:
             raise XendError('Could not read store/console MFN')

-        os.read(fd, 1)           # Wait for source to close connection
         dominfo.waitForDevices() # Wait for backends to set up
         if not paused:
             dominfo.unpause()
+
+         # get qemu state and create a tmp file for dm restore
+        if hvm:
+            qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
+                    "not a valid device model state: signature read")
+            if qemu_signature != QEMU_SIGNATURE:
+                raise XendError("not a valid device model state: found '%s'" %
+                        qemu_signature)
+            qemu_fd = os.open("/tmp/xen.qemu-dm.%d" % dominfo.getDomid(),
+                    os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
+            while True:
+                buf = os.read(fd, dm_batch)
+                if len(buf):
+                    write_exact(qemu_fd, buf, "could not write dm state to tmp 
file")
+                else:
+                    break
+            os.close(qemu_fd)
+
+        os.read(fd, 1)           # Wait for source to close connection
+

         dominfo.completeRestore(handler.store_mfn, handler.console_mfn)

diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/xend/XendDomainInfo.py   Wed Dec 13 22:52:02 2006 +0800
@@ -488,6 +488,16 @@ class XendDomainInfo:
         self._removeVm('xend/previous_restart_time')
         self.storeDom("control/shutdown", reason)

+        ## shutdown hypercall for hvm domain desides xenstore write
+        image_cfg = self.info.get('image', {})
+        hvm = image_cfg.has_key('hvm')
+        if hvm:
+            for code in DOMAIN_SHUTDOWN_REASONS.keys():
+                if DOMAIN_SHUTDOWN_REASONS[code] == reason:
+                    break
+            xc.domain_shutdown(self.domid, code)
+
+
     def pause(self):
         """Pause domain

@@ -1203,8 +1213,11 @@ class XendDomainInfo:
         if self.image:
             self.image.createDeviceModel()

-    def _releaseDevices(self):
+    def _releaseDevices(self, suspend = False):
         """Release all domain's devices.  Nothrow guarantee."""
+        if suspend and self.image:
+            self.image.destroy(suspend)
+            return

         while True:
             t = xstransact("%s/device" % self.dompath)
@@ -1473,6 +1486,16 @@ class XendDomainInfo:
         self.console_mfn = console_mfn

         self._introduceDomain()
+        image_cfg = self.info.get('image', {})
+        hvm = image_cfg.has_key('hvm')
+        if hvm:
+            self.image = image.create(self,
+                    self.info,
+                    self.info['image'],
+                    self.info['devices'])
+            if self.image:
+                self.image.createDeviceModel(True)
+                self.image.register_shutdown_watch()
         self._storeDomDetails()
         self._registerWatches()
         self.refreshShutdown()
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/python/xen/xend/image.py    Wed Dec 13 22:52:02 2006 +0800
@@ -157,7 +157,7 @@ class ImageHandler:
         """Build the domain. Define in subclass."""
         raise NotImplementedError()

-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         """Create device model for the domain (define in subclass if 
needed)."""
         pass

@@ -405,7 +405,7 @@ class HVMImageHandler(ImageHandler):

         return ret

-    def createDeviceModel(self):
+    def createDeviceModel(self, restore = False):
         if self.pid:
             return
         # Execute device model.
@@ -414,6 +414,8 @@ class HVMImageHandler(ImageHandler):
         args = args + ([ "-d",  "%d" % self.vm.getDomid(),
                   "-m", "%s" % (self.getRequiredInitialReservation() / 1024)])
         args = args + self.dmargs
+        if restore:
+            args = args + ([ "-loadvm", "/tmp/xen.qemu-dm.%d" % 
self.vm.getDomid() ])
         env = dict(os.environ)
         if self.display:
             env['DISPLAY'] = self.display
@@ -432,12 +434,16 @@ class HVMImageHandler(ImageHandler):
         self.register_reboot_feature_watch()
         self.pid = self.vm.gatherDom(('image/device-model-pid', int))

-    def destroy(self):
+    def destroy(self, suspend = False):
         self.unregister_shutdown_watch()
         self.unregister_reboot_feature_watch();
         if self.pid:
             try:
-                os.kill(self.pid, signal.SIGKILL)
+                sig = signal.SIGKILL
+                if suspend:
+                    log.info("use sigusr1 to signal qemu %d", self.pid)
+                    sig = signal.SIGUSR1
+                os.kill(self.pid, sig)
             except OSError, exn:
                 log.exception(exn)
             try:
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_restore.c
--- a/tools/xcutils/xc_restore.c        Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/xcutils/xc_restore.c        Wed Dec 13 22:52:02 2006 +0800
@@ -19,12 +19,13 @@ main(int argc, char **argv)
 main(int argc, char **argv)
 {
     unsigned int xc_fd, io_fd, domid, nr_pfns, store_evtchn, console_evtchn;
+    unsigned int hvm, pae, apic;
     int ret;
     unsigned long store_mfn, console_mfn;

-    if (argc != 6)
+    if (argc != 9)
        errx(1,
-            "usage: %s iofd domid nr_pfns store_evtchn console_evtchn",
+            "usage: %s iofd domid nr_pfns store_evtchn console_evtchn hvm pae 
apic",
             argv[0]);

     xc_fd = xc_interface_open();
@@ -36,9 +37,19 @@ main(int argc, char **argv)
     nr_pfns = atoi(argv[3]);
     store_evtchn = atoi(argv[4]);
     console_evtchn = atoi(argv[5]);
+    hvm  = atoi(argv[6]);
+    pae  = atoi(argv[7]);
+    apic = atoi(argv[8]);

-    ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
-                          &store_mfn, console_evtchn, &console_mfn);
+    if (hvm) {
+         /* pass the memsize to xc_hvm_restore to find the store_mfn */
+        store_mfn = hvm;
+        ret = xc_hvm_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn, pae, apic);
+    } else
+        ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, store_evtchn,
+                &store_mfn, console_evtchn, &console_mfn);
+
     if (ret == 0) {
        printf("store-mfn %li\n", store_mfn);
        printf("console-mfn %li\n", console_mfn);
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c   Fri Sep 15 17:05:38 2006 +0800
+++ b/tools/xcutils/xc_save.c   Wed Dec 13 22:52:02 2006 +0800
@@ -51,7 +51,10 @@ main(int argc, char **argv)
     max_f = atoi(argv[4]);
     flags = atoi(argv[5]);

-    ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    if (flags & XCFLAGS_HVM)
+        ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend);
+    else
+        ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
&suspend);

     xc_interface_close(xc_fd);

diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/domain.c     Wed Dec 13 22:52:02 2006 +0800
@@ -330,6 +330,7 @@ int arch_set_info_guest(
     else
     {
         hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs);
+        hvm_load_cpu_context(v, &v->arch.guest_context.hvmcpu_ctxt);
     }

     if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/domctl.c     Wed Dec 13 22:52:02 2006 +0800
@@ -297,6 +297,7 @@ void arch_getdomaininfo_ctxt(
     if ( is_hvm_vcpu(v) )
     {
         hvm_store_cpu_guest_regs(v, &c->user_regs, c->ctrlreg);
+        hvm_save_cpu_context(v, &c->hvmcpu_ctxt);
     }
     else
     {
@@ -314,6 +315,22 @@ void arch_getdomaininfo_ctxt(
     c->ctrlreg[3] = xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table));

     c->vm_assist = v->domain->vm_assist;
+}
+
+int arch_gethvm_ctxt(
+    struct vcpu *v, struct hvm_domain_context *c)
+{
+    if ( !is_hvm_vcpu(v) )
+        return -1;
+
+    return hvm_save(v, c);
+
+}
+
+int arch_sethvm_ctxt(
+        struct vcpu *v, struct hvm_domain_context *c)
+{
+    return hvm_load(v, c);
 }

 /*
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/hvm.c    Wed Dec 13 22:52:02 2006 +0800
@@ -182,9 +182,18 @@ int hvm_domain_initialise(struct domain

 void hvm_domain_destroy(struct domain *d)
 {
+    HVMStateEntry *se, *dse;
     kill_timer(&d->arch.hvm_domain.pl_time.periodic_tm.timer);
     rtc_deinit(d);
     pmtimer_deinit(d);
+
+    se = d->arch.hvm_domain.first_se;
+    while (se) {
+        dse = se;
+        se = se->next;
+        xfree(dse);
+    }
+

     if ( d->arch.hvm_domain.shared_page_va )
         unmap_domain_page_global(
@@ -225,6 +234,9 @@ int hvm_vcpu_initialise(struct vcpu *v)
     pit_init(v, cpu_khz);
     rtc_init(v, RTC_PORT(0), RTC_IRQ);
     pmtimer_init(v, ACPI_PM_TMR_BLK_ADDRESS);
+
+    /* init hvm sharepage */
+    shpage_init(v->domain, get_sp(v->domain));

     /* Init guest TSC to start from zero. */
     hvm_set_guest_time(v, 0);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c  Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/i8254.c  Wed Dec 13 22:52:02 2006 +0800
@@ -203,11 +203,11 @@ static inline void pit_load_count(PITCha
     switch (s->mode) {
         case 2:
             /* create periodic time */
-            s->pt = create_periodic_time (period, 0, 0, pit_time_fired, s);
+            s->pt = create_periodic_time (current->domain, period, 0, 0, 
pit_time_fired, s);
             break;
         case 1:
             /* create one shot time */
-            s->pt = create_periodic_time (period, 0, 1, pit_time_fired, s);
+            s->pt = create_periodic_time (current->domain, period, 0, 1, 
pit_time_fired, s);
 #ifdef DEBUG_PIT
             printk("HVM_PIT: create one shot time.\n");
 #endif
@@ -345,6 +345,152 @@ static uint32_t pit_ioport_read(void *op
     return ret;
 }

+#ifdef HVM_DEBUG_SUSPEND
+static void pit_info(PITState *pit)
+{
+    PITChannelState *s;
+    int i;
+
+    for(i = 0; i < 3; i++) {
+        printk("*****pit channel %d's state:*****\n", i);
+        s = &pit->channels[i];
+        printk("pit 0x%x.\n", s->count);
+        printk("pit 0x%x.\n", s->latched_count);
+        printk("pit 0x%x.\n", s->count_latched);
+        printk("pit 0x%x.\n", s->status_latched);
+        printk("pit 0x%x.\n", s->status);
+        printk("pit 0x%x.\n", s->read_state);
+        printk("pit 0x%x.\n", s->write_state);
+        printk("pit 0x%x.\n", s->write_latch);
+        printk("pit 0x%x.\n", s->rw_mode);
+        printk("pit 0x%x.\n", s->mode);
+        printk("pit 0x%x.\n", s->bcd);
+        printk("pit 0x%x.\n", s->gate);
+        printk("pit %"PRId64"\n", s->count_load_time);
+
+        if (s->pt) {
+            struct periodic_time *pt = s->pt;
+            printk("pit channel %d has a periodic timer:\n", i);
+            printk("pt %d.\n", pt->enabled);
+            printk("pt %d.\n", pt->one_shot);
+            printk("pt %d.\n", pt->irq);
+            printk("pt %d.\n", pt->first_injected);
+
+            printk("pt %d.\n", pt->pending_intr_nr);
+            printk("pt %d.\n", pt->period);
+            printk("pt %"PRId64"\n", pt->period_cycles);
+            printk("pt %"PRId64"\n", pt->last_plt_gtime);
+        }
+    }
+
+}
+#else
+static void pit_info(PITState *pit)
+{
+}
+#endif
+
+static void pit_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct domain *d = opaque;
+    PITState *pit = &d->arch.hvm_domain.pl_time.vpit;
+    PITChannelState *s;
+    struct periodic_time *pt;
+    int i, pti = -1;
+
+    pit_info(pit);
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        hvm_put_32u(h, s->count);
+        hvm_put_16u(h, s->latched_count);
+        hvm_put_8u(h, s->count_latched);
+        hvm_put_8u(h, s->status_latched);
+        hvm_put_8u(h, s->status);
+        hvm_put_8u(h, s->read_state);
+        hvm_put_8u(h, s->write_state);
+        hvm_put_8u(h, s->write_latch);
+        hvm_put_8u(h, s->rw_mode);
+        hvm_put_8u(h, s->mode);
+        hvm_put_8u(h, s->bcd);
+        hvm_put_8u(h, s->gate);
+        hvm_put_64u(h, s->count_load_time);
+
+        if (s->pt && pti == -1)
+            pti = i;
+    }
+
+    /* save guest time */
+    pt = pit->channels[pti].pt;
+    hvm_put_8u(h, pti);
+    hvm_put_8u(h, pt->first_injected);
+    hvm_put_32u(h, pt->pending_intr_nr);
+    hvm_put_64u(h, pt->last_plt_gtime);
+
+}
+
+static int pit_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct domain *d = opaque;
+    PITState *pit = &d->arch.hvm_domain.pl_time.vpit;
+    PITChannelState *s;
+    int i, pti;
+    u32 period;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    for(i = 0; i < 3; i++) {
+        s = &pit->channels[i];
+        s->count = hvm_get_32u(h);
+        s->latched_count = hvm_get_16u(h);
+        s->count_latched = hvm_get_8u(h);
+        s->status_latched = hvm_get_8u(h);
+        s->status = hvm_get_8u(h);
+        s->read_state = hvm_get_8u(h);
+        s->write_state = hvm_get_8u(h);
+        s->write_latch = hvm_get_8u(h);
+        s->rw_mode = hvm_get_8u(h);
+        s->mode = hvm_get_8u(h);
+        s->bcd = hvm_get_8u(h);
+        s->gate = hvm_get_8u(h);
+        s->count_load_time = hvm_get_64u(h);
+    }
+
+    pti = hvm_get_8u(h);
+    if ( pti < 0 || pti > 2) {
+        printk("pit load get a wrong channel %d when HVM resume.\n", pti);
+        return -EINVAL;
+    }
+
+    s = &pit->channels[pti];
+    period = DIV_ROUND((s->count * 1000000000ULL), PIT_FREQ);
+
+    printk("recreate periodic timer %d in mode %d, freq=%d.\n", pti, s->mode, 
period);
+    switch (s->mode) {
+        case 2:
+            /* create periodic time */
+            s->pt = create_periodic_time (d, period, 0, 0, pit_time_fired, s);
+            s->pt->first_injected = hvm_get_8u(h);
+            s->pt->pending_intr_nr = hvm_get_32u(h);
+            s->pt->last_plt_gtime = hvm_get_64u(h);
+            break;
+        case 1:
+            /* create one shot time */
+            s->pt = create_periodic_time (d, period, 0, 1, pit_time_fired, s);
+            break;
+        default:
+            printk("pit mode %"PRId8" should not use periodic timer!\n", 
s->mode);
+            return -EINVAL;
+    }
+
+    /*XXX: need set_guest_time here or do this when post_inject? */
+
+    pit_info(pit);
+
+    return 0;
+}
+
 static void pit_reset(void *opaque)
 {
     PITState *pit = opaque;
@@ -373,6 +519,8 @@ void pit_init(struct vcpu *v, unsigned l
     s->vcpu = v;
     s++; s->vcpu = v;
     s++; s->vcpu = v;
+
+    hvm_register_savevm(v->domain, "xen_hvm_i8254", PIT_BASE, 1, pit_save, 
pit_load, v->domain);

     register_portio_handler(v->domain, PIT_BASE, 4, handle_pit_io);
     /* register the speaker port */
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/intercept.c
--- a/xen/arch/x86/hvm/intercept.c      Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/intercept.c      Wed Dec 13 22:52:02 2006 +0800
@@ -29,6 +29,8 @@
 #include <asm/current.h>
 #include <io_ports.h>
 #include <xen/event.h>
+#include <xen/compile.h>
+#include <public/version.h>


 extern struct hvm_mmio_handler vlapic_mmio_handler;
@@ -314,13 +316,14 @@ void pickup_deactive_ticks(struct period
  * period: fire frequency in ns.
  */
 struct periodic_time * create_periodic_time(
+        struct domain *d,
         u32 period,
         char irq,
         char one_shot,
         time_cb *cb,
         void *data)
 {
-    struct periodic_time *pt = 
&(current->domain->arch.hvm_domain.pl_time.periodic_tm);
+    struct periodic_time *pt = &(d->arch.hvm_domain.pl_time.periodic_tm);
     if ( pt->enabled ) {
         stop_timer (&pt->timer);
         pt->enabled = 0;
@@ -353,6 +356,278 @@ void destroy_periodic_time(struct period
         stop_timer(&pt->timer);
         pt->enabled = 0;
     }
+}
+
+/* save/restore support */
+#define HVM_FILE_MAGIC   0x54381286
+#define HVM_FILE_VERSION 0x00000001
+
+int hvm_register_savevm(struct domain *d,
+                    const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque)
+{
+    HVMStateEntry *se, **pse;
+
+    if ( (se = xmalloc(struct HVMStateEntry)) == NULL ){
+        printk("allocat hvmstate entry fail.\n");
+        return -1;
+    }
+
+    strncpy(se->idstr, idstr, HVM_SE_IDSTR_LEN);
+
+    se->instance_id = instance_id;
+    se->version_id = version_id;
+    se->save_state = save_state;
+    se->load_state = load_state;
+    se->opaque = opaque;
+    se->next = NULL;
+
+    /* add at the end of list */
+    pse = &d->arch.hvm_domain.first_se;
+    while (*pse != NULL)
+        pse = &(*pse)->next;
+    *pse = se;
+    return 0;
+}
+
+int hvm_save(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, len_pos, cur_pos;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char *chgset;
+
+    if (!is_hvm_vcpu(v)) {
+        printk("hvm_save only for hvm guest!\n");
+        return -1;
+    }
+
+    memset(h, 0, sizeof(hvm_domain_context_t));
+    hvm_put_32u(h, HVM_FILE_MAGIC);
+    hvm_put_32u(h, HVM_FILE_VERSION);
+
+    /* save xen changeset */
+    chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = strlen(chgset);
+    hvm_put_8u(h, len);
+    hvm_put_buffer(h, chgset, len);
+
+    /* save cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    hvm_put_32u(h, eax);
+
+    for(se = v->domain->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        /* ID string */
+        len = strnlen(se->idstr, HVM_SE_IDSTR_LEN);
+        hvm_put_8u(h, len);
+        hvm_put_buffer(h, se->idstr, len);
+
+        hvm_put_32u(h, se->instance_id);
+        hvm_put_32u(h, se->version_id);
+
+        /* record size */
+        len_pos = hvm_ctxt_tell(h);
+        hvm_put_32u(h, 0);
+
+        se->save_state(h, se->opaque);
+
+        cur_pos = hvm_ctxt_tell(h);
+        len = cur_pos - len_pos - 4;
+        hvm_ctxt_seek(h, len_pos);
+        hvm_put_32u(h, len);
+        hvm_ctxt_seek(h, cur_pos);
+
+    }
+
+    h->size = hvm_ctxt_tell(h);
+    hvm_ctxt_seek(h, 0);
+
+    if (h->size >= HVM_CTXT_SIZE) {
+        printk("hvm_domain_context overflow when hvm_save! need %"PRId32" bytes for 
use.\n", h->size);
+        return -1;
+    }
+
+    return 0;
+
+}
+
+static HVMStateEntry *find_se(struct domain *d, const char *idstr, int 
instance_id)
+{
+    HVMStateEntry *se;
+
+    for(se = d->arch.hvm_domain.first_se; se != NULL; se = se->next) {
+        if (!strncmp(se->idstr, idstr, HVM_SE_IDSTR_LEN) &&
+            instance_id == se->instance_id){
+            return se;
+        }
+    }
+    return NULL;
+}
+
+int hvm_load(struct vcpu *v, hvm_domain_context_t *h)
+{
+    uint32_t len, rec_len, rec_pos, magic, instance_id, version_id;
+    uint32_t eax, ebx, ecx, edx;
+    HVMStateEntry *se;
+    char idstr[HVM_SE_IDSTR_LEN];
+    xen_changeset_info_t chgset;
+    char *cur_chgset;
+    int ret;
+
+    if (!is_hvm_vcpu(v)) {
+        printk("hvm_load only for hvm guest!\n");
+        return -1;
+    }
+
+    if (h->size >= HVM_CTXT_SIZE) {
+        printk("hvm_load fail! seems hvm_domain_context overflow when hvm_save! need 
%"PRId32" bytes.\n", h->size);
+        return -1;
+    }
+
+    hvm_ctxt_seek(h, 0);
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_MAGIC) {
+        printk("HVM restore magic dismatch!\n");
+        return -1;
+    }
+
+    magic = hvm_get_32u(h);
+    if (magic != HVM_FILE_VERSION) {
+        printk("HVM restore version dismatch!\n");
+        return -1;
+    }
+
+    /* check xen change set */
+    cur_chgset = strrchr(XEN_CHANGESET, ' ') + 1;
+
+    len = hvm_get_8u(h);
+    if (len > 20) { /*typical length is 18 -- "revision number:changeset id" */
+        printk("wrong change set length %d when hvm restore!\n", len);
+        return -1;
+    }
+
+    hvm_get_buffer(h, chgset, len);
+    chgset[len] = '\0';
+    if (strncmp(cur_chgset, chgset, len + 1))
+        printk("warnings: try to restore hvm guest(%s) on a different changeset 
%s.\n",
+                chgset, cur_chgset);
+
+    /* check cpuid */
+    cpuid(1, &eax, &ebx, &ecx, &edx);
+    ebx = hvm_get_32u(h);
+    /*TODO: need difine how big difference is acceptable */
+    if (ebx != eax)
+        printk("warnings: try to restore hvm guest(0x%"PRIx32") "
+               "on a different type processor(0x%"PRIx32").\n",
+                ebx,
+                eax);
+
+    while(1) {
+        if (hvm_ctxt_end(h)) {
+            break;
+        }
+
+        /* ID string */
+        len = hvm_get_8u(h);
+        if (len > HVM_SE_IDSTR_LEN) {
+            printk("wrong HVM save entry idstr len %d!", len);
+            return -1;
+        }
+
+        hvm_get_buffer(h, idstr, len);
+        idstr[len] = '\0';
+
+        instance_id = hvm_get_32u(h);
+        version_id = hvm_get_32u(h);
+
+        rec_len = hvm_get_32u(h);
+        rec_pos = hvm_ctxt_tell(h);
+
+        se = find_se(v->domain, idstr, instance_id);
+        if (se == NULL) {
+            printk("warnings: hvm load can't find device %s's instance %d!\n",
+                    idstr, instance_id);
+        } else {
+            ret = se->load_state(h, se->opaque, version_id);
+            if (ret < 0)
+                printk("warnings: loading state fail for device %s instance 
%d!\n",
+                        idstr, instance_id);
+        }
+
+
+        /* make sure to jump end of record */
+        if ( hvm_ctxt_tell(h) - rec_pos != rec_len) {
+            printk("wrong hvm record size, maybe some dismatch between save&restore 
handler!\n");
+        }
+        hvm_ctxt_seek(h, rec_pos + rec_len);
+    }
+
+    return 0;
+}
+
+#ifdef HVM_DEBUG_SUSPEND
+static void shpage_info(shared_iopage_t *sh)
+{
+
+    vcpu_iodata_t *p = &sh->vcpu_iodata[0];
+    ioreq_t *req = &p->vp_ioreq;
+    printk("*****sharepage_info******!\n");
+    printk("vp_eport=%d\n", p->vp_eport);
+    printk("io packet: "
+                     "state:%x, pvalid: %x, dir:%x, port: %"PRIx64", "
+                     "data: %"PRIx64", count: %"PRIx64", size: %"PRIx64"\n",
+                     req->state, req->data_is_ptr, req->dir, req->addr,
+                     req->data, req->count, req->size);
+}
+#else
+static void shpage_info(shared_iopage_t *sh)
+{
+}
+#endif
+
+static void shpage_save(hvm_domain_context_t *h, void *opaque)
+{
+    /* XXX:no action required for shpage save/restore, since it's in guest 
memory
+     * keep it for debug purpose only */
+
+#if 0
+    struct shared_iopage *s = opaque;
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+
+    shpage_info(s);
+
+    hvm_put_buffer(h, (char*)req, sizeof(struct ioreq));
+#endif
+}
+
+static int shpage_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct shared_iopage *s = opaque;
+#if 0
+    /* XXX:smp */
+    struct ioreq *req = &s->vcpu_iodata[0].vp_ioreq;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    hvm_get_buffer(h, (char*)req, sizeof(struct ioreq));
+
+
+#endif
+    shpage_info(s);
+    return 0;
+}
+
+void shpage_init(struct domain *d, shared_iopage_t *sp)
+{
+    hvm_register_savevm(d, "xen_hvm_shpage", 0x10, 1, shpage_save, 
shpage_load, sp);
 }

 /*
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vioapic.c
--- a/xen/arch/x86/hvm/vioapic.c        Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vioapic.c        Wed Dec 13 22:52:02 2006 +0800
@@ -466,10 +466,138 @@ void vioapic_update_EOI(struct domain *d
     spin_unlock(&hvm_irq->lock);
 }

+#ifdef HVM_DEBUG_SUSPEND
+static void ioapic_info(struct vioapic *s)
+{
+    int i;
+    printk("*****ioapic state:*****\n");
+    printk("ioapic 0x%x.\n", s->ioregsel);
+    printk("ioapic 0x%x.\n", s->id);
+    printk("ioapic 0x%lx.\n", s->base_address);
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
+        printk("ioapic redirtbl[%d]:0x%"PRIx64"\n", i, s->redirtbl[i].bits);
+    }
+
+}
+static void hvmirq_info(struct hvm_irq *hvm_irq)
+{
+    int i;
+    printk("*****hvmirq state:*****\n");
+    for (i = 0; i < BITS_TO_LONGS(32*4); i++)
+        printk("hvmirq pci_intx[%d]:0x%lx.\n", i, hvm_irq->pci_intx[i]);
+
+    for (i = 0; i < BITS_TO_LONGS(16); i++)
+        printk("hvmirq isa_irq[%d]:0x%lx.\n", i, hvm_irq->isa_irq[i]);
+
+    for (i = 0; i < BITS_TO_LONGS(1); i++)
+        printk("hvmirq callback_irq_wire[%d]:0x%lx.\n", i, 
hvm_irq->callback_irq_wire[i]);
+
+    printk("hvmirq callback_gsi:0x%x.\n", hvm_irq->callback_gsi);
+
+    for (i = 0; i < 4; i++)
+        printk("hvmirq pci_link_route[%d]:0x%"PRIx8".\n", i, 
hvm_irq->pci_link_route[i]);
+
+    for (i = 0; i < 4; i++)
+        printk("hvmirq pci_link_assert_count[%d]:0x%"PRIx8".\n", i, 
hvm_irq->pci_link_assert_count[i]);
+
+    for (i = 0; i < 4; i++)
+        printk("hvmirq gsi_assert_count[%d]:0x%"PRIx8".\n", i, 
hvm_irq->gsi_assert_count[i]);
+
+    printk("hvmirq round_robin_prev_vcpu:0x%"PRIx8".\n", 
hvm_irq->round_robin_prev_vcpu);
+}
+#else
+static void ioapic_info(struct vioapic *s)
+{
+}
+static void hvmirq_info(struct hvm_irq *hvm_irq)
+{
+}
+#endif
+
+static void ioapic_save(hvm_domain_context_t *h, void *opaque)
+{
+    int i;
+    struct domain *d = opaque;
+    struct vioapic *s = domain_vioapic(d);
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+
+    ioapic_info(s);
+    hvmirq_info(hvm_irq);
+
+    /* save iopaic state*/
+    hvm_put_32u(h, s->ioregsel);
+    hvm_put_32u(h, s->id);
+    hvm_put_64u(h, s->base_address);
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
+        hvm_put_64u(h, s->redirtbl[i].bits);
+    }
+
+    /* save hvm irq state */
+    hvm_put_buffer(h, (char*)hvm_irq->pci_intx, 16);
+    hvm_put_buffer(h, (char*)hvm_irq->isa_irq, 2);
+    hvm_put_buffer(h, (char*)hvm_irq->callback_irq_wire, 1);
+    hvm_put_32u(h, hvm_irq->callback_gsi);
+
+    for (i = 0; i < 4; i++)
+        hvm_put_8u(h, hvm_irq->pci_link_route[i]);
+
+    for (i = 0; i < 4; i++)
+        hvm_put_8u(h, hvm_irq->pci_link_assert_count[i]);
+
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++)
+        hvm_put_8u(h, hvm_irq->gsi_assert_count[i]);
+
+    hvm_put_8u(h, hvm_irq->round_robin_prev_vcpu);
+
+}
+
+static int ioapic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    int i;
+    struct domain *d = opaque;
+    struct vioapic *s = domain_vioapic(d);
+    struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    /* restore ioapic state */
+    s->ioregsel = hvm_get_32u(h);
+    s->id = hvm_get_32u(h);
+    s->base_address = hvm_get_64u(h);
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++) {
+        s->redirtbl[i].bits = hvm_get_64u(h);
+    }
+
+    /* restore irq state */
+    hvm_get_buffer(h, (char*)hvm_irq->pci_intx, 16);
+    hvm_get_buffer(h, (char*)hvm_irq->isa_irq, 2);
+    hvm_get_buffer(h, (char*)hvm_irq->callback_irq_wire, 1);
+    hvm_irq->callback_gsi = hvm_get_32u(h);
+
+    for (i = 0; i < 4; i++)
+        hvm_irq->pci_link_route[i] = hvm_get_8u(h);
+
+    for (i = 0; i < 4; i++)
+        hvm_irq->pci_link_assert_count[i] = hvm_get_8u(h);
+
+    for (i = 0; i < VIOAPIC_NUM_PINS; i++)
+        hvm_irq->gsi_assert_count[i] = hvm_get_8u(h);
+
+    hvm_irq->round_robin_prev_vcpu = hvm_get_8u(h);
+
+    ioapic_info(s);
+    hvmirq_info(hvm_irq);
+
+    return 0;
+}
+
 void vioapic_init(struct domain *d)
 {
     struct vioapic *vioapic = domain_vioapic(d);
     int i;
+
+    hvm_register_savevm(d, "xen_hvm_ioapic", 0, 1, ioapic_save, ioapic_load, 
d);

     memset(vioapic, 0, sizeof(*vioapic));
     for ( i = 0; i < VIOAPIC_NUM_PINS; i++ )
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vlapic.c Wed Dec 13 22:52:02 2006 +0800
@@ -921,6 +921,82 @@ static int vlapic_reset(struct vlapic *v
     return 1;
 }

+#ifdef HVM_DEBUG_SUSPEND
+static void lapic_info(struct vlapic *s)
+{
+    printk("*****lapic state:*****\n");
+    printk("lapic 0x%"PRIx64".\n", s->apic_base_msr);
+    printk("lapic 0x%x.\n", s->disabled);
+    printk("lapic 0x%x.\n", s->timer_divisor);
+    printk("lapic 0x%x.\n", s->timer_pending_count);
+}
+#else
+static void lapic_info(struct vlapic *s)
+{
+}
+#endif
+
+static void lapic_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct vlapic *s = opaque;
+
+    lapic_info(s);
+
+    hvm_put_64u(h, s->apic_base_msr);
+    hvm_put_32u(h, s->disabled);
+    hvm_put_32u(h, s->timer_divisor);
+
+    /*XXX: need this?*/
+    hvm_put_32u(h, s->timer_pending_count);
+
+    hvm_put_buffer(h, (char*)s->regs, 0x3f0);
+
+}
+
+static int lapic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct vlapic *s = opaque;
+    uint32_t tmict;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->apic_base_msr = hvm_get_64u(h);
+    s->disabled = hvm_get_32u(h);
+    s->timer_divisor = hvm_get_32u(h);
+
+    /*XXX: need this?*/
+    s->timer_pending_count = hvm_get_32u(h);
+
+    hvm_get_buffer(h, (char*)s->regs, 0x3f0);
+
+    /* rearm the actiemr if needed */
+    tmict = vlapic_get_reg(s, APIC_TMICT);
+    if (tmict > 0) {
+        s_time_t now = NOW(), offset;
+        stop_timer(&s->vlapic_timer);
+        vlapic_set_reg(s, APIC_TMCCT, tmict);
+        s->timer_last_update = now;
+
+        offset = APIC_BUS_CYCLE_NS * s->timer_divisor * tmict;
+
+        set_timer(&s->vlapic_timer, now + offset);
+
+        printk("lapic_load to rearm the actimer:"
+                    "bus cycle is %"PRId64"ns, now 0x%016"PRIx64", "
+                    "timer initial count 0x%x, offset 0x%016"PRIx64", "
+                    "expire @ 0x%016"PRIx64".",
+                    APIC_BUS_CYCLE_NS, now,
+                    vlapic_get_reg(s, APIC_TMICT),
+                    offset, now + offset);
+    }
+
+
+    lapic_info(s);
+
+    return 0;
+}
+
 int vlapic_init(struct vcpu *v)
 {
     struct vlapic *vlapic = vcpu_vlapic(v);
@@ -939,6 +1015,7 @@ int vlapic_init(struct vcpu *v)
     vlapic->regs = map_domain_page_global(page_to_mfn(vlapic->regs_page));
     memset(vlapic->regs, 0, PAGE_SIZE);

+    hvm_register_savevm(v->domain, "xen_hvm_lapic", v->vcpu_id, 1, lapic_save, 
lapic_load, vlapic);
     vlapic_reset(vlapic);

     vlapic->apic_base_msr = MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Dec 13 22:52:02 2006 +0800
@@ -426,6 +426,319 @@ static void vmx_store_cpu_guest_regs(
     vmx_vmcs_exit(v);
 }

+static int __get_instruction_length(void);
+int vmx_vmcs_save(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long inst_len;
+
+    inst_len = __get_instruction_length();
+    c->eip = __vmread(GUEST_RIP);
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: inst_len=0x%lx, eip=0x%"PRIx64".\n",
+            inst_len, c->eip);
+#endif
+
+    c->esp = __vmread(GUEST_RSP);
+    c->eflags = __vmread(GUEST_RFLAGS);
+
+    c->cr0 = v->arch.hvm_vmx.cpu_shadow_cr0;
+    c->cr3 = v->arch.hvm_vmx.cpu_cr3;
+    c->cr4 = v->arch.hvm_vmx.cpu_shadow_cr4;
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_save: cr3=0x%"PRIx64", cr0=0x%"PRIx64", 
cr4=0x%"PRIx64".\n",
+            c->cr3,
+            c->cr0,
+            c->cr4);
+#endif
+
+    c->idtr_limit = __vmread(GUEST_IDTR_LIMIT);
+    c->idtr_base = __vmread(GUEST_IDTR_BASE);
+
+    c->gdtr_limit = __vmread(GUEST_GDTR_LIMIT);
+    c->gdtr_base = __vmread(GUEST_GDTR_BASE);
+
+    c->cs_sel = __vmread(GUEST_CS_SELECTOR);
+    c->cs_limit = __vmread(GUEST_CS_LIMIT);
+    c->cs_base = __vmread(GUEST_CS_BASE);
+    c->cs_arbytes = __vmread(GUEST_CS_AR_BYTES);
+
+    c->ds_sel = __vmread(GUEST_DS_SELECTOR);
+    c->ds_limit = __vmread(GUEST_DS_LIMIT);
+    c->ds_base = __vmread(GUEST_DS_BASE);
+    c->ds_arbytes = __vmread(GUEST_DS_AR_BYTES);
+
+    c->es_sel = __vmread(GUEST_ES_SELECTOR);
+    c->es_limit = __vmread(GUEST_ES_LIMIT);
+    c->es_base = __vmread(GUEST_ES_BASE);
+    c->es_arbytes = __vmread(GUEST_ES_AR_BYTES);
+
+    c->ss_sel = __vmread(GUEST_SS_SELECTOR);
+    c->ss_limit = __vmread(GUEST_SS_LIMIT);
+    c->ss_base = __vmread(GUEST_SS_BASE);
+    c->ss_arbytes = __vmread(GUEST_SS_AR_BYTES);
+
+    c->fs_sel = __vmread(GUEST_FS_SELECTOR);
+    c->fs_limit = __vmread(GUEST_FS_LIMIT);
+    c->fs_base = __vmread(GUEST_FS_BASE);
+    c->fs_arbytes = __vmread(GUEST_FS_AR_BYTES);
+
+    c->gs_sel = __vmread(GUEST_GS_SELECTOR);
+    c->gs_limit = __vmread(GUEST_GS_LIMIT);
+    c->gs_base = __vmread(GUEST_GS_BASE);
+    c->gs_arbytes = __vmread(GUEST_GS_AR_BYTES);
+
+    c->tr_sel = __vmread(GUEST_TR_SELECTOR);
+    c->tr_limit = __vmread(GUEST_TR_LIMIT);
+    c->tr_base = __vmread(GUEST_TR_BASE);
+    c->tr_arbytes = __vmread(GUEST_TR_AR_BYTES);
+
+    c->ldtr_sel = __vmread(GUEST_LDTR_SELECTOR);
+    c->ldtr_limit = __vmread(GUEST_LDTR_LIMIT);
+    c->ldtr_base = __vmread(GUEST_LDTR_BASE);
+    c->ldtr_arbytes = __vmread(GUEST_LDTR_AR_BYTES);
+
+    c->sysenter_cs = __vmread(GUEST_SYSENTER_CS);
+    c->sysenter_esp = __vmread(GUEST_SYSENTER_ESP);
+    c->sysenter_eip = __vmread(GUEST_SYSENTER_EIP);
+
+    return 1;
+}
+
+int vmx_vmcs_restore(struct vcpu *v, struct vmcs_data *c)
+{
+    unsigned long mfn, old_cr4, old_base_mfn;
+    int error = 0;
+
+    __vmwrite(GUEST_RIP, c->eip);
+    __vmwrite(GUEST_RSP, c->esp);
+    __vmwrite(GUEST_RFLAGS, c->eflags);
+
+    v->arch.hvm_vmx.cpu_shadow_cr0 = c->cr0;
+    __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr0);
+
+    old_cr4 = __vmread(CR4_READ_SHADOW);
+    __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
+
+    v->arch.hvm_vmx.cpu_shadow_cr4 = c->cr4;
+    __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vmx.cpu_shadow_cr4);
+
+#ifdef HVM_DEBUG_SUSPEND
+    printk("vmx_vmcs_restore: cr3=0x%"PRIx64", cr0=0x%"PRIx64", 
cr4=0x%"PRIx64".\n",
+            c->cr3,
+            c->cr0,
+            c->cr4);
+#endif
+
+    if (!vmx_paging_enabled(v)) {
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+        goto skip_cr3;
+    }
+
+    if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
+        /*
+         * This is simple TLB flush, implying the guest has
+         * removed some translation or changed page attributes.
+         * We simply invalidate the shadow.
+         */
+        mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
+        if (mfn != pagetable_get_pfn(v->arch.guest_table)) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash(v->domain);
+            return 0;
+        }
+    } else {
+        /*
+         * If different, make a shadow. Check if the PDBR is valid
+         * first.
+         */
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %"PRIx64"", c->cr3);
+        if ((c->cr3 >> PAGE_SHIFT) > v->domain->max_pages) {
+            printk("Invalid CR3 value=%"PRIx64"", c->cr3);
+            domain_crash(v->domain);
+            return 0;
+        }
+
+        /* current!=vcpu as not called by arch_vmx_do_launch */
+        mfn = gmfn_to_mfn(v->domain, c->cr3 >> PAGE_SHIFT);
+        if(!get_page(mfn_to_page(mfn), v->domain)) {
+            struct page_info *page = mfn_to_page(mfn);
+            printk("get_page for mfn failed. CR3 value=%"PRIx64", 
count_info=0x%"PRIx32", type_info=0x%lx, owner=%d.\n", c->cr3,
+                    page->count_info,
+                    page->u.inuse.type_info,
+                    page->u.inuse._domain);
+            return 0;
+        }
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+        if (old_base_mfn)
+             put_page(mfn_to_page(old_base_mfn));
+        /*
+         * arch.shadow_table should now hold the next CR3 for shadow
+         */
+        v->arch.hvm_vmx.cpu_cr3 = c->cr3;
+    }
+
+ skip_cr3:
+#if defined(__x86_64__)
+    if (vmx_long_mode_enabled(v)) {
+        unsigned long vm_entry_value;
+        vm_entry_value = __vmread(VM_ENTRY_CONTROLS);
+        vm_entry_value |= VM_ENTRY_IA32E_MODE;
+        __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
+    }
+#endif
+
+    shadow_update_paging_modes(v);
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+
+    __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+    __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+    __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+    __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+    __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+    __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+    __vmwrite(GUEST_CS_BASE, c->cs_base);
+    __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes);
+
+    __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+    __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+    __vmwrite(GUEST_DS_BASE, c->ds_base);
+    __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes);
+
+    __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+    __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+    __vmwrite(GUEST_ES_BASE, c->es_base);
+    __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes);
+
+    __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+    __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+    __vmwrite(GUEST_SS_BASE, c->ss_base);
+    __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes);
+
+    __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+    __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+    __vmwrite(GUEST_FS_BASE, c->fs_base);
+    __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes);
+
+    __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+    __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+    __vmwrite(GUEST_GS_BASE, c->gs_base);
+    __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes);
+
+    __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+    __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+    __vmwrite(GUEST_TR_BASE, c->tr_base);
+    __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes);
+
+    __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+    __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+    __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+    __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes);
+
+    __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs);
+    __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp);
+    __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip);
+
+    return !error;
+}
+
+#ifdef HVM_DEBUG_SUSPEND
+static void dump_msr_state(struct vmx_msr_state *m)
+{
+    int i = 0;
+    printk("**** msr state ****\n");
+    printk("shadow_gs=0x%lx, flags=0x%lx, msr_items:", m->shadow_gs, m->flags);
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        printk("0x%lx,", m->msrs[i]);
+    printk("\n");
+}
+#else
+static void dump_msr_state(struct vmx_msr_state *m)
+{
+}
+#endif
+
+void vmx_save_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
+    unsigned long guest_flags = guest_state->flags;
+    int i = 0;
+
+    data->shadow_gs = guest_state->shadow_gs;
+    data->vmxassist_enabled = v->arch.hvm_vmx.vmxassist_enabled;
+    /* save msrs */
+    data->flags = guest_flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        data->msr_items[i] = guest_state->msrs[i];
+
+    dump_msr_state(guest_state);
+}
+
+void vmx_load_cpu_state(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    int i = 0;
+    struct vmcs_data *data = &ctxt->data;
+    struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state;
+
+    /* restore msrs */
+    guest_state->flags = data->flags;
+    for (i = 0; i < VMX_MSR_COUNT; i++)
+        guest_state->msrs[i] = data->msr_items[i];
+
+    guest_state->shadow_gs = data->shadow_gs;
+
+    /*XXX:no need to restore msrs, current!=vcpu as not called by 
arch_vmx_do_launch */
+/*    vmx_restore_guest_msrs(v);*/
+
+    v->arch.hvm_vmx.vmxassist_enabled = data->vmxassist_enabled;
+
+    dump_msr_state(guest_state);
+}
+
+void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    struct vmcs_data *data = &ctxt->data;
+
+    /* set valid flag to recover whole vmcs when restore */
+    ctxt->valid = 1;
+
+    vmx_save_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    vmx_vmcs_save(v, data);
+
+    vmx_vmcs_exit(v);
+
+}
+
+void vmx_load_vmcs_ctxt(struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    if (!ctxt->valid)
+        return;
+
+    vmx_load_cpu_state(v, ctxt);
+
+    vmx_vmcs_enter(v);
+
+    if (!vmx_vmcs_restore(v, &ctxt->data)) {
+        printk("vmx_vmcs restore failed!\n");
+        domain_crash(v->domain);
+    }
+
+    /* only load vmcs once */
+    ctxt->valid = 0;
+
+    vmx_vmcs_exit(v);
+
+}
+
 /*
  * The VMX spec (section 4.3.1.2, Checks on Guest Segment
  * Registers) says that virtual-8086 mode guests' segment
@@ -737,6 +1050,9 @@ static void vmx_setup_hvm_funcs(void)

     hvm_funcs.store_cpu_guest_regs = vmx_store_cpu_guest_regs;
     hvm_funcs.load_cpu_guest_regs = vmx_load_cpu_guest_regs;
+
+    hvm_funcs.save_cpu_ctxt = vmx_save_vmcs_ctxt;
+    hvm_funcs.load_cpu_ctxt = vmx_load_vmcs_ctxt;

     hvm_funcs.paging_enabled = vmx_paging_enabled;
     hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/hvm/vpic.c
--- a/xen/arch/x86/hvm/vpic.c   Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/hvm/vpic.c   Wed Dec 13 22:52:02 2006 +0800
@@ -378,6 +378,87 @@ static int vpic_intercept_elcr_io(ioreq_
     return 1;
 }

+#ifdef HVM_DEBUG_SUSPEND
+static void vpic_info(struct vpic *s)
+{
+    printk("*****pic state:*****\n");
+    printk("pic 0x%x.\n", s->irr);
+    printk("pic 0x%x.\n", s->imr);
+    printk("pic 0x%x.\n", s->isr);
+    printk("pic 0x%x.\n", s->irq_base);
+    printk("pic 0x%x.\n", s->init_state);
+    printk("pic 0x%x.\n", s->priority_add);
+    printk("pic 0x%x.\n", s->readsel_isr);
+    printk("pic 0x%x.\n", s->poll);
+    printk("pic 0x%x.\n", s->auto_eoi);
+    printk("pic 0x%x.\n", s->rotate_on_auto_eoi);
+    printk("pic 0x%x.\n", s->special_fully_nested_mode);
+    printk("pic 0x%x.\n", s->special_mask_mode);
+    printk("pic 0x%x.\n", s->elcr);
+    printk("pic 0x%x.\n", s->int_output);
+    printk("pic 0x%x.\n", s->is_master);
+}
+#else
+static void vpic_info(struct vpic *s)
+{
+}
+#endif
+
+static void vpic_save(hvm_domain_context_t *h, void *opaque)
+{
+    struct vpic *s = opaque;
+
+    vpic_info(s);
+
+    hvm_put_8u(h, s->irr);
+    hvm_put_8u(h, s->imr);
+    hvm_put_8u(h, s->isr);
+    hvm_put_8u(h, s->irq_base);
+    hvm_put_8u(h, s->init_state);
+    hvm_put_8u(h, s->priority_add);
+    hvm_put_8u(h, s->readsel_isr);
+
+    hvm_put_8u(h, s->poll);
+    hvm_put_8u(h, s->auto_eoi);
+
+    hvm_put_8u(h, s->rotate_on_auto_eoi);
+    hvm_put_8u(h, s->special_fully_nested_mode);
+    hvm_put_8u(h, s->special_mask_mode);
+
+    hvm_put_8u(h, s->elcr);
+    hvm_put_8u(h, s->int_output);
+}
+
+static int vpic_load(hvm_domain_context_t *h, void *opaque, int version_id)
+{
+    struct vpic *s = opaque;
+
+    if (version_id != 1)
+        return -EINVAL;
+
+    s->irr = hvm_get_8u(h);
+    s->imr = hvm_get_8u(h);
+    s->isr = hvm_get_8u(h);
+    s->irq_base = hvm_get_8u(h);
+    s->init_state = hvm_get_8u(h);
+    s->priority_add = hvm_get_8u(h);
+    s->readsel_isr = hvm_get_8u(h);
+
+    s->poll = hvm_get_8u(h);
+    s->auto_eoi = hvm_get_8u(h);
+
+    s->rotate_on_auto_eoi = hvm_get_8u(h);
+    s->special_fully_nested_mode = hvm_get_8u(h);
+    s->special_mask_mode = hvm_get_8u(h);
+
+    s->elcr = hvm_get_8u(h);
+    s->int_output = hvm_get_8u(h);
+
+    vpic_info(s);
+
+    return 0;
+}
+
 void vpic_init(struct domain *d)
 {
     struct vpic *vpic;
@@ -387,12 +468,14 @@ void vpic_init(struct domain *d)
     memset(vpic, 0, sizeof(*vpic));
     vpic->is_master = 1;
     vpic->elcr      = 1 << 2;
+    hvm_register_savevm(d, "xen_hvm_i8259", 0x20, 1, vpic_save, vpic_load, 
vpic);
     register_portio_handler(d, 0x20, 2, vpic_intercept_pic_io);
     register_portio_handler(d, 0x4d0, 1, vpic_intercept_elcr_io);

     /* Slave PIC. */
     vpic++;
     memset(vpic, 0, sizeof(*vpic));
+    hvm_register_savevm(d, "xen_hvm_i8259", 0xa0, 1, vpic_save, vpic_load, 
vpic);
     register_portio_handler(d, 0xa0, 2, vpic_intercept_pic_io);
     register_portio_handler(d, 0x4d1, 1, vpic_intercept_elcr_io);
 }
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/mm/shadow/common.c   Wed Dec 13 22:52:02 2006 +0800
@@ -2145,7 +2145,7 @@ int shadow_remove_all_mappings(struct vc
         /* Don't complain if we're in HVM and there's one extra mapping:
          * The qemu helper process has an untyped mapping of this dom's RAM */
         if ( !(shadow_mode_external(v->domain)
-               && (page->count_info & PGC_count_mask) <= 2
+               && (page->count_info & PGC_count_mask) <= 3 /* vmx restore add 
one extra mapping*/
                && (page->u.inuse.type_info & PGT_count_mask) == 0) )
         {
             SHADOW_ERROR("can't find all mappings of mfn %lx: "
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/arch/x86/mm/shadow/multi.c    Wed Dec 13 22:52:02 2006 +0800
@@ -1613,6 +1613,14 @@ sh_make_shadow(struct vcpu *v, mfn_t gmf
         }
     }

+    {
+        struct page_info *page = mfn_to_page(gmfn);
+        /* XXX: add it to emulate a touched page */
+        if ((page->u.inuse.type_info & PGT_type_mask) == PGT_none){
+            page->u.inuse.type_info |= (PGT_writable_page | PGT_validated);
+        }
+    }
+
     shadow_promote(v, gmfn, shadow_type);
     set_shadow_status(v, gmfn, shadow_type, smfn);

diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domain.c
--- a/xen/common/domain.c       Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/common/domain.c       Wed Dec 13 22:52:02 2006 +0800
@@ -24,6 +24,7 @@
 #include <xen/percpu.h>
 #include <xen/multicall.h>
 #include <asm/debugger.h>
+#include <asm/hvm/support.h>
 #include <public/sched.h>
 #include <public/vcpu.h>

@@ -454,8 +455,14 @@ int set_info_guest(struct domain *d,
     domain_pause(d);

     rc = -EFAULT;
-    if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 )
+    if ( copy_from_guest(c, vcpucontext->ctxt, 1) == 0 ) {
         rc = arch_set_info_guest(v, c);
+        if ( v->vcpu_id != 0 &&
+                is_hvm_vcpu(v) &&
+                test_and_clear_bit(_VCPUF_down, &v->vcpu_flags) ) {
+            vcpu_wake(v);
+        }
+    }

     domain_unpause(d);

diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/common/domctl.c
--- a/xen/common/domctl.c       Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/common/domctl.c       Wed Dec 13 22:52:02 2006 +0800
@@ -26,6 +26,10 @@ extern long arch_do_domctl(
     struct xen_domctl *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
 extern void arch_getdomaininfo_ctxt(
     struct vcpu *, struct vcpu_guest_context *);
+extern int arch_gethvm_ctxt(
+    struct vcpu *, struct hvm_domain_context *);
+extern int arch_sethvm_ctxt(
+        struct vcpu *, struct hvm_domain_context *);

 void cpumask_to_xenctl_cpumap(
     struct xenctl_cpumap *xenctl_cpumap, cpumask_t *cpumask)
@@ -205,6 +209,37 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
     }
     break;

+    case XEN_DOMCTL_sethvmcontext:
+    {
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto sethvmcontext_out;
+
+        /*XXX: need check input vcpu when smp */
+        v = d->vcpu[0];
+
+        ret = -EFAULT;
+        if ( copy_from_guest(c, op->u.hvmcontext.ctxt, 1) != 0 )
+            goto sethvmcontext_out;
+
+        ret = arch_sethvm_ctxt(v, c);
+
+        xfree(c);
+
+    sethvmcontext_out:
+        put_domain(d);
+
+    }
+    break;
+
     case XEN_DOMCTL_pausedomain:
     {
         struct domain *d = find_domain_by_id(op->domain);
@@ -489,6 +524,44 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc

     getvcpucontext_out:
         put_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_gethvmcontext:
+    {
+        struct hvm_domain_context *c;
+        struct domain             *d;
+        struct vcpu               *v;
+
+        ret = -ESRCH;
+        if ( (d = find_domain_by_id(op->domain)) == NULL )
+            break;
+
+        ret = -ENOMEM;
+        if ( (c = xmalloc(struct hvm_domain_context)) == NULL )
+            goto gethvmcontext_out;
+
+        v = d->vcpu[0];
+
+        ret = -ENODATA;
+        if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+            goto gethvmcontext_out;
+
+        ret = 0;
+        if (arch_gethvm_ctxt(v, c) == -1)
+            ret = -EFAULT;
+
+        if ( copy_to_guest(op->u.hvmcontext.ctxt, c, 1) )
+            ret = -EFAULT;
+
+        xfree(c);
+
+        if ( copy_to_guest(u_domctl, op, 1) )
+            ret = -EFAULT;
+
+    gethvmcontext_out:
+        put_domain(d);
+
     }
     break;

diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h  Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/domain.h  Wed Dec 13 22:52:02 2006 +0800
@@ -27,6 +27,20 @@
 #include <asm/hvm/io.h>
 #include <public/hvm/params.h>

+typedef void SaveStateHandler(hvm_domain_context_t *h, void *opaque);
+typedef int LoadStateHandler(hvm_domain_context_t *h, void *opaque, int 
version_id);
+
+#define HVM_SE_IDSTR_LEN 32
+typedef struct HVMStateEntry {
+    char idstr[HVM_SE_IDSTR_LEN];
+    int instance_id;
+    int version_id;
+    SaveStateHandler *save_state;
+    LoadStateHandler *load_state;
+    void *opaque;
+    struct HVMStateEntry *next;
+} HVMStateEntry;
+
 struct hvm_domain {
     unsigned long          shared_page_va;
     unsigned long          buffered_io_va;
@@ -44,6 +58,9 @@ struct hvm_domain {
     spinlock_t             pbuf_lock;

     uint64_t               params[HVM_NR_PARAMS];
+
+    struct hvm_domain_context *hvm_ctxt;
+    HVMStateEntry *first_se;
 };

 #endif /* __ASM_X86_HVM_DOMAIN_H__ */
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Dec 13 22:52:02 2006 +0800
@@ -79,6 +79,13 @@ struct hvm_function_table {
         struct vcpu *v, struct cpu_user_regs *r, unsigned long *crs);
     void (*load_cpu_guest_regs)(
         struct vcpu *v, struct cpu_user_regs *r);
+
+    /* save and load hvm guest cpu context for save/restore */
+    void (*save_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+    void (*load_cpu_ctxt)(
+        struct vcpu *v, struct hvmcpu_context *ctxt);
+
     /*
      * Examine specifics of the guest state:
      * 1) determine whether paging is enabled,
@@ -152,6 +159,20 @@ hvm_load_cpu_guest_regs(struct vcpu *v,
     hvm_funcs.load_cpu_guest_regs(v, r);
 }

+static inline void
+hvm_save_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.save_cpu_ctxt(v, ctxt);
+}
+
+static inline void
+hvm_load_cpu_context(
+        struct vcpu *v, struct hvmcpu_context *ctxt)
+{
+    hvm_funcs.load_cpu_ctxt(v, ctxt);
+}
+
 static inline int
 hvm_paging_enabled(struct vcpu *v)
 {
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/support.h Wed Dec 13 22:52:02 2006 +0800
@@ -121,6 +121,130 @@ extern unsigned int opt_hvm_debug_level;
 #define TRACE_VMEXIT(index, value)                              \
     current->arch.hvm_vcpu.hvm_trace_values[index] = (value)

+/* save/restore support */
+
+//#define HVM_DEBUG_SUSPEND
+
+extern int hvm_register_savevm(struct domain *d,
+                    const char *idstr,
+                    int instance_id,
+                    int version_id,
+                    SaveStateHandler *save_state,
+                    LoadStateHandler *load_state,
+                    void *opaque);
+
+static inline void hvm_ctxt_seek(hvm_domain_context_t *h, unsigned int pos)
+{
+    h->cur = pos;
+}
+
+static inline uint32_t hvm_ctxt_tell(hvm_domain_context_t *h)
+{
+    return h->cur;
+}
+
+static inline int hvm_ctxt_end(hvm_domain_context_t *h)
+{
+    return (h->cur >= h->size || h->cur >= HVM_CTXT_SIZE);
+}
+
+static inline void hvm_put_byte(hvm_domain_context_t *h, unsigned int i)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        h->cur++;
+        return;
+    }
+    h->data[h->cur++] = (char)i;
+}
+
+static inline void hvm_put_8u(hvm_domain_context_t *h, uint8_t b)
+{
+    hvm_put_byte(h, b);
+}
+
+static inline void hvm_put_16u(hvm_domain_context_t *h, uint16_t b)
+{
+    hvm_put_8u(h, b >> 8);
+    hvm_put_8u(h, b);
+}
+
+static inline void hvm_put_32u(hvm_domain_context_t *h, uint32_t b)
+{
+    hvm_put_16u(h, b >> 16);
+    hvm_put_16u(h, b);
+}
+
+static inline void hvm_put_64u(hvm_domain_context_t *h, uint64_t b)
+{
+    hvm_put_32u(h, b >> 32);
+    hvm_put_32u(h, b);
+}
+
+static inline void hvm_put_buffer(hvm_domain_context_t *h, const char *buf, 
int len)
+{
+    memcpy(&h->data[h->cur], buf, len);
+    h->cur += len;
+}
+
+
+static inline char hvm_get_byte(hvm_domain_context_t *h)
+{
+    if (h->cur >= HVM_CTXT_SIZE) {
+        printk("hvm_get_byte overflow.\n");
+        return -1;
+    }
+
+    if (h->cur >= h->size) {
+        printk("hvm_get_byte exceed data area.\n");
+        return -1;
+    }
+
+    return h->data[h->cur++];
+}
+
+static inline uint8_t hvm_get_8u(hvm_domain_context_t *h)
+{
+    return hvm_get_byte(h);
+}
+
+static inline uint16_t hvm_get_16u(hvm_domain_context_t *h)
+{
+    uint16_t v;
+    v =  hvm_get_8u(h) << 8;
+    v |= hvm_get_8u(h);
+
+    return v;
+}
+
+static inline uint32_t hvm_get_32u(hvm_domain_context_t *h)
+{
+    uint32_t v;
+    v =  hvm_get_16u(h) << 16;
+    v |= hvm_get_16u(h);
+
+    return v;
+}
+
+static inline uint64_t hvm_get_64u(hvm_domain_context_t *h)
+{
+    uint64_t v;
+    v =  (uint64_t)hvm_get_32u(h) << 32;
+    v |= hvm_get_32u(h);
+
+    return v;
+}
+
+static inline void hvm_get_buffer(hvm_domain_context_t *h, char *buf, int len)
+{
+    memcpy(buf, &h->data[h->cur], len);
+    h->cur += len;
+}
+
+extern int hvm_save(struct vcpu*, hvm_domain_context_t *h);
+extern int hvm_load(struct vcpu*, hvm_domain_context_t *h);
+
+extern void shpage_init(struct domain *d, shared_iopage_t *sp);
+
 extern int hvm_enabled;

 int hvm_copy_to_guest_phys(paddr_t paddr, void *buf, int size);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/asm-x86/hvm/vpt.h
--- a/xen/include/asm-x86/hvm/vpt.h     Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/asm-x86/hvm/vpt.h     Wed Dec 13 22:52:02 2006 +0800
@@ -123,7 +123,7 @@ extern void hvm_hooks_assist(struct vcpu
 extern void hvm_hooks_assist(struct vcpu *v);
 extern void pickup_deactive_ticks(struct periodic_time *vpit);
 extern struct periodic_time *create_periodic_time(
-    u32 period, char irq, char one_shot, time_cb *cb, void *data);
+    struct domain* d, u32 period, char irq, char one_shot, time_cb *cb, void 
*data);
 extern void destroy_periodic_time(struct periodic_time *pt);
 void pit_init(struct vcpu *v, unsigned long cpu_khz);
 void rtc_init(struct vcpu *v, int base, int irq);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/public/arch-x86_32.h  Wed Dec 13 22:52:02 2006 +0800
@@ -181,6 +181,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)
 DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);

 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};

 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled
@@ -210,6 +217,7 @@ struct vcpu_guest_context {
     unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
     unsigned long failsafe_callback_eip;
     unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/public/arch-x86_64.h  Wed Dec 13 22:52:02 2006 +0800
@@ -255,6 +255,13 @@ DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t)

 typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */

+#include "vmcs_data.h"
+
+struct hvmcpu_context {
+    uint32_t valid;
+    struct vmcs_data data;
+};
+
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled
  * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -288,6 +295,7 @@ struct vcpu_guest_context {
     uint64_t      fs_base;
     uint64_t      gs_base_kernel;
     uint64_t      gs_base_user;
+    struct hvmcpu_context hvmcpu_ctxt;          /* whole vmcs region */
 };
 typedef struct vcpu_guest_context vcpu_guest_context_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Fri Sep 15 17:05:38 2006 +0800
+++ b/xen/include/public/domctl.h       Wed Dec 13 22:52:02 2006 +0800
@@ -384,6 +384,21 @@ struct xen_domctl_settimeoffset {
 };
 typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
+
+#define HVM_CTXT_SIZE        6144
+typedef struct hvm_domain_context {
+    uint32_t cur;
+    uint32_t size;
+    uint8_t data[HVM_CTXT_SIZE];
+} hvm_domain_context_t;
+DEFINE_XEN_GUEST_HANDLE(hvm_domain_context_t);
+
+#define XEN_DOMCTL_gethvmcontext   33
+#define XEN_DOMCTL_sethvmcontext   34
+typedef struct xen_domctl_hvmcontext {
+    XEN_GUEST_HANDLE(hvm_domain_context_t) ctxt;  /* IN/OUT */
+} xen_domctl_hvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t);

 struct xen_domctl {
     uint32_t cmd;
@@ -410,6 +425,7 @@ struct xen_domctl {
         struct xen_domctl_hypercall_init    hypercall_init;
         struct xen_domctl_arch_setup        arch_setup;
         struct xen_domctl_settimeoffset     settimeoffset;
+        struct xen_domctl_hvmcontext        hvmcontext;
         uint8_t                             pad[128];
     } u;
 };
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_restore.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_restore.c      Wed Dec 13 22:52:02 2006 +0800
@@ -0,0 +1,280 @@
+/******************************************************************************
+ * xc_hvm_restore.c
+ *
+ * Restore the state of a HVM guest.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2006 Intel Corperation
+ * rewriten for hvm guest by Zhai Edwin <edwin.zhai@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/ioreq.h>
+#include <xen/hvm/params.h>
+#include <xen/hvm/e820.h>
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+static ssize_t
+read_exact(int fd, void *buf, size_t count)
+{
+    int r = 0, s;
+    unsigned char *b = buf;
+
+    while (r < count) {
+        s = read(fd, &b[r], count - r);
+        if ((s == -1) && (errno == EINTR))
+            continue;
+        if (s <= 0) {
+            break;
+        }
+        r += s;
+    }
+
+    return (r == count) ? 1 : 0;
+}
+
+int xc_hvm_restore(int xc_handle, int io_fd,
+                     uint32_t dom, unsigned long nr_pfns,
+                     unsigned int store_evtchn, unsigned long *store_mfn,
+                     unsigned int console_evtchn, unsigned long *console_mfn,
+                     unsigned int pae, unsigned int apic)
+{
+    DECLARE_DOMCTL;
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    char *region_base;
+
+    xc_mmu_t *mmu = NULL;
+
+    xc_dominfo_t info;
+    unsigned int rc = 1, i;
+    uint32_t rec_len, nr_vcpus;
+    hvm_domain_context_t hvm_ctxt;
+    unsigned long long v_end, memsize;
+    unsigned long shared_page_nr;
+
+    /* hvm guest mem size (Mb) */
+    memsize = (unsigned long long)*store_mfn;
+    v_end = memsize << 20;
+
+    DPRINTF("xc_hvm_restore:dom=%d, nr_pfns=0x%lx, store_evtchn=%d, *store_mfn=%ld, 
console_evtchn=%d, *console_mfn=%ld, pae=%u, apic=%u.\n",
+            dom, nr_pfns, store_evtchn, *store_mfn, console_evtchn, 
*console_mfn, pae, apic);
+
+
+
+    /*XXX: caculate the VGA hole, it's better derived from memsize*/
+    max_pfn = nr_pfns + 0x20;
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERROR("Unable to get platform info.");
+        return 1;
+    }
+
+    DPRINTF("xc_hvm_restore start: max_pfn = %lx, max_mfn = %lx, hvirt_start=%lx, 
pt_levels=%d\n",
+            max_pfn,
+            max_mfn,
+            hvirt_start,
+            pt_levels);
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        /* needed for build dom0 op, but might as well do early */
+        ERROR("Unable to mlock ctxt");
+        return 1;
+    }
+
+
+    p2m        = malloc(max_pfn * sizeof(xen_pfn_t));
+
+    if (p2m == NULL) {
+        ERROR("memory alloc failed");
+        errno = ENOMEM;
+        goto out;
+    }
+
+    /* Get the domain's shared-info frame. */
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)dom;
+    if (xc_domctl(xc_handle, &domctl) < 0) {
+        ERROR("Could not get information on new domain");
+        goto out;
+    }
+    shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
+
+    if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
+        errno = ENOMEM;
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfn; i++ )
+        p2m[i] = i;
+    for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < max_pfn; i++ )
+        p2m[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+
+    /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */
+    rc = xc_domain_memory_populate_physmap(
+        xc_handle, dom, (max_pfn > 0xa0) ? 0xa0 : max_pfn,
+        0, 0, &p2m[0x00]);
+    if ( (rc == 0) && (max_pfn > 0xc0) )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, max_pfn - 0xc0, 0, 0, &p2m[0xc0]);
+    if ( rc != 0 )
+    {
+        PERROR("Could not allocate memory for HVM guest.\n");
+        goto out;
+    }
+
+
+    /**********XXXXXXXXXXXXXXXX******************/
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERROR("Could not get domain info");
+        return 1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_getdomaininfo;
+    domctl.domain = (domid_t)dom;
+    if (xc_domctl(xc_handle, &domctl) < 0) {
+        ERROR("Could not get information on new domain");
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfn; i++)
+        p2m[i] = i;
+
+    /* resotre memory */
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | 
PROT_WRITE, p2m, max_pfn) ) == 0) {
+        ERROR("HVM:map page_array failed!\n");
+        goto out;
+    }
+
+    for (i = 0; i < max_pfn; i++) {
+        void *zpage = region_base + i * PAGE_SIZE;
+        if ( p2m[i] == (~0UL)) { /*invalid mfn*/
+            continue;
+        }
+        if (i >= 0xa0 && i < 0xc0) {
+            continue;
+        }
+
+        if (!read_exact(io_fd, zpage, PAGE_SIZE)) {
+            ERROR("HVM:read page %d failed!\n", i);
+            goto out;
+        }
+    }
+
+    (void)munmap(region_base, max_pfn*PAGE_SIZE);
+
+
+/*    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
+
+    if ( v_end > HVM_BELOW_4G_RAM_END )
+        shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
+    else
+        shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
+
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
+
+    /* caculate the store_mfn , wrong val cause hang when introduceDomain */
+    *store_mfn = p2m[(v_end >> PAGE_SHIFT) - 2];
+    DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", 
*store_mfn, v_end);
+
+    /* restore hvm context including pic/pit/shpage */
+    if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+        ERROR("error read hvm context size!\n");
+        goto out;
+    }
+    if (rec_len != sizeof(hvm_ctxt)) {
+        ERROR("hvm context size dismatch!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERROR("error read hvm context!\n");
+        goto out;
+    }
+
+    if (( rc = xc_domain_hvm_setcontext(xc_handle, dom, &hvm_ctxt))) {
+        ERROR("error set hvm context!\n");
+        goto out;
+    }
+
+    if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
+        ERROR("error read nr vcpu !\n");
+        goto out;
+    }
+    DPRINTF("hvm restore:get nr_vcpus=%d.\n", nr_vcpus);
+
+    for (i =0; i < nr_vcpus; i++) {
+        if (!read_exact(io_fd, &rec_len, sizeof(uint32_t))) {
+            ERROR("error read vcpu context size!\n");
+            goto out;
+        }
+        if (rec_len != sizeof(ctxt)) {
+            ERROR("vcpu context size dismatch!\n");
+            goto out;
+        }
+
+        if (!read_exact(io_fd, &(ctxt), sizeof(ctxt))) {
+            ERROR("error read vcpu context.\n");
+            goto out;
+        }
+
+        if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) ) {
+            ERROR("Could not set vcpu context, rc=%d", rc);
+            goto out;
+        }
+    }
+
+    rc = 0;
+    goto out;
+
+ out:
+    if ( (rc != 0) && (dom != 0) )
+        xc_domain_destroy(xc_handle, dom);
+    free(mmu);
+    free(p2m);
+
+    DPRINTF("Restore exit with rc=%d\n", rc);
+
+    return rc;
+}
diff -r 7c0030214af1 -r 3c0bd8907fd9 tools/libxc/xc_hvm_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_hvm_save.c Wed Dec 13 22:52:02 2006 +0800
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * xc_hvm_save.c
+ *
+ * Save the state of a running HVM guest.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2006 Intel Corperation
+ * rewriten for hvm guest by Zhai Edwin <edwin.zhai@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#define DEF_MAX_ITERS    (4 - 1)       /* limit us to 4 times round loop  */
+#define DEF_MAX_FACTOR   3             /* never send more than 3x nr_pfns */
+
+/* max mfn of the whole machine */
+static unsigned long max_mfn;
+
+/* virtual starting address of the hypervisor */
+static unsigned long hvirt_start;
+
+/* #levels of page tables used by the currrent guest */
+static unsigned int pt_levels;
+
+/* total number of pages used by the current guest */
+static unsigned long max_pfn;
+
+#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
+
+int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
+                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
+{
+    xc_dominfo_t info;
+
+    int rc = 1, i;
+    int live  = (flags & XCFLAGS_LIVE);
+    int debug = (flags & XCFLAGS_DEBUG);
+
+    /* The new domain's shared-info frame number. */
+    unsigned long shared_info_frame;
+
+    /* A copy of the CPU context of the guest. */
+    vcpu_guest_context_t ctxt;
+
+    /* A copy of hvm domain context */
+    hvm_domain_context_t hvm_ctxt;
+
+    /* Live mapping of shared info structure */
+    shared_info_t *live_shinfo = NULL;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    uint32_t nr_pfns, max_pfns, rec_size, nr_vcpus;
+    unsigned long *page_array;
+
+    DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x.\n",
+            dom, max_iters, max_factor, flags);
+
+    /* If no explicit control parameters given, use defaults */
+    if(!max_iters)
+        max_iters = DEF_MAX_ITERS;
+    if(!max_factor)
+        max_factor = DEF_MAX_FACTOR;
+
+/*    initialize_mbit_rate();*/
+
+    if(!get_platform_info(xc_handle, dom,
+                          &max_mfn, &hvirt_start, &pt_levels)) {
+        ERROR("HVM:Unable to get platform info.");
+        return 1;
+    }
+
+    if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
+        ERROR("HVM:Could not get domain info");
+        return 1;
+    }
+    nr_vcpus = info.nr_online_vcpus;
+
+    if (mlock(&ctxt, sizeof(ctxt))) {
+        ERROR("HVM:Unable to mlock ctxt");
+        return 1;
+    }
+
+    /* Only have to worry about vcpu 0 even for SMP */
+    if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
+        ERROR("HVM:Could not get vcpu context");
+        goto out;
+    }
+    shared_info_frame = info.shared_info_frame;
+
+    /* A cheesy test to see whether the domain contains valid state. */
+    if (ctxt.ctrlreg[3] == 0)
+    {
+        ERROR("Domain is not in a valid HVM guest state");
+        goto out;
+    }
+
+   /* cheesy sanity check */
+    if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
+        ERROR("Invalid HVM state record -- pfn count out of range: %lu",
+            (info.max_memkb >> (PAGE_SHIFT - 10)));
+        goto out;
+    }
+
+    /* Map the shared info frame */
+    if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                            PROT_READ, shared_info_frame))) {
+        ERROR("HVM:Couldn't map live_shinfo");
+        goto out;
+    }
+
+    max_pfn = live_shinfo->arch.max_pfn;
+
+    DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, 
nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages);
+
+    if (live) {
+        ERROR("hvm domain doesn't support live migration now.\n");
+        if (debug)
+            ERROR("hvm domain debug on.\n");
+        goto out;
+    }
+
+    /* suspend hvm domain */
+    if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
+        ERROR("HVM Domain appears not to have suspended");
+        goto out;
+    }
+
+    nr_pfns = info.nr_pages;
+    DPRINTF("after suspend hvm domain nr_pages=0x%x, max_memkb=0x%lx.\n", 
nr_pfns, info.max_memkb);
+
+    /*XXX: caculate the VGA hole*/
+    max_pfns = nr_pfns + 0x20;
+
+    /* get all the HVM domain pfns */
+    if ( (page_array = (unsigned long *) malloc (sizeof(unsigned long) * 
max_pfns)) == NULL) {
+        ERROR("HVM:malloc fail!\n");
+        goto out;
+    }
+
+    for ( i = 0; i < max_pfns; i++)
+        page_array[i] = i;
+
+    if ( (region_base = xc_map_foreign_batch(xc_handle, dom, PROT_READ | 
PROT_WRITE, page_array, max_pfns) ) == 0) {
+        ERROR("HVM domain map pages failed!\n");
+        goto out;
+    }
+
+
+    /* Start writing out the saved-domain record. begin with mem */
+    if (!write_exact(io_fd, &nr_pfns, sizeof(unsigned int))) {
+        ERROR("write: nr_pfns");
+        goto out;
+    }
+
+    for (i = 0; i < max_pfns; i++) {
+        int ret;
+        void *zpage = region_base + i * PAGE_SIZE;
+        if ( page_array[i] == (~0UL)) {
+            continue;
+        }
+        if (i >= 0xa0 && i < 0xc0) {
+            continue;
+        }
+
+        if ((ret = ratewrite(io_fd, zpage, PAGE_SIZE)) != PAGE_SIZE) {
+            ERROR("HVM:read page %d failed, mfn=0x%lx.\n", i, page_array[i]);
+            goto out;
+        }
+    }
+
+    /* save hvm hypervisor state including pic/pit/shpage */
+    if (mlock(&hvm_ctxt, sizeof(hvm_ctxt))) {
+        ERROR("Unable to mlock ctxt");
+        return 1;
+    }
+
+    if (xc_domain_hvm_getcontext(xc_handle, dom, &hvm_ctxt)){
+        ERROR("HVM:Could not get hvm context");
+        goto out;
+    }
+
+    rec_size = sizeof(hvm_ctxt);
+    if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+        ERROR("error write hvm ctxt size");
+        goto out;
+    }
+
+    if ( !write_exact(io_fd, &hvm_ctxt, sizeof(hvm_ctxt)) ) {
+        ERROR("write HVM info failed!\n");
+    }
+
+    /* save vcpu/vmcs context */
+    if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
+        ERROR("error write nr vcpus");
+        goto out;
+    }
+
+    /*XXX: need a online map to exclude down cpu */
+    for (i = 0; i < nr_vcpus; i++) {
+
+        if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) {
+            ERROR("HVM:Could not get vcpu context");
+            goto out;
+        }
+
+        rec_size = sizeof(ctxt);
+        DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus);
+        if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
+            ERROR("error write vcpu ctxt size");
+            goto out;
+        }
+
+        if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) {
+            ERROR("write vmcs failed!\n");
+            goto out;
+        }
+    }
+
+    /* Success! */
+    rc = 0;
+
+ out:
+    return !!rc;
+}
diff -r 7c0030214af1 -r 3c0bd8907fd9 xen/include/public/vmcs_data.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/vmcs_data.h    Wed Dec 13 22:52:02 2006 +0800
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * vmcs_data.h
+ *
+ * Copyright (c) 2006 Intel Corperation
+ *
+ */
+
+#ifndef __XEN_PUBLIC_VMCS_DATA_H__
+#define __XEN_PUBLIC_VMCS_DATA_H__
+
+/*
+ * World vmcs state
+ */
+struct vmcs_data {
+    uint64_t  eip;        /* execution pointer */
+    uint64_t  esp;        /* stack pointer */
+    uint64_t  eflags;     /* flags register */
+    uint64_t  cr0;
+    uint64_t  cr3;        /* page table directory */
+    uint64_t  cr4;
+    uint32_t  idtr_limit; /* idt */
+    uint64_t  idtr_base;
+    uint32_t  gdtr_limit; /* gdt */
+    uint64_t  gdtr_base;
+    uint32_t  cs_sel;     /* cs selector */
+    uint32_t  cs_limit;
+    uint64_t  cs_base;
+    uint32_t  cs_arbytes;
+    uint32_t  ds_sel;     /* ds selector */
+    uint32_t  ds_limit;
+    uint64_t  ds_base;
+    uint32_t  ds_arbytes;
+    uint32_t  es_sel;     /* es selector */
+    uint32_t  es_limit;
+    uint64_t  es_base;
+    uint32_t  es_arbytes;
+    uint32_t  ss_sel;     /* ss selector */
+    uint32_t  ss_limit;
+    uint64_t  ss_base;
+    uint32_t  ss_arbytes;
+    uint32_t  fs_sel;     /* fs selector */
+    uint32_t  fs_limit;
+    uint64_t  fs_base;
+    uint32_t  fs_arbytes;
+    uint32_t  gs_sel;     /* gs selector */
+    uint32_t  gs_limit;
+    uint64_t  gs_base;
+    uint32_t  gs_arbytes;
+    uint32_t  tr_sel;     /* task selector */
+    uint32_t  tr_limit;
+    uint64_t  tr_base;
+    uint32_t  tr_arbytes;
+    uint32_t  ldtr_sel;   /* ldtr selector */
+    uint32_t  ldtr_limit;
+    uint64_t  ldtr_base;
+    uint32_t  ldtr_arbytes;
+    uint32_t  sysenter_cs;
+    uint64_t  sysenter_esp;
+    uint64_t  sysenter_eip;
+    /* msr for em64t */
+    uint64_t shadow_gs;
+    uint64_t flags;
+    /* same size as VMX_MSR_COUNT */
+    uint64_t msr_items[6];
+    uint64_t vmxassist_enabled;
+};
+typedef struct vmcs_data vmcs_data_t;
+#endif
------------------------------------------------------------------------

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.