[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH]: kexec: framework and i386 (Take IV)



Hi, 

here is the latest update of the kexec xen/dom0 patch.

-- 
Horms

kexec: framework and i386

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * Kdump works by first copying the kernel into dom0 segments
    and relocating them later in xen, the same way that kexec does
    The only difference is that the relocation is made into
    an area reserved by xen
  * Kdump reservation is made using the xen command line parameters,
    kdump_megabytes and kdump_megabytes_base, rather than
    the linux option crashkernel, which is now ignored.
    Two parameters are used instead of one to simplify parsing.
    This can be cleaned up later if desired. But the reservation
    seems to need to be made by xen to make sure that it happens
    early enough.
  * This patch uses dom0_op for hypercalls

Highlights since the previous posted version:
  * Use dom0_op instead of a new kexec hypercall
    - the hypercall table is currently full, so there is no where to 
      put a new kexec hypercall
    - This kexec patch makes sense for dom0 at this stage
  * Kernel notes are filled in for kdump
    - UP only, this patch does not support SMP kdump yet
  * Share x86 code between x86_64 and x86_32 
    (though x86_64 is not finished and not included in this patch)
  * Doesn't break x86_64 build

Prepared by Horms and Magnus Damm

Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
Signed-Off-By: Horms <horms@xxxxxxxxxxxx>

 linux-2.6-xen-sparse/arch/i386/Kconfig                |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile        |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c     |   26 ++
 linux-2.6-xen-sparse/drivers/xen/core/Makefile        |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c         |   98 +++++++++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c |   78 +++++++
 linux-2.6-xen-sparse/drivers/xen/core/reboot.c        |    7 
 ref-linux-2.6.16/drivers/base/cpu.c                   |    4 
 ref-linux-2.6.16/kernel/kexec.c                       |   52 ++++-
 xen/arch/x86/Makefile                                 |    1 
 xen/arch/x86/dom0_ops.c                               |   33 +++
 xen/arch/x86/machine_kexec.c                          |  174 +++++++++++++++++
 xen/arch/x86/setup.c                                  |   75 ++++++-
 xen/common/page_alloc.c                               |   33 ++-
 xen/include/public/dom0_ops.h                         |   23 ++
 xen/include/public/xen.h                              |    8 
 xen/include/xen/mm.h                                  |    1 
 17 files changed, 585 insertions(+), 33 deletions(-)

--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
-       depends on EXPERIMENTAL && !X86_XEN
+       depends on EXPERIMENTAL
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -92,7 +92,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o 
crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/dom0_ops.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
                 * after a kernel panic.
                 */
                else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
                        unsigned long size, base;
                        size = memparse(from+12, &from);
                        if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
                                crashk_res.start = base;
                                crashk_res.end   = base + size - 1;
                        }
+#else
+                       printk("Ignoring crashkernel command line, "
+                              "parameter will be supplied by xen\n");
+#endif
                }
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,23 @@ void __init setup_bootmem_allocator(void
        }
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
        if (crashk_res.start != crashk_res.end)
                reserve_bootmem(crashk_res.start,
                        crashk_res.end - crashk_res.start + 1);
+#else
+       {
+               dom0_op_t op;
+               op.cmd = DOM0_KEXEC;
+               op.u.kexec.op = KEXEC_CMD_reserve;
+               BUG_ON(HYPERVISOR_dom0_op(&op));
+               if (op.u.kexec.u.reserve.size) {
+                       crashk_res.start = op.u.kexec.u.reserve.start;
+                       crashk_res.end = op.u.kexec.u.reserve.start + 
+                               op.u.kexec.u.reserve.size - 1;
+               }
+       }
+#endif
 #endif
 
        if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1418,9 @@ legacy_init_iomem_resources(struct resou
                res->end = map[i].end - 1;
                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
        }
 
        free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,98 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@xxxxxxxxxxxx>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
+ * are copied from arch/i386/kernel/crash.c, might be good to either
+ * the original functions non-static and use them, or just
+ * merge this this into that file. 
+ */
+#include <linux/elf.h>     /* For struct elf_note */
+#include <linux/elfcore.h> /* For struct elf_prstatus */
+#include <linux/kexec.h>   /* crash_notes */
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                                                              size_t data_len)
+{
+       struct elf_note note;
+
+       note.n_namesz = strlen(name) + 1;
+       note.n_descsz = data_len;
+       note.n_type   = type;
+       memcpy(buf, &note, sizeof(note));
+       buf += (sizeof(note) +3)/4;
+       memcpy(buf, name, note.n_namesz);
+       buf += (note.n_namesz + 3)/4;
+       memcpy(buf, data, note.n_descsz);
+       buf += (note.n_descsz + 3)/4;
+
+       return buf;
+}
+
+static void final_note(u32 *buf)
+{
+       struct elf_note note;
+
+       note.n_namesz = 0;
+       note.n_descsz = 0;
+       note.n_type   = 0;
+       memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+       struct elf_prstatus prstatus;
+       u32 *buf;
+
+       if ((cpu < 0) || (cpu >= NR_CPUS))
+               return;
+
+       /* Using ELF notes here is opportunistic.
+        * I need a well defined structure format
+        * for the data I pass, and I need tags
+        * on the data to indicate what information I have
+        * squirrelled away.  ELF notes happen to provide
+        * all of that that no need to invent something new.
+        */
+       buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+       if (!buf)
+               return;
+       memset(&prstatus, 0, sizeof(prstatus));
+       prstatus.pr_pid = current->pid;
+       elf_core_copy_regs(&prstatus.pr_reg, regs);
+       buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+                               sizeof(prstatus));
+       final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+       int cpu;
+
+       cpu = smp_processor_id();
+       crash_save_this_cpu(regs, cpu);
+}
+
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+       /* XXX: This should do something */
+       printk("xen-kexec: Need to turn of other CPUS in "
+              "machine_crash_shutdown()\n");
+       crash_save_self(regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,78 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/dom0_ops.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+    struct dom0_op op;
+    op.cmd = DOM0_KEXEC;
+    op.u.kexec.op = KEXEC_CMD_kexec_prepare;
+    op.u.kexec.u.helper.data = 0;
+    return HYPERVISOR_dom0_op(&op);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+    struct dom0_op op;
+    op.cmd = DOM0_KEXEC;
+    op.u.kexec.op = KEXEC_CMD_kexec_cleanup;
+    op.u.kexec.u.helper.data = 0;
+    HYPERVISOR_dom0_op(&op);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+    struct dom0_op op;
+    op.cmd = DOM0_KEXEC;
+    op.u.kexec.op = KEXEC_CMD_kexec;
+    op.u.kexec.u.kexec.indirection_page = image->head;
+    op.u.kexec.u.kexec.reboot_code_buffer = 
+            pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+    op.u.kexec.u.kexec.start_address = image->start;
+    op.u.kexec.u.kexec.relocate_new_kernel = relocate_new_kernel;
+    op.u.kexec.u.kexec.relocate_new_kernel_size = relocate_new_kernel_size;
+    HYPERVISOR_dom0_op(&op);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
+++ x/linux-2.6-xen-sparse/drivers/xen/core/reboot.c
@@ -370,6 +370,13 @@ static int __init setup_shutdown_event(v
 
 subsys_initcall(setup_shutdown_event);
 
+#ifdef CONFIG_KEXEC
+void machine_shutdown(void) 
+{
+       printk("machine_shutdown: does nothing\n");
+}
+#endif
+
 /*
  * Local variables:
  *  c-file-style: "linux"
--- x/ref-linux-2.6.16/drivers/base/cpu.c
+++ x/ref-linux-2.6.16/drivers/base/cpu.c
@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
         * boot up and this data does not change there after. Hence this
         * operation should be safe. No locking required.
         */
+#ifndef CONFIG_XEN
        addr = __pa(per_cpu_ptr(crash_notes, cpunum));
+#else
+       addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
+#endif
        rc = sprintf(buf, "%Lx\n", addr);
        return rc;
 }
--- x/ref-linux-2.6.16/kernel/kexec.c
+++ x/ref-linux-2.6.16/kernel/kexec.c
@@ -38,6 +38,20 @@ struct resource crashk_res = {
        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+/* Kexec needs to know about the actually physical addresss.
+ * But in xen, a physical address is a pseudo-physical addresss. */
+#ifndef CONFIG_XEN
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#else
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 int kexec_should_crash(struct task_struct *p)
 {
        if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
                pages = kimage_alloc_pages(GFP_KERNEL, order);
                if (!pages)
                        break;
-               pfn   = page_to_pfn(pages);
+               pfn   = kexec_page_to_pfn(pages);
                epfn  = pfn + count;
                addr  = pfn << PAGE_SHIFT;
                eaddr = epfn << PAGE_SHIFT;
@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
        return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                                                      unsigned int order)
 {
@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
                }
                /* If I don't overlap any segments I have found my hole! */
                if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
                        break;
                }
        }
@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
 
        return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
                        return -ENOMEM;
 
                ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
                image->entry = ind_page;
                image->last_entry = ind_page +
                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
 #define for_each_kimage_entry(image, ptr, entry) \
        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
                ptr = (entry & IND_INDIRECTION)? \
-                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
+                       kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
        struct page *page;
 
-       page = pfn_to_page(entry >> PAGE_SHIFT);
+       page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
        kimage_free_pages(page);
 }
 
@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
         * have a match.
         */
        list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
                if (addr == destination) {
                        list_del(&page->lru);
                        return page;
@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
                if (!page)
                        return NULL;
                /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
+               if (kexec_page_to_pfn(page) >
                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
                        list_add(&page->lru, &image->unuseable_pages);
                        continue;
                }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 
                /* If it is the destination page we want use it */
                if (addr == destination)
@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
                        struct page *old_page;
 
                        old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
                        copy_highpage(page, old_page);
                        *old = addr | (*old & ~PAGE_MASK);
 
@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
                        result  = -ENOMEM;
                        goto out;
                }
-               result = kimage_add_page(image, page_to_pfn(page)
+               result = kimage_add_page(image, kexec_page_to_pfn(page)
                                                                << PAGE_SHIFT);
                if (result < 0)
                        goto out;
@@ -811,6 +833,7 @@ out:
        return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
                                        struct kexec_segment *segment)
 {
@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
                char *ptr;
                size_t uchunk, mchunk;
 
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
                if (page == 0) {
                        result  = -ENOMEM;
                        goto out;
@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
 
        return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -38,6 +38,7 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,13 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern int machine_kexec_prepare(struct dom0_kexec *arg);
+extern void machine_kexec_cleanup(struct dom0_kexec *arg);
+extern void machine_kexec(struct dom0_kexec *arg);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
@@ -445,6 +452,32 @@ long arch_do_dom0_op(struct dom0_op *op,
     }
     break;
 
+    case DOM0_KEXEC:
+               switch(op->u.kexec.op) {
+       case KEXEC_CMD_kexec:
+            machine_kexec(&op->u.kexec);
+            ret = -EINVAL; /* Not Reached */
+           break;
+       case KEXEC_CMD_kexec_prepare:
+            ret = machine_kexec_prepare(&op->u.kexec);
+           break;
+       case KEXEC_CMD_kexec_cleanup:
+            machine_kexec_cleanup(&op->u.kexec);
+           ret = 0;
+           break;
+       case KEXEC_CMD_reserve:
+            op->u.kexec.u.reserve.size = opt_kdump_megabytes << 20;
+            op->u.kexec.u.reserve.start = opt_kdump_megabytes_base << 20;
+            if ( unlikely(copy_to_guest(u_dom0_op, op, 1) != 0) )
+            {
+                printk("arch_do_dom0_op: kexec: copy_to_guest failed");
+                return -EFAULT;
+            }
+           ret = 0;
+           break;
+       }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,174 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/xen.h>
+#include <public/dom0_ops.h>
+
+#ifdef CONFIG_X86_32
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+             __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+#endif
+
+int machine_kexec_prepare(struct dom0_kexec *arg)
+{
+       return 0;
+}
+
+void machine_kexec_cleanup(struct dom0_kexec *arg)
+{
+}
+
+void machine_kexec(struct dom0_kexec *arg)
+{
+#ifdef CONFIG_X86_32
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+
+    kexec_set_gdt(__va(0),0);
+
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+#endif
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -37,6 +37,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -159,6 +164,20 @@ void discard_initial_images(void)
     init_domheap_pages(initial_images_start, initial_images_end);
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char *cmdline;
@@ -289,15 +308,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -344,6 +356,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/include/public/dom0_ops.h
+++ x/xen/include/public/dom0_ops.h
@@ -472,6 +472,28 @@ typedef struct dom0_hypercall_init {
 } dom0_hypercall_init_t;
 DEFINE_GUEST_HANDLE(dom0_hypercall_init_t);
 
+#define DOM0_KEXEC   49
+typedef struct dom0_kexec{
+    unsigned long op;
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+    } u;
+} dom0_kexec_t;
+DEFINE_GUEST_HANDLE(dom0_kexec_t);
+
 typedef struct dom0_op {
     uint32_t cmd;
     uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
@@ -513,6 +535,7 @@ typedef struct dom0_op {
         struct dom0_irq_permission    irq_permission;
         struct dom0_iomem_permission  iomem_permission;
         struct dom0_hypercall_init    hypercall_init;
+        struct dom0_kexec             kexec;
         uint8_t                       pad[128];
     } u;
 } dom0_op_t;
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -215,6 +215,14 @@ DEFINE_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_reserve               3
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.