[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH]: kexec: framework and i386 (Take VI)



Hi, 

I will be out of the office until next Monday, so here is the latest and
greatest before I go. Tested against 9896, should also work fine
with tip (9903).

-- 
Horms                                           http://www.vergenet.net/~horms/

kexec: framework and i386

This is an implementation of kexec for dom0/xen, that allows
kexecing of the physical machine from xen. The approach taken is
to move the architecture-dependant kexec code into a new hypercall.

Some notes:
  * machine_kexec_cleanup() and machine_kexec_prepare() don't do
    anything in i386. So while this patch adds a framework for them,
    I am not sure what parameters are needs at this stage.
  * Only works for UP, as machine_shutdown is not implemented yet
  * kexecing into xen does not seem to work, I think that 
    kexec-tools needs updating, but I have not investigated yet
  * Kdump works by first copying the kernel into dom0 segments
    and relocating them later in xen, the same way that kexec does
    The only difference is that the relocation is made into
    an area reserved by xen
  * Kdump reservation is made using the xen command line parameters,
    kdump_megabytes and kdump_megabytes_base, rather than
    the linux option crashkernel, which is now ignored.
    Two parameters are used instead of one to simplify parsing.
    This can be cleaned up later if desired. But the reservation
    seems to need to be made by xen to make sure that it happens
    early enough.
  * This patch uses a new kexec hypercall

Highlights since the previous posted version:
 
  * SMP kexec (not kdump yet)
  * Split x86_32 specific xen code out

Prepared by Horms and Magnus Damm

Signed-Off-By: Magnus Damm <magnus@xxxxxxxxxxxxx>
Signed-Off-By: Horms <horms@xxxxxxxxxxxx>

 linux-2.6-xen-sparse/arch/i386/Kconfig                         |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/Makefile                 |    2 
 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c              |   24 +
 linux-2.6-xen-sparse/drivers/xen/core/Makefile                 |    1 
 linux-2.6-xen-sparse/drivers/xen/core/crash.c                  |   98 ++++
 linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c          |   73 +++
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h |   10 
 ref-linux-2.6.16/drivers/base/cpu.c                            |    4 
 ref-linux-2.6.16/kernel/kexec.c                                |   52 +-
 xen/arch/x86/Makefile                                          |    1 
 xen/arch/x86/dom0_ops.c                                        |    3 
 xen/arch/x86/machine_kexec.c                                   |   27 +
 xen/arch/x86/setup.c                                           |   75 +++
 xen/arch/x86/x86_32/Makefile                                   |    1 
 xen/arch/x86/x86_32/entry.S                                    |    2 
 xen/arch/x86/x86_32/machine_kexec.c                            |  206 
++++++++++
 xen/arch/x86/x86_64/Makefile                                   |    1 
 xen/arch/x86/x86_64/machine_kexec.c                            |   24 +
 xen/common/Makefile                                            |    1 
 xen/common/kexec.c                                             |   73 +++
 xen/common/page_alloc.c                                        |   33 +
 xen/include/asm-x86/hypercall.h                                |    5 
 xen/include/public/kexec.h                                     |   46 ++
 xen/include/public/xen.h                                       |    9 
 xen/include/xen/mm.h                                           |    1 
 25 files changed, 741 insertions(+), 33 deletions(-)

--- x/linux-2.6-xen-sparse/arch/i386/Kconfig
+++ x/linux-2.6-xen-sparse/arch/i386/Kconfig
@@ -726,7 +726,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
-       depends on EXPERIMENTAL && !X86_XEN
+       depends on EXPERIMENTAL
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
--- x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
@@ -89,7 +89,7 @@ include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
-n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o machine_kexec.o 
crash.o
 
 obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
 obj-y := $(call cherrypickxen, $(obj-y))
--- x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
+++ x/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c
@@ -68,6 +68,10 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
+
 /* Forward Declaration. */
 void __init find_max_pfn(void);
 
@@ -932,6 +936,7 @@ static void __init parse_cmdline_early (
                 * after a kernel panic.
                 */
                else if (!memcmp(from, "crashkernel=", 12)) {
+#ifndef CONFIG_XEN
                        unsigned long size, base;
                        size = memparse(from+12, &from);
                        if (*from == '@') {
@@ -942,6 +947,10 @@ static void __init parse_cmdline_early (
                                crashk_res.start = base;
                                crashk_res.end   = base + size - 1;
                        }
+#else
+                       printk("Ignoring crashkernel command line, "
+                              "parameter will be supplied by xen\n");
+#endif
                }
 #endif
 #ifdef CONFIG_PROC_VMCORE
@@ -1318,9 +1327,21 @@ void __init setup_bootmem_allocator(void
        }
 #endif
 #ifdef CONFIG_KEXEC
+#ifndef CONFIG_XEN
        if (crashk_res.start != crashk_res.end)
                reserve_bootmem(crashk_res.start,
                        crashk_res.end - crashk_res.start + 1);
+#else
+       {
+               struct kexec_arg xen_kexec_arg;
+               BUG_ON(HYPERVISOR_kexec(KEXEC_CMD_reserve, &xen_kexec_arg));
+               if (xen_kexec_arg.u.reserve.size) {
+                       crashk_res.start = xen_kexec_arg.u.reserve.start;
+                       crashk_res.end = xen_kexec_arg.u.reserve.start + 
+                               xen_kexec_arg.u.reserve.size - 1;
+               }
+       }
+#endif
 #endif
 
        if (!xen_feature(XENFEAT_auto_translated_physmap))
@@ -1395,6 +1416,9 @@ legacy_init_iomem_resources(struct resou
                res->end = map[i].end - 1;
                res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
                request_resource(&iomem_resource, res);
+#ifdef CONFIG_KEXEC
+        request_resource(res, &crashk_res);
+#endif
        }
 
        free_bootmem(__pa(map), PAGE_SIZE);
--- x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
+++ x/linux-2.6-xen-sparse/drivers/xen/core/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_NET)     += skbuff.o
 obj-$(CONFIG_SMP)     += smpboot.o
 obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
 obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
+obj-$(CONFIG_KEXEC)   += machine_kexec.o crash.o
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/crash.c
@@ -0,0 +1,98 @@
+/*
+ * Architecture specific (i386-xen) functions for kexec based crash dumps.
+ *
+ * Created by: Horms <horms@xxxxxxxxxxxx>
+ *
+ */
+
+#include <linux/kernel.h> /* For printk */
+
+/* XXX: final_note(), crash_save_this_cpu() and crash_save_self()
+ * are copied from arch/i386/kernel/crash.c, might be good to either
+ * the original functions non-static and use them, or just
+ * merge this this into that file. 
+ */
+#include <linux/elf.h>     /* For struct elf_note */
+#include <linux/elfcore.h> /* For struct elf_prstatus */
+#include <linux/kexec.h>   /* crash_notes */
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                                                              size_t data_len)
+{
+       struct elf_note note;
+
+       note.n_namesz = strlen(name) + 1;
+       note.n_descsz = data_len;
+       note.n_type   = type;
+       memcpy(buf, &note, sizeof(note));
+       buf += (sizeof(note) +3)/4;
+       memcpy(buf, name, note.n_namesz);
+       buf += (note.n_namesz + 3)/4;
+       memcpy(buf, data, note.n_descsz);
+       buf += (note.n_descsz + 3)/4;
+
+       return buf;
+}
+
+static void final_note(u32 *buf)
+{
+       struct elf_note note;
+
+       note.n_namesz = 0;
+       note.n_descsz = 0;
+       note.n_type   = 0;
+       memcpy(buf, &note, sizeof(note));
+}
+
+static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
+{
+       struct elf_prstatus prstatus;
+       u32 *buf;
+
+       if ((cpu < 0) || (cpu >= NR_CPUS))
+               return;
+
+       /* Using ELF notes here is opportunistic.
+        * I need a well defined structure format
+        * for the data I pass, and I need tags
+        * on the data to indicate what information I have
+        * squirrelled away.  ELF notes happen to provide
+        * all of that that no need to invent something new.
+        */
+       buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+       if (!buf)
+               return;
+       memset(&prstatus, 0, sizeof(prstatus));
+       prstatus.pr_pid = current->pid;
+       elf_core_copy_regs(&prstatus.pr_reg, regs);
+       buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+                               sizeof(prstatus));
+       final_note(buf);
+}
+
+static void crash_save_self(struct pt_regs *regs)
+{
+       int cpu;
+
+       cpu = smp_processor_id();
+       crash_save_this_cpu(regs, cpu);
+}
+
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+       /* XXX: This should do something */
+       printk("xen-kexec: Need to turn of other CPUS in "
+              "machine_crash_shutdown()\n");
+       crash_save_self(regs);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ x/linux-2.6-xen-sparse/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,73 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ *
+ * Losely based on arch/i386/kernel/machine_kexec.c
+ */
+
+#include <linux/kexec.h>
+#include <xen/interface/kexec.h>
+#include <linux/mm.h>
+#include <asm/hypercall.h>
+
+const extern unsigned char relocate_new_kernel[];
+extern unsigned int relocate_new_kernel_size;
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * Currently nothing.
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+       kexec_arg_t hypercall_arg;
+               hypercall_arg.u.helper.data = NULL;
+       return HYPERVISOR_kexec(KEXEC_CMD_kexec_prepare, &hypercall_arg);
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+       kexec_arg_t hypercall_arg;
+       hypercall_arg.u.helper.data = NULL;
+       HYPERVISOR_kexec(KEXEC_CMD_kexec_cleanup, &hypercall_arg);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+       kexec_arg_t hypercall_arg;
+       hypercall_arg.u.kexec.indirection_page = image->head;
+       hypercall_arg.u.kexec.reboot_code_buffer = 
+               pfn_to_mfn(page_to_pfn(image->control_code_page)) << PAGE_SHIFT;
+       hypercall_arg.u.kexec.start_address = image->start;
+       hypercall_arg.u.kexec.relocate_new_kernel = relocate_new_kernel;
+       hypercall_arg.u.kexec.relocate_new_kernel_size = 
+               relocate_new_kernel_size;
+       HYPERVISOR_kexec(KEXEC_CMD_kexec, &hypercall_arg);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
+++ x/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
@@ -37,6 +37,8 @@
 # error "please don't include this file directly"
 #endif
 
+#include <xen/interface/kexec.h>
+
 #define __STR(x) #x
 #define STR(x) __STR(x)
 
@@ -357,6 +359,14 @@ HYPERVISOR_xenoprof_op(
        return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline int
+HYPERVISOR_kexec(
+       unsigned long op, kexec_arg_t * arg)
+{
+       return _hypercall2(int, kexec_op, op, arg); 
+}
+
+
 
 #endif /* __HYPERCALL_H__ */
 
--- x/ref-linux-2.6.16/drivers/base/cpu.c
+++ x/ref-linux-2.6.16/drivers/base/cpu.c
@@ -101,7 +101,11 @@ static ssize_t show_crash_notes(struct s
         * boot up and this data does not change there after. Hence this
         * operation should be safe. No locking required.
         */
+#ifndef CONFIG_XEN
        addr = __pa(per_cpu_ptr(crash_notes, cpunum));
+#else
+       addr = virt_to_machine(per_cpu_ptr(crash_notes, cpunum));
+#endif
        rc = sprintf(buf, "%Lx\n", addr);
        return rc;
 }
--- x/ref-linux-2.6.16/kernel/kexec.c
+++ x/ref-linux-2.6.16/kernel/kexec.c
@@ -38,6 +38,20 @@ struct resource crashk_res = {
        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+/* Kexec needs to know about the actually physical addresss.
+ * But in xen, a physical address is a pseudo-physical addresss. */
+#ifndef CONFIG_XEN
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#else
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 int kexec_should_crash(struct task_struct *p)
 {
        if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
@@ -403,7 +417,7 @@ static struct page *kimage_alloc_normal_
                pages = kimage_alloc_pages(GFP_KERNEL, order);
                if (!pages)
                        break;
-               pfn   = page_to_pfn(pages);
+               pfn   = kexec_page_to_pfn(pages);
                epfn  = pfn + count;
                addr  = pfn << PAGE_SHIFT;
                eaddr = epfn << PAGE_SHIFT;
@@ -437,6 +451,7 @@ static struct page *kimage_alloc_normal_
        return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                                                      unsigned int order)
 {
@@ -490,7 +505,7 @@ static struct page *kimage_alloc_crash_c
                }
                /* If I don't overlap any segments I have found my hole! */
                if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
                        break;
                }
        }
@@ -517,6 +532,13 @@ struct page *kimage_alloc_control_pages(
 
        return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -532,7 +554,7 @@ static int kimage_add_entry(struct kimag
                        return -ENOMEM;
 
                ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
                image->entry = ind_page;
                image->last_entry = ind_page +
                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -593,13 +615,13 @@ static int kimage_terminate(struct kimag
 #define for_each_kimage_entry(image, ptr, entry) \
        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
                ptr = (entry & IND_INDIRECTION)? \
-                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
+                       kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
        struct page *page;
 
-       page = pfn_to_page(entry >> PAGE_SHIFT);
+       page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
        kimage_free_pages(page);
 }
 
@@ -686,7 +708,7 @@ static struct page *kimage_alloc_page(st
         * have a match.
         */
        list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
                if (addr == destination) {
                        list_del(&page->lru);
                        return page;
@@ -701,12 +723,12 @@ static struct page *kimage_alloc_page(st
                if (!page)
                        return NULL;
                /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
+               if (kexec_page_to_pfn(page) >
                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
                        list_add(&page->lru, &image->unuseable_pages);
                        continue;
                }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 
                /* If it is the destination page we want use it */
                if (addr == destination)
@@ -729,7 +751,7 @@ static struct page *kimage_alloc_page(st
                        struct page *old_page;
 
                        old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
                        copy_highpage(page, old_page);
                        *old = addr | (*old & ~PAGE_MASK);
 
@@ -779,7 +801,7 @@ static int kimage_load_normal_segment(st
                        result  = -ENOMEM;
                        goto out;
                }
-               result = kimage_add_page(image, page_to_pfn(page)
+               result = kimage_add_page(image, kexec_page_to_pfn(page)
                                                                << PAGE_SHIFT);
                if (result < 0)
                        goto out;
@@ -811,6 +833,7 @@ out:
        return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
                                        struct kexec_segment *segment)
 {
@@ -833,7 +856,7 @@ static int kimage_load_crash_segment(str
                char *ptr;
                size_t uchunk, mchunk;
 
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
                if (page == 0) {
                        result  = -ENOMEM;
                        goto out;
@@ -881,6 +904,13 @@ static int kimage_load_segment(struct ki
 
        return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
--- x/xen/arch/x86/Makefile
+++ x/xen/arch/x86/Makefile
@@ -39,6 +39,7 @@ obj-y += trampoline.o
 obj-y += traps.o
 obj-y += usercopy.o
 obj-y += x86_emulate.o
+obj-y += machine_kexec.o
 
 ifneq ($(pae),n)
 obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o
--- x/xen/arch/x86/dom0_ops.c
+++ x/xen/arch/x86/dom0_ops.c
@@ -29,6 +29,9 @@
 #include <asm/mtrr.h>
 #include "cpu/mtrr/mtrr.h"
 
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
 #define TRC_DOM0OP_ENTER_BASE  0x00020000
 #define TRC_DOM0OP_LEAVE_BASE  0x00030000
 
--- /dev/null
+++ x/xen/arch/x86/machine_kexec.c
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch/x86/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ */
+
+#include <public/kexec.h>
+
+int machine_kexec_prepare(struct kexec_arg *arg)
+{
+       return 0;
+}
+
+void machine_kexec_cleanup(struct kexec_arg *arg)
+{
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/setup.c
+++ x/xen/arch/x86/setup.c
@@ -38,6 +38,11 @@ static unsigned int opt_xenheap_megabyte
 integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 #endif
 
+unsigned int opt_kdump_megabytes = 0;
+integer_param("kdump_megabytes", opt_kdump_megabytes);
+unsigned int opt_kdump_megabytes_base = 0;
+integer_param("kdump_megabytes_base", opt_kdump_megabytes_base);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static int opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
@@ -192,6 +197,20 @@ static void percpu_free_unused_areas(voi
                        __pa(__per_cpu_end));
 }
 
+void __init move_memory(unsigned long dst, 
+                          unsigned long src_start, unsigned long src_end)
+{
+#if defined(CONFIG_X86_32)
+    memmove((void *)dst,  /* use low mapping */
+            (void *)src_start,      /* use low mapping */
+            src_end - src_start);
+#elif defined(CONFIG_X86_64)
+    memmove(__va(dst),
+            __va(src_start),
+            src_end - src_start);
+#endif
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char __cmdline[] = "", *cmdline = __cmdline;
@@ -327,15 +346,8 @@ void __init __start_xen(multiboot_info_t
         initial_images_start = xenheap_phys_end;
     initial_images_end = initial_images_start + modules_length;
 
-#if defined(CONFIG_X86_32)
-    memmove((void *)initial_images_start,  /* use low mapping */
-            (void *)mod[0].mod_start,      /* use low mapping */
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#elif defined(CONFIG_X86_64)
-    memmove(__va(initial_images_start),
-            __va(mod[0].mod_start),
-            mod[mbi->mods_count-1].mod_end - mod[0].mod_start);
-#endif
+    move_memory(initial_images_start, 
+                mod[0].mod_start, mod[mbi->mods_count-1].mod_end);
 
     /* Initialise boot-time allocator with all RAM situated after modules. */
     xenheap_phys_start = init_boot_allocator(__pa(&_end));
@@ -383,6 +395,51 @@ void __init __start_xen(multiboot_info_t
 #endif
     }
 
+    if (opt_kdump_megabytes) {
+        unsigned long kdump_start, kdump_size, k;
+
+        /* mark images pages as free for now */
+
+        init_boot_pages(initial_images_start, initial_images_end);
+
+        kdump_start = opt_kdump_megabytes_base << 20;
+        kdump_size = opt_kdump_megabytes << 20;
+
+        printk("Kdump: %luMB (%lukB) at 0x%lx\n", 
+               kdump_size >> 20,
+               kdump_size >> 10,
+               kdump_start);
+
+        if ((kdump_start & ~PAGE_MASK) || (kdump_size & ~PAGE_MASK))
+            panic("Kdump parameters not page aligned\n");
+
+        kdump_start >>= PAGE_SHIFT;
+        kdump_size >>= PAGE_SHIFT;
+
+        /* allocate pages for Kdump memory area */
+
+        k = alloc_boot_pages_at(kdump_size, kdump_start);
+
+        if (k != kdump_start)
+            panic("Unable to reserve Kdump memory\n");
+
+        /* allocate pages for relocated initial images */
+
+        k = ((initial_images_end - initial_images_start) & ~PAGE_MASK) ? 1 : 0;
+        k += (initial_images_end - initial_images_start) >> PAGE_SHIFT;
+
+        k = alloc_boot_pages(k, 1);
+
+        if (!k)
+            panic("Unable to allocate initial images memory\n");
+
+        move_memory(k << PAGE_SHIFT, initial_images_start, initial_images_end);
+
+        initial_images_end -= initial_images_start;
+        initial_images_start = k << PAGE_SHIFT;
+        initial_images_end += initial_images_start;
+    }        
+
     memguard_init();
 
     printk("System RAM: %luMB (%lukB)\n", 
--- x/xen/arch/x86/x86_32/Makefile
+++ x/xen/arch/x86/x86_32/Makefile
@@ -3,5 +3,6 @@ obj-y += entry.o
 obj-y += mm.o
 obj-y += seg_fixup.o
 obj-y += traps.o
+obj-y += machine_kexec.o
 
 obj-$(supervisor_mode_kernel) += supervisor_mode_kernel.o
--- x/xen/arch/x86/x86_32/entry.S
+++ x/xen/arch/x86/x86_32/entry.S
@@ -648,6 +648,7 @@ ENTRY(hypercall_table)
         .long do_xenoprof_op
         .long do_event_channel_op
         .long do_physdev_op
+        .long do_kexec
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -687,6 +688,7 @@ ENTRY(hypercall_args_table)
         .byte 2 /* do_xenoprof_op       */
         .byte 2 /* do_event_channel_op  */
         .byte 2 /* do_physdev_op        */
+        .byte 2 /* do_kexec             */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- /dev/null
+++ x/xen/arch/x86/x86_32/machine_kexec.c
@@ -0,0 +1,206 @@
+/******************************************************************************
+ * arch/x86/x86_32/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/domain_page.h> 
+#include <xen/timer.h>
+#include <xen/sched.h>
+#include <xen/reboot.h>
+#include <xen/console.h>
+#include <asm/page.h> 
+#include <asm/flushtlb.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+static void __machine_kexec(struct kexec_arg *arg);
+
+typedef asmlinkage void (*relocate_new_kernel_t)(
+                    unsigned long indirection_page,
+                    unsigned long reboot_code_buffer,
+                    unsigned long start_address,
+                    unsigned int has_pae);
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L2_ATTR (_PAGE_PRESENT)
+
+#ifndef CONFIG_X86_PAE
+
+static u32 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    unsigned long mfn;
+    u32 *pgtable_level2;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level2 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    write_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level2);
+}
+
+#else
+static u64 pgtable_level1[L1_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+static u64 pgtable_level2[L2_PAGETABLE_ENTRIES] PAGE_ALIGNED;
+
+static void identity_map_page(unsigned long address)
+{
+    int mfn;
+    intpte_t *pgtable_level3;
+
+    /* Find the current page table */
+    mfn = read_cr3() >> PAGE_SHIFT;
+    pgtable_level3 = map_domain_page(mfn);
+
+    /* Identity map the page table entry */
+    pgtable_level1[l1_table_offset(address)] = address | L0_ATTR;
+    pgtable_level2[l2_table_offset(address)] = __pa(pgtable_level1) | L1_ATTR;
+    set_64bit(&pgtable_level3[l3_table_offset(address)],
+             __pa(pgtable_level2) | L2_ATTR);
+
+    /* Flush the tlb so the new mapping takes effect.
+     * Global tlb entries are not flushed but that is not an issue.
+     */
+    load_cr3(mfn << PAGE_SHIFT);
+
+    unmap_domain_page(pgtable_level3);
+}
+#endif
+
+static void kexec_load_segments(void)
+{
+#define __SSTR(X) #X
+#define SSTR(X) __SSTR(X)
+    __asm__ __volatile__ (
+        "\tljmp $"SSTR(__HYPERVISOR_CS)",$1f\n"
+        "\t1:\n"
+        "\tmovl $"SSTR(__HYPERVISOR_DS)",%%eax\n"
+        "\tmovl %%eax,%%ds\n"
+        "\tmovl %%eax,%%es\n"
+        "\tmovl %%eax,%%fs\n"
+        "\tmovl %%eax,%%gs\n"
+        "\tmovl %%eax,%%ss\n"
+        ::: "eax", "memory");
+#undef SSTR
+#undef __SSTR
+}
+
+#define kexec_load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+static void kexec_set_idt(void *newidt, __u16 limit)
+{
+    struct Xgt_desc_struct curidt;
+
+    /* ia32 supports unaliged loads & stores */
+    curidt.size    = limit;
+    curidt.address = (unsigned long)newidt;
+    
+    kexec_load_idt(&curidt);
+
+};
+
+#define kexec_load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+static void kexec_set_gdt(void *newgdt, __u16 limit)
+{
+    struct Xgt_desc_struct curgdt;
+
+    /* ia32 supports unaligned loads & stores */
+    curgdt.size    = limit;
+    curgdt.address = (unsigned long)newgdt;
+
+    kexec_load_gdt(&curgdt);
+};
+
+static void __machine_shutdown(void *data)
+{
+    struct kexec_arg *arg = (struct kexec_arg *)data;
+
+    printk("__machine_shutdown: cpu=%u\n", smp_processor_id());
+
+    watchdog_disable();
+    console_start_sync();
+
+    smp_send_stop();
+
+#ifdef CONFIG_X86_IO_APIC
+    disable_IO_APIC();
+#endif   
+
+    __machine_kexec(arg);
+}
+
+void machine_shutdown(struct kexec_arg *arg)
+{
+    int reboot_cpu_id;
+    cpumask_t reboot_cpu;
+
+
+    reboot_cpu_id = 0;
+
+    if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+        reboot_cpu_id = smp_processor_id();
+    
+    if (reboot_cpu_id != smp_processor_id()) {
+        cpus_clear(reboot_cpu);
+        cpu_set(reboot_cpu_id, reboot_cpu);
+        on_selected_cpus(reboot_cpu, __machine_shutdown, arg, 1, 0);
+       for (;;)
+               ; /* nothing */
+    }
+    else
+        __machine_shutdown(arg);
+    BUG();
+}
+
+static void __machine_kexec(struct kexec_arg *arg)
+{
+    relocate_new_kernel_t rnk;
+
+    local_irq_disable();
+
+    identity_map_page(arg->u.kexec.reboot_code_buffer);
+
+    copy_from_user((void *)arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.relocate_new_kernel,
+           arg->u.kexec.relocate_new_kernel_size);
+
+    kexec_load_segments();
+    kexec_set_gdt(__va(0),0);
+    kexec_set_idt(__va(0),0);
+
+    rnk = (relocate_new_kernel_t) arg->u.kexec.reboot_code_buffer;
+    (*rnk)(arg->u.kexec.indirection_page, arg->u.kexec.reboot_code_buffer, 
+           arg->u.kexec.start_address, cpu_has_pae);
+}
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    machine_shutdown(arg);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/arch/x86/x86_64/Makefile
+++ x/xen/arch/x86/x86_64/Makefile
@@ -1,3 +1,4 @@
 obj-y += entry.o
 obj-y += mm.o
 obj-y += traps.o
+obj-y += machine_kexec.o
--- /dev/null
+++ x/xen/arch/x86/x86_64/machine_kexec.c
@@ -0,0 +1,24 @@
+/******************************************************************************
+ * arch/x86/x86_64/machine_kexec.c
+ * 
+ * Created By: Horms
+ *
+ * Based heavily on arch/i386/machine_kexec.c from Linux 2.6.16
+ */
+
+#include <public/kexec.h>
+
+void machine_kexec(struct kexec_arg *arg)
+{
+    printk("machine_kexec: not implemented\n");
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/common/Makefile
+++ x/xen/common/Makefile
@@ -7,6 +7,7 @@ obj-y += event_channel.o
 obj-y += grant_table.o
 obj-y += kernel.o
 obj-y += keyhandler.o
+obj-y += kexec.o
 obj-y += lib.o
 obj-y += memory.o
 obj-y += multicall.o
--- /dev/null
+++ x/xen/common/kexec.c
@@ -0,0 +1,73 @@
+/*
+ * Achitecture independent kexec code for Xen
+ *
+ * At this statge, just a switch for the kexec hypercall into
+ * architecture dependent code.
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ */
+
+#include <xen/lib.h>
+#include <xen/errno.h>
+#include <xen/guest_access.h>
+#include <xen/sched.h>
+#include <public/xen.h>
+#include <public/kexec.h>
+
+extern int machine_kexec_prepare(struct kexec_arg *arg);
+extern void machine_kexec_cleanup(struct kexec_arg *arg);
+extern void machine_kexec(struct kexec_arg *arg);
+
+extern unsigned int opt_kdump_megabytes;
+extern unsigned int opt_kdump_megabytes_base;
+
+int do_kexec(unsigned long op, 
+             XEN_GUEST_HANDLE(kexec_arg_t) uarg)
+{
+    struct kexec_arg arg;
+
+    if ( !IS_PRIV(current->domain) )  
+        return -EPERM;
+
+    if (op == KEXEC_CMD_reserve)
+    {
+       arg.u.reserve.size = opt_kdump_megabytes << 20;
+       arg.u.reserve.start = opt_kdump_megabytes_base << 20;
+       if ( unlikely(copy_to_guest(uarg, &arg, 1) != 0) )
+       {
+               printk("do_kexec: copy_to_guest failed");
+               return -EFAULT;
+       }
+       return 0;
+    }
+
+    if ( unlikely(copy_from_guest(&arg, uarg, 1) != 0) )
+    {
+        printk("do_kexec: __copy_from_guest failed");
+        return -EFAULT;
+    }
+
+    switch(op) {
+    case KEXEC_CMD_kexec:
+        machine_kexec(&arg);
+        return -EINVAL; /* Not Reached */
+    case KEXEC_CMD_kexec_prepare:
+        return machine_kexec_prepare(&arg);
+    case KEXEC_CMD_kexec_cleanup:
+        machine_kexec_cleanup(&arg);
+        return 0;
+    }
+
+    return -EINVAL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
--- x/xen/common/page_alloc.c
+++ x/xen/common/page_alloc.c
@@ -212,24 +212,35 @@ void init_boot_pages(paddr_t ps, paddr_t
     }
 }
 
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at)
+{
+    unsigned long i;
+
+    for ( i = 0; i < nr_pfns; i++ )
+        if ( allocated_in_map(pfn_at + i) )
+             break;
+
+    if ( i == nr_pfns )
+    {
+        map_alloc(pfn_at, nr_pfns);
+        return pfn_at;
+    }
+
+    return 0;
+}
+
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
 {
-    unsigned long pg, i;
+    unsigned long pg, i = 0;
 
     for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
     {
-        for ( i = 0; i < nr_pfns; i++ )
-            if ( allocated_in_map(pg + i) )
-                 break;
-
-        if ( i == nr_pfns )
-        {
-            map_alloc(pg, nr_pfns);
-            return pg;
-        }
+        i = alloc_boot_pages_at(nr_pfns, pg);
+        if (i != 0)
+            break;
     }
 
-    return 0;
+    return i;
 }
 
 
--- x/xen/include/asm-x86/hypercall.h
+++ x/xen/include/asm-x86/hypercall.h
@@ -6,6 +6,7 @@
 #define __ASM_X86_HYPERCALL_H__
 
 #include <public/physdev.h>
+#include <public/kexec.h>
 
 extern long
 do_event_channel_op_compat(
@@ -87,6 +88,10 @@ extern long
 arch_do_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg);
 
+extern int
+do_kexec(
+    unsigned long op, XEN_GUEST_HANDLE(kexec_arg_t) uarg);
+
 #ifdef __x86_64__
 
 extern long
--- /dev/null
+++ x/xen/include/public/kexec.h
@@ -0,0 +1,46 @@
+/*
+ * kexec.h: Xen kexec public
+ *
+ * Created By: Horms <horms@xxxxxxxxxxxx>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+#include <xen/types.h>
+#include <public/xen.h>
+
+/*
+ * Scratch space for passing arguments to the kexec hypercall
+ */
+typedef struct kexec_arg {
+    union {
+        struct {
+            unsigned long data; /* Not sure what this should be yet */
+        } helper;
+        struct {
+            unsigned long indirection_page;
+            unsigned long reboot_code_buffer;
+            unsigned long start_address;
+            const char *relocate_new_kernel;
+            unsigned int relocate_new_kernel_size;
+        } kexec;
+        struct {
+            unsigned long size;
+            unsigned long start;
+        } reserve;
+    } u;
+} kexec_arg_t;
+DEFINE_XEN_GUEST_HANDLE(kexec_arg_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- x/xen/include/public/xen.h
+++ x/xen/include/public/xen.h
@@ -64,6 +64,7 @@
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_kexec_op             34
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -238,6 +239,14 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_writable_pagetables  2
 #define MAX_VMASST_TYPE 2
 
+/*
+ * Operations for kexec.
+ */
+#define KEXEC_CMD_kexec                 0
+#define KEXEC_CMD_kexec_prepare         1
+#define KEXEC_CMD_kexec_cleanup         2
+#define KEXEC_CMD_reserve               3
+
 #ifndef __ASSEMBLY__
 
 typedef uint16_t domid_t;
--- x/xen/include/xen/mm.h
+++ x/xen/include/xen/mm.h
@@ -40,6 +40,7 @@ struct page_info;
 paddr_t init_boot_allocator(paddr_t bitmap_start);
 void init_boot_pages(paddr_t ps, paddr_t pe);
 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align);
+unsigned long alloc_boot_pages_at(unsigned long nr_pfns, unsigned long pfn_at);
 void end_boot_allocator(void);
 
 /* Generic allocator. These functions are *not* interrupt-safe. */


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.