Xen project Mailing List

[Xen-changelog] [xen-unstable] [LINUX] Kexec: add kexec files to sparse tree.

From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>

Date: Fri, 08 Dec 2006 15:10:14 +0000

Delivery-date: Fri, 08 Dec 2006 07:09:50 -0800

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User Ian Campbell <ian.campbell@xxxxxxxxxxxxx> # Node ID d51e5a7317bb6f9f02b60a9bbb8519817ce55d68 # Parent 1db125262365b993d486272cc28e084bf57e0d66 [LINUX] Kexec: add kexec files to sparse tree. Signed-off-by: Ian Campbell <ian.campbell@xxxxxxxxxxxxx> --- linux-2.6-xen-sparse/arch/i386/kernel/crash.c | 183 ++ linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec.c | 89 + linux-2.6-xen-sparse/arch/x86_64/kernel/crash.c | 186 ++ linux-2.6-xen-sparse/arch/x86_64/kernel/machine_kexec.c | 173 ++ linux-2.6-xen-sparse/include/asm-i386/kexec.h | 103 + linux-2.6-xen-sparse/include/asm-x86_64/kexec.h | 96 + linux-2.6-xen-sparse/include/linux/kexec.h | 139 ++ linux-2.6-xen-sparse/kernel/kexec.c | 1081 ++++++++++++++++ 8 files changed, 2050 insertions(+) diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/arch/i386/kernel/crash.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/crash.c Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,183 @@ +/* + * Architecture specific (i386) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@xxxxxxxxxx) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <linux/elf.h> +#include <linux/elfcore.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <mach_ipi.h> + + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, &note, sizeof(note)); + buf += (sizeof(note) +3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, &note, sizeof(note)); +} + +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that that no need to invent something new. + */ + buf = (u32*)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + +static void crash_save_self(struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + crash_save_this_cpu(regs, cpu); +} + +#ifdef CONFIG_SMP +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct pt_regs *regs, int cpu) +{ + struct pt_regs fixed_regs; + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return 1; + local_irq_disable(); + + if (!user_mode(regs)) { + crash_fixup_ss_esp(&fixed_regs, regs); + regs = &fixed_regs; + } + crash_save_this_cpu(regs, cpu); + disable_local_APIC(); + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for(;;); + + return 1; +} + +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +static void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} + +static void nmi_shootdown_cpus(void) +{ + unsigned long msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + set_nmi_callback(crash_nmi_callback); + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ + disable_local_APIC(); +} +#else +static void nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = smp_processor_id(); + nmi_shootdown_cpus(); + lapic_shutdown(); +#if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); +#endif + crash_save_self(regs); +} diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec.c Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,89 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xxxxxxxxxxxx> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/desc.h> +#include <asm/system.h> + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u32 kexec_pgd[1024] PAGE_ALIGNED; +#ifdef CONFIG_X86_PAE +static u32 kexec_pmd0[1024] PAGE_ALIGNED; +static u32 kexec_pmd1[1024] PAGE_ALIGNED; +#endif +static u32 kexec_pte0[1024] PAGE_ALIGNED; +static u32 kexec_pte1[1024] PAGE_ALIGNED; + +/* + * A architecture hook called to validate the + * proposed image and prepare the control pages + * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * have been allocated, but the segments have yet + * been copied into the kernel. + * + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + * + * Currently nothing. + */ +int machine_kexec_prepare(struct kimage *image) +{ + return 0; +} + +/* + * Undo anything leftover by machine_kexec_prepare + * when an image is freed. + */ +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; +#ifdef CONFIG_X86_PAE + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; +#endif + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); +} diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/arch/x86_64/kernel/crash.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/crash.c Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,186 @@ +/* + * Architecture specific (x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@xxxxxxxxxx) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <linux/elf.h> +#include <linux/elfcore.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> +#include <asm/mach_apic.h> + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, + void *data, size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, &note, sizeof(note)); + buf += (sizeof(note) +3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, &note, sizeof(note)); +} + +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that that no need to invent something new. + */ + + buf = (u32*)per_cpu_ptr(crash_notes, cpu); + + if (!buf) + return; + + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + +static void crash_save_self(struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + crash_save_this_cpu(regs, cpu); +} + +#ifdef CONFIG_SMP +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct pt_regs *regs, int cpu) +{ + /* + * Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return 1; + local_irq_disable(); + + crash_save_this_cpu(regs, cpu); + disable_local_APIC(); + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + for(;;) + asm("hlt"); + + return 1; +} + +static void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} + +/* + * This code is a best effort heuristic to get the + * other cpus to stop executing. So races with + * cpu hotplug shouldn't matter. + */ + +static void nmi_shootdown_cpus(void) +{ + unsigned long msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + set_nmi_callback(crash_nmi_callback); + + /* + * Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + /* Leave the nmi callback set */ + disable_local_APIC(); +} +#else +static void nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* + * This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = smp_processor_id(); + nmi_shootdown_cpus(); + + if(cpu_has_apic) + disable_local_APIC(); + +#if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); +#endif + + crash_save_self(regs); +} diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/arch/x86_64/kernel/machine_kexec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/machine_kexec.c Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,173 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xxxxxxxxxxxx> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/string.h> +#include <linux/reboot.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/io.h> + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u64 kexec_pgd[512] PAGE_ALIGNED; +static u64 kexec_pud0[512] PAGE_ALIGNED; +static u64 kexec_pmd0[512] PAGE_ALIGNED; +static u64 kexec_pte0[512] PAGE_ALIGNED; +static u64 kexec_pud1[512] PAGE_ALIGNED; +static u64 kexec_pmd1[512] PAGE_ALIGNED; +static u64 kexec_pte1[512] PAGE_ALIGNED; + +static void init_level2_page(pmd_t *level2p, unsigned long addr) +{ + unsigned long end_addr; + + addr &= PAGE_MASK; + end_addr = addr + PUD_SIZE; + while (addr < end_addr) { + set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; + } +} + +static int init_level3_page(struct kimage *image, pud_t *level3p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + PGDIR_SIZE; + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + pmd_t *level2p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level2p = (pmd_t *)page_address(page); + init_level2_page(level2p, addr); + set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + addr += PUD_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + pud_clear(level3p++); + addr += PUD_SIZE; + } +out: + return result; +} + + +static int init_level4_page(struct kimage *image, pgd_t *level4p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + pud_t *level3p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level3p = (pud_t *)page_address(page); + result = init_level3_page(image, level3p, addr, last_addr); + if (result) { + goto out; + } + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + addr += PGDIR_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + pgd_clear(level4p++); + addr += PGDIR_SIZE; + } +out: + return result; +} + + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + pgd_t *level4p; + level4p = (pgd_t *)__va(start_pgtable); + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +} + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + return; +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); +} diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/include/asm-i386/kexec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/include/asm-i386/kexec.h Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,103 @@ +#ifndef _I386_KEXEC_H +#define _I386_KEXEC_H + +#define PA_CONTROL_PAGE 0 +#define VA_CONTROL_PAGE 1 +#define PA_PGD 2 +#define VA_PGD 3 +#define PA_PTE_0 4 +#define VA_PTE_0 5 +#define PA_PTE_1 6 +#define VA_PTE_1 7 +#ifdef CONFIG_X86_PAE +#define PA_PMD_0 8 +#define VA_PMD_0 9 +#define PA_PMD_1 10 +#define VA_PMD_1 11 +#define PAGES_NR 12 +#else +#define PAGES_NR 8 +#endif + +#ifndef __ASSEMBLY__ + +#include <asm/fixmap.h> +#include <asm/ptrace.h> +#include <asm/string.h> + +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + * + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct + * calculation for the amount of memory directly mappable into the + * kernel memory space. + */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) +/* Maximum address we can use for the control code buffer */ +#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE + +#define KEXEC_CONTROL_CODE_SIZE 4096 + +/* The native architecture */ +#define KEXEC_ARCH KEXEC_ARCH_386 + +#define MAX_NOTE_BYTES 1024 + +/* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ +static inline void crash_fixup_ss_esp(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ + memcpy(newregs, oldregs, sizeof(*newregs)); + newregs->esp = (unsigned long)&(oldregs->esp); + __asm__ __volatile__( + "xorl %%eax, %%eax\n\t" + "movw %%ss, %%ax\n\t" + :"=a"(newregs->xss)); +} + +/* + * This function is responsible for capturing register states if coming + * via panic otherwise just fix up the ss and esp if coming via kernel + * mode exception. + */ +static inline void crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ + if (oldregs) + crash_fixup_ss_esp(newregs, oldregs); + else { + __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags)); + + newregs->eip = (unsigned long)current_text_addr(); + } +} +asmlinkage NORET_TYPE void +relocate_kernel(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae) ATTRIB_NORET; + +#endif /* __ASSEMBLY__ */ + +#endif /* _I386_KEXEC_H */ diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/include/asm-x86_64/kexec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/kexec.h Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,96 @@ +#ifndef _X86_64_KEXEC_H +#define _X86_64_KEXEC_H + +#define PA_CONTROL_PAGE 0 +#define VA_CONTROL_PAGE 1 +#define PA_PGD 2 +#define VA_PGD 3 +#define PA_PUD_0 4 +#define VA_PUD_0 5 +#define PA_PMD_0 6 +#define VA_PMD_0 7 +#define PA_PTE_0 8 +#define VA_PTE_0 9 +#define PA_PUD_1 10 +#define VA_PUD_1 11 +#define PA_PMD_1 12 +#define VA_PMD_1 13 +#define PA_PTE_1 14 +#define VA_PTE_1 15 +#define PA_TABLE_PAGE 16 +#define PAGES_NR 17 + +#ifndef __ASSEMBLY__ + +#include <linux/string.h> + +#include <asm/page.h> +#include <asm/ptrace.h> + +/* + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. + * I.e. Maximum page that is mapped directly into kernel memory, + * and kmap is not required. + * + * So far x86_64 is limited to 40 physical address bits. + */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) +/* Maximum address we can use for the control pages */ +#define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) + +/* Allocate one page for the pdp and the second for the code */ +#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL) + +/* The native architecture */ +#define KEXEC_ARCH KEXEC_ARCH_X86_64 + +#define MAX_NOTE_BYTES 1024 + +/* + * Saving the registers of the cpu on which panic occured in + * crash_kexec to save a valid sp. The registers of other cpus + * will be saved in machine_crash_shutdown while shooting down them. + */ + +static inline void crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ + if (oldregs) + memcpy(newregs, oldregs, sizeof(*newregs)); + else { + __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx)); + __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx)); + __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx)); + __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi)); + __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi)); + __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp)); + __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax)); + __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp)); + __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8)); + __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9)); + __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10)); + __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11)); + __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12)); + __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13)); + __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14)); + __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15)); + __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss)); + __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs)); + __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags)); + + newregs->rip = (unsigned long)current_text_addr(); + } +} + +NORET_TYPE void +relocate_kernel(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address) ATTRIB_NORET; + +#endif /* __ASSEMBLY__ */ + +#endif /* _X86_64_KEXEC_H */ diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/include/linux/kexec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/include/linux/kexec.h Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,139 @@ +#ifndef LINUX_KEXEC_H +#define LINUX_KEXEC_H + +#ifdef CONFIG_KEXEC +#include <linux/types.h> +#include <linux/list.h> +#include <linux/linkage.h> +#include <linux/compat.h> +#include <linux/ioport.h> +#include <asm/kexec.h> + +/* Verify architecture specific macros are defined */ + +#ifndef KEXEC_SOURCE_MEMORY_LIMIT +#error KEXEC_SOURCE_MEMORY_LIMIT not defined +#endif + +#ifndef KEXEC_DESTINATION_MEMORY_LIMIT +#error KEXEC_DESTINATION_MEMORY_LIMIT not defined +#endif + +#ifndef KEXEC_CONTROL_MEMORY_LIMIT +#error KEXEC_CONTROL_MEMORY_LIMIT not defined +#endif + +#ifndef KEXEC_CONTROL_CODE_SIZE +#error KEXEC_CONTROL_CODE_SIZE not defined +#endif + +#ifndef KEXEC_ARCH +#error KEXEC_ARCH not defined +#endif + +/* + * This structure is used to hold the arguments that are used when loading + * kernel binaries. + */ + +typedef unsigned long kimage_entry_t; +#define IND_DESTINATION 0x1 +#define IND_INDIRECTION 0x2 +#define IND_DONE 0x4 +#define IND_SOURCE 0x8 + +#define KEXEC_SEGMENT_MAX 16 +struct kexec_segment { + void __user *buf; + size_t bufsz; + unsigned long mem; /* User space sees this as a (void *) ... */ + size_t memsz; +}; + +#ifdef CONFIG_COMPAT +struct compat_kexec_segment { + compat_uptr_t buf; + compat_size_t bufsz; + compat_ulong_t mem; /* User space sees this as a (void *) ... */ + compat_size_t memsz; +}; +#endif + +struct kimage { + kimage_entry_t head; + kimage_entry_t *entry; + kimage_entry_t *last_entry; + + unsigned long destination; + + unsigned long start; + struct page *control_code_page; + + unsigned long nr_segments; + struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + + struct list_head control_pages; + struct list_head dest_pages; + struct list_head unuseable_pages; + + /* Address of next control page to allocate for crash kernels. */ + unsigned long control_page; + + /* Flags to indicate special processing */ + unsigned int type : 1; +#define KEXEC_TYPE_DEFAULT 0 +#define KEXEC_TYPE_CRASH 1 +}; + + + +/* kexec interface functions */ +extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET; +extern int machine_kexec_prepare(struct kimage *image); +extern void machine_kexec_cleanup(struct kimage *image); +extern asmlinkage long sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags); +#ifdef CONFIG_COMPAT +extern asmlinkage long compat_sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct compat_kexec_segment __user *segments, + unsigned long flags); +#endif +extern struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order); +extern void crash_kexec(struct pt_regs *); +int kexec_should_crash(struct task_struct *); +extern struct kimage *kexec_image; + +#define KEXEC_ON_CRASH 0x00000001 +#define KEXEC_ARCH_MASK 0xffff0000 + +/* These values match the ELF architecture values. + * Unless there is a good reason that should continue to be the case. + */ +#define KEXEC_ARCH_DEFAULT ( 0 << 16) +#define KEXEC_ARCH_386 ( 3 << 16) +#define KEXEC_ARCH_X86_64 (62 << 16) +#define KEXEC_ARCH_PPC (20 << 16) +#define KEXEC_ARCH_PPC64 (21 << 16) +#define KEXEC_ARCH_IA_64 (50 << 16) +#define KEXEC_ARCH_S390 (22 << 16) +#define KEXEC_ARCH_SH (42 << 16) + +#define KEXEC_FLAGS (KEXEC_ON_CRASH) /* List of defined/legal kexec flags */ + +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +typedef u32 note_buf_t[MAX_NOTE_BYTES/4]; +extern note_buf_t *crash_notes; + +#else /* !CONFIG_KEXEC */ +struct pt_regs; +struct task_struct; +static inline void crash_kexec(struct pt_regs *regs) { } +static inline int kexec_should_crash(struct task_struct *p) { return 0; } +#endif /* CONFIG_KEXEC */ +#endif /* LINUX_KEXEC_H */ diff -r 1db125262365 -r d51e5a7317bb linux-2.6-xen-sparse/kernel/kexec.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/kernel/kexec.c Fri Dec 08 11:47:09 2006 +0000 @@ -0,0 +1,1081 @@ +/* + * kexec.c - kexec system call + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xxxxxxxxxxxx> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/capability.h> +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/kexec.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/syscalls.h> +#include <linux/ioport.h> +#include <linux/hardirq.h> + +#include <asm/page.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/system.h> +#include <asm/semaphore.h> + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t* crash_notes; + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +int kexec_should_crash(struct task_struct *p) +{ + if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) + return 1; + return 0; +} + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in <asm/kexec.h>. + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long dest); + +static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + size_t segment_bytes; + struct kimage *image; + unsigned long i; + int result; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kmalloc(sizeof(*image), GFP_KERNEL); + if (!image) + goto out; + + memset(image, 0, sizeof(*image)); + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->start = entry; + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unuseable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof(*segments); + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) + goto out; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + * + * Since the kernel does everything in page size chunks ensure + * the destination addreses are page aligned. Too many + * special cases crop of when we don't do this. The most + * insidious is getting overlapping destination addresses + * simply because addresses are changed to page size + * granularity. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) + goto out; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* Verify our destination addresses do not overlap. + * If we alloed overlapping destination addresses + * through very weird things can happen with no + * easy explanation as one segment stops on another. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + unsigned long j; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + for (j = 0; j < i; j++) { + unsigned long pstart, pend; + pstart = image->segment[j].mem; + pend = pstart + image->segment[j].memsz; + /* Do the segments overlap ? */ + if ((mend > pstart) && (mstart < pend)) + goto out; + } + } + + /* Ensure our buffer sizes are strictly less than + * our memory sizes. This should always be the case, + * and it is easier to check up front than to be surprised + * later on. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + if (image->segment[i].bufsz > image->segment[i].memsz) + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; + +} + +static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + int result; + struct kimage *image; + + /* Allocate and initialize a controlling structure */ + image = NULL; + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + *rimage = image; + + /* + * Find a location for the control code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; + out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + int result; + struct kimage *image; + unsigned long i; + + image = NULL; + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) { + result = -EADDRNOTAVAIL; + goto out; + } + + /* Allocate and initialize a controlling structure */ + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + /* Enable the special crash kernel control page + * allocation policy. + */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || (mend > crashk_res.end)) + goto out; + } + + /* + * Find a location for the control code buffer, and add + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) + return 1; + } + + return 0; +} + +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *pages; + + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + pages->mapping = NULL; + set_page_private(pages, order); + count = 1 << order; + for (i = 0; i < count; i++) + SetPageReserved(pages + i); + } + + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + + order = page_private(page); + count = 1 << order; + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +static void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + kimage_free_pages(page); + } +} + +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + + pages = kimage_alloc_pages(GFP_KERNEL, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while (!pages); + + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everyting to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + + return pages; +} + +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * Control pages are also the only pags we must allocate + * when loading a crash kernel. All of the other pages + * are specified by the segments and we just memcpy + * into them directly. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * Given the low demand this implements a very simple + * allocator that finds the first hole of the appropriate + * size in the reserved memory region, and allocates all + * of the memory up to and including the hole. + */ + unsigned long hole_start, hole_end, size; + struct page *pages; + + pages = NULL; + size = (1 << order) << PAGE_SHIFT; + hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + while (hole_end <= crashk_res.end) { + unsigned long i; + + if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) + break; + if (hole_end > crashk_res.end) + break; + /* See if I overlap any of the segments */ + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + if ((hole_end >= mstart) && (hole_start <= mend)) { + /* Advance the hole to the end of the segment */ + hole_start = (mend + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + break; + } + } + /* If I don't overlap any segments I have found my hole! */ + if (i == image->nr_segments) { + pages = pfn_to_page(hole_start >> PAGE_SHIFT); + break; + } + } + if (pages) + image->control_page = hole_end; + + return pages; +} + + +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + struct page *pages = NULL; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + pages = kimage_alloc_normal_control_pages(image, order); + break; + case KEXEC_TYPE_CRASH: + pages = kimage_alloc_crash_control_pages(image, order); + break; + } + + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) + image->entry++; + + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) + return -ENOMEM; + + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + + return 0; +} + +static int kimage_set_destination(struct kimage *image, + unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) + image->destination = destination; + + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) + image->destination += PAGE_SIZE; + + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unuseable pages I have cached */ + kimage_free_page_list(&image->unuseable_pages); + +} +static int kimage_terminate(struct kimage *image) +{ + if (*image->entry != 0) + image->entry++; + + *image->entry = IND_DONE; + + return 0; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) + kimage_free_entry(entry); + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) + destination = entry & PAGE_MASK; + else if (entry & IND_SOURCE) { + if (page == destination) + return ptr; + destination += PAGE_SIZE; + } + } + + return NULL; +} + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) + return NULL; + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it. + */ + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->lru, &image->dest_pages); + } + } + + return page; +} + +static int kimage_load_normal_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char __user *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + result = kimage_set_destination(image, maddr); + if (result < 0) + goto out; + + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); + if (page == 0) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) + goto out; + + ptr = kmap(page); + /* Start with a clear page */ + memset(ptr, 0, PAGE_SIZE); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) + uchunk = ubytes; + + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_crash_segment(struct kimage *image, + struct kexec_segment *segment) +{ + /* For crash dumps kernels we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char __user *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = pfn_to_page(maddr >> PAGE_SHIFT); + if (page == 0) { + result = -ENOMEM; + goto out; + } + ptr = kmap(page); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) { + uchunk = ubytes; + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + int result = -ENOMEM; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + result = kimage_load_normal_segment(image, segment); + break; + case KEXEC_TYPE_CRASH: + result = kimage_load_crash_segment(image, segment); + break; + } + + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image = NULL; +static struct kimage *kexec_crash_image = NULL; +/* + * A home grown binary mutex. + * Nothing can wait so this mutex is safe to use + * in interrupt context :) + */ +static int kexec_lock = 0; + +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) +{ + struct kimage **dest_image, *image; + int locked; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* + * Verify we have a legal set of flags + * This leaves us room for future extensions. + */ + if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) + return -EINVAL; + + /* Verify we are on the appropriate architecture */ + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) + return -EINVAL; + + /* Put an artificial cap on the number + * of segments passed to kexec_load. + */ + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + image = NULL; + result = 0; + + /* Because we write directly to the reserved memory + * region when loading crash kernels we need a mutex here to + * prevent multiple crash kernels from attempting to load + * simultaneously, and to prevent a crash kernel from loading + * over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + locked = xchg(&kexec_lock, 1); + if (locked) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_ON_CRASH) + dest_image = &kexec_crash_image; + if (nr_segments > 0) { + unsigned long i; + + /* Loading another kernel to reboot into */ + if ((flags & KEXEC_ON_CRASH) == 0) + result = kimage_normal_alloc(&image, entry, + nr_segments, segments); + /* Loading another kernel to switch to if this one crashes */ + else if (flags & KEXEC_ON_CRASH) { + /* Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + result = kimage_crash_alloc(&image, entry, + nr_segments, segments); + } + if (result) + goto out; + + result = machine_kexec_prepare(image); + if (result) + goto out; + + for (i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &image->segment[i]); + if (result) + goto out; + } + result = kimage_terminate(image); + if (result) + goto out; + } + /* Install the new kernel, and Uninstall the old */ + image = xchg(dest_image, image); + +out: + xchg(&kexec_lock, 0); /* Release the mutex */ + kimage_free(image); + + return result; +} + +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct compat_kexec_segment __user *segments, + unsigned long flags) +{ + struct compat_kexec_segment in; + struct kexec_segment out, __user *ksegments; + unsigned long i, result; + + /* Don't allow clients that don't understand the native + * architecture to do anything. + */ + if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + for (i=0; i < nr_segments; i++) { + result = copy_from_user(&in, &segments[i], sizeof(in)); + if (result) + return -EFAULT; + + out.buf = compat_ptr(in.buf); + out.bufsz = in.bufsz; + out.mem = in.mem; + out.memsz = in.memsz; + + result = copy_to_user(&ksegments[i], &out, sizeof(out)); + if (result) + return -EFAULT; + } + + return sys_kexec_load(entry, nr_segments, ksegments, flags); +} +#endif + +void crash_kexec(struct pt_regs *regs) +{ + struct kimage *image; + int locked; + + + /* Take the kexec_lock here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + locked = xchg(&kexec_lock, 1); + if (!locked) { + image = xchg(&kexec_crash_image, NULL); + if (image) { + struct pt_regs fixed_regs; + crash_setup_regs(&fixed_regs, regs); + machine_crash_shutdown(&fixed_regs); + machine_kexec(image); + } + xchg(&kexec_lock, 0); + } +} + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + crash_notes = alloc_percpu(note_buf_t); + if (!crash_notes) { + printk("Kexec: Memory allocation for saving cpu register" + " states failed\n"); + return -ENOMEM; + } + return 0; +} +module_init(crash_notes_memory_init) _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.