diff -Npru kexec-kernel-only/arch/i386/kernel/time-xen.c kexec-kernel-only_20120522/arch/i386/kernel/time-xen.c --- kexec-kernel-only/arch/i386/kernel/time-xen.c 2012-01-25 14:15:45.000000000 +0100 +++ kexec-kernel-only_20120522/arch/i386/kernel/time-xen.c 2012-05-21 13:05:16.000000000 +0200 @@ -1072,10 +1072,11 @@ int local_setup_timer(unsigned int cpu) return 0; } +#endif +#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC) void local_teardown_timer(unsigned int cpu) { - BUG_ON(cpu == 0); unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL); } #endif diff -Npru kexec-kernel-only/arch/i386/mm/hypervisor.c kexec-kernel-only_20120522/arch/i386/mm/hypervisor.c --- kexec-kernel-only/arch/i386/mm/hypervisor.c 2012-01-25 14:15:45.000000000 +0100 +++ kexec-kernel-only_20120522/arch/i386/mm/hypervisor.c 2012-02-22 16:20:31.000000000 +0100 @@ -392,6 +392,7 @@ void xen_destroy_contiguous_region(unsig balloon_unlock(flags); } +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); #ifdef __i386__ int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) diff -Npru kexec-kernel-only/arch/x86_64/Kconfig kexec-kernel-only_20120522/arch/x86_64/Kconfig --- kexec-kernel-only/arch/x86_64/Kconfig 2012-01-25 14:15:38.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/Kconfig 2012-05-22 12:59:53.000000000 +0200 @@ -589,6 +589,12 @@ config CRASH_DUMP help Generate crash dump after being started by kexec. +config PHYSICAL_START + hex "Physical address where the kernel is loaded" if CRASH_DUMP + default "0x200000" + ---help--- + This gives the physical address where the kernel is loaded. + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS diff -Npru kexec-kernel-only/arch/x86_64/kernel/crash.c kexec-kernel-only_20120522/arch/x86_64/kernel/crash.c --- kexec-kernel-only/arch/x86_64/kernel/crash.c 2012-01-25 14:15:33.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/crash.c 2012-05-20 16:53:52.000000000 +0200 @@ -231,6 +231,9 @@ void machine_crash_shutdown(struct pt_re printk(KERN_CRIT "CFG = %x\n", cfg); pci_write_config_dword(mcp55_rewrite, 0x74, cfg); } +#else + if (!is_initial_xendomain()) + xen_pv_kexec_smp_send_stop(); #endif /* CONFIG_XEN */ crash_save_self(regs); } diff -Npru kexec-kernel-only/arch/x86_64/kernel/crash_dump.c kexec-kernel-only_20120522/arch/x86_64/kernel/crash_dump.c --- kexec-kernel-only/arch/x86_64/kernel/crash_dump.c 2006-09-20 05:42:06.000000000 +0200 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/crash_dump.c 2012-05-21 18:43:13.000000000 +0200 @@ -7,9 +7,74 @@ #include #include +#include +#include +#include #include #include +#include +#include + +#ifdef CONFIG_XEN +static void *map_oldmem_page(unsigned long pfn) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + struct vm_struct *area; + + area = get_vm_area(PAGE_SIZE, VM_IOREMAP); + + if (!area) + return NULL; + + pgd = pgd_offset_k((unsigned long)area->addr); + + pud = pud_alloc(&init_mm, pgd, (unsigned long)area->addr); + + if (!pud) + goto err; + + pmd = pmd_alloc(&init_mm, pud, (unsigned long)area->addr); + + if (!pmd) + goto err; + + pte = pte_alloc_kernel(pmd, (unsigned long)area->addr); + + if (!pte) + goto err; + + if (HYPERVISOR_update_va_mapping((unsigned long)area->addr, + pfn_pte_ma(pfn_to_mfn(pfn), + PAGE_KERNEL_RO), 0)) + goto err; + + return area->addr; + +err: + vunmap(area->addr); + + return NULL; +} + +static void unmap_oldmem_page(void *ptr) +{ + vunmap(ptr); +} +#else +static void *map_oldmem_page(unsigned long pfn) +{ + return ioremap(PFN_PHYS(pfn), PAGE_SIZE); +} + +static void unmap_oldmem_page(void *ptr) +{ + iounmap(ptr); +} +#endif /* CONFIG_XEN */ /** * copy_oldmem_page - copy one page from "oldmem" @@ -32,16 +97,29 @@ ssize_t copy_oldmem_page(unsigned long p if (!csize) return 0; - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); +#ifdef CONFIG_XEN + if (!phys_to_machine_mapping_valid(pfn)) { + memset(buf, 0, csize); + return csize; + } +#endif + + vaddr = map_oldmem_page(pfn); + + if (!vaddr) { + memset(buf, 0, csize); + return csize; + } if (userbuf) { if (copy_to_user(buf, (vaddr + offset), csize)) { - iounmap(vaddr); + unmap_oldmem_page(vaddr); return -EFAULT; } } else - memcpy(buf, (vaddr + offset), csize); + memcpy(buf, (vaddr + offset), csize); + + unmap_oldmem_page(vaddr); - iounmap(vaddr); return csize; } diff -Npru kexec-kernel-only/arch/x86_64/kernel/e820-xen.c kexec-kernel-only_20120522/arch/x86_64/kernel/e820-xen.c --- kexec-kernel-only/arch/x86_64/kernel/e820-xen.c 2012-01-25 14:15:30.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/e820-xen.c 2012-05-21 19:17:52.000000000 +0200 @@ -125,6 +125,7 @@ e820_any_mapped(unsigned long start, uns } return 0; } +EXPORT_SYMBOL_GPL(e820_any_mapped); /* * This function checks if the entire range is mapped with type. @@ -315,10 +316,10 @@ void __init e820_reserve_resources(struc * so we try it repeatedly and let the resource manager * test it. */ -#ifndef CONFIG_XEN - request_resource(res, &code_resource); - request_resource(res, &data_resource); -#endif + if (!is_initial_xendomain()) { + request_resource(res, &code_resource); + request_resource(res, &data_resource); + } #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) request_resource(res, &crashk_res); diff -Npru kexec-kernel-only/arch/x86_64/kernel/head-xen.S kexec-kernel-only_20120522/arch/x86_64/kernel/head-xen.S --- kexec-kernel-only/arch/x86_64/kernel/head-xen.S 2012-01-25 14:15:04.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/head-xen.S 2012-05-22 13:01:35.000000000 +0200 @@ -89,7 +89,7 @@ NEXT_PAGE(hypercall_page) .data - .align 16 + .align PAGE_SIZE .globl cpu_gdt_descr cpu_gdt_descr: .word gdt_end-cpu_gdt_table-1 @@ -166,7 +166,7 @@ ENTRY(empty_zero_page) .ascii ",ELF_PADDR_OFFSET=0x" utoh __START_KERNEL_map .ascii ",VIRT_ENTRY=0x" - utoh (__START_KERNEL_map + 0x200000 + VIRT_ENTRY_OFFSET) + utoh (__START_KERNEL_map + CONFIG_PHYSICAL_START + VIRT_ENTRY_OFFSET) .ascii ",HYPERCALL_PAGE=0x" utoh (phys_hypercall_page >> PAGE_SHIFT) .ascii ",FEATURES=writable_page_tables" diff -Npru kexec-kernel-only/arch/x86_64/kernel/machine_kexec.c kexec-kernel-only_20120522/arch/x86_64/kernel/machine_kexec.c --- kexec-kernel-only/arch/x86_64/kernel/machine_kexec.c 2012-01-25 14:15:17.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/machine_kexec.c 2012-05-22 14:37:25.000000000 +0200 @@ -1,9 +1,27 @@ /* - * machine_kexec.c - handle transition of Linux booting another kernel * Copyright (C) 2002-2005 Eric Biederman + * Copyright (c) 2011-2012 Acunu Limited * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. + * kexec/kdump implementation for Xen domU guests was written by Daniel Kiper. + * + * Some ideas are taken from: + * - native kexec/kdump implementation, + * - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18, + * - PV-GRUB. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include @@ -11,20 +29,16 @@ #include #include #include +#include +#include + +#include + #include #include #include #include -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u64 kexec_pgd[512] PAGE_ALIGNED; -static u64 kexec_pud0[512] PAGE_ALIGNED; -static u64 kexec_pmd0[512] PAGE_ALIGNED; -static u64 kexec_pte0[512] PAGE_ALIGNED; -static u64 kexec_pud1[512] PAGE_ALIGNED; -static u64 kexec_pmd1[512] PAGE_ALIGNED; -static u64 kexec_pte1[512] PAGE_ALIGNED; - #ifdef CONFIG_XEN /* In the case of Xen, override hypervisor functions to be able to create @@ -34,17 +48,37 @@ static u64 kexec_pte1[512] PAGE_ALIGNED; #include #include +#define x__pte(x) ((pte_t) { (x) } ) #define x__pmd(x) ((pmd_t) { (x) } ) #define x__pud(x) ((pud_t) { (x) } ) #define x__pgd(x) ((pgd_t) { (x) } ) +#define x_pte_val(x) ((x).pte) #define x_pmd_val(x) ((x).pmd) #define x_pud_val(x) ((x).pud) #define x_pgd_val(x) ((x).pgd) +static inline void x_set_pte(pte_t *dst, pte_t val) +{ + x_pte_val(*dst) = phys_to_machine(x_pte_val(val)); +} + +static inline void x_pte_clear(pte_t *pte) +{ + x_pte_val(*pte) = 0; +} + static inline void x_set_pmd(pmd_t *dst, pmd_t val) { - x_pmd_val(*dst) = x_pmd_val(val); + if (is_initial_xendomain()) + x_pmd_val(*dst) = x_pmd_val(val); + else + x_pmd_val(*dst) = phys_to_machine(x_pmd_val(val)); +} + +static inline void x_pmd_clear(pmd_t *pmd) +{ + x_pmd_val(*pmd) = 0; } static inline void x_set_pud(pud_t *dst, pud_t val) @@ -67,11 +101,12 @@ static inline void x_pgd_clear (pgd_t * x_pgd_val(*pgd) = 0; } +#define X__PAGE_KERNEL_EXEC \ + _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED #define X__PAGE_KERNEL_LARGE_EXEC \ _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE #define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY - -#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) +#define X_KERNPG_TABLE_RO _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY #if PAGES_NR > KEXEC_XEN_NO_PAGES #error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break @@ -81,28 +116,322 @@ static inline void x_pgd_clear (pgd_t * #error PA_CONTROL_PAGE is non zero - Xen support will break #endif +#define UPDATE_VA_MAPPING_BATCH 8 + +#define M2P_UPDATES_SIZE 4 + +typedef int (*update_pgprot_t)(int cpu, int flush, pgd_t *pgd, unsigned long paddr, pgprot_t pgprot); + +/* We need this to fix xenstore and console mapping. */ +static struct mmu_update m2p_updates[M2P_UPDATES_SIZE]; + +static DEFINE_PER_CPU(multicall_entry_t[UPDATE_VA_MAPPING_BATCH], pb_mcl); + +static void remap_page(pgd_t *pgd, unsigned long paddr, unsigned long maddr) +{ + pmd_t *pmd; + pud_t *pud; + pte_t *pte; + + pud = __va(pgd_val(pgd[pgd_index(paddr)]) & PHYSICAL_PAGE_MASK); + pmd = __va(pud_val(pud[pud_index(paddr)]) & PHYSICAL_PAGE_MASK); + pte = __va(pmd_val(pmd[pmd_index(paddr)]) & PHYSICAL_PAGE_MASK); + pte = &pte[pte_index(paddr)]; + + x_set_pte(pte, x__pte(machine_to_phys(maddr) | + (x_pte_val(*pte) & ~PHYSICAL_PAGE_MASK))); +} + +static int native_page_set_prot(int cpu, int flush, pgd_t *pgd, unsigned long paddr, pgprot_t pgprot) +{ + pmd_t *pmd; + pud_t *pud; + pte_t *pte; + + pud = __va(pgd_val(pgd[pgd_index(paddr)]) & PHYSICAL_PAGE_MASK); + pmd = __va(pud_val(pud[pud_index(paddr)]) & PHYSICAL_PAGE_MASK); + pte = __va(pmd_val(pmd[pmd_index(paddr)]) & PHYSICAL_PAGE_MASK); + pte = &pte[pte_index(paddr)]; + + x_set_pte(pte, x__pte(paddr | pgprot_val(pgprot))); + + return 0; +} + +static int xen_page_set_prot(int cpu, int flush, pgd_t *pgd, unsigned long paddr, pgprot_t pgprot) +{ + int result = 0; + static int seq = 0; + + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq++, (unsigned long)__va(paddr), + pfn_pte(PFN_DOWN(paddr), pgprot), UVMF_INVLPG | UVMF_ALL); + + if (unlikely(seq == UPDATE_VA_MAPPING_BATCH || flush)) { + result = HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), seq, NULL); + seq = 0; + if (unlikely(result)) + pr_info("kexec: %s: HYPERVISOR_multicall_check() failed: %i\n", + __func__, result); + } + + return result; +} + +static int pgtable_walk(pgd_t *pgds, pgd_t *pgdd, + update_pgprot_t update_pgprot, + pgprot_t pgprot) +{ + int cpu, i, j, k, result; + pmd_t *pmd; + pud_t *pud; + unsigned long paddr; + + cpu = get_cpu(); + + for (i = 0; i < PTRS_PER_PGD; ++i) { + /* Skip Xen mappings. */ + if (i == ROOT_PAGETABLE_FIRST_XEN_SLOT) + i += ROOT_PAGETABLE_XEN_SLOTS; + + if (pgd_none(pgds[i])) + continue; + + paddr = pgd_val(pgds[i]) & PHYSICAL_PAGE_MASK; + pud = __va(paddr); + + result = (*update_pgprot)(cpu, 0, pgdd, paddr, pgprot); + + if (result) + goto err; + + for (j = 0; j < PTRS_PER_PUD; ++j) { + if (pud_none(pud[j])) + continue; + + paddr = pud_val(pud[j]) & PHYSICAL_PAGE_MASK; + pmd = __va(paddr); + + result = (*update_pgprot)(cpu, 0, pgdd, paddr, pgprot); + + if (result) + goto err; + + for (k = 0; k < PTRS_PER_PMD; ++k) { + if (pmd_none(pmd[k])) + continue; + + paddr = pmd_val(pmd[k]) & PHYSICAL_PAGE_MASK; + + result = (*update_pgprot)(cpu, 0, pgdd, paddr, pgprot); + + if (result) + goto err; + } + } + } + + result = (*update_pgprot)(cpu, 1, pgdd, __pa(pgds), pgprot); + +err: + put_cpu(); + + return result; +} + +static int init_transition_pgtable(struct kimage *image) +{ + int result; + pgd_t *pgd; + pmd_t *pmd; + pud_t *pud; + pte_t *pte; + struct mmuext_op pin_op; + unsigned long addr; + + /* Map control page at its virtual address. */ + addr = (unsigned long)page_address(image->control_code_page) + PAGE_SIZE; + + pgd = (pgd_t *)&image->pgd[pgd_index(addr)]; + x_set_pgd(pgd, x__pgd(__pa(image->pud0) | X_KERNPG_TABLE)); + + pud = (pud_t *)&image->pud0[pud_index(addr)]; + x_set_pud(pud, x__pud(__pa(image->pmd0) | X_KERNPG_TABLE)); + + pmd = (pmd_t *)&image->pmd0[pmd_index(addr)]; + x_set_pmd(pmd, x__pmd(__pa(image->pte0) | X_KERNPG_TABLE)); + + pte = (pte_t *)&image->pte0[pte_index(addr)]; + x_set_pte(pte, x__pte(__pa(addr) | X__PAGE_KERNEL_EXEC)); + + /* Map control page at its physical address. */ + addr = __pa(addr); + + pgd = (pgd_t *)&image->pgd[pgd_index(addr)]; + x_set_pgd(pgd, x__pgd(__pa(image->pud1) | X_KERNPG_TABLE)); + + pud = (pud_t *)&image->pud1[pud_index(addr)]; + x_set_pud(pud, x__pud(__pa(image->pmd1) | X_KERNPG_TABLE)); + + pmd = (pmd_t *)&image->pmd1[pmd_index(addr)]; + x_set_pmd(pmd, x__pmd(__pa(image->pte1) | X_KERNPG_TABLE)); + + pte = (pte_t *)&image->pte1[pte_index(addr)]; + x_set_pte(pte, x__pte(addr | X__PAGE_KERNEL_EXEC)); + + result = pgtable_walk((pgd_t *)image->pgd, NULL, + xen_page_set_prot, PAGE_KERNEL_RO); + + if (result) + return result; + + pin_op.cmd = MMUEXT_PIN_L4_TABLE; + pin_op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(image->pgd))); + + result = HYPERVISOR_mmuext_op(&pin_op, 1, NULL, DOMID_SELF); + + if (result) + pr_info("kexec: %s: HYPERVISOR_mmuext_op() failed: %i\n", + __func__, result); + + return result; +} + +static int destroy_transition_pgtable(struct kimage *image) +{ + int result; + struct mmuext_op unpin_op; + + unpin_op.cmd = MMUEXT_UNPIN_TABLE; + unpin_op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(image->pgd))); + + result = HYPERVISOR_mmuext_op(&unpin_op, 1, NULL, DOMID_SELF); + + if (result) { + pr_info("kexec: %s: HYPERVISOR_mmuext_op() failed: %i\n", + __func__, result); + return result; + } + + return pgtable_walk((pgd_t *)image->pgd, NULL, + xen_page_set_prot, PAGE_KERNEL); +} + +static int is_new_start_info(start_info_t *new_start_info) +{ + /* Is it new start info? */ + if (memcmp(new_start_info->magic, xen_start_info->magic, + sizeof(new_start_info->magic))) + return 0; + + /* It looks like new start info but double check it... */ + if (new_start_info->store_mfn != xen_start_info->store_mfn) + return 0; + + if (new_start_info->console.domU.mfn != xen_start_info->console.domU.mfn) + return 0; + + /* + * Here we are almost sure that + * we have found new start info. + */ + return 1; +} + +/* + * Magic pages are behind start info. + * This assumption was made in kexec-tools, + * xen-pv loader. + */ + +static unsigned long find_magic_pages(struct kimage *image) +{ + unsigned long i, segment_start; + + for (i = image->nr_segments - 1; i; --i) { + segment_start = image->segment[i].mem; + + if (!is_new_start_info(__va(segment_start))) + continue; + + return segment_start + PAGE_SIZE; + } + + return 0; +} + +/* + * Remap xenstore and console pages (in this order). + * This function depends on assumptions made + * in kexec-tools, xen-pv loader. + */ + +static void remap_magic_pages(struct kimage *image, pgd_t *pgd) +{ + unsigned long magic_paddr; + + memset(m2p_updates, 0, sizeof(m2p_updates)); + + magic_paddr = find_magic_pages(image); + + if (!magic_paddr) + return; + + /* Remap xenstore page. */ + remap_page(pgd, magic_paddr, PFN_PHYS(xen_start_info->store_mfn)); + remap_page(pgd, PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)), + phys_to_machine(magic_paddr)); + + m2p_updates[0].ptr = PFN_PHYS(xen_start_info->store_mfn); + m2p_updates[0].ptr |= MMU_MACHPHYS_UPDATE; + m2p_updates[0].val = PFN_DOWN(magic_paddr); + + m2p_updates[1].ptr = phys_to_machine(magic_paddr); + m2p_updates[1].ptr |= MMU_MACHPHYS_UPDATE; + m2p_updates[1].val = mfn_to_pfn(xen_start_info->store_mfn); + + magic_paddr += PAGE_SIZE; + + /* Remap console page. */ + remap_page(pgd, magic_paddr, PFN_PHYS(xen_start_info->console.domU.mfn)); + remap_page(pgd, PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)), + phys_to_machine(magic_paddr)); + + m2p_updates[2].ptr = PFN_PHYS(xen_start_info->console.domU.mfn); + m2p_updates[2].ptr |= MMU_MACHPHYS_UPDATE; + m2p_updates[2].val = PFN_DOWN(magic_paddr); + + m2p_updates[3].ptr = phys_to_machine(magic_paddr); + m2p_updates[3].ptr |= MMU_MACHPHYS_UPDATE; + m2p_updates[3].val = mfn_to_pfn(xen_start_info->console.domU.mfn); +} + void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) { void *control_page; void *table_page; + table_page = page_address(image->control_code_page); + + if (!is_initial_xendomain()) { + remap_magic_pages(image, table_page); + return; + } + memset(xki->page_list, 0, sizeof(xki->page_list)); control_page = page_address(image->control_code_page) + PAGE_SIZE; memcpy(control_page, relocate_kernel, PAGE_SIZE); - table_page = page_address(image->control_code_page); - - xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); - xki->page_list[PA_TABLE_PAGE] = __ma(table_page); + xki->page_list[PA_CONTROL_PAGE] = virt_to_machine(control_page); + xki->page_list[PA_TABLE_PAGE] = virt_to_machine(table_page); - xki->page_list[PA_PGD] = __ma(kexec_pgd); - xki->page_list[PA_PUD_0] = __ma(kexec_pud0); - xki->page_list[PA_PUD_1] = __ma(kexec_pud1); - xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); - xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); - xki->page_list[PA_PTE_0] = __ma(kexec_pte0); - xki->page_list[PA_PTE_1] = __ma(kexec_pte1); + xki->page_list[PA_PGD] = virt_to_machine(image->pgd); + xki->page_list[PA_PUD_0] = virt_to_machine(image->pud0); + xki->page_list[PA_PUD_1] = virt_to_machine(image->pud1); + xki->page_list[PA_PMD_0] = virt_to_machine(image->pmd0); + xki->page_list[PA_PMD_1] = virt_to_machine(image->pmd1); + xki->page_list[PA_PTE_0] = virt_to_machine(image->pte0); + xki->page_list[PA_PTE_1] = virt_to_machine(image->pte1); } #else /* CONFIG_XEN */ @@ -123,16 +452,60 @@ void machine_kexec_setup_load_arg(xen_ke #endif /* CONFIG_XEN */ -static void init_level2_page(pmd_t *level2p, unsigned long addr) +#ifdef CONFIG_XEN +static void init_level1_page(pte_t *level1p, unsigned long addr) { unsigned long end_addr; addr &= PAGE_MASK; + end_addr = addr + PMD_SIZE; + while (addr < end_addr) { + x_set_pte(level1p++, x__pte(addr | X__PAGE_KERNEL_EXEC)); + addr += PAGE_SIZE; + } +} +#endif + +static int init_level2_page(struct kimage *image, pmd_t *level2p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result = 0; + + addr &= PAGE_MASK; end_addr = addr + PUD_SIZE; + + if (is_initial_xendomain()) { + while (addr < end_addr) { + x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; + } + return 0; + } + +#ifdef CONFIG_XEN + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + pte_t *level1p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level1p = (pte_t *)page_address(page); + init_level1_page(level1p, addr); + x_set_pmd(level2p++, x__pmd(__pa(level1p) | X_KERNPG_TABLE)); + addr += PMD_SIZE; + } + /* clear the unused entries */ while (addr < end_addr) { - x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); + x_pmd_clear(level2p++); addr += PMD_SIZE; } +out: + return result; +#endif } static int init_level3_page(struct kimage *image, pud_t *level3p, @@ -154,7 +527,7 @@ static int init_level3_page(struct kimag goto out; } level2p = (pmd_t *)page_address(page); - init_level2_page(level2p, addr); + init_level2_page(image, level2p, addr, last_addr); x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE)); addr += PUD_SIZE; } @@ -167,7 +540,6 @@ out: return result; } - static int init_level4_page(struct kimage *image, pgd_t *level4p, unsigned long addr, unsigned long last_addr) { @@ -203,39 +575,112 @@ out: return result; } - -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +#ifdef CONFIG_XEN +static int init_pgtable(struct kimage *image, pgd_t *level4p) { - pgd_t *level4p; - unsigned long x_end_pfn = end_pfn; + int result; + unsigned long x_max_pfn; -#ifdef CONFIG_XEN - x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); -#endif + if (is_initial_xendomain()) + x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); + else { + result = init_transition_pgtable(image); - level4p = (pgd_t *)__va(start_pgtable); - return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT); -} + if (result) + return result; -int machine_kexec_prepare(struct kimage *image) -{ - unsigned long start_pgtable; - int result; + x_max_pfn = min(xen_start_info->nr_pages, max_pfn); + } - /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + result = init_level4_page(image, level4p, 0, PFN_PHYS(x_max_pfn)); - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); if (result) return result; + if (!is_initial_xendomain()) { + pgtable_walk(level4p, level4p, native_page_set_prot, + __pgprot(X_KERNPG_TABLE_RO)); + pgtable_walk((pgd_t *)image->pgd, level4p, native_page_set_prot, + __pgprot(X_KERNPG_TABLE_RO)); + } + return 0; } +#else +static int init_pgtable(struct kimage *image, pgd_t *level4p) +{ + /* Setup the identity mapped 64bit page table */ + return init_level4_page(image, level4p, 0, PFN_PHYS(max_pfn)); +} +#endif /* CONFIG_XEN */ + +static void free_transition_pgtable(struct kimage *image) +{ + free_page((unsigned long)image->pgd); + free_page((unsigned long)image->pud0); + free_page((unsigned long)image->pud1); + free_page((unsigned long)image->pmd0); + free_page((unsigned long)image->pmd1); + free_page((unsigned long)image->pte0); + free_page((unsigned long)image->pte1); +} + +int machine_kexec_prepare(struct kimage *image) +{ + image->pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); + + if (!image->pgd) + goto err; + + image->pud0 = (pud_t *)get_zeroed_page(GFP_KERNEL); + + if (!image->pud0) + goto err; + + image->pud1 = (pud_t *)get_zeroed_page(GFP_KERNEL); + + if (!image->pud1) + goto err; + + image->pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + + if (!image->pmd0) + goto err; + + image->pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + + if (!image->pmd1) + goto err; + + image->pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + + if (!image->pte0) + goto err; + + image->pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + + if (!image->pte1) + goto err; + + return init_pgtable(image, page_address(image->control_code_page)); + +err: + free_transition_pgtable(image); + + return -ENOMEM; +} void machine_kexec_cleanup(struct kimage *image) { - return; +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + free_transition_pgtable(image); + else + if (!destroy_transition_pgtable(image)) + free_transition_pgtable(image); +#else + free_transition_pgtable(image); +#endif } void arch_crash_save_vmcoreinfo(void) @@ -267,20 +712,20 @@ NORET_TYPE void machine_kexec(struct kim page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; - page_list[PA_PGD] = __pa_symbol(&kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; - page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0); - page_list[VA_PUD_0] = (unsigned long)kexec_pud0; - page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1); - page_list[VA_PUD_1] = (unsigned long)kexec_pud1; - page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; - page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_PGD] = __pa_symbol(&image->pgd); + page_list[VA_PGD] = (unsigned long)image->pgd; + page_list[PA_PUD_0] = __pa_symbol(&image->pud0); + page_list[VA_PUD_0] = (unsigned long)image->pud0; + page_list[PA_PMD_0] = __pa_symbol(&image->pmd0); + page_list[VA_PMD_0] = (unsigned long)image->pmd0; + page_list[PA_PTE_0] = __pa_symbol(&image->pte0); + page_list[VA_PTE_0] = (unsigned long)image->pte0; + page_list[PA_PUD_1] = __pa_symbol(&image->pud1); + page_list[VA_PUD_1] = (unsigned long)image->pud1; + page_list[PA_PMD_1] = __pa_symbol(&image->pmd1); + page_list[VA_PMD_1] = (unsigned long)image->pmd1; + page_list[PA_PTE_1] = __pa_symbol(&image->pte1); + page_list[VA_PTE_1] = (unsigned long)image->pte1; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); @@ -288,4 +733,124 @@ NORET_TYPE void machine_kexec(struct kim relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start); } +#else +typedef NORET_TYPE void (*xen_pv_relocate_kernel_t)(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address, + int num_cpus, int cpu) ATTRIB_NORET; + +extern void local_teardown_timer(unsigned int cpu); +extern void __xen_smp_intr_exit(unsigned int cpu); + +#ifdef CONFIG_SMP +static atomic_t control_page_ready = ATOMIC_INIT(0); +static xen_pv_kexec_halt_t xpkh_relocated; + +xen_pv_kexec_halt_t get_relocated_xpkh(void) +{ + while (!atomic_read(&control_page_ready)) + udelay(1000); + + return xpkh_relocated; +} +#endif + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void xen_pv_machine_kexec(struct kimage *image) +{ +#ifdef CONFIG_SMP + int i; +#endif + pgd_t *pgd; + struct mmuext_op ldt_op = { + .cmd = MMUEXT_SET_LDT, + .arg1.linear_addr = 0, + .arg2.nr_ents = 0 + }; + struct page *next, *page; + unsigned long page_list[PAGES_NR]; + void *table_page; + xen_pv_relocate_kernel_t control_page; + + /* Interrupts aren't acceptable while we reboot. */ + local_irq_disable(); + + table_page = page_address(image->control_code_page); + control_page = table_page + PAGE_SIZE; + +#ifdef CONFIG_SMP + xpkh_relocated = (xen_pv_kexec_halt_t)control_page; + xpkh_relocated += (void *)xen_pv_kexec_halt - (void *)xen_pv_relocate_kernel; +#endif + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[PA_TABLE_PAGE] = virt_to_machine(table_page); + page_list[VA_PGD] = __pa_symbol(image->pgd); + page_list[PA_PGD] = virt_to_machine(image->pgd) | X__PAGE_KERNEL_EXEC; + page_list[VA_PUD_0] = __pa_symbol(image->pud0); + page_list[PA_PUD_0] = virt_to_machine(image->pud0) | X__PAGE_KERNEL_EXEC; + page_list[VA_PMD_0] = __pa_symbol(image->pmd0); + page_list[PA_PMD_0] = virt_to_machine(image->pmd0) | X__PAGE_KERNEL_EXEC; + page_list[VA_PTE_0] = __pa_symbol(image->pte0); + page_list[PA_PTE_0] = virt_to_machine(image->pte0) | X__PAGE_KERNEL_EXEC; + page_list[VA_PUD_1] = __pa_symbol(image->pud1); + page_list[PA_PUD_1] = virt_to_machine(image->pud1) | X__PAGE_KERNEL_EXEC; + page_list[VA_PMD_1] = __pa_symbol(image->pmd1); + page_list[PA_PMD_1] = virt_to_machine(image->pmd1) | X__PAGE_KERNEL_EXEC; + page_list[VA_PTE_1] = __pa_symbol(image->pte1); + page_list[PA_PTE_1] = virt_to_machine(image->pte1) | X__PAGE_KERNEL_EXEC; + + memcpy(control_page, xen_pv_relocate_kernel, PAGE_SIZE); + +#ifdef CONFIG_SMP + wmb(); + + atomic_inc(&control_page_ready); #endif + + /* Stop singleshot timer. */ + if (HYPERVISOR_set_timer_op(0)) + BUG(); + +#ifdef CONFIG_SMP + for_each_present_cpu(i) + __xen_smp_intr_exit(i); +#else + local_teardown_timer(smp_processor_id()); +#endif + + /* Unpin all page tables. */ + for (page = pgd_list; page; page = next) { + next = (struct page *)page->index; + pgd = ((struct mm_struct *)page->mapping)->pgd; + xen_pgd_unpin(__pa(pgd)); + xen_pgd_unpin(__pa(__user_pgd(pgd))); + } + + xen_pgd_unpin(__pa_symbol(init_level4_user_pgt)); + xen_pgd_unpin(__pa(xen_start_info->pt_base)); + xen_pgd_unpin(__pa(init_mm.pgd)); + + /* Move NULL segment selector to %ds and %es register. */ + asm volatile("movl %0, %%ds; movl %0, %%es" : : "r" (0)); + + /* Destroy GDT. */ + if (HYPERVISOR_set_gdt(NULL, 0)) + BUG(); + + /* Destroy LDT. */ + if (HYPERVISOR_mmuext_op(&ldt_op, 1, NULL, DOMID_SELF)) + BUG(); + + if (m2p_updates[0].ptr) + if (HYPERVISOR_mmu_update(m2p_updates, M2P_UPDATES_SIZE, + NULL, DOMID_SELF)) + BUG(); + + (*control_page)((unsigned long)image->head, (unsigned long)page_list, + image->start, num_present_cpus(), smp_processor_id()); +} +#endif /* CONFIG_XEN */ diff -Npru kexec-kernel-only/arch/x86_64/kernel/relocate_kernel.S kexec-kernel-only_20120522/arch/x86_64/kernel/relocate_kernel.S --- kexec-kernel-only/arch/x86_64/kernel/relocate_kernel.S 2012-01-25 14:15:10.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/relocate_kernel.S 2012-05-21 14:23:42.000000000 +0200 @@ -14,6 +14,12 @@ * Must be relocatable PIC code callable as a C function */ +#define DOMID_SELF 0x7ff0 + +#define UVMF_INVLPG 2 + +#define TRANSITION_PGTABLE_SIZE 7 + #define PTR(x) (x << 3) #define PAGE_ALIGNED (1 << PAGE_SHIFT) #define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ @@ -292,7 +298,7 @@ identity_mapped: xorq %rbp, %rbp xorq %r8, %r8 xorq %r9, %r9 - xorq %r10, %r9 + xorq %r10, %r10 xorq %r11, %r11 xorq %r12, %r12 xorq %r13, %r13 @@ -314,3 +320,379 @@ gdt_80: idt_80: .word 0 /* limit */ .quad 0 /* base */ + +#ifdef CONFIG_XEN + .globl xen_pv_relocate_kernel + +xen_pv_relocate_kernel: + /* + * %rdi - indirection_page, + * %rsi - page_list, + * %rdx - start_address, + * %ecx - num_cpus, + * %r8d - cpu. + */ + + /* We need these arguments later. Store them in safe place. */ + movq %rdi, %r13 + movq %rdx, %r14 + movl %ecx, %r15d + +#ifdef CONFIG_SMP + /* Do not take into account our CPU. */ + decl %r15d + +0: + /* Is everybody at entry stage? */ + cmpl %r15d, xpkh_stage_cpus(%rip) + jne 0b + + /* Reset stage counter. */ + movl $0, xpkh_stage_cpus(%rip) +#endif + + /* Store transition page table addresses in safe place too. */ + leaq transition_pgtable_uvm(%rip), %rax + movq %rax, %rbx + addq $0x10, %rax /* *vaddr */ + addq $0x18, %rbx /* *pte */ + + movq PTR(VA_PGD)(%rsi), %rcx + movq PTR(PA_PGD)(%rsi), %rdx + movq %rcx, (%rax) + movq %rdx, (%rbx) + + addq $0x40, %rax + addq $0x40, %rbx + + movq PTR(VA_PUD_0)(%rsi), %rcx + movq PTR(PA_PUD_0)(%rsi), %rdx + movq %rcx, (%rax) + movq %rdx, (%rbx) + + addq $0x40, %rax + addq $0x40, %rbx + + movq PTR(VA_PMD_0)(%rsi), %rcx + movq PTR(PA_PMD_0)(%rsi), %rdx + movq %rcx, (%rax) + movq %rdx, (%rbx) + + addq $0x40, %rax + addq $0x40, %rbx + + movq PTR(VA_PTE_0)(%rsi), %rcx + movq PTR(PA_PTE_0)(%rsi), %rdx + movq %rcx, (%rax) + movq %rdx, (%rbx) + + addq $0x40, %rax + addq $0x40, %rbx + + movq PTR(VA_PUD_1)(%rsi), %rcx + movq PTR(PA_PUD_1)(%rsi), %rdx + movq %rcx, (%rax) + movq %rdx, (%rbx) + + addq $0x40, %rax + addq $0x40, %rbx + + movq PTR(VA_PMD_1)(%rsi), %rcx + movq PTR(PA_PMD_1)(%rsi), %rdx + movq %rcx, (%rax) + movq %rdx, (%rbx) + + addq $0x40, %rax + addq $0x40, %rbx + + movq PTR(VA_PTE_1)(%rsi), %rcx + movq PTR(PA_PTE_1)(%rsi), %rdx + movq %rcx, (%rax) + movq %rdx, (%rbx) + + /* + * Get control page physical address now. + * This is impossible after page table switch. + */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %rbp + + /* Get identity page table MFN now too. */ + movq PTR(PA_TABLE_PAGE)(%rsi), %r12 + shrq $PAGE_SHIFT, %r12 + + /* Store transition page table MFN. */ + movq PTR(PA_PGD)(%rsi), %rax + shrq $PAGE_SHIFT, %rax + movq %rax, mmuext_new_baseptr(%rip) + movq %rax, mmuext_new_user_baseptr(%rip) + movq %rax, mmuext_unpin_table(%rip) + + /* Switch to transition page table. */ + leaq mmuext_args(%rip), %rdi + movq $2, %rsi + xorq %rdx, %rdx + movq $DOMID_SELF, %r10 + movq $__HYPERVISOR_mmuext_op, %rax + syscall + testq %rax, %rax + jz 0f + ud2a + +0: + /* Go to control page physical address. */ + leaq (0f - xen_pv_relocate_kernel)(%rbp), %rax + jmpq *%rax + +0: +#ifdef CONFIG_SMP + sfence + + /* Store control page physical address. */ + movq %rbp, cp_paddr(%rip) + +0: + /* Is everybody at transition stage? */ + cmpl %r15d, xpkh_stage_cpus(%rip) + jne 0b + + /* Reset stage counter. */ + movl $0, xpkh_stage_cpus(%rip) +#endif + + /* Store identity page table MFN. */ + movq %r12, mmuext_new_baseptr(%rip) + movq %r12, mmuext_new_user_baseptr(%rip) + + /* Switch to identity page table. */ + leaq mmuext_args(%rip), %rdi + movq $3, %rsi + xorq %rdx, %rdx + movq $DOMID_SELF, %r10 + movq $__HYPERVISOR_mmuext_op, %rax + syscall + testq %rax, %rax + jz 0f + ud2a + +0: +#ifdef CONFIG_SMP + sfence + + /* Signal that we are at identity stage. */ + lock incb xprk_stage_identity(%rip) + +0: + /* Is everybody at identity stage? */ + cmpl %r15d, xpkh_stage_cpus(%rip) + jne 0b +#endif + + /* Map transition page table pages with _PAGE_RW bit set. */ + leaq transition_pgtable_uvm(%rip), %rdi + movq $TRANSITION_PGTABLE_SIZE, %rsi + movq $__HYPERVISOR_multicall, %rax + syscall + testq %rax, %rax + jz 0f + ud2a + +0: + /* Do the copies */ + movq %r13, %rcx /* Put the page_list in %rcx */ + xorq %rdi, %rdi + xorq %rsi, %rsi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testq $0x1, %rcx /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testq $0x2, %rcx /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testq $0x4, %rcx /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testq $0x8, %rcx /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq $512, %rcx + rep ; movsq + jmp 0b + +3: +#ifdef CONFIG_SMP + sfence + + /* Store purgatory() physical address. */ + movq %r14, %rax + movq %r14, purgatory_paddr(%rip) +#endif + + /* Store current CPU number. */ + movl %r8d, %r14d + + /* Set unused registers to known values. */ + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + + jmpq *%rax + +#ifdef CONFIG_SMP + .globl xen_pv_kexec_halt + +xen_pv_kexec_halt: + /* %edi - cpu. */ + + /* Store current CPU number. */ + movl %edi, %r14d + + /* Signal that we are at entry stage. */ + lock incl xpkh_stage_cpus(%rip) + +0: + /* Wait for control page physical address. */ + cmpq $0, cp_paddr(%rip) + jz 0b + + lfence + + movq cp_paddr(%rip), %rbp + movq cp_paddr(%rip), %r15 + + /* Switch to transition page table. */ + leaq mmuext_args(%rip), %rdi + movq $2, %rsi + xorq %rdx, %rdx + movq $DOMID_SELF, %r10 + movq $__HYPERVISOR_mmuext_op, %rax + syscall + testq %rax, %rax + jz 0f + ud2a + +0: + /* Go to control page physical address. */ + leaq (0f - xen_pv_relocate_kernel)(%rbp), %rax + jmpq *%rax + +0: + /* Signal that we are at transition stage. */ + lock incl xpkh_stage_cpus(%rip) + +0: + /* Is xen_pv_relocate_kernel() at identity stage? */ + cmpb $0, xprk_stage_identity(%rip) + jz 0b + + lfence + + /* Switch to identity page table. */ + leaq mmuext_args(%rip), %rdi + movq $2, %rsi + xorq %rdx, %rdx + movq $DOMID_SELF, %r10 + movq $__HYPERVISOR_mmuext_op, %rax + syscall + testq %rax, %rax + jz 0f + ud2a + +0: + /* Signal that we are at identity stage. */ + lock incl xpkh_stage_cpus(%rip) + +0: + /* Wait for purgatory() physical address. */ + cmpq $0, purgatory_paddr(%rip) + jz 0b + + lfence + + movq purgatory_paddr(%rip), %rbx + + /* Set unused registers to known values. */ + xorq %rax, %rax + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r15, %r15 + + jmpq *%rbx + + .align 8 + +cp_paddr: + .quad 0 /* Control page physical address. */ + +purgatory_paddr: + .quad 0 /* purgatory() physical address. */ + +xpkh_stage_cpus: + .long 0 /* Number of CPUs at given stage in xen_pv_kexec_halt(). */ + +xprk_stage_identity: + .byte 0 /* xen_pv_relocate_kernel() is at identity stage. */ +#endif + +mmuext_args: + .long MMUEXT_NEW_BASEPTR /* Operation */ + .long 0 /* PAD */ + +mmuext_new_baseptr: + .quad 0 /* MFN of target page table directory */ + .quad 0 /* UNUSED */ + + .long MMUEXT_NEW_USER_BASEPTR /* Operation */ + .long 0 /* PAD */ + +mmuext_new_user_baseptr: + .quad 0 /* MFN of user target page table directory */ + .quad 0 /* UNUSED */ + + .long MMUEXT_UNPIN_TABLE /* Operation */ + .long 0 /* PAD */ + +mmuext_unpin_table: + .quad 0 /* MFN of old page table directory */ + .quad 0 /* UNUSED */ + +transition_pgtable_uvm: + .rept TRANSITION_PGTABLE_SIZE + .quad __HYPERVISOR_update_va_mapping + .fill 3, 8, 0 + .quad UVMF_INVLPG + .fill 3, 8, 0 + .endr +#endif diff -Npru kexec-kernel-only/arch/x86_64/kernel/setup-xen.c kexec-kernel-only_20120522/arch/x86_64/kernel/setup-xen.c --- kexec-kernel-only/arch/x86_64/kernel/setup-xen.c 2012-01-25 14:15:36.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/setup-xen.c 2012-04-26 23:53:01.000000000 +0200 @@ -102,8 +102,10 @@ static struct notifier_block xen_panic_b unsigned long *phys_to_machine_mapping; unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512]; +unsigned long p2m_max_pfn; EXPORT_SYMBOL(phys_to_machine_mapping); +EXPORT_SYMBOL(p2m_max_pfn); DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]); DEFINE_PER_CPU(int, nr_multicall_ents); @@ -475,18 +477,21 @@ static __init void parse_cmdline_early ( * after a kernel panic. */ else if (!memcmp(from, "crashkernel=", 12)) { -#ifndef CONFIG_XEN - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - crashk_res.start = base; - crashk_res.end = base + size - 1; +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); + else +#endif + { + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + crashk_res.start = base; + crashk_res.end = base + size - 1; + } } -#else - printk("Ignoring crashkernel command line, " - "parameter will be supplied by xen\n"); -#endif } #endif @@ -785,22 +790,16 @@ void __init setup_arch(char **cmdline_p) #endif /* !CONFIG_XEN */ #ifdef CONFIG_KEXEC #ifdef CONFIG_XEN - xen_machine_kexec_setup_resources(); -#else - if ((crashk_res.start < crashk_res.end) && - (crashk_res.end <= (end_pfn << PAGE_SHIFT))) { - reserve_bootmem_generic(crashk_res.start, + if (is_initial_xendomain()) + xen_machine_kexec_setup_resources(); + else +#endif + { + if (crashk_res.start != crashk_res.end) + reserve_bootmem_generic(crashk_res.start, crashk_res.end - crashk_res.start + 1, BOOTMEM_EXCLUSIVE); } - else { - printk(KERN_ERR "Memory for crash kernel (0x%lx to 0x%lx) not" - "within permissible range\ndisabling kdump\n", - crashk_res.start, crashk_res.end); - crashk_res.end = 0; - crashk_res.start = 0; - } -#endif #endif paging_init(); @@ -814,10 +813,10 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_XEN { int i, j, k, fpp; - unsigned long p2m_pages; - p2m_pages = end_pfn; - if (xen_start_info->nr_pages > end_pfn) { + p2m_max_pfn = saved_max_pfn ? saved_max_pfn : end_pfn; + + if (xen_start_info->nr_pages > end_pfn && !saved_max_pfn) { /* * the end_pfn was shrunk (probably by mem= * kernel parameter); shrink reservation with the HV @@ -839,18 +838,17 @@ void __init setup_arch(char **cmdline_p) &reservation); BUG_ON (ret != difference); } - else if (end_pfn > xen_start_info->nr_pages) - p2m_pages = xen_start_info->nr_pages; if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make sure we have a large enough P->M table. */ phys_to_machine_mapping = alloc_bootmem_pages( - end_pfn * sizeof(unsigned long)); + p2m_max_pfn * sizeof(unsigned long)); memset(phys_to_machine_mapping, ~0, - end_pfn * sizeof(unsigned long)); + p2m_max_pfn * sizeof(unsigned long)); memcpy(phys_to_machine_mapping, (unsigned long *)xen_start_info->mfn_list, - p2m_pages * sizeof(unsigned long)); + min(xen_start_info->nr_pages, p2m_max_pfn) * + sizeof(unsigned long)); free_bootmem( __pa(xen_start_info->mfn_list), PFN_PHYS(PFN_UP(xen_start_info->nr_pages * @@ -938,21 +936,23 @@ void __init setup_arch(char **cmdline_p) * and also for regions reported as reserved by the e820. */ probe_roms(); + #ifdef CONFIG_XEN + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, machine_e820.map); + if (is_initial_xendomain()) { - memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, machine_e820.map); - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) BUG(); - machine_e820.nr_map = memmap.nr_entries; + } else + if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) + BUG(); - e820_reserve_resources(machine_e820.map, machine_e820.nr_map); - } -#else - e820_reserve_resources(e820.map, e820.nr_map); + machine_e820.nr_map = memmap.nr_entries; #endif + e820_reserve_resources(e820.map, e820.nr_map); + request_resource(&iomem_resource, &video_ram_resource); { diff -Npru kexec-kernel-only/arch/x86_64/kernel/vmlinux.lds.S kexec-kernel-only_20120522/arch/x86_64/kernel/vmlinux.lds.S --- kexec-kernel-only/arch/x86_64/kernel/vmlinux.lds.S 2012-01-25 14:15:45.000000000 +0100 +++ kexec-kernel-only_20120522/arch/x86_64/kernel/vmlinux.lds.S 2012-05-22 13:02:24.000000000 +0200 @@ -24,7 +24,7 @@ SECTIONS { /* XEN x86_64 don't work with relocations yet quintela@xxxxxxxxxx */ #ifdef CONFIG_X86_64_XEN - . = __START_KERNEL_map + 0x200000; + . = __START_KERNEL_map + CONFIG_PHYSICAL_START; #else . = __START_KERNEL_map; #endif diff -Npru kexec-kernel-only/drivers/char/mem.c kexec-kernel-only_20120522/drivers/char/mem.c --- kexec-kernel-only/drivers/char/mem.c 2012-01-25 14:15:39.000000000 +0100 +++ kexec-kernel-only_20120522/drivers/char/mem.c 2012-04-27 00:09:57.000000000 +0200 @@ -322,7 +322,7 @@ static ssize_t read_oldmem(struct file * while (count) { pfn = *ppos / PAGE_SIZE; - if (pfn > saved_max_pfn) + if (pfn >= saved_max_pfn) return read; offset = (unsigned long)(*ppos % PAGE_SIZE); diff -Npru kexec-kernel-only/drivers/xen/core/evtchn.c kexec-kernel-only_20120522/drivers/xen/core/evtchn.c --- kexec-kernel-only/drivers/xen/core/evtchn.c 2012-01-25 14:15:38.000000000 +0100 +++ kexec-kernel-only_20120522/drivers/xen/core/evtchn.c 2012-05-21 13:34:24.000000000 +0200 @@ -510,6 +510,12 @@ int bind_ipi_to_irqhandler( } EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler); +void __unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(__unbind_from_irqhandler); + void unbind_from_irqhandler(unsigned int irq, void *dev_id) { free_irq(irq, dev_id); diff -Npru kexec-kernel-only/drivers/xen/core/machine_kexec.c kexec-kernel-only_20120522/drivers/xen/core/machine_kexec.c --- kexec-kernel-only/drivers/xen/core/machine_kexec.c 2012-01-25 14:15:23.000000000 +0100 +++ kexec-kernel-only_20120522/drivers/xen/core/machine_kexec.c 2012-05-22 14:52:00.000000000 +0200 @@ -1,16 +1,44 @@ /* - * drivers/xen/core/machine_kexec.c - * handle transition of Linux booting another kernel + * Copyright (c) 2011-2012 Acunu Limited + * + * kexec/kdump implementation for Xen domU guests was written by Daniel Kiper. + * + * Some ideas are taken from: + * - native kexec/kdump implementation, + * - kexec/kdump implementation for Xen Linux Kernel Ver. 2.6.18, + * - PV-GRUB. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include +#include +#include +#include +#include +#include + #include -extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, - struct kimage *image); +extern char hypercall_page[PAGE_SIZE]; + +static struct bin_attribute p2m_attr; +static struct bin_attribute si_attr; int xen_max_nr_phys_cpus; struct resource xen_hypervisor_res; @@ -19,6 +47,13 @@ struct resource *xen_phys_cpus; size_t vmcoreinfo_size_xen; unsigned long paddr_vmcoreinfo_xen; +#ifdef CONFIG_SMP +static atomic_t waiting_for_down; +#endif + +extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, + struct kimage *image); + void xen_machine_kexec_setup_resources(void) { xen_kexec_range_t range; @@ -124,6 +159,9 @@ void xen_machine_kexec_register_resource { int k; + if (!is_initial_xendomain()) + return; + request_resource(res, &xen_hypervisor_res); for (k = 0; k < xen_max_nr_phys_cpus; k++) @@ -152,6 +190,10 @@ int xen_machine_kexec_load(struct kimage memset(&xkl, 0, sizeof(xkl)); xkl.type = image->type; setup_load_arg(&xkl.image, image); + + if (!is_initial_xendomain()) + return 0; + return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl); } @@ -165,6 +207,9 @@ void xen_machine_kexec_unload(struct kim { xen_kexec_load_t xkl; + if (!is_initial_xendomain()) + return; + memset(&xkl, 0, sizeof(xkl)); xkl.type = image->type; HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl); @@ -182,17 +227,246 @@ NORET_TYPE void machine_kexec(struct kim { xen_kexec_exec_t xke; - memset(&xke, 0, sizeof(xke)); - xke.type = image->type; - HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke); - panic("KEXEC_CMD_kexec hypercall should not return\n"); + if (is_initial_xendomain()) { + memset(&xke, 0, sizeof(xke)); + xke.type = image->type; + HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke); + panic("KEXEC_CMD_kexec hypercall should not return\n"); + } else + xen_pv_machine_kexec(image); } +#ifdef CONFIG_SMP +static void xen_pv_kexec_stop_this_cpu(void *dummy) +{ + struct mmuext_op ldt_op = { + .cmd = MMUEXT_SET_LDT, + .arg1.linear_addr = 0, + .arg2.nr_ents = 0 + }; + xen_pv_kexec_halt_t xpkh_relocated; + + /* Interrupts aren't acceptable while we reboot. */ + local_irq_disable(); + + cpu_clear(smp_processor_id(), cpu_online_map); + + /* Stop singleshot timer. */ + if (HYPERVISOR_set_timer_op(0)) + BUG(); + + /* Move NULL segment selector to %ds and %es register. */ + asm volatile("movl %0, %%ds; movl %0, %%es" : : "r" (0)); + + /* Destroy GDT. */ + if (HYPERVISOR_set_gdt(NULL, 0)) + BUG(); + + /* Destroy LDT. */ + if (HYPERVISOR_mmuext_op(&ldt_op, 1, NULL, DOMID_SELF)) + BUG(); + + atomic_dec(&waiting_for_down); + + xpkh_relocated = get_relocated_xpkh(); + + (*xpkh_relocated)(smp_processor_id()); +} + +void xen_pv_kexec_smp_send_stop(void) +{ + atomic_set(&waiting_for_down, num_present_cpus() - 1); + + smp_call_function(xen_pv_kexec_stop_this_cpu, NULL, 1, 0); + + /* Wait for all CPUs will be almost ready to come into down state. */ + while (atomic_read(&waiting_for_down)) + udelay(1000); +} + +void machine_shutdown(void) +{ + int reboot_cpu_id; + + if (is_initial_xendomain()) + return; + + /* The boot cpu is always logical cpu 0. */ + reboot_cpu_id = 0; + + /* Make certain the cpu I'm rebooting on is online. */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) + reboot_cpu_id = smp_processor_id(); + + /* Make certain I only run on the appropriate processor. */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* + * O.K. Now that I'm on the appropriate processor, + * stop all of the others. + */ + xen_pv_kexec_smp_send_stop(); +} +#else void machine_shutdown(void) { /* do nothing */ } +#endif /* CONFIG_SMP */ + +static ssize_t si_read(struct kobject *kobj, char *buf, loff_t off, size_t count) +{ + if (off >= si_attr.size) + return 0; + + count = min(si_attr.size - (size_t)off, count); + memcpy(buf, &((char *)xen_start_info)[off], count); + + return count; +} + +static int si_mmap(struct kobject *kobj, + struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long off, size; + + if (vma->vm_flags & VM_SHARED) + return -EACCES; + + off = vma->vm_pgoff << PAGE_SHIFT; + size = vma->vm_end - vma->vm_start; + + if (off + size > PAGE_SIZE) + return -EINVAL; + + vma->vm_pgoff += PFN_DOWN(__pa(xen_start_info)); + vma->vm_flags &= ~VM_MAYSHARE; + + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + size, vma->vm_page_prot); +} + +static ssize_t hypercall_page_read(struct kobject *kobj, char *buf, + loff_t off, size_t count) +{ + if (off >= PAGE_SIZE) + return 0; + + count = min(PAGE_SIZE - (size_t)off, count); + memcpy(buf, &hypercall_page[off], count); + + return count; +} + +static int hypercall_page_mmap(struct kobject *kobj, + struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma->vm_flags & VM_SHARED) + return -EACCES; + + if (vma->vm_pgoff + size > PAGE_SIZE) + return -EINVAL; + + vma->vm_pgoff = PFN_DOWN(__pa_symbol(hypercall_page)); + vma->vm_flags &= ~VM_MAYSHARE; + + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + size, vma->vm_page_prot); +} + +static ssize_t p2m_read(struct kobject *kobj, char *buf, loff_t off, size_t count) +{ + if (off >= p2m_attr.size) + return 0; + + count = min(p2m_attr.size - (size_t)off, count); + memcpy(buf, &((char *)phys_to_machine_mapping)[off], count); + + return count; +} + +static int p2m_mmap(struct kobject *kobj, + struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long off, size; + + if (vma->vm_flags & VM_SHARED) + return -EACCES; + + off = vma->vm_pgoff << PAGE_SHIFT; + size = vma->vm_end - vma->vm_start; + + if (off + size > roundup(p2m_attr.size, PAGE_SIZE)) + return -EINVAL; + + vma->vm_pgoff += PFN_DOWN(__pa(phys_to_machine_mapping)); + vma->vm_flags &= ~VM_MAYSHARE; + + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + size, vma->vm_page_prot); +} + +static struct bin_attribute si_attr = { + .attr = { + .name = "start_info", + .mode = S_IRUSR, + .owner = THIS_MODULE + }, + .size = sizeof(*xen_start_info), + .read = si_read, + .mmap = si_mmap +}; + +static struct bin_attribute hypercall_page_attr = { + .attr = { + .name = "hypercall_page", + .mode = S_IRUSR, + .owner = THIS_MODULE + }, + .size = PAGE_SIZE, + .read = hypercall_page_read, + .mmap = hypercall_page_mmap +}; + +static struct bin_attribute p2m_attr = { + .attr = { + .name = "p2m", + .mode = S_IRUSR, + .owner = THIS_MODULE + }, + .read = p2m_read, + .mmap = p2m_mmap +}; + +static int __init kexec_xen_sysfs_init(void) +{ + int rc; + + if (is_initial_xendomain()) + return 0; + + rc = sysfs_create_bin_file(&kernel_subsys.kset.kobj, &si_attr); + + if (rc) + return rc; + + rc = sysfs_create_bin_file(&kernel_subsys.kset.kobj, &hypercall_page_attr); + + if (rc) + return rc; + + p2m_attr.size = min(xen_start_info->nr_pages, max_pfn); + p2m_attr.size *= sizeof(unsigned long); + + return sysfs_create_bin_file(&kernel_subsys.kset.kobj, &p2m_attr); +} +subsys_initcall(kexec_xen_sysfs_init); /* * Local variables: diff -Npru kexec-kernel-only/drivers/xen/core/smpboot.c kexec-kernel-only_20120522/drivers/xen/core/smpboot.c --- kexec-kernel-only/drivers/xen/core/smpboot.c 2012-01-25 14:15:23.000000000 +0100 +++ kexec-kernel-only_20120522/drivers/xen/core/smpboot.c 2012-05-21 13:51:38.000000000 +0200 @@ -148,6 +148,15 @@ static int xen_smp_intr_init(unsigned in return rc; } +#ifdef CONFIG_KEXEC +void __xen_smp_intr_exit(unsigned int cpu) +{ + local_teardown_timer(cpu); + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + __unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); +} +#endif + #ifdef CONFIG_HOTPLUG_CPU static void xen_smp_intr_exit(unsigned int cpu) { diff -Npru kexec-kernel-only/drivers/xen/xenbus/xenbus_comms.c kexec-kernel-only_20120522/drivers/xen/xenbus/xenbus_comms.c --- kexec-kernel-only/drivers/xen/xenbus/xenbus_comms.c 2012-01-25 14:15:19.000000000 +0100 +++ kexec-kernel-only_20120522/drivers/xen/xenbus/xenbus_comms.c 2012-02-10 22:06:16.000000000 +0100 @@ -188,8 +188,22 @@ int xb_read(void *data, unsigned len) /* Set up interrupt handler off store event channel. */ int xb_init_comms(void) { + struct xenstore_domain_interface *intf = xen_store_interface; int err; + if (intf->req_prod != intf->req_cons) + printk(KERN_ERR "XENBUS request ring is not quiescent " + "(%08x:%08x)!\n", intf->req_cons, intf->req_prod); + + if (intf->rsp_prod != intf->rsp_cons) { + printk(KERN_WARNING "XENBUS response ring is not quiescent " + "(%08x:%08x): fixing up\n", + intf->rsp_cons, intf->rsp_prod); + /* breaks kdump */ + if (!reset_devices) + intf->rsp_cons = intf->rsp_prod; + } + if (xenbus_irq) unbind_from_irqhandler(xenbus_irq, &xb_waitq); diff -Npru kexec-kernel-only/drivers/xen/xenbus/xenbus_probe.c kexec-kernel-only_20120522/drivers/xen/xenbus/xenbus_probe.c --- kexec-kernel-only/drivers/xen/xenbus/xenbus_probe.c 2012-01-25 14:15:32.000000000 +0100 +++ kexec-kernel-only_20120522/drivers/xen/xenbus/xenbus_probe.c 2012-02-10 21:17:54.000000000 +0100 @@ -1032,11 +1032,136 @@ void unregister_xenstore_notifier(struct } EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); +#ifdef CONFIG_KEXEC +static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); +static int backend_state; + +static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, + const char **v, unsigned int l) +{ + if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state) != 1) + backend_state = XenbusStateUnknown; + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + v[XS_WATCH_PATH], xenbus_strstate(backend_state)); + wake_up(&backend_state_wq); +} + +static void xenbus_reset_wait_for_backend(char *be, int expected) +{ + long timeout; + timeout = wait_event_interruptible_timeout(backend_state_wq, + backend_state == expected, 5 * HZ); + if (timeout <= 0) + printk(KERN_INFO "XENBUS: backend %s timed out.\n", be); +} + +/* + * Reset frontend if it is in Connected or Closed state. + * Wait for backend to catch up. + * State Connected happens during kdump, Closed after kexec. + */ +static void xenbus_reset_frontend(char *fe, char *be, int be_state) +{ + struct xenbus_watch be_watch; + + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + be, xenbus_strstate(be_state)); + + memset(&be_watch, 0, sizeof(be_watch)); + be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be); + if (!be_watch.node) + return; + + be_watch.callback = xenbus_reset_backend_state_changed; + backend_state = XenbusStateUnknown; + + printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", be); + register_xenbus_watch(&be_watch); + + /* fall through to forward backend to state XenbusStateInitialising */ + switch (be_state) { + case XenbusStateConnected: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); + xenbus_reset_wait_for_backend(be, XenbusStateClosing); + + case XenbusStateClosing: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); + xenbus_reset_wait_for_backend(be, XenbusStateClosed); + + case XenbusStateClosed: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); + xenbus_reset_wait_for_backend(be, XenbusStateInitWait); + } + + unregister_xenbus_watch(&be_watch); + printk(KERN_INFO "XENBUS: reconnect done on %s\n", be); + kfree(be_watch.node); +} + +static void xenbus_check_frontend(char *class, char *dev) +{ + int be_state, fe_state, err; + char *backend, *frontend; + + frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev); + if (!frontend) + return; + + err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state); + if (err != 1) + goto out; + + switch (fe_state) { + case XenbusStateConnected: + case XenbusStateClosed: + printk(KERN_DEBUG "XENBUS: frontend %s %s\n", + frontend, xenbus_strstate(fe_state)); + backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); + if (!backend || IS_ERR(backend)) + goto out; + err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); + if (err == 1) + xenbus_reset_frontend(frontend, backend, be_state); + kfree(backend); + break; + default: + break; + } +out: + kfree(frontend); +} + +static void xenbus_reset_state(void) +{ + char **devclass, **dev; + int devclass_n, dev_n; + int i, j; + + devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n); + if (IS_ERR(devclass)) + return; + + for (i = 0; i < devclass_n; i++) { + dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n); + if (IS_ERR(dev)) + continue; + for (j = 0; j < dev_n; j++) + xenbus_check_frontend(devclass[i], dev[j]); + kfree(dev); + } + kfree(devclass); +} +#endif void xenbus_probe(void *unused) { BUG_ON((xenstored_ready <= 0)); +#ifdef CONFIG_KEXEC + /* reset devices in Connected or Closed state */ + xenbus_reset_state(); +#endif + /* Enumerate devices in xenstore. */ xenbus_probe_devices(&xenbus_frontend); #ifdef CONFIG_XEN diff -Npru kexec-kernel-only/drivers/xen/xenbus/xenbus_xs.c kexec-kernel-only_20120522/drivers/xen/xenbus/xenbus_xs.c --- kexec-kernel-only/drivers/xen/xenbus/xenbus_xs.c 2012-01-25 14:15:39.000000000 +0100 +++ kexec-kernel-only_20120522/drivers/xen/xenbus/xenbus_xs.c 2012-02-11 19:04:11.000000000 +0100 @@ -591,6 +591,24 @@ static struct xenbus_watch *find_watch(c return NULL; } +static void xs_reset_watches(void) +{ +#ifdef CONFIG_KEXEC + int err, supported = 0; + + err = xenbus_scanf(XBT_NIL, "control", + "platform-feature-xs_reset_watches", "%d", + &supported); + + if (err != 1 || !supported) + return; + + err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); + if (err && err != -EEXIST) + printk(KERN_WARNING "xs_reset_watches failed: %d\n", err); +#endif +} + /* Register callback to watch this node. */ int register_xenbus_watch(struct xenbus_watch *watch) { @@ -609,8 +627,37 @@ int register_xenbus_watch(struct xenbus_ err = xs_watch(watch->node, token); - /* Ignore errors due to multiple registration. */ - if ((err != 0) && (err != -EEXIST)) { + /* Ten fragment kodu jest zbyt generyczny !!! + * Przeniesc do sterownika blkfront i netfront. */ + /* vbd vbd-51712: 17 adding watch on /local/domain/0/backend/vbd/181/51712/state + * xenbus: failed to write error node for device/vbd/51712 (17 adding watch on /local/domain/0/backend/vbd/181/51712/state) + * xenbus_probe: watch_otherend on device/vbd/51712 failed. + * vbd: probe of vbd-51712 failed with error -17 + * vbd vbd-51728: 17 adding watch on /local/domain/0/backend/vbd/181/51728/state + * xenbus: failed to write error node for device/vbd/51728 (17 adding watch on /local/domain/0/backend/vbd/181/51728/state) + * xenbus_probe: watch_otherend on device/vbd/51728 failed. + * vbd: probe of vbd-51728 failed with error -17 + * netfront: Initialising virtual ethernet driver. + * vif vif-0: 17 adding watch on /local/domain/0/backend/vif/181/0/state + * xenbus: failed to write error node for device/vif/0 (17 adding watch on /local/domain/0/backend/vif/181/0/state) + * xenbus_probe: watch_otherend on device/vif/0 failed. + * vif: probe of vif-0 failed with error -17 + * + * warto zaczac analize od xenbus_dev_probe() -> watch_otherend() */ + if (err == -EEXIST) { + err = xs_unwatch(watch->node, token); + + if (err) + printk(KERN_WARNING + "XENBUS Failed to release watch %s: %i\n", + watch->node, err); + else + err = xs_watch(watch->node, token); + } + /* Ten fragment kodu jest zbyt generyczny !!! + * Przeniesc do sterownika blkfront i netfront. */ + + if (err) { spin_lock(&watches_lock); list_del(&watch->list); spin_unlock(&watches_lock); @@ -877,5 +924,8 @@ int xs_init(void) if (IS_ERR(task)) return PTR_ERR(task); + /* shutdown watches for kexec boot */ + xs_reset_watches(); + return 0; } diff -Npru kexec-kernel-only/include/asm-x86_64/kexec.h kexec-kernel-only_20120522/include/asm-x86_64/kexec.h --- kexec-kernel-only/include/asm-x86_64/kexec.h 2012-01-25 14:15:10.000000000 +0100 +++ kexec-kernel-only_20120522/include/asm-x86_64/kexec.h 2012-05-20 16:49:27.000000000 +0200 @@ -22,8 +22,10 @@ #ifndef __ASSEMBLY__ +#include #include +#include #include #include @@ -91,17 +93,54 @@ relocate_kernel(unsigned long indirectio unsigned long page_list, unsigned long start_address) ATTRIB_NORET; -/* Under Xen we need to work with machine addresses. These macros give the +/* Under Xen we need to work with machine addresses. These functions give the * machine address of a certain page to the generic kexec code instead of * the pseudo physical address which would be given by the default macros. */ #ifdef CONFIG_XEN + +typedef NORET_TYPE void (*xen_pv_kexec_halt_t)(int cpu) ATTRIB_NORET; + #define KEXEC_ARCH_HAS_PAGE_MACROS -#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page)) -#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn)) -#define kexec_virt_to_phys(addr) virt_to_machine(addr) -#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr)) + +static inline unsigned long kexec_page_to_pfn(struct page *page) { + if (is_initial_xendomain()) + return pfn_to_mfn(page_to_pfn(page)); + else + return page_to_pfn(page); +} + +static inline struct page *kexec_pfn_to_page(unsigned long pfn) { + if (is_initial_xendomain()) + return pfn_to_page(mfn_to_pfn(pfn)); + else + return pfn_to_page(pfn); +} + +static inline unsigned long kexec_virt_to_phys(void *addr) { + if (is_initial_xendomain()) + return virt_to_machine(addr); + else + return virt_to_phys(addr); +} + +static inline void *kexec_phys_to_virt(unsigned long addr) { + if (is_initial_xendomain()) + return phys_to_virt(machine_to_phys(addr)); + else + return phys_to_virt(addr); +} + +extern xen_pv_kexec_halt_t get_relocated_xpkh(void); +extern void xen_pv_kexec_smp_send_stop(void); + +extern NORET_TYPE void xen_pv_relocate_kernel(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address, + int num_cpus, int cpu) ATTRIB_NORET; + +extern NORET_TYPE void xen_pv_kexec_halt(int cpu) ATTRIB_NORET; #endif #endif /* __ASSEMBLY__ */ diff -Npru kexec-kernel-only/include/asm-x86_64/mach-xen/asm/maddr.h kexec-kernel-only_20120522/include/asm-x86_64/mach-xen/asm/maddr.h --- kexec-kernel-only/include/asm-x86_64/mach-xen/asm/maddr.h 2012-01-25 14:15:19.000000000 +0100 +++ kexec-kernel-only_20120522/include/asm-x86_64/mach-xen/asm/maddr.h 2012-04-26 19:52:25.000000000 +0200 @@ -16,6 +16,7 @@ typedef unsigned long maddr_t; #ifdef CONFIG_XEN extern unsigned long *phys_to_machine_mapping; +extern unsigned long p2m_max_pfn; #undef machine_to_phys_mapping extern unsigned long *machine_to_phys_mapping; @@ -25,7 +26,7 @@ static inline unsigned long pfn_to_mfn(u { if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - BUG_ON(end_pfn && pfn >= end_pfn); + BUG_ON(p2m_max_pfn && pfn >= p2m_max_pfn); return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; } @@ -33,7 +34,7 @@ static inline int phys_to_machine_mappin { if (xen_feature(XENFEAT_auto_translated_physmap)) return 1; - BUG_ON(end_pfn && pfn >= end_pfn); + BUG_ON(p2m_max_pfn && pfn >= p2m_max_pfn); return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); } @@ -45,7 +46,7 @@ static inline unsigned long mfn_to_pfn(u return mfn; if (unlikely((mfn >> machine_to_phys_order) != 0)) - return end_pfn; + return p2m_max_pfn; /* The array access can fail (e.g., device space beyond end of RAM). */ asm ( @@ -60,7 +61,7 @@ static inline unsigned long mfn_to_pfn(u " .quad 1b,3b\n" ".previous" : "=r" (pfn) - : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) ); + : "m" (machine_to_phys_mapping[mfn]), "m" (p2m_max_pfn) ); return pfn; } @@ -88,16 +89,16 @@ static inline unsigned long mfn_to_pfn(u static inline unsigned long mfn_to_local_pfn(unsigned long mfn) { unsigned long pfn = mfn_to_pfn(mfn); - if ((pfn < end_pfn) + if ((pfn < p2m_max_pfn) && !xen_feature(XENFEAT_auto_translated_physmap) && (phys_to_machine_mapping[pfn] != mfn)) - return end_pfn; /* force !pfn_valid() */ + return p2m_max_pfn; /* force !pfn_valid() */ return pfn; } static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - BUG_ON(end_pfn && pfn >= end_pfn); + BUG_ON(p2m_max_pfn && pfn >= p2m_max_pfn); if (xen_feature(XENFEAT_auto_translated_physmap)) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); return; diff -Npru kexec-kernel-only/include/asm-x86_64/mach-xen/asm/page.h kexec-kernel-only_20120522/include/asm-x86_64/mach-xen/asm/page.h --- kexec-kernel-only/include/asm-x86_64/mach-xen/asm/page.h 2012-01-25 14:15:19.000000000 +0100 +++ kexec-kernel-only_20120522/include/asm-x86_64/mach-xen/asm/page.h 2012-04-26 19:08:54.000000000 +0200 @@ -188,7 +188,7 @@ static inline pgd_t __pgd(unsigned long #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < end_pfn) +#define pfn_valid(pfn) ((pfn) < p2m_max_pfn) #endif #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) diff -Npru kexec-kernel-only/include/asm-x86_64/mach-xen/asm/pgalloc.h kexec-kernel-only_20120522/include/asm-x86_64/mach-xen/asm/pgalloc.h --- kexec-kernel-only/include/asm-x86_64/mach-xen/asm/pgalloc.h 2012-01-25 14:15:02.000000000 +0100 +++ kexec-kernel-only_20120522/include/asm-x86_64/mach-xen/asm/pgalloc.h 2012-02-02 09:27:14.000000000 +0100 @@ -110,10 +110,12 @@ static inline void pud_free(pud_t *pud) free_page((unsigned long)pud); } -static inline void pgd_list_add(pgd_t *pgd) +static inline void pgd_list_add(pgd_t *pgd, void *mm) { struct page *page = virt_to_page(pgd); + page->mapping = mm; + spin_lock(&pgd_lock); page->index = (pgoff_t)pgd_list; if (pgd_list) @@ -134,6 +136,8 @@ static inline void pgd_list_del(pgd_t *p if (next) next->private = (unsigned long)pprev; spin_unlock(&pgd_lock); + + page->mapping = NULL; } static inline pgd_t *pgd_alloc(struct mm_struct *mm) @@ -146,7 +150,7 @@ static inline pgd_t *pgd_alloc(struct mm if (!pgd) return NULL; - pgd_list_add(pgd); + pgd_list_add(pgd, mm); /* * Copy kernel pointers in from init. * Could keep a freelist or slab cache of those because the kernel @@ -171,6 +175,8 @@ static inline void pgd_free(pgd_t *pgd) { pte_t *ptep = virt_to_ptep(pgd); + pgd_list_del(pgd); + if (!pte_write(*ptep)) { xen_pgd_unpin(__pa(pgd)); BUG_ON(HYPERVISOR_update_va_mapping( @@ -190,7 +196,6 @@ static inline void pgd_free(pgd_t *pgd) 0)); } - pgd_list_del(pgd); free_pages((unsigned long)pgd, 1); } diff -Npru kexec-kernel-only/include/linux/kexec.h kexec-kernel-only_20120522/include/linux/kexec.h --- kexec-kernel-only/include/linux/kexec.h 2012-01-25 14:15:25.000000000 +0100 +++ kexec-kernel-only_20120522/include/linux/kexec.h 2012-05-22 13:08:23.000000000 +0200 @@ -96,6 +96,16 @@ struct kimage { unsigned int type : 1; #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 + +#ifdef CONFIG_X86_64 + pgd_t *pgd; + pud_t *pud0; + pud_t *pud1; + pmd_t *pmd0; + pmd_t *pmd1; + pte_t *pte0; + pte_t *pte1; +#endif }; @@ -109,6 +119,7 @@ extern int xen_machine_kexec_load(struct extern void xen_machine_kexec_unload(struct kimage *image); extern void xen_machine_kexec_setup_resources(void); extern void xen_machine_kexec_register_resources(struct resource *res); +extern NORET_TYPE void xen_pv_machine_kexec(struct kimage *image) ATTRIB_NORET; #endif extern asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, diff -Npru kexec-kernel-only/include/xen/evtchn.h kexec-kernel-only_20120522/include/xen/evtchn.h --- kexec-kernel-only/include/xen/evtchn.h 2012-01-25 14:15:26.000000000 +0100 +++ kexec-kernel-only_20120522/include/xen/evtchn.h 2012-05-21 13:35:58.000000000 +0200 @@ -82,10 +82,11 @@ extern int bind_ipi_to_irqhandler( void *dev_id); /* - * Common unbind function for all event sources. Takes IRQ to unbind from. + * Common unbind functions for all event sources. Takes IRQ to unbind from. * Automatically closes the underlying event channel (even for bindings * made with bind_evtchn_to_irqhandler()). */ +extern void __unbind_from_irqhandler(unsigned int irq, void *dev_id); extern void unbind_from_irqhandler(unsigned int irq, void *dev_id); extern void irq_resume(void); diff -Npru kexec-kernel-only/include/xen/hypercall.h kexec-kernel-only_20120522/include/xen/hypercall.h --- kexec-kernel-only/include/xen/hypercall.h 1970-01-01 01:00:00.000000000 +0100 +++ kexec-kernel-only_20120522/include/xen/hypercall.h 2011-10-22 18:12:49.000000000 +0200 @@ -0,0 +1,30 @@ +#ifndef __XEN_HYPERCALL_H__ +#define __XEN_HYPERCALL_H__ + +#include + +static inline int __must_check +HYPERVISOR_multicall_check( + multicall_entry_t *call_list, unsigned int nr_calls, + const unsigned long *rc_list) +{ + int rc = HYPERVISOR_multicall(call_list, nr_calls); + + if (unlikely(rc < 0)) + return rc; + BUG_ON(rc); + BUG_ON((int)nr_calls < 0); + + for ( ; nr_calls > 0; --nr_calls, ++call_list) + if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0))) + return nr_calls; + + return 0; +} + +/* A construct to ignore the return value of hypercall wrappers in a few + * exceptional cases (simply casting the function result to void doesn't + * avoid the compiler warning): */ +#define VOID(expr) ((void)((expr)?:0)) + +#endif /* __XEN_HYPERCALL_H__ */ diff -Npru kexec-kernel-only/include/xen/interface/arch-x86_64.h kexec-kernel-only_20120522/include/xen/interface/arch-x86_64.h --- kexec-kernel-only/include/xen/interface/arch-x86_64.h 2012-01-25 14:15:01.000000000 +0100 +++ kexec-kernel-only_20120522/include/xen/interface/arch-x86_64.h 2012-05-20 22:44:26.000000000 +0200 @@ -105,6 +105,11 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); #define FLAT_USER_SS32 FLAT_RING3_SS32 #define FLAT_USER_SS FLAT_USER_SS64 +#define ROOT_PAGETABLE_FIRST_XEN_SLOT 256 +#define ROOT_PAGETABLE_LAST_XEN_SLOT 271 +#define ROOT_PAGETABLE_XEN_SLOTS \ + (ROOT_PAGETABLE_LAST_XEN_SLOT - ROOT_PAGETABLE_FIRST_XEN_SLOT + 1) + #define __HYPERVISOR_VIRT_START 0xFFFF800000000000 #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 diff -Npru kexec-kernel-only/include/xen/interface/io/xs_wire.h kexec-kernel-only_20120522/include/xen/interface/io/xs_wire.h --- kexec-kernel-only/include/xen/interface/io/xs_wire.h 2012-01-25 14:15:01.000000000 +0100 +++ kexec-kernel-only_20120522/include/xen/interface/io/xs_wire.h 2012-02-10 21:25:12.000000000 +0100 @@ -26,7 +26,11 @@ enum xsd_sockmsg_type XS_SET_PERMS, XS_WATCH_EVENT, XS_ERROR, - XS_IS_DOMAIN_INTRODUCED + XS_IS_DOMAIN_INTRODUCED, + XS_RESUME, + XS_SET_TARGET, + XS_RESTRICT, + XS_RESET_WATCHES }; #define XS_WRITE_NONE "NONE" diff -Npru kexec-kernel-only/kernel/kexec.c kexec-kernel-only_20120522/kernel/kexec.c --- kexec-kernel-only/kernel/kexec.c 2012-01-25 14:15:25.000000000 +0100 +++ kexec-kernel-only_20120522/kernel/kexec.c 2012-05-21 19:00:36.000000000 +0200 @@ -351,17 +351,19 @@ static struct page *kimage_alloc_pages(g if (pages) { unsigned int count, i; #ifdef CONFIG_XEN - int address_bits; + if (is_initial_xendomain()) { + int address_bits; - if (limit == ~0UL) - address_bits = BITS_PER_LONG; - else - address_bits = long_log2(limit); - - if (xen_create_contiguous_region((unsigned long)page_address(pages), - order, address_bits) < 0) { - __free_pages(pages, order); - return NULL; + if (limit == ~0UL) + address_bits = BITS_PER_LONG; + else + address_bits = long_log2(limit); + + if (xen_create_contiguous_region((unsigned long)page_address(pages), + order, address_bits) < 0) { + __free_pages(pages, order); + return NULL; + } } #endif pages->mapping = NULL; @@ -383,7 +385,8 @@ static void kimage_free_pages(struct pag for (i = 0; i < count; i++) ClearPageReserved(page + i); #ifdef CONFIG_XEN - xen_destroy_contiguous_region((unsigned long)page_address(page), order); + if (is_initial_xendomain()) + xen_destroy_contiguous_region((unsigned long)page_address(page), order); #endif __free_pages(page, order); } @@ -467,7 +470,6 @@ static struct page *kimage_alloc_normal_ return pages; } -#ifndef CONFIG_XEN static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { @@ -542,19 +544,16 @@ struct page *kimage_alloc_control_pages( pages = kimage_alloc_normal_control_pages(image, order); break; case KEXEC_TYPE_CRASH: +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + return kimage_alloc_normal_control_pages(image, order); +#endif pages = kimage_alloc_crash_control_pages(image, order); break; } return pages; } -#else /* !CONFIG_XEN */ -struct page *kimage_alloc_control_pages(struct kimage *image, - unsigned int order) -{ - return kimage_alloc_normal_control_pages(image, order); -} -#endif static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { @@ -857,7 +856,6 @@ out: return result; } -#ifndef CONFIG_XEN static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { @@ -923,19 +921,16 @@ static int kimage_load_segment(struct ki result = kimage_load_normal_segment(image, segment); break; case KEXEC_TYPE_CRASH: +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + return kimage_load_normal_segment(image, segment); +#endif result = kimage_load_crash_segment(image, segment); break; } return result; } -#else /* CONFIG_XEN */ -static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) -{ - return kimage_load_normal_segment(image, segment); -} -#endif /* * Exec Kernel system call: for obvious reasons only root may call it.