[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen master] x86: rename domain_build.c to dom0_build.c
commit d4188929e49cf5801e296f33dd7a477689bbed05 Author: Wei Liu <wei.liu2@xxxxxxxxxx> AuthorDate: Thu Mar 16 16:40:53 2017 +0000 Commit: Wei Liu <wei.liu2@xxxxxxxxxx> CommitDate: Mon Mar 20 15:39:45 2017 +0000 x86: rename domain_build.c to dom0_build.c To reflect the true nature of this file. No functional change. Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx> Reviewed-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> --- xen/arch/x86/Makefile | 2 +- xen/arch/x86/dom0_build.c | 2438 +++++++++++++++++++++++++++++++++++++++++++ xen/arch/x86/domain_build.c | 2438 ------------------------------------------- 3 files changed, 2439 insertions(+), 2439 deletions(-) diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index f75eca0..adc768f 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -23,7 +23,7 @@ obj-y += delay.o obj-bin-y += dmi_scan.init.o obj-y += domctl.o obj-y += domain.o -obj-bin-y += domain_build.init.o +obj-bin-y += dom0_build.init.o obj-y += domain_page.o obj-y += e820.o obj-y += extable.o diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c new file mode 100644 index 0000000..1c723c9 --- /dev/null +++ b/xen/arch/x86/dom0_build.c @@ -0,0 +1,2438 @@ +/****************************************************************************** + * dom0_build.c + * + * Copyright (c) 2002-2005, K A Fraser + */ + +#include <xen/init.h> +#include <xen/lib.h> +#include <xen/ctype.h> +#include <xen/sched.h> +#include <xen/sched-if.h> +#include <xen/smp.h> +#include <xen/delay.h> +#include <xen/event.h> +#include <xen/console.h> +#include <xen/kernel.h> +#include <xen/domain.h> +#include <xen/version.h> +#include <xen/iocap.h> +#include <xen/bitops.h> +#include <xen/compat.h> +#include <xen/libelf.h> +#include <xen/pfn.h> +#include <xen/guest_access.h> +#include <xen/acpi.h> +#include <asm/regs.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/paging.h> +#include <asm/p2m.h> +#include <asm/e820.h> +#include <asm/acpi.h> +#include <asm/setup.h> +#include <asm/bzimage.h> /* for bzimage_parse */ +#include <asm/io_apic.h> +#include <asm/hpet.h> + +#include <acpi/actables.h> + +#include <public/version.h> +#include <public/hvm/hvm_info_table.h> +#include <public/arch-x86/hvm/start_info.h> +#include <public/hvm/hvm_vcpu.h> + +static long __initdata dom0_nrpages; +static long __initdata dom0_min_nrpages; +static long __initdata dom0_max_nrpages = LONG_MAX; + +/* + * Have the TSS cover the ISA port range, which makes it + * - 104 bytes base structure + * - 32 bytes interrupt redirection bitmap + * - 128 bytes I/O bitmap + * - one trailing byte + * or a total of 265 bytes. + * + * NB: as PVHv2 Dom0 doesn't have legacy devices (ISA), it shouldn't have any + * business in accessing the ISA port range, much less in real mode, and due to + * the lack of firmware it shouldn't also execute any INT instruction. This is + * done just for consistency with what hvmloader does. + */ +#define HVM_VM86_TSS_SIZE 265 + +static unsigned int __initdata acpi_intr_overrides; +static struct acpi_madt_interrupt_override __initdata *intsrcovr; + +static unsigned int __initdata acpi_nmi_sources; +static struct acpi_madt_nmi_source __initdata *nmisrc; + +/* + * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>] + * + * <min_amt>: The minimum amount of memory which should be allocated for dom0. + * <max_amt>: The maximum amount of memory which should be allocated for dom0. + * <amt>: The precise amount of memory to allocate for dom0. + * + * Notes: + * 1. <amt> is clamped from below by <min_amt> and from above by available + * memory and <max_amt> + * 2. <min_amt> is clamped from above by available memory and <max_amt> + * 3. <min_amt> is ignored if it is greater than <max_amt> + * 4. If <amt> is not specified, it is calculated as follows: + * "All of memory is allocated to domain 0, minus 1/16th which is reserved + * for uses such as DMA buffers (the reservation is clamped to 128MB)." + * + * Each value can be specified as positive or negative: + * If +ve: The specified amount is an absolute value. + * If -ve: The specified amount is subtracted from total available memory. + */ +static long __init parse_amt(const char *s, const char **ps) +{ + long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT; + return (*s == '-') ? -pages : pages; +} +static void __init parse_dom0_mem(const char *s) +{ + do { + if ( !strncmp(s, "min:", 4) ) + dom0_min_nrpages = parse_amt(s+4, &s); + else if ( !strncmp(s, "max:", 4) ) + dom0_max_nrpages = parse_amt(s+4, &s); + else + dom0_nrpages = parse_amt(s, &s); + } while ( *s++ == ',' ); +} +custom_param("dom0_mem", parse_dom0_mem); + +static unsigned int __initdata opt_dom0_max_vcpus_min = 1; +static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX; + +static void __init parse_dom0_max_vcpus(const char *s) +{ + if ( *s == '-' ) /* -M */ + opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0); + else /* N, N-, or N-M */ + { + opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0); + if ( opt_dom0_max_vcpus_min == 0 ) + opt_dom0_max_vcpus_min = 1; + if ( !*s ) /* N */ + opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min; + else if ( *s++ == '-' && *s ) /* N-M */ + opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0); + } +} +custom_param("dom0_max_vcpus", parse_dom0_max_vcpus); + +static __initdata unsigned int dom0_nr_pxms; +static __initdata unsigned int dom0_pxms[MAX_NUMNODES] = + { [0 ... MAX_NUMNODES - 1] = ~0 }; +static __initdata bool_t dom0_affinity_relaxed; + +static void __init parse_dom0_nodes(const char *s) +{ + do { + if ( isdigit(*s) ) + dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0); + else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') ) + { + dom0_affinity_relaxed = 1; + s += 7; + } + else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') ) + { + dom0_affinity_relaxed = 0; + s += 6; + } + else + break; + } while ( ++dom0_nr_pxms < ARRAY_SIZE(dom0_pxms) && *s++ == ',' ); +} +custom_param("dom0_nodes", parse_dom0_nodes); + +static cpumask_t __initdata dom0_cpus; + +static struct vcpu *__init setup_dom0_vcpu(struct domain *d, + unsigned int vcpu_id, + unsigned int cpu) +{ + struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu); + + if ( v ) + { + if ( !d->is_pinned && !dom0_affinity_relaxed ) + cpumask_copy(v->cpu_hard_affinity, &dom0_cpus); + cpumask_copy(v->cpu_soft_affinity, &dom0_cpus); + } + + return v; +} + +static nodemask_t __initdata dom0_nodes; + +unsigned int __init dom0_max_vcpus(void) +{ + unsigned int i, max_vcpus, limit; + nodeid_t node; + + for ( i = 0; i < dom0_nr_pxms; ++i ) + if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE ) + node_set(node, dom0_nodes); + nodes_and(dom0_nodes, dom0_nodes, node_online_map); + if ( nodes_empty(dom0_nodes) ) + dom0_nodes = node_online_map; + for_each_node_mask ( node, dom0_nodes ) + cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node)); + cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid); + if ( cpumask_empty(&dom0_cpus) ) + cpumask_copy(&dom0_cpus, cpupool0->cpu_valid); + + max_vcpus = cpumask_weight(&dom0_cpus); + if ( opt_dom0_max_vcpus_min > max_vcpus ) + max_vcpus = opt_dom0_max_vcpus_min; + if ( opt_dom0_max_vcpus_max < max_vcpus ) + max_vcpus = opt_dom0_max_vcpus_max; + limit = dom0_pvh ? HVM_MAX_VCPUS : MAX_VIRT_CPUS; + if ( max_vcpus > limit ) + max_vcpus = limit; + + return max_vcpus; +} + +struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0) +{ + unsigned int max_vcpus = dom0_max_vcpus(); + + dom0->node_affinity = dom0_nodes; + dom0->auto_node_affinity = !dom0_nr_pxms; + + dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus); + if ( !dom0->vcpu ) + return NULL; + dom0->max_vcpus = max_vcpus; + + return setup_dom0_vcpu(dom0, 0, cpumask_first(&dom0_cpus)); +} + +#ifdef CONFIG_SHADOW_PAGING +bool __initdata opt_dom0_shadow; +boolean_param("dom0_shadow", opt_dom0_shadow); +#endif +bool __initdata dom0_pvh; + +/* + * List of parameters that affect Dom0 creation: + * + * - pvh Create a PVHv2 Dom0. + * - shadow Use shadow paging for Dom0. + */ +static void __init parse_dom0_param(char *s) +{ + char *ss; + + do { + + ss = strchr(s, ','); + if ( ss ) + *ss = '\0'; + + if ( !strcmp(s, "pvh") ) + dom0_pvh = true; +#ifdef CONFIG_SHADOW_PAGING + else if ( !strcmp(s, "shadow") ) + opt_dom0_shadow = true; +#endif + + s = ss + 1; + } while ( ss ); +} +custom_param("dom0", parse_dom0_param); + +static char __initdata opt_dom0_ioports_disable[200] = ""; +string_param("dom0_ioports_disable", opt_dom0_ioports_disable); + +static bool_t __initdata ro_hpet = 1; +boolean_param("ro-hpet", ro_hpet); + +/* Allow ring-3 access in long mode as guest cannot use ring 1 ... */ +#define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) +#define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL) +/* ... except for compatibility mode guests. */ +#define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) +#define L2_PROT (BASE_PROT|_PAGE_DIRTY) +#define L3_PROT (BASE_PROT|_PAGE_DIRTY) +#define L4_PROT (BASE_PROT|_PAGE_DIRTY) + +static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; + +static struct page_info * __init alloc_chunk( + struct domain *d, unsigned long max_pages) +{ + static unsigned int __initdata last_order = MAX_ORDER; + struct page_info *page; + unsigned int order = get_order_from_pages(max_pages), free_order; + + if ( order > last_order ) + order = last_order; + else if ( max_pages & (max_pages - 1) ) + --order; + while ( (page = alloc_domheap_pages(d, order, memflags)) == NULL ) + if ( order-- == 0 ) + break; + if ( page ) + last_order = order; + else if ( memflags ) + { + /* + * Allocate up to 2MB at a time: It prevents allocating very large + * chunks from DMA pools before the >4GB pool is fully depleted. + */ + last_order = 21 - PAGE_SHIFT; + memflags = 0; + return alloc_chunk(d, max_pages); + } + + /* + * Make a reasonable attempt at finding a smaller chunk at a higher + * address, to avoid allocating from low memory as much as possible. + */ + for ( free_order = order; !memflags && page && order--; ) + { + struct page_info *pg2; + + if ( d->tot_pages + (1 << order) > d->max_pages ) + continue; + pg2 = alloc_domheap_pages(d, order, MEMF_exact_node); + if ( pg2 > page ) + { + free_domheap_pages(page, free_order); + page = pg2; + free_order = order; + } + else if ( pg2 ) + free_domheap_pages(pg2, order); + } + return page; +} + +static unsigned long __init dom0_paging_pages(const struct domain *d, + unsigned long nr_pages) +{ + /* Copied from: libxl_get_required_shadow_memory() */ + unsigned long memkb = nr_pages * (PAGE_SIZE / 1024); + + memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024)); + + return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT); +} + +static unsigned long __init compute_dom0_nr_pages( + struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len) +{ + nodeid_t node; + unsigned long avail = 0, nr_pages, min_pages, max_pages; + bool_t need_paging; + + for_each_node_mask ( node, dom0_nodes ) + avail += avail_domheap_pages_region(node, 0, 0) + + initial_images_nrpages(node); + + /* Reserve memory for further dom0 vcpu-struct allocations... */ + avail -= (d->max_vcpus - 1UL) + << get_order_from_bytes(sizeof(struct vcpu)); + /* ...and compat_l4's, if needed. */ + if ( is_pv_32bit_domain(d) ) + avail -= d->max_vcpus - 1; + + /* Reserve memory for iommu_dom0_init() (rough estimate). */ + if ( iommu_enabled ) + { + unsigned int s; + + for ( s = 9; s < BITS_PER_LONG; s += 9 ) + avail -= max_pdx >> s; + } + + need_paging = is_hvm_domain(d) ? !iommu_hap_pt_share || !paging_mode_hap(d) + : opt_dom0_shadow; + for ( ; ; need_paging = 0 ) + { + nr_pages = dom0_nrpages; + min_pages = dom0_min_nrpages; + max_pages = dom0_max_nrpages; + + /* + * If allocation isn't specified, reserve 1/16th of available memory + * for things like DMA buffers. This reservation is clamped to a + * maximum of 128MB. + */ + if ( nr_pages == 0 ) + nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT)); + + /* Negative specification means "all memory - specified amount". */ + if ( (long)nr_pages < 0 ) nr_pages += avail; + if ( (long)min_pages < 0 ) min_pages += avail; + if ( (long)max_pages < 0 ) max_pages += avail; + + /* Clamp according to min/max limits and available memory. */ + nr_pages = max(nr_pages, min_pages); + nr_pages = min(nr_pages, max_pages); + nr_pages = min(nr_pages, avail); + + if ( !need_paging ) + break; + + /* Reserve memory for shadow or HAP. */ + avail -= dom0_paging_pages(d, nr_pages); + } + + if ( is_pv_domain(d) && + (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && + ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) ) + { + /* + * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M + * note) require that there is enough virtual space beyond the initial + * allocation to set up their initial page tables. This space is + * roughly the same size as the p2m table, so make sure the initial + * allocation doesn't consume more than about half the space that's + * available between params.virt_base and the address space end. + */ + unsigned long vstart, vend, end; + size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long); + + vstart = parms->virt_base; + vend = round_pgup(parms->virt_kend); + if ( !parms->unmapped_initrd ) + vend += round_pgup(initrd_len); + end = vend + nr_pages * sizeof_long; + + if ( end > vstart ) + end += end - vstart; + if ( end <= vstart || + (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) ) + { + end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long); + nr_pages = (end - vend) / (2 * sizeof_long); + if ( dom0_min_nrpages > 0 && nr_pages < min_pages ) + nr_pages = min_pages; + printk("Dom0 memory clipped to %lu pages\n", nr_pages); + } + } + + d->max_pages = min_t(unsigned long, max_pages, UINT_MAX); + + return nr_pages; +} + +static void __init process_dom0_ioports_disable(struct domain *dom0) +{ + unsigned long io_from, io_to; + char *t, *s = opt_dom0_ioports_disable; + const char *u; + + if ( *s == '\0' ) + return; + + while ( (t = strsep(&s, ",")) != NULL ) + { + io_from = simple_strtoul(t, &u, 16); + if ( u == t ) + { + parse_error: + printk("Invalid ioport range <%s> " + "in dom0_ioports_disable, skipping\n", t); + continue; + } + + if ( *u == '\0' ) + io_to = io_from; + else if ( *u == '-' ) + io_to = simple_strtoul(u + 1, &u, 16); + else + goto parse_error; + + if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) ) + goto parse_error; + + printk("Disabling dom0 access to ioport range %04lx-%04lx\n", + io_from, io_to); + + if ( ioports_deny_access(dom0, io_from, io_to) != 0 ) + BUG(); + } +} + +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, + unsigned long mfn, unsigned long vphysmap_s) +{ + if ( !is_pv_32bit_domain(d) ) + ((unsigned long *)vphysmap_s)[pfn] = mfn; + else + ((unsigned int *)vphysmap_s)[pfn] = mfn; + + set_gpfn_from_mfn(mfn, pfn); +} + +static __init void mark_pv_pt_pages_rdonly(struct domain *d, + l4_pgentry_t *l4start, + unsigned long vpt_start, + unsigned long nr_pt_pages) +{ + unsigned long count; + struct page_info *page; + l4_pgentry_t *pl4e; + l3_pgentry_t *pl3e; + l2_pgentry_t *pl2e; + l1_pgentry_t *pl1e; + + pl4e = l4start + l4_table_offset(vpt_start); + pl3e = l4e_to_l3e(*pl4e); + pl3e += l3_table_offset(vpt_start); + pl2e = l3e_to_l2e(*pl3e); + pl2e += l2_table_offset(vpt_start); + pl1e = l2e_to_l1e(*pl2e); + pl1e += l1_table_offset(vpt_start); + for ( count = 0; count < nr_pt_pages; count++ ) + { + l1e_remove_flags(*pl1e, _PAGE_RW); + page = mfn_to_page(l1e_get_pfn(*pl1e)); + + /* Read-only mapping + PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 3; + page->u.inuse.type_info |= PGT_validated | 1; + + /* Top-level p.t. is pinned. */ + if ( (page->u.inuse.type_info & PGT_type_mask) == + (!is_pv_32bit_domain(d) ? + PGT_l4_page_table : PGT_l3_page_table) ) + { + page->count_info += 1; + page->u.inuse.type_info += 1 | PGT_pinned; + } + + /* Iterate. */ + if ( !((unsigned long)++pl1e & (PAGE_SIZE - 1)) ) + { + if ( !((unsigned long)++pl2e & (PAGE_SIZE - 1)) ) + { + if ( !((unsigned long)++pl3e & (PAGE_SIZE - 1)) ) + pl3e = l4e_to_l3e(*++pl4e); + pl2e = l3e_to_l2e(*pl3e); + } + pl1e = l2e_to_l1e(*pl2e); + } + } +} + +static __init void setup_pv_physmap(struct domain *d, unsigned long pgtbl_pfn, + unsigned long v_start, unsigned long v_end, + unsigned long vphysmap_start, + unsigned long vphysmap_end, + unsigned long nr_pages) +{ + struct page_info *page = NULL; + l4_pgentry_t *pl4e, *l4start = map_domain_page(_mfn(pgtbl_pfn)); + l3_pgentry_t *pl3e = NULL; + l2_pgentry_t *pl2e = NULL; + l1_pgentry_t *pl1e = NULL; + + if ( v_start <= vphysmap_end && vphysmap_start <= v_end ) + panic("DOM0 P->M table overlaps initial mapping"); + + while ( vphysmap_start < vphysmap_end ) + { + if ( d->tot_pages + ((round_pgup(vphysmap_end) - vphysmap_start) + >> PAGE_SHIFT) + 3 > nr_pages ) + panic("Dom0 allocation too small for initial P->M table"); + + if ( pl1e ) + { + unmap_domain_page(pl1e); + pl1e = NULL; + } + if ( pl2e ) + { + unmap_domain_page(pl2e); + pl2e = NULL; + } + if ( pl3e ) + { + unmap_domain_page(pl3e); + pl3e = NULL; + } + pl4e = l4start + l4_table_offset(vphysmap_start); + if ( !l4e_get_intpte(*pl4e) ) + { + page = alloc_domheap_page(d, 0); + if ( !page ) + break; + + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info = PGT_l3_page_table | PGT_validated | 1; + pl3e = __map_domain_page(page); + clear_page(pl3e); + *pl4e = l4e_from_page(page, L4_PROT); + } else + pl3e = map_domain_page(_mfn(l4e_get_pfn(*pl4e))); + + pl3e += l3_table_offset(vphysmap_start); + if ( !l3e_get_intpte(*pl3e) ) + { + if ( cpu_has_page1gb && + !(vphysmap_start & ((1UL << L3_PAGETABLE_SHIFT) - 1)) && + vphysmap_end >= vphysmap_start + (1UL << L3_PAGETABLE_SHIFT) && + (page = alloc_domheap_pages(d, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + 0)) != NULL ) + { + *pl3e = l3e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE); + vphysmap_start += 1UL << L3_PAGETABLE_SHIFT; + continue; + } + if ( (page = alloc_domheap_page(d, 0)) == NULL ) + break; + + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info = PGT_l2_page_table | PGT_validated | 1; + pl2e = __map_domain_page(page); + clear_page(pl2e); + *pl3e = l3e_from_page(page, L3_PROT); + } + else + pl2e = map_domain_page(_mfn(l3e_get_pfn(*pl3e))); + + pl2e += l2_table_offset(vphysmap_start); + if ( !l2e_get_intpte(*pl2e) ) + { + if ( !(vphysmap_start & ((1UL << L2_PAGETABLE_SHIFT) - 1)) && + vphysmap_end >= vphysmap_start + (1UL << L2_PAGETABLE_SHIFT) && + (page = alloc_domheap_pages(d, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + 0)) != NULL ) + { + *pl2e = l2e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE); + if ( opt_allow_superpage ) + get_superpage(page_to_mfn(page), d); + vphysmap_start += 1UL << L2_PAGETABLE_SHIFT; + continue; + } + if ( (page = alloc_domheap_page(d, 0)) == NULL ) + break; + + /* No mapping, PGC_allocated + page-table page. */ + page->count_info = PGC_allocated | 2; + page->u.inuse.type_info = PGT_l1_page_table | PGT_validated | 1; + pl1e = __map_domain_page(page); + clear_page(pl1e); + *pl2e = l2e_from_page(page, L2_PROT); + } + else + pl1e = map_domain_page(_mfn(l2e_get_pfn(*pl2e))); + + pl1e += l1_table_offset(vphysmap_start); + BUG_ON(l1e_get_intpte(*pl1e)); + page = alloc_domheap_page(d, 0); + if ( !page ) + break; + + *pl1e = l1e_from_page(page, L1_PROT|_PAGE_DIRTY); + vphysmap_start += PAGE_SIZE; + vphysmap_start &= PAGE_MASK; + } + if ( !page ) + panic("Not enough RAM for DOM0 P->M table"); + + if ( pl1e ) + unmap_domain_page(pl1e); + if ( pl2e ) + unmap_domain_page(pl2e); + if ( pl3e ) + unmap_domain_page(pl3e); + + unmap_domain_page(l4start); +} + +static int __init setup_permissions(struct domain *d) +{ + unsigned long mfn; + unsigned int i; + int rc; + + /* The hardware domain is initially permitted full I/O capabilities. */ + rc = ioports_permit_access(d, 0, 0xFFFF); + rc |= iomem_permit_access(d, 0UL, (1UL << (paddr_bits - PAGE_SHIFT)) - 1); + rc |= irqs_permit_access(d, 1, nr_irqs_gsi - 1); + + /* Modify I/O port access permissions. */ + + /* Master Interrupt Controller (PIC). */ + rc |= ioports_deny_access(d, 0x20, 0x21); + /* Slave Interrupt Controller (PIC). */ + rc |= ioports_deny_access(d, 0xA0, 0xA1); + /* Interval Timer (PIT). */ + rc |= ioports_deny_access(d, 0x40, 0x43); + /* PIT Channel 2 / PC Speaker Control. */ + rc |= ioports_deny_access(d, 0x61, 0x61); + /* ACPI PM Timer. */ + if ( pmtmr_ioport ) + rc |= ioports_deny_access(d, pmtmr_ioport, pmtmr_ioport + 3); + /* PCI configuration space (NB. 0xcf8 has special treatment). */ + rc |= ioports_deny_access(d, 0xcfc, 0xcff); + /* Command-line I/O ranges. */ + process_dom0_ioports_disable(d); + + /* Modify I/O memory access permissions. */ + + /* Local APIC. */ + if ( mp_lapic_addr != 0 ) + { + mfn = paddr_to_pfn(mp_lapic_addr); + rc |= iomem_deny_access(d, mfn, mfn); + } + /* I/O APICs. */ + for ( i = 0; i < nr_ioapics; i++ ) + { + mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr); + if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) + rc |= iomem_deny_access(d, mfn, mfn); + } + /* MSI range. */ + rc |= iomem_deny_access(d, paddr_to_pfn(MSI_ADDR_BASE_LO), + paddr_to_pfn(MSI_ADDR_BASE_LO + + MSI_ADDR_DEST_ID_MASK)); + /* HyperTransport range. */ + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) + rc |= iomem_deny_access(d, paddr_to_pfn(0xfdULL << 32), + paddr_to_pfn((1ULL << 40) - 1)); + + /* Remove access to E820_UNUSABLE I/O regions above 1MB. */ + for ( i = 0; i < e820.nr_map; i++ ) + { + unsigned long sfn, efn; + sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul); + efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1); + if ( (e820.map[i].type == E820_UNUSABLE) && + (e820.map[i].size != 0) && + (sfn <= efn) ) + rc |= iomem_deny_access(d, sfn, efn); + } + + /* Prevent access to HPET */ + if ( hpet_address ) + { + u8 prot_flags = hpet_flags & ACPI_HPET_PAGE_PROTECT_MASK; + + mfn = paddr_to_pfn(hpet_address); + if ( prot_flags == ACPI_HPET_PAGE_PROTECT4 ) + rc |= iomem_deny_access(d, mfn, mfn); + else if ( prot_flags == ACPI_HPET_PAGE_PROTECT64 ) + rc |= iomem_deny_access(d, mfn, mfn + 15); + else if ( ro_hpet ) + rc |= rangeset_add_singleton(mmio_ro_ranges, mfn); + } + + return rc; +} + +static int __init construct_dom0_pv( + struct domain *d, + const module_t *image, unsigned long image_headroom, + module_t *initrd, + void *(*bootstrap_map)(const module_t *), + char *cmdline) +{ + int i, cpu, rc, compatible, compat32, order, machine; + struct cpu_user_regs *regs; + unsigned long pfn, mfn; + unsigned long nr_pages; + unsigned long nr_pt_pages; + unsigned long alloc_spfn; + unsigned long alloc_epfn; + unsigned long initrd_pfn = -1, initrd_mfn = 0; + unsigned long count; + struct page_info *page = NULL; + start_info_t *si; + struct vcpu *v = d->vcpu[0]; + unsigned long long value; + void *image_base = bootstrap_map(image); + unsigned long image_len = image->mod_end; + void *image_start = image_base + image_headroom; + unsigned long initrd_len = initrd ? initrd->mod_end : 0; + l4_pgentry_t *l4tab = NULL, *l4start = NULL; + l3_pgentry_t *l3tab = NULL, *l3start = NULL; + l2_pgentry_t *l2tab = NULL, *l2start = NULL; + l1_pgentry_t *l1tab = NULL, *l1start = NULL; + + /* + * This fully describes the memory layout of the initial domain. All + * *_start address are page-aligned, except v_start (and v_end) which are + * superpage-aligned. + */ + struct elf_binary elf; + struct elf_dom_parms parms; + unsigned long vkern_start; + unsigned long vkern_end; + unsigned long vinitrd_start; + unsigned long vinitrd_end; + unsigned long vphysmap_start; + unsigned long vphysmap_end; + unsigned long vstartinfo_start; + unsigned long vstartinfo_end; + unsigned long vstack_start; + unsigned long vstack_end; + unsigned long vpt_start; + unsigned long vpt_end; + unsigned long v_start; + unsigned long v_end; + + /* Machine address of next candidate page-table page. */ + paddr_t mpt_alloc; + + printk("*** LOADING DOMAIN 0 ***\n"); + + d->max_pages = ~0U; + + if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 ) + return rc; + + if ( (rc = elf_init(&elf, image_start, image_len)) != 0 ) + return rc; +#ifdef CONFIG_VERBOSE_DEBUG + elf_set_verbose(&elf); +#endif + elf_parse_binary(&elf); + if ( (rc = elf_xen_parse(&elf, &parms)) != 0 ) + goto out; + + /* compatibility check */ + compatible = 0; + compat32 = 0; + machine = elf_uval(&elf, elf.ehdr, e_machine); + printk(" Xen kernel: 64-bit, lsb, compat32\n"); + if (elf_32bit(&elf) && parms.pae == XEN_PAE_BIMODAL) + parms.pae = XEN_PAE_EXTCR3; + if (elf_32bit(&elf) && parms.pae && machine == EM_386) + { + compat32 = 1; + compatible = 1; + } + if (elf_64bit(&elf) && machine == EM_X86_64) + compatible = 1; + printk(" Dom0 kernel: %s%s, %s, paddr %#" PRIx64 " -> %#" PRIx64 "\n", + elf_64bit(&elf) ? "64-bit" : "32-bit", + parms.pae ? ", PAE" : "", + elf_msb(&elf) ? "msb" : "lsb", + elf.pstart, elf.pend); + if ( elf.bsd_symtab_pstart ) + printk(" Dom0 symbol map %#" PRIx64 " -> %#" PRIx64 "\n", + elf.bsd_symtab_pstart, elf.bsd_symtab_pend); + + if ( !compatible ) + { + printk("Mismatch between Xen and DOM0 kernel\n"); + rc = -EINVAL; + goto out; + } + + if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE ) + { + if ( !test_bit(XENFEAT_dom0, parms.f_supported) ) + { + printk("Kernel does not support Dom0 operation\n"); + rc = -EINVAL; + goto out; + } + } + + if ( compat32 ) + { + d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; + v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0]; + if ( setup_compat_arg_xlat(v) != 0 ) + BUG(); + } + + nr_pages = compute_dom0_nr_pages(d, &parms, initrd_len); + + if ( parms.pae == XEN_PAE_EXTCR3 ) + set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist); + + if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) ) + { + unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1; + value = (parms.virt_hv_start_low + mask) & ~mask; + BUG_ON(!is_pv_32bit_domain(d)); + if ( value > __HYPERVISOR_COMPAT_VIRT_START ) + panic("Domain 0 expects too high a hypervisor start address"); + HYPERVISOR_COMPAT_VIRT_START(d) = + max_t(unsigned int, m2p_compat_vstart, value); + } + + if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) ) + { + printk(XENLOG_WARNING "P2M table base ignored\n"); + parms.p2m_base = UNSET_ADDR; + } + + domain_set_alloc_bitsize(d); + + /* + * Why do we need this? The number of page-table frames depends on the + * size of the bootstrap address space. But the size of the address space + * depends on the number of page-table frames (since each one is mapped + * read-only). We have a pair of simultaneous equations in two unknowns, + * which we solve by exhaustive search. + */ + v_start = parms.virt_base; + vkern_start = parms.virt_kstart; + vkern_end = parms.virt_kend; + if ( parms.unmapped_initrd ) + { + vinitrd_start = vinitrd_end = 0; + vphysmap_start = round_pgup(vkern_end); + } + else + { + vinitrd_start = round_pgup(vkern_end); + vinitrd_end = vinitrd_start + initrd_len; + vphysmap_start = round_pgup(vinitrd_end); + } + vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32bit_domain(d) ? + sizeof(unsigned long) : + sizeof(unsigned int))); + if ( parms.p2m_base != UNSET_ADDR ) + vphysmap_end = vphysmap_start; + vstartinfo_start = round_pgup(vphysmap_end); + vstartinfo_end = (vstartinfo_start + + sizeof(struct start_info) + + sizeof(struct dom0_vga_console_info)); + + vpt_start = round_pgup(vstartinfo_end); + for ( nr_pt_pages = 2; ; nr_pt_pages++ ) + { + vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); + vstack_start = vpt_end; + vstack_end = vstack_start + PAGE_SIZE; + v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1); + if ( (v_end - vstack_end) < (512UL << 10) ) + v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */ +#define NR(_l,_h,_s) \ + (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \ + ((_l) & ~((1UL<<(_s))-1))) >> (_s)) + if ( (!is_pv_32bit_domain(d) + /* # L4 */ + NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */ + (!is_pv_32bit_domain(d) ? + NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */ + 4) + /* # compat L2 */ + NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */ + <= nr_pt_pages ) + break; + } + + count = v_end - v_start; + if ( vinitrd_start ) + count -= PAGE_ALIGN(initrd_len); + order = get_order_from_bytes(count); + if ( (1UL << order) + PFN_UP(initrd_len) > nr_pages ) + panic("Domain 0 allocation is too small for kernel image"); + + if ( parms.p2m_base != UNSET_ADDR ) + { + vphysmap_start = parms.p2m_base; + vphysmap_end = vphysmap_start + nr_pages * sizeof(unsigned long); + } + page = alloc_domheap_pages(d, order, 0); + if ( page == NULL ) + panic("Not enough RAM for domain 0 allocation"); + alloc_spfn = page_to_mfn(page); + alloc_epfn = alloc_spfn + d->tot_pages; + + if ( initrd_len ) + { + initrd_pfn = vinitrd_start ? + (vinitrd_start - v_start) >> PAGE_SHIFT : + d->tot_pages; + initrd_mfn = mfn = initrd->mod_start; + count = PFN_UP(initrd_len); + if ( d->arch.physaddr_bitsize && + ((mfn + count - 1) >> (d->arch.physaddr_bitsize - PAGE_SHIFT)) ) + { + order = get_order_from_pages(count); + page = alloc_domheap_pages(d, order, 0); + if ( !page ) + panic("Not enough RAM for domain 0 initrd"); + for ( count = -count; order--; ) + if ( count & (1UL << order) ) + { + free_domheap_pages(page, order); + page += 1UL << order; + } + memcpy(page_to_virt(page), mfn_to_virt(initrd->mod_start), + initrd_len); + mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT; + init_domheap_pages(mpt_alloc, + mpt_alloc + PAGE_ALIGN(initrd_len)); + initrd->mod_start = initrd_mfn = page_to_mfn(page); + } + else + { + while ( count-- ) + if ( assign_pages(d, mfn_to_page(mfn++), 0, 0) ) + BUG(); + } + initrd->mod_end = 0; + } + + printk("PHYSICAL MEMORY ARRANGEMENT:\n" + " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr, + pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn)); + if ( d->tot_pages < nr_pages ) + printk(" (%lu pages to be allocated)", + nr_pages - d->tot_pages); + if ( initrd ) + { + mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT; + printk("\n Init. ramdisk: %"PRIpaddr"->%"PRIpaddr, + mpt_alloc, mpt_alloc + initrd_len); + } + printk("\nVIRTUAL MEMORY ARRANGEMENT:\n" + " Loaded kernel: %p->%p\n" + " Init. ramdisk: %p->%p\n" + " Phys-Mach map: %p->%p\n" + " Start info: %p->%p\n" + " Page tables: %p->%p\n" + " Boot stack: %p->%p\n" + " TOTAL: %p->%p\n", + _p(vkern_start), _p(vkern_end), + _p(vinitrd_start), _p(vinitrd_end), + _p(vphysmap_start), _p(vphysmap_end), + _p(vstartinfo_start), _p(vstartinfo_end), + _p(vpt_start), _p(vpt_end), + _p(vstack_start), _p(vstack_end), + _p(v_start), _p(v_end)); + printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry)); + + process_pending_softirqs(); + + mpt_alloc = (vpt_start - v_start) + pfn_to_paddr(alloc_spfn); + if ( vinitrd_start ) + mpt_alloc -= PAGE_ALIGN(initrd_len); + + /* Overlap with Xen protected area? */ + if ( !is_pv_32bit_domain(d) ? + ((v_start < HYPERVISOR_VIRT_END) && + (v_end > HYPERVISOR_VIRT_START)) : + (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) ) + { + printk("DOM0 image overlaps with Xen private area.\n"); + rc = -EINVAL; + goto out; + } + + if ( is_pv_32bit_domain(d) ) + { + v->arch.pv_vcpu.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS; + v->arch.pv_vcpu.event_callback_cs = FLAT_COMPAT_KERNEL_CS; + } + + /* WARNING: The new domain must have its 'processor' field filled in! */ + if ( !is_pv_32bit_domain(d) ) + { + maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table; + l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; + } + else + { + page = alloc_domheap_page(d, MEMF_no_owner); + if ( !page ) + panic("Not enough RAM for domain 0 PML4"); + page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1; + l4start = l4tab = page_to_virt(page); + maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; + l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; + } + clear_page(l4tab); + init_guest_l4_table(l4tab, d, 0); + v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); + if ( is_pv_32bit_domain(d) ) + v->arch.guest_table_user = v->arch.guest_table; + + l4tab += l4_table_offset(v_start); + pfn = alloc_spfn; + for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) + { + if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) + { + maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table; + l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; + clear_page(l1tab); + if ( count == 0 ) + l1tab += l1_table_offset(v_start); + if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) ) + { + maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table; + l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; + clear_page(l2tab); + if ( count == 0 ) + l2tab += l2_table_offset(v_start); + if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) ) + { + if ( count || !l3start ) + { + maddr_to_page(mpt_alloc)->u.inuse.type_info = + PGT_l3_page_table; + l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; + } + l3tab = l3start; + clear_page(l3tab); + if ( count == 0 ) + l3tab += l3_table_offset(v_start); + *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT); + l4tab++; + } + *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT); + l3tab++; + } + *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT); + l2tab++; + } + if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) + mfn = pfn++; + else + mfn = initrd_mfn++; + *l1tab = l1e_from_pfn(mfn, (!is_pv_32bit_domain(d) ? + L1_PROT : COMPAT_L1_PROT)); + l1tab++; + + if ( !paging_mode_translate(d) ) + { + page = mfn_to_page(mfn); + if ( !page->u.inuse.type_info && + !get_page_and_type(page, d, PGT_writable_page) ) + BUG(); + } + } + + if ( is_pv_32bit_domain(d) ) + { + /* Ensure the first four L3 entries are all populated. */ + for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab ) + { + if ( !l3e_get_intpte(*l3tab) ) + { + maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table; + l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; + clear_page(l2tab); + *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT); + } + if ( i == 3 ) + l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2; + } + /* Install read-only guest visible MPT mapping. */ + l2tab = l3e_to_l2e(l3start[3]); + memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)], + &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)], + COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); + } + + /* Pages that are part of page tables must be read only. */ + if ( is_pv_domain(d) ) + mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages); + + /* Mask all upcalls... */ + for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) + shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; + + printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus); + + cpu = v->processor; + for ( i = 1; i < d->max_vcpus; i++ ) + { + cpu = cpumask_cycle(cpu, &dom0_cpus); + setup_dom0_vcpu(d, i, cpu); + } + + d->arch.paging.mode = 0; + + /* Set up CR3 value for write_ptbase */ + if ( paging_mode_enabled(d) ) + paging_update_paging_modes(v); + else + update_cr3(v); + + /* We run on dom0's page tables for the final part of the build process. */ + write_ptbase(v); + mapcache_override_current(v); + + /* Copy the OS image and free temporary buffer. */ + elf.dest_base = (void*)vkern_start; + elf.dest_size = vkern_end - vkern_start; + elf_set_vcpu(&elf, v); + rc = elf_load_binary(&elf); + if ( rc < 0 ) + { + printk("Failed to load the kernel binary\n"); + goto out; + } + bootstrap_map(NULL); + + if ( UNSET_ADDR != parms.virt_hypercall ) + { + if ( (parms.virt_hypercall < v_start) || + (parms.virt_hypercall >= v_end) ) + { + mapcache_override_current(NULL); + write_ptbase(current); + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); + rc = -1; + goto out; + } + hypercall_page_initialise( + d, (void *)(unsigned long)parms.virt_hypercall); + } + + /* Free temporary buffers. */ + discard_initial_images(); + + /* Set up start info area. */ + si = (start_info_t *)vstartinfo_start; + clear_page(si); + si->nr_pages = nr_pages; + + si->shared_info = virt_to_maddr(d->shared_info); + + si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; + if ( !vinitrd_start && initrd_len ) + si->flags |= SIF_MOD_START_PFN; + si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK; + si->pt_base = vpt_start; + si->nr_pt_frames = nr_pt_pages; + si->mfn_list = vphysmap_start; + snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s", + elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : ""); + + count = d->tot_pages; + + /* Set up the phys->machine table if not part of the initial mapping. */ + if ( is_pv_domain(d) && parms.p2m_base != UNSET_ADDR ) + { + pfn = pagetable_get_pfn(v->arch.guest_table); + setup_pv_physmap(d, pfn, v_start, v_end, vphysmap_start, vphysmap_end, + nr_pages); + } + + /* Write the phys->machine and machine->phys table entries. */ + for ( pfn = 0; pfn < count; pfn++ ) + { + mfn = pfn + alloc_spfn; + if ( pfn >= initrd_pfn ) + { + if ( pfn < initrd_pfn + PFN_UP(initrd_len) ) + mfn = initrd->mod_start + (pfn - initrd_pfn); + else + mfn -= PFN_UP(initrd_len); + } +#ifndef NDEBUG +#define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT) + if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) + mfn = alloc_epfn - (pfn - REVERSE_START); +#endif + dom0_update_physmap(d, pfn, mfn, vphysmap_start); + if (!(pfn & 0xfffff)) + process_pending_softirqs(); + } + si->first_p2m_pfn = pfn; + si->nr_p2m_frames = d->tot_pages - count; + page_list_for_each ( page, &d->page_list ) + { + mfn = page_to_mfn(page); + BUG_ON(SHARED_M2P(get_gpfn_from_mfn(mfn))); + if ( get_gpfn_from_mfn(mfn) >= count ) + { + BUG_ON(is_pv_32bit_domain(d)); + if ( !paging_mode_translate(d) && !page->u.inuse.type_info && + !get_page_and_type(page, d, PGT_writable_page) ) + BUG(); + + dom0_update_physmap(d, pfn, mfn, vphysmap_start); + ++pfn; + if (!(pfn & 0xfffff)) + process_pending_softirqs(); + } + } + BUG_ON(pfn != d->tot_pages); +#ifndef NDEBUG + alloc_epfn += PFN_UP(initrd_len) + si->nr_p2m_frames; +#endif + while ( pfn < nr_pages ) + { + if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL ) + panic("Not enough RAM for DOM0 reservation"); + while ( pfn < d->tot_pages ) + { + mfn = page_to_mfn(page); +#ifndef NDEBUG +#define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) +#endif + dom0_update_physmap(d, pfn, mfn, vphysmap_start); +#undef pfn + page++; pfn++; + if (!(pfn & 0xfffff)) + process_pending_softirqs(); + } + } + + if ( initrd_len != 0 ) + { + si->mod_start = vinitrd_start ?: initrd_pfn; + si->mod_len = initrd_len; + } + + memset(si->cmd_line, 0, sizeof(si->cmd_line)); + if ( cmdline != NULL ) + strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)); + + if ( fill_console_start_info((void *)(si + 1)) ) + { + si->console.dom0.info_off = sizeof(struct start_info); + si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); + } + + if ( is_pv_32bit_domain(d) ) + xlat_start_info(si, XLAT_start_info_console_dom0); + + /* Return to idle domain's page tables. */ + mapcache_override_current(NULL); + write_ptbase(current); + + update_domain_wallclock_time(d); + + v->is_initialised = 1; + clear_bit(_VPF_down, &v->pause_flags); + + /* + * Initial register values: + * DS,ES,FS,GS = FLAT_KERNEL_DS + * CS:rIP = FLAT_KERNEL_CS:start_pc + * SS:rSP = FLAT_KERNEL_SS:start_stack + * rSI = start_info + * [rAX,rBX,rCX,rDX,rDI,rBP,R8-R15 are zero] + */ + regs = &v->arch.user_regs; + regs->ds = regs->es = regs->fs = regs->gs = + !is_pv_32bit_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS; + regs->ss = (!is_pv_32bit_domain(d) ? + FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS); + regs->cs = (!is_pv_32bit_domain(d) ? + FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS); + regs->rip = parms.virt_entry; + regs->rsp = vstack_end; + regs->rsi = vstartinfo_start; + regs->eflags = X86_EFLAGS_IF; + +#ifdef CONFIG_SHADOW_PAGING + if ( opt_dom0_shadow && paging_enable(d, PG_SH_enable) == 0 ) + paging_update_paging_modes(v); +#endif + + /* + * PVH Fixme: XENFEAT_supervisor_mode_kernel has been reused in PVH with a + * different meaning. + */ + if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) ) + panic("Dom0 requires supervisor-mode execution"); + + rc = setup_permissions(d); + BUG_ON(rc != 0); + + if ( elf_check_broken(&elf) ) + printk(" Xen warning: dom0 kernel broken ELF: %s\n", + elf_check_broken(&elf)); + + if ( d->domain_id == hardware_domid ) + iommu_hwdom_init(d); + + return 0; + +out: + if ( elf_check_broken(&elf) ) + printk(" Xen dom0 kernel broken ELF: %s\n", + elf_check_broken(&elf)); + + return rc; +} + +static int __init modify_identity_mmio(struct domain *d, unsigned long pfn, + unsigned long nr_pages, const bool map) +{ + int rc; + + for ( ; ; ) + { + rc = (map ? map_mmio_regions : unmap_mmio_regions) + (d, _gfn(pfn), nr_pages, _mfn(pfn)); + if ( rc == 0 ) + break; + if ( rc < 0 ) + { + printk(XENLOG_WARNING + "Failed to identity %smap [%#lx,%#lx) for d%d: %d\n", + map ? "" : "un", pfn, pfn + nr_pages, d->domain_id, rc); + break; + } + nr_pages -= rc; + pfn += rc; + process_pending_softirqs(); + } + + return rc; +} + +/* Populate a HVM memory range using the biggest possible order. */ +static int __init pvh_populate_memory_range(struct domain *d, + unsigned long start, + unsigned long nr_pages) +{ + unsigned int order, i = 0; + struct page_info *page; + int rc; +#define MAP_MAX_ITER 64 + + order = MAX_ORDER; + while ( nr_pages != 0 ) + { + unsigned int range_order = get_order_from_pages(nr_pages + 1); + + order = min(range_order ? range_order - 1 : 0, order); + page = alloc_domheap_pages(d, order, memflags); + if ( page == NULL ) + { + if ( order == 0 && memflags ) + { + /* Try again without any memflags. */ + memflags = 0; + order = MAX_ORDER; + continue; + } + if ( order == 0 ) + { + printk("Unable to allocate memory with order 0!\n"); + return -ENOMEM; + } + order--; + continue; + } + + rc = guest_physmap_add_page(d, _gfn(start), _mfn(page_to_mfn(page)), + order); + if ( rc != 0 ) + { + printk("Failed to populate memory: [%#lx,%lx): %d\n", + start, start + (1UL << order), rc); + return -ENOMEM; + } + start += 1UL << order; + nr_pages -= 1UL << order; + if ( (++i % MAP_MAX_ITER) == 0 ) + process_pending_softirqs(); + } + + return 0; +#undef MAP_MAX_ITER +} + +/* Steal RAM from the end of a memory region. */ +static int __init pvh_steal_ram(struct domain *d, unsigned long size, + unsigned long align, paddr_t limit, + paddr_t *addr) +{ + unsigned int i = d->arch.nr_e820; + + /* + * Alignment 0 should be set to 1, so it doesn't wrap around in the + * calculations below. + */ + align = align ? : 1; + while ( i-- ) + { + struct e820entry *entry = &d->arch.e820[i]; + + if ( entry->type != E820_RAM || entry->addr + entry->size > limit || + entry->addr < MB(1) ) + continue; + + *addr = (entry->addr + entry->size - size) & ~(align - 1); + if ( *addr < entry->addr ) + continue; + + entry->size = *addr - entry->addr; + return 0; + } + + return -ENOMEM; +} + +/* NB: memory map must be sorted at all times for this to work correctly. */ +static int __init pvh_add_mem_range(struct domain *d, uint64_t s, uint64_t e, + unsigned int type) +{ + struct e820entry *map; + unsigned int i; + + for ( i = 0; i < d->arch.nr_e820; i++ ) + { + uint64_t rs = d->arch.e820[i].addr; + uint64_t re = rs + d->arch.e820[i].size; + + if ( rs == e && d->arch.e820[i].type == type ) + { + d->arch.e820[i].addr = s; + return 0; + } + + if ( re == s && d->arch.e820[i].type == type && + (i + 1 == d->arch.nr_e820 || d->arch.e820[i + 1].addr >= e) ) + { + d->arch.e820[i].size += e - s; + return 0; + } + + if ( rs >= e ) + break; + + if ( re > s ) + return -EEXIST; + } + + map = xzalloc_array(struct e820entry, d->arch.nr_e820 + 1); + if ( !map ) + { + printk(XENLOG_WARNING "E820: out of memory to add region\n"); + return -ENOMEM; + } + + memcpy(map, d->arch.e820, i * sizeof(*d->arch.e820)); + memcpy(map + i + 1, d->arch.e820 + i, + (d->arch.nr_e820 - i) * sizeof(*d->arch.e820)); + map[i].addr = s; + map[i].size = e - s; + map[i].type = type; + xfree(d->arch.e820); + d->arch.e820 = map; + d->arch.nr_e820++; + + return 0; +} + +static int __init pvh_setup_vmx_realmode_helpers(struct domain *d) +{ + p2m_type_t p2mt; + uint32_t rc, *ident_pt; + mfn_t mfn; + paddr_t gaddr; + struct vcpu *v = d->vcpu[0]; + + /* + * Steal some space from the last RAM region below 4GB and use it to + * store the real-mode TSS. It needs to be aligned to 128 so that the + * TSS structure (which accounts for the first 104b) doesn't cross + * a page boundary. + */ + if ( !pvh_steal_ram(d, HVM_VM86_TSS_SIZE, 128, GB(4), &gaddr) ) + { + if ( hvm_copy_to_guest_phys(gaddr, NULL, HVM_VM86_TSS_SIZE, v) != + HVMCOPY_okay ) + printk("Unable to zero VM86 TSS area\n"); + d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED] = + VM86_TSS_UPDATED | ((uint64_t)HVM_VM86_TSS_SIZE << 32) | gaddr; + if ( pvh_add_mem_range(d, gaddr, gaddr + HVM_VM86_TSS_SIZE, + E820_RESERVED) ) + printk("Unable to set VM86 TSS as reserved in the memory map\n"); + } + else + printk("Unable to allocate VM86 TSS area\n"); + + /* Steal some more RAM for the identity page tables. */ + if ( pvh_steal_ram(d, PAGE_SIZE, PAGE_SIZE, GB(4), &gaddr) ) + { + printk("Unable to find memory to stash the identity page tables\n"); + return -ENOMEM; + } + + /* + * Identity-map page table is required for running with CR0.PG=0 + * when using Intel EPT. Create a 32-bit non-PAE page directory of + * superpages. + */ + ident_pt = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)), + &mfn, &p2mt, 0, &rc); + if ( ident_pt == NULL ) + { + printk("Unable to map identity page tables\n"); + return -ENOMEM; + } + write_32bit_pse_identmap(ident_pt); + unmap_domain_page(ident_pt); + put_page(mfn_to_page(mfn_x(mfn))); + d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] = gaddr; + if ( pvh_add_mem_range(d, gaddr, gaddr + PAGE_SIZE, E820_RESERVED) ) + printk("Unable to set identity page tables as reserved in the memory map\n"); + + return 0; +} + +/* Assign the low 1MB to Dom0. */ +static void __init pvh_steal_low_ram(struct domain *d, unsigned long start, + unsigned long nr_pages) +{ + unsigned long mfn; + + ASSERT(start + nr_pages <= PFN_DOWN(MB(1))); + + for ( mfn = start; mfn < start + nr_pages; mfn++ ) + { + struct page_info *pg = mfn_to_page(mfn); + int rc; + + rc = unshare_xen_page_with_guest(pg, dom_io); + if ( rc ) + { + printk("Unable to unshare Xen mfn %#lx: %d\n", mfn, rc); + continue; + } + + share_xen_page_with_guest(pg, d, XENSHARE_writable); + rc = guest_physmap_add_entry(d, _gfn(mfn), _mfn(mfn), 0, p2m_ram_rw); + if ( rc ) + printk("Unable to add mfn %#lx to p2m: %d\n", mfn, rc); + } +} + +static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages) +{ + struct e820entry *entry, *entry_guest; + unsigned int i; + unsigned long pages, cur_pages = 0; + uint64_t start, end; + + /* + * Craft the e820 memory map for Dom0 based on the hardware e820 map. + */ + d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map); + if ( !d->arch.e820 ) + panic("Unable to allocate memory for Dom0 e820 map"); + entry_guest = d->arch.e820; + + /* Clamp e820 memory map to match the memory assigned to Dom0 */ + for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) + { + if ( entry->type != E820_RAM ) + { + *entry_guest = *entry; + goto next; + } + + if ( nr_pages == cur_pages ) + { + /* + * We already have all the assigned memory, + * skip this entry + */ + continue; + } + + /* + * Make sure the start and length are aligned to PAGE_SIZE, because + * that's the minimum granularity of the 2nd stage translation. Since + * the p2m code uses PAGE_ORDER_4K internally, also use it here in + * order to prevent this code from getting out of sync. + */ + start = ROUNDUP(entry->addr, PAGE_SIZE << PAGE_ORDER_4K); + end = (entry->addr + entry->size) & + ~((PAGE_SIZE << PAGE_ORDER_4K) - 1); + if ( start >= end ) + continue; + + entry_guest->type = E820_RAM; + entry_guest->addr = start; + entry_guest->size = end - start; + pages = PFN_DOWN(entry_guest->size); + if ( (cur_pages + pages) > nr_pages ) + { + /* Truncate region */ + entry_guest->size = (nr_pages - cur_pages) << PAGE_SHIFT; + cur_pages = nr_pages; + } + else + { + cur_pages += pages; + } + next: + d->arch.nr_e820++; + entry_guest++; + } + ASSERT(cur_pages == nr_pages); + ASSERT(d->arch.nr_e820 <= e820.nr_map); +} + +static int __init pvh_setup_p2m(struct domain *d) +{ + struct vcpu *v = d->vcpu[0]; + unsigned long nr_pages; + unsigned int i; + int rc; + bool preempted; +#define MB1_PAGES PFN_DOWN(MB(1)) + + nr_pages = compute_dom0_nr_pages(d, NULL, 0); + + pvh_setup_e820(d, nr_pages); + do { + preempted = false; + paging_set_allocation(d, dom0_paging_pages(d, nr_pages), + &preempted); + process_pending_softirqs(); + } while ( preempted ); + + /* + * Memory below 1MB is identity mapped. + * NB: this only makes sense when booted from legacy BIOS. + */ + rc = modify_identity_mmio(d, 0, MB1_PAGES, true); + if ( rc ) + { + printk("Failed to identity map low 1MB: %d\n", rc); + return rc; + } + + /* Populate memory map. */ + for ( i = 0; i < d->arch.nr_e820; i++ ) + { + unsigned long addr, size; + + if ( d->arch.e820[i].type != E820_RAM ) + continue; + + addr = PFN_DOWN(d->arch.e820[i].addr); + size = PFN_DOWN(d->arch.e820[i].size); + + if ( addr >= MB1_PAGES ) + rc = pvh_populate_memory_range(d, addr, size); + else + { + ASSERT(addr + size < MB1_PAGES); + pvh_steal_low_ram(d, addr, size); + } + + if ( rc ) + return rc; + } + + if ( cpu_has_vmx && paging_mode_hap(d) && !vmx_unrestricted_guest(v) ) + { + /* + * Since Dom0 cannot be migrated, we will only setup the + * unrestricted guest helpers if they are needed by the current + * hardware we are running on. + */ + rc = pvh_setup_vmx_realmode_helpers(d); + if ( rc ) + return rc; + } + + return 0; +#undef MB1_PAGES +} + +static int __init pvh_load_kernel(struct domain *d, const module_t *image, + unsigned long image_headroom, + module_t *initrd, void *image_base, + char *cmdline, paddr_t *entry, + paddr_t *start_info_addr) +{ + void *image_start = image_base + image_headroom; + unsigned long image_len = image->mod_end; + struct elf_binary elf; + struct elf_dom_parms parms; + paddr_t last_addr; + struct hvm_start_info start_info = { 0 }; + struct hvm_modlist_entry mod = { 0 }; + struct vcpu *v = d->vcpu[0]; + int rc; + + if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 ) + { + printk("Error trying to detect bz compressed kernel\n"); + return rc; + } + + if ( (rc = elf_init(&elf, image_start, image_len)) != 0 ) + { + printk("Unable to init ELF\n"); + return rc; + } +#ifdef VERBOSE + elf_set_verbose(&elf); +#endif + elf_parse_binary(&elf); + if ( (rc = elf_xen_parse(&elf, &parms)) != 0 ) + { + printk("Unable to parse kernel for ELFNOTES\n"); + return rc; + } + + if ( parms.phys_entry == UNSET_ADDR32 ) + { + printk("Unable to find XEN_ELFNOTE_PHYS32_ENTRY address\n"); + return -EINVAL; + } + + printk("OS: %s version: %s loader: %s bitness: %s\n", parms.guest_os, + parms.guest_ver, parms.loader, + elf_64bit(&elf) ? "64-bit" : "32-bit"); + + /* Copy the OS image and free temporary buffer. */ + elf.dest_base = (void *)(parms.virt_kstart - parms.virt_base); + elf.dest_size = parms.virt_kend - parms.virt_kstart; + + elf_set_vcpu(&elf, v); + rc = elf_load_binary(&elf); + if ( rc < 0 ) + { + printk("Failed to load kernel: %d\n", rc); + printk("Xen dom0 kernel broken ELF: %s\n", elf_check_broken(&elf)); + return rc; + } + + last_addr = ROUNDUP(parms.virt_kend - parms.virt_base, PAGE_SIZE); + + if ( initrd != NULL ) + { + rc = hvm_copy_to_guest_phys(last_addr, mfn_to_virt(initrd->mod_start), + initrd->mod_end, v); + if ( rc ) + { + printk("Unable to copy initrd to guest\n"); + return rc; + } + + mod.paddr = last_addr; + mod.size = initrd->mod_end; + last_addr += ROUNDUP(initrd->mod_end, PAGE_SIZE); + } + + /* Free temporary buffers. */ + discard_initial_images(); + + if ( cmdline != NULL ) + { + rc = hvm_copy_to_guest_phys(last_addr, cmdline, strlen(cmdline) + 1, v); + if ( rc ) + { + printk("Unable to copy guest command line\n"); + return rc; + } + start_info.cmdline_paddr = last_addr; + /* + * Round up to 32/64 bits (depending on the guest kernel bitness) so + * the modlist/start_info is aligned. + */ + last_addr += ROUNDUP(strlen(cmdline) + 1, elf_64bit(&elf) ? 8 : 4); + } + if ( initrd != NULL ) + { + rc = hvm_copy_to_guest_phys(last_addr, &mod, sizeof(mod), v); + if ( rc ) + { + printk("Unable to copy guest modules\n"); + return rc; + } + start_info.modlist_paddr = last_addr; + start_info.nr_modules = 1; + last_addr += sizeof(mod); + } + + start_info.magic = XEN_HVM_START_MAGIC_VALUE; + start_info.flags = SIF_PRIVILEGED | SIF_INITDOMAIN; + rc = hvm_copy_to_guest_phys(last_addr, &start_info, sizeof(start_info), v); + if ( rc ) + { + printk("Unable to copy start info to guest\n"); + return rc; + } + + *entry = parms.phys_entry; + *start_info_addr = last_addr; + + return 0; +} + +static int __init pvh_setup_cpus(struct domain *d, paddr_t entry, + paddr_t start_info) +{ + struct vcpu *v = d->vcpu[0]; + unsigned int cpu, i; + int rc; + /* + * This sets the vCPU state according to the state described in + * docs/misc/hvmlite.markdown. + */ + vcpu_hvm_context_t cpu_ctx = { + .mode = VCPU_HVM_MODE_32B, + .cpu_regs.x86_32.ebx = start_info, + .cpu_regs.x86_32.eip = entry, + .cpu_regs.x86_32.cr0 = X86_CR0_PE | X86_CR0_ET, + .cpu_regs.x86_32.cs_limit = ~0u, + .cpu_regs.x86_32.ds_limit = ~0u, + .cpu_regs.x86_32.ss_limit = ~0u, + .cpu_regs.x86_32.tr_limit = 0x67, + .cpu_regs.x86_32.cs_ar = 0xc9b, + .cpu_regs.x86_32.ds_ar = 0xc93, + .cpu_regs.x86_32.ss_ar = 0xc93, + .cpu_regs.x86_32.tr_ar = 0x8b, + }; + + cpu = v->processor; + for ( i = 1; i < d->max_vcpus; i++ ) + { + cpu = cpumask_cycle(cpu, &dom0_cpus); + setup_dom0_vcpu(d, i, cpu); + } + + rc = arch_set_info_hvm_guest(v, &cpu_ctx); + if ( rc ) + { + printk("Unable to setup Dom0 BSP context: %d\n", rc); + return rc; + } + + rc = setup_permissions(d); + if ( rc ) + { + panic("Unable to setup Dom0 permissions: %d\n", rc); + return rc; + } + + update_domain_wallclock_time(d); + + clear_bit(_VPF_down, &v->pause_flags); + + return 0; +} + +static int __init acpi_count_intr_ovr(struct acpi_subtable_header *header, + const unsigned long end) +{ + + acpi_intr_overrides++; + return 0; +} + +static int __init acpi_set_intr_ovr(struct acpi_subtable_header *header, + const unsigned long end) +{ + const struct acpi_madt_interrupt_override *intr = + container_of(header, struct acpi_madt_interrupt_override, header); + + *intsrcovr = *intr; + intsrcovr++; + + return 0; +} + +static int __init acpi_count_nmi_src(struct acpi_subtable_header *header, + const unsigned long end) +{ + + acpi_nmi_sources++; + return 0; +} + +static int __init acpi_set_nmi_src(struct acpi_subtable_header *header, + const unsigned long end) +{ + const struct acpi_madt_nmi_source *src = + container_of(header, struct acpi_madt_nmi_source, header); + + *nmisrc = *src; + nmisrc++; + + return 0; +} + +static int __init pvh_setup_acpi_madt(struct domain *d, paddr_t *addr) +{ + struct acpi_table_madt *madt; + struct acpi_table_header *table; + struct acpi_madt_io_apic *io_apic; + struct acpi_madt_local_x2apic *x2apic; + acpi_status status; + unsigned long size; + unsigned int i, max_vcpus; + int rc; + + /* Count number of interrupt overrides in the MADT. */ + acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, + acpi_count_intr_ovr, UINT_MAX); + + /* Count number of NMI sources in the MADT. */ + acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_count_nmi_src, + UINT_MAX); + + max_vcpus = dom0_max_vcpus(); + /* Calculate the size of the crafted MADT. */ + size = sizeof(*madt); + /* + * FIXME: the current vIO-APIC code just supports one IO-APIC instance + * per domain. This must be fixed in order to provide the same amount of + * IO APICs as available on bare metal. + */ + size += sizeof(*io_apic); + size += sizeof(*intsrcovr) * acpi_intr_overrides; + size += sizeof(*nmisrc) * acpi_nmi_sources; + size += sizeof(*x2apic) * max_vcpus; + + madt = xzalloc_bytes(size); + if ( !madt ) + { + printk("Unable to allocate memory for MADT table\n"); + rc = -ENOMEM; + goto out; + } + + /* Copy the native MADT table header. */ + status = acpi_get_table(ACPI_SIG_MADT, 0, &table); + if ( !ACPI_SUCCESS(status) ) + { + printk("Failed to get MADT ACPI table, aborting.\n"); + rc = -EINVAL; + goto out; + } + madt->header = *table; + madt->address = APIC_DEFAULT_PHYS_BASE; + /* + * NB: this is currently set to 4, which is the revision in the ACPI + * spec 6.1. Sadly ACPICA doesn't provide revision numbers for the + * tables described in the headers. + */ + madt->header.revision = min_t(unsigned char, table->revision, 4); + + /* + * Setup the IO APIC entry. + * FIXME: the current vIO-APIC code just supports one IO-APIC instance + * per domain. This must be fixed in order to provide the same amount of + * IO APICs as available on bare metal, and with the same IDs as found in + * the native IO APIC MADT entries. + */ + if ( nr_ioapics > 1 ) + printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 emulated IO APIC\n", + nr_ioapics); + io_apic = (void *)(madt + 1); + io_apic->header.type = ACPI_MADT_TYPE_IO_APIC; + io_apic->header.length = sizeof(*io_apic); + io_apic->id = domain_vioapic(d)->id; + io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS; + + x2apic = (void *)(io_apic + 1); + for ( i = 0; i < max_vcpus; i++ ) + { + x2apic->header.type = ACPI_MADT_TYPE_LOCAL_X2APIC; + x2apic->header.length = sizeof(*x2apic); + x2apic->uid = i; + x2apic->local_apic_id = i * 2; + x2apic->lapic_flags = ACPI_MADT_ENABLED; + x2apic++; + } + + /* Setup interrupt overrides. */ + intsrcovr = (void *)x2apic; + acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_set_intr_ovr, + acpi_intr_overrides); + + /* Setup NMI sources. */ + nmisrc = (void *)intsrcovr; + acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_set_nmi_src, + acpi_nmi_sources); + + ASSERT(((void *)nmisrc - (void *)madt) == size); + madt->header.length = size; + /* + * Calling acpi_tb_checksum here is a layering violation, but + * introducing a wrapper for such simple usage seems overkill. + */ + madt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, madt), size); + + /* Place the new MADT in guest memory space. */ + if ( pvh_steal_ram(d, size, 0, GB(4), addr) ) + { + printk("Unable to find allocate guest RAM for MADT\n"); + rc = -ENOMEM; + goto out; + } + + /* Mark this region as E820_ACPI. */ + if ( pvh_add_mem_range(d, *addr, *addr + size, E820_ACPI) ) + printk("Unable to add MADT region to memory map\n"); + + rc = hvm_copy_to_guest_phys(*addr, madt, size, d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy MADT into guest memory\n"); + goto out; + } + + rc = 0; + + out: + xfree(madt); + + return rc; +} + +static bool __init acpi_memory_banned(unsigned long address, + unsigned long size) +{ + unsigned long mfn, nr_pages, i; + + mfn = PFN_DOWN(address); + nr_pages = PFN_UP((address & ~PAGE_MASK) + size); + for ( i = 0 ; i < nr_pages; i++ ) + if ( !page_is_ram_type(mfn + i, RAM_TYPE_RESERVED) && + !page_is_ram_type(mfn + i, RAM_TYPE_ACPI) ) + return true; + + return false; +} + +static bool __init pvh_acpi_table_allowed(const char *sig) +{ + static const char __initconst banned_tables[][ACPI_NAME_SIZE] = { + ACPI_SIG_HPET, ACPI_SIG_SLIT, ACPI_SIG_SRAT, ACPI_SIG_MPST, + ACPI_SIG_PMTT, ACPI_SIG_MADT, ACPI_SIG_DMAR}; + unsigned int i; + + for ( i = 0 ; i < ARRAY_SIZE(banned_tables); i++ ) + if ( strncmp(sig, banned_tables[i], ACPI_NAME_SIZE) == 0 ) + return false; + + /* Make sure table doesn't reside in a RAM region. */ + if ( acpi_memory_banned(acpi_gbl_root_table_list.tables[i].address, + acpi_gbl_root_table_list.tables[i].length) ) + { + printk("Skipping table %.4s because resides in a non-ACPI, non-reserved region\n", + sig); + return false; + } + + return true; +} + +static int __init pvh_setup_acpi_xsdt(struct domain *d, paddr_t madt_addr, + paddr_t *addr) +{ + struct acpi_table_xsdt *xsdt; + struct acpi_table_header *table; + struct acpi_table_rsdp *rsdp; + unsigned long size = sizeof(*xsdt); + unsigned int i, j, num_tables = 0; + paddr_t xsdt_paddr; + int rc; + + /* + * Restore original DMAR table signature, we are going to filter it from + * the new XSDT that is presented to the guest, so it is no longer + * necessary to have it's signature zapped. + */ + acpi_dmar_reinstate(); + + /* Count the number of tables that will be added to the XSDT. */ + for( i = 0; i < acpi_gbl_root_table_list.count; i++ ) + { + const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; + + if ( pvh_acpi_table_allowed(sig) ) + num_tables++; + } + + /* + * No need to add or subtract anything because struct acpi_table_xsdt + * includes one array slot already, and we have filtered out the original + * MADT and we are going to add a custom built MADT. + */ + size += num_tables * sizeof(xsdt->table_offset_entry[0]); + + xsdt = xzalloc_bytes(size); + if ( !xsdt ) + { + printk("Unable to allocate memory for XSDT table\n"); + rc = -ENOMEM; + goto out; + } + + /* Copy the native XSDT table header. */ + rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(*rsdp)); + if ( !rsdp ) + { + printk("Unable to map RSDP\n"); + rc = -EINVAL; + goto out; + } + xsdt_paddr = rsdp->xsdt_physical_address; + acpi_os_unmap_memory(rsdp, sizeof(*rsdp)); + table = acpi_os_map_memory(xsdt_paddr, sizeof(*table)); + if ( !table ) + { + printk("Unable to map XSDT\n"); + rc = -EINVAL; + goto out; + } + xsdt->header = *table; + acpi_os_unmap_memory(table, sizeof(*table)); + + /* Add the custom MADT. */ + xsdt->table_offset_entry[0] = madt_addr; + + /* Copy the addresses of the rest of the allowed tables. */ + for( i = 0, j = 1; i < acpi_gbl_root_table_list.count; i++ ) + { + const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; + + if ( pvh_acpi_table_allowed(sig) ) + xsdt->table_offset_entry[j++] = + acpi_gbl_root_table_list.tables[i].address; + } + + xsdt->header.revision = 1; + xsdt->header.length = size; + /* + * Calling acpi_tb_checksum here is a layering violation, but + * introducing a wrapper for such simple usage seems overkill. + */ + xsdt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, xsdt), size); + + /* Place the new XSDT in guest memory space. */ + if ( pvh_steal_ram(d, size, 0, GB(4), addr) ) + { + printk("Unable to find guest RAM for XSDT\n"); + rc = -ENOMEM; + goto out; + } + + /* Mark this region as E820_ACPI. */ + if ( pvh_add_mem_range(d, *addr, *addr + size, E820_ACPI) ) + printk("Unable to add XSDT region to memory map\n"); + + rc = hvm_copy_to_guest_phys(*addr, xsdt, size, d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy XSDT into guest memory\n"); + goto out; + } + + rc = 0; + + out: + xfree(xsdt); + + return rc; +} + +static int __init pvh_setup_acpi(struct domain *d, paddr_t start_info) +{ + unsigned long pfn, nr_pages; + paddr_t madt_paddr, xsdt_paddr, rsdp_paddr; + unsigned int i; + int rc; + struct acpi_table_rsdp *native_rsdp, rsdp = { + .signature = ACPI_SIG_RSDP, + .revision = 2, + .length = sizeof(rsdp), + }; + + + /* Scan top-level tables and add their regions to the guest memory map. */ + for( i = 0; i < acpi_gbl_root_table_list.count; i++ ) + { + const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii; + unsigned long addr = acpi_gbl_root_table_list.tables[i].address; + unsigned long size = acpi_gbl_root_table_list.tables[i].length; + + /* + * Make sure the original MADT is also mapped, so that Dom0 can + * properly access the data returned by _MAT methods in case it's + * re-using MADT memory. + */ + if ( strncmp(sig, ACPI_SIG_MADT, ACPI_NAME_SIZE) + ? pvh_acpi_table_allowed(sig) + : !acpi_memory_banned(addr, size) ) + pvh_add_mem_range(d, addr, addr + size, E820_ACPI); + } + + /* Identity map ACPI e820 regions. */ + for ( i = 0; i < d->arch.nr_e820; i++ ) + { + if ( d->arch.e820[i].type != E820_ACPI && + d->arch.e820[i].type != E820_NVS ) + continue; + + pfn = PFN_DOWN(d->arch.e820[i].addr); + nr_pages = PFN_UP((d->arch.e820[i].addr & ~PAGE_MASK) + + d->arch.e820[i].size); + + rc = modify_identity_mmio(d, pfn, nr_pages, true); + if ( rc ) + { + printk("Failed to map ACPI region [%#lx, %#lx) into Dom0 memory map\n", + pfn, pfn + nr_pages); + return rc; + } + } + + rc = pvh_setup_acpi_madt(d, &madt_paddr); + if ( rc ) + return rc; + + rc = pvh_setup_acpi_xsdt(d, madt_paddr, &xsdt_paddr); + if ( rc ) + return rc; + + /* Craft a custom RSDP. */ + native_rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(rsdp)); + if ( !native_rsdp ) + { + printk("Failed to map native RSDP\n"); + return -ENOMEM; + } + memcpy(rsdp.oem_id, native_rsdp->oem_id, sizeof(rsdp.oem_id)); + acpi_os_unmap_memory(native_rsdp, sizeof(rsdp)); + rsdp.xsdt_physical_address = xsdt_paddr; + /* + * Calling acpi_tb_checksum here is a layering violation, but + * introducing a wrapper for such simple usage seems overkill. + */ + rsdp.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp), + ACPI_RSDP_REV0_SIZE); + rsdp.extended_checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp), + sizeof(rsdp)); + + /* + * Place the new RSDP in guest memory space. + * + * NB: this RSDP is not going to replace the original RSDP, which should + * still be accessible to the guest. However that RSDP is going to point to + * the native RSDT, and should not be used for the Dom0 kernel's boot + * purposes (we keep it visible for post boot access). + */ + if ( pvh_steal_ram(d, sizeof(rsdp), 0, GB(4), &rsdp_paddr) ) + { + printk("Unable to allocate guest RAM for RSDP\n"); + return -ENOMEM; + } + + /* Mark this region as E820_ACPI. */ + if ( pvh_add_mem_range(d, rsdp_paddr, rsdp_paddr + sizeof(rsdp), + E820_ACPI) ) + printk("Unable to add RSDP region to memory map\n"); + + /* Copy RSDP into guest memory. */ + rc = hvm_copy_to_guest_phys(rsdp_paddr, &rsdp, sizeof(rsdp), d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy RSDP into guest memory\n"); + return rc; + } + + /* Copy RSDP address to start_info. */ + rc = hvm_copy_to_guest_phys(start_info + + offsetof(struct hvm_start_info, rsdp_paddr), + &rsdp_paddr, + sizeof(((struct hvm_start_info *) + 0)->rsdp_paddr), + d->vcpu[0]); + if ( rc ) + { + printk("Unable to copy RSDP into guest memory\n"); + return rc; + } + + return 0; +} + +static int __init construct_dom0_pvh(struct domain *d, const module_t *image, + unsigned long image_headroom, + module_t *initrd, + void *(*bootstrap_map)(const module_t *), + char *cmdline) +{ + paddr_t entry, start_info; + int rc; + + printk("** Building a PVH Dom0 **\n"); + + iommu_hwdom_init(d); + + rc = pvh_setup_p2m(d); + if ( rc ) + { + printk("Failed to setup Dom0 physical memory map\n"); + return rc; + } + + rc = pvh_load_kernel(d, image, image_headroom, initrd, bootstrap_map(image), + cmdline, &entry, &start_info); + if ( rc ) + { + printk("Failed to load Dom0 kernel\n"); + return rc; + } + + rc = pvh_setup_cpus(d, entry, start_info); + if ( rc ) + { + printk("Failed to setup Dom0 CPUs: %d\n", rc); + return rc; + } + + rc = pvh_setup_acpi(d, start_info); + if ( rc ) + { + printk("Failed to setup Dom0 ACPI tables: %d\n", rc); + return rc; + } + + panic("Building a PVHv2 Dom0 is not yet supported."); + return 0; +} + +int __init construct_dom0(struct domain *d, const module_t *image, + unsigned long image_headroom, module_t *initrd, + void *(*bootstrap_map)(const module_t *), + char *cmdline) +{ + /* Sanity! */ + BUG_ON(d->domain_id != 0); + BUG_ON(d->vcpu[0] == NULL); + BUG_ON(d->vcpu[0]->is_initialised); + + process_pending_softirqs(); + + return (is_hvm_domain(d) ? construct_dom0_pvh : construct_dom0_pv) + (d, image, image_headroom, initrd,bootstrap_map, cmdline); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c deleted file mode 100644 index 00ce5f2..0000000 --- a/xen/arch/x86/domain_build.c +++ /dev/null @@ -1,2438 +0,0 @@ -/****************************************************************************** - * domain_build.c - * - * Copyright (c) 2002-2005, K A Fraser - */ - -#include <xen/init.h> -#include <xen/lib.h> -#include <xen/ctype.h> -#include <xen/sched.h> -#include <xen/sched-if.h> -#include <xen/smp.h> -#include <xen/delay.h> -#include <xen/event.h> -#include <xen/console.h> -#include <xen/kernel.h> -#include <xen/domain.h> -#include <xen/version.h> -#include <xen/iocap.h> -#include <xen/bitops.h> -#include <xen/compat.h> -#include <xen/libelf.h> -#include <xen/pfn.h> -#include <xen/guest_access.h> -#include <xen/acpi.h> -#include <asm/regs.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/processor.h> -#include <asm/desc.h> -#include <asm/i387.h> -#include <asm/paging.h> -#include <asm/p2m.h> -#include <asm/e820.h> -#include <asm/acpi.h> -#include <asm/setup.h> -#include <asm/bzimage.h> /* for bzimage_parse */ -#include <asm/io_apic.h> -#include <asm/hpet.h> - -#include <acpi/actables.h> - -#include <public/version.h> -#include <public/hvm/hvm_info_table.h> -#include <public/arch-x86/hvm/start_info.h> -#include <public/hvm/hvm_vcpu.h> - -static long __initdata dom0_nrpages; -static long __initdata dom0_min_nrpages; -static long __initdata dom0_max_nrpages = LONG_MAX; - -/* - * Have the TSS cover the ISA port range, which makes it - * - 104 bytes base structure - * - 32 bytes interrupt redirection bitmap - * - 128 bytes I/O bitmap - * - one trailing byte - * or a total of 265 bytes. - * - * NB: as PVHv2 Dom0 doesn't have legacy devices (ISA), it shouldn't have any - * business in accessing the ISA port range, much less in real mode, and due to - * the lack of firmware it shouldn't also execute any INT instruction. This is - * done just for consistency with what hvmloader does. - */ -#define HVM_VM86_TSS_SIZE 265 - -static unsigned int __initdata acpi_intr_overrides; -static struct acpi_madt_interrupt_override __initdata *intsrcovr; - -static unsigned int __initdata acpi_nmi_sources; -static struct acpi_madt_nmi_source __initdata *nmisrc; - -/* - * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>] - * - * <min_amt>: The minimum amount of memory which should be allocated for dom0. - * <max_amt>: The maximum amount of memory which should be allocated for dom0. - * <amt>: The precise amount of memory to allocate for dom0. - * - * Notes: - * 1. <amt> is clamped from below by <min_amt> and from above by available - * memory and <max_amt> - * 2. <min_amt> is clamped from above by available memory and <max_amt> - * 3. <min_amt> is ignored if it is greater than <max_amt> - * 4. If <amt> is not specified, it is calculated as follows: - * "All of memory is allocated to domain 0, minus 1/16th which is reserved - * for uses such as DMA buffers (the reservation is clamped to 128MB)." - * - * Each value can be specified as positive or negative: - * If +ve: The specified amount is an absolute value. - * If -ve: The specified amount is subtracted from total available memory. - */ -static long __init parse_amt(const char *s, const char **ps) -{ - long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT; - return (*s == '-') ? -pages : pages; -} -static void __init parse_dom0_mem(const char *s) -{ - do { - if ( !strncmp(s, "min:", 4) ) - dom0_min_nrpages = parse_amt(s+4, &s); - else if ( !strncmp(s, "max:", 4) ) - dom0_max_nrpages = parse_amt(s+4, &s); - else - dom0_nrpages = parse_amt(s, &s); - } while ( *s++ == ',' ); -} -custom_param("dom0_mem", parse_dom0_mem); - -static unsigned int __initdata opt_dom0_max_vcpus_min = 1; -static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX; - -static void __init parse_dom0_max_vcpus(const char *s) -{ - if ( *s == '-' ) /* -M */ - opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0); - else /* N, N-, or N-M */ - { - opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0); - if ( opt_dom0_max_vcpus_min == 0 ) - opt_dom0_max_vcpus_min = 1; - if ( !*s ) /* N */ - opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min; - else if ( *s++ == '-' && *s ) /* N-M */ - opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0); - } -} -custom_param("dom0_max_vcpus", parse_dom0_max_vcpus); - -static __initdata unsigned int dom0_nr_pxms; -static __initdata unsigned int dom0_pxms[MAX_NUMNODES] = - { [0 ... MAX_NUMNODES - 1] = ~0 }; -static __initdata bool_t dom0_affinity_relaxed; - -static void __init parse_dom0_nodes(const char *s) -{ - do { - if ( isdigit(*s) ) - dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0); - else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') ) - { - dom0_affinity_relaxed = 1; - s += 7; - } - else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') ) - { - dom0_affinity_relaxed = 0; - s += 6; - } - else - break; - } while ( ++dom0_nr_pxms < ARRAY_SIZE(dom0_pxms) && *s++ == ',' ); -} -custom_param("dom0_nodes", parse_dom0_nodes); - -static cpumask_t __initdata dom0_cpus; - -static struct vcpu *__init setup_dom0_vcpu(struct domain *d, - unsigned int vcpu_id, - unsigned int cpu) -{ - struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu); - - if ( v ) - { - if ( !d->is_pinned && !dom0_affinity_relaxed ) - cpumask_copy(v->cpu_hard_affinity, &dom0_cpus); - cpumask_copy(v->cpu_soft_affinity, &dom0_cpus); - } - - return v; -} - -static nodemask_t __initdata dom0_nodes; - -unsigned int __init dom0_max_vcpus(void) -{ - unsigned int i, max_vcpus, limit; - nodeid_t node; - - for ( i = 0; i < dom0_nr_pxms; ++i ) - if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE ) - node_set(node, dom0_nodes); - nodes_and(dom0_nodes, dom0_nodes, node_online_map); - if ( nodes_empty(dom0_nodes) ) - dom0_nodes = node_online_map; - for_each_node_mask ( node, dom0_nodes ) - cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node)); - cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid); - if ( cpumask_empty(&dom0_cpus) ) - cpumask_copy(&dom0_cpus, cpupool0->cpu_valid); - - max_vcpus = cpumask_weight(&dom0_cpus); - if ( opt_dom0_max_vcpus_min > max_vcpus ) - max_vcpus = opt_dom0_max_vcpus_min; - if ( opt_dom0_max_vcpus_max < max_vcpus ) - max_vcpus = opt_dom0_max_vcpus_max; - limit = dom0_pvh ? HVM_MAX_VCPUS : MAX_VIRT_CPUS; - if ( max_vcpus > limit ) - max_vcpus = limit; - - return max_vcpus; -} - -struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0) -{ - unsigned int max_vcpus = dom0_max_vcpus(); - - dom0->node_affinity = dom0_nodes; - dom0->auto_node_affinity = !dom0_nr_pxms; - - dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus); - if ( !dom0->vcpu ) - return NULL; - dom0->max_vcpus = max_vcpus; - - return setup_dom0_vcpu(dom0, 0, cpumask_first(&dom0_cpus)); -} - -#ifdef CONFIG_SHADOW_PAGING -bool __initdata opt_dom0_shadow; -boolean_param("dom0_shadow", opt_dom0_shadow); -#endif -bool __initdata dom0_pvh; - -/* - * List of parameters that affect Dom0 creation: - * - * - pvh Create a PVHv2 Dom0. - * - shadow Use shadow paging for Dom0. - */ -static void __init parse_dom0_param(char *s) -{ - char *ss; - - do { - - ss = strchr(s, ','); - if ( ss ) - *ss = '\0'; - - if ( !strcmp(s, "pvh") ) - dom0_pvh = true; -#ifdef CONFIG_SHADOW_PAGING - else if ( !strcmp(s, "shadow") ) - opt_dom0_shadow = true; -#endif - - s = ss + 1; - } while ( ss ); -} -custom_param("dom0", parse_dom0_param); - -static char __initdata opt_dom0_ioports_disable[200] = ""; -string_param("dom0_ioports_disable", opt_dom0_ioports_disable); - -static bool_t __initdata ro_hpet = 1; -boolean_param("ro-hpet", ro_hpet); - -/* Allow ring-3 access in long mode as guest cannot use ring 1 ... */ -#define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) -#define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL) -/* ... except for compatibility mode guests. */ -#define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) -#define L2_PROT (BASE_PROT|_PAGE_DIRTY) -#define L3_PROT (BASE_PROT|_PAGE_DIRTY) -#define L4_PROT (BASE_PROT|_PAGE_DIRTY) - -static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node; - -static struct page_info * __init alloc_chunk( - struct domain *d, unsigned long max_pages) -{ - static unsigned int __initdata last_order = MAX_ORDER; - struct page_info *page; - unsigned int order = get_order_from_pages(max_pages), free_order; - - if ( order > last_order ) - order = last_order; - else if ( max_pages & (max_pages - 1) ) - --order; - while ( (page = alloc_domheap_pages(d, order, memflags)) == NULL ) - if ( order-- == 0 ) - break; - if ( page ) - last_order = order; - else if ( memflags ) - { - /* - * Allocate up to 2MB at a time: It prevents allocating very large - * chunks from DMA pools before the >4GB pool is fully depleted. - */ - last_order = 21 - PAGE_SHIFT; - memflags = 0; - return alloc_chunk(d, max_pages); - } - - /* - * Make a reasonable attempt at finding a smaller chunk at a higher - * address, to avoid allocating from low memory as much as possible. - */ - for ( free_order = order; !memflags && page && order--; ) - { - struct page_info *pg2; - - if ( d->tot_pages + (1 << order) > d->max_pages ) - continue; - pg2 = alloc_domheap_pages(d, order, MEMF_exact_node); - if ( pg2 > page ) - { - free_domheap_pages(page, free_order); - page = pg2; - free_order = order; - } - else if ( pg2 ) - free_domheap_pages(pg2, order); - } - return page; -} - -static unsigned long __init dom0_paging_pages(const struct domain *d, - unsigned long nr_pages) -{ - /* Copied from: libxl_get_required_shadow_memory() */ - unsigned long memkb = nr_pages * (PAGE_SIZE / 1024); - - memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024)); - - return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT); -} - -static unsigned long __init compute_dom0_nr_pages( - struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len) -{ - nodeid_t node; - unsigned long avail = 0, nr_pages, min_pages, max_pages; - bool_t need_paging; - - for_each_node_mask ( node, dom0_nodes ) - avail += avail_domheap_pages_region(node, 0, 0) + - initial_images_nrpages(node); - - /* Reserve memory for further dom0 vcpu-struct allocations... */ - avail -= (d->max_vcpus - 1UL) - << get_order_from_bytes(sizeof(struct vcpu)); - /* ...and compat_l4's, if needed. */ - if ( is_pv_32bit_domain(d) ) - avail -= d->max_vcpus - 1; - - /* Reserve memory for iommu_dom0_init() (rough estimate). */ - if ( iommu_enabled ) - { - unsigned int s; - - for ( s = 9; s < BITS_PER_LONG; s += 9 ) - avail -= max_pdx >> s; - } - - need_paging = is_hvm_domain(d) ? !iommu_hap_pt_share || !paging_mode_hap(d) - : opt_dom0_shadow; - for ( ; ; need_paging = 0 ) - { - nr_pages = dom0_nrpages; - min_pages = dom0_min_nrpages; - max_pages = dom0_max_nrpages; - - /* - * If allocation isn't specified, reserve 1/16th of available memory - * for things like DMA buffers. This reservation is clamped to a - * maximum of 128MB. - */ - if ( nr_pages == 0 ) - nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT)); - - /* Negative specification means "all memory - specified amount". */ - if ( (long)nr_pages < 0 ) nr_pages += avail; - if ( (long)min_pages < 0 ) min_pages += avail; - if ( (long)max_pages < 0 ) max_pages += avail; - - /* Clamp according to min/max limits and available memory. */ - nr_pages = max(nr_pages, min_pages); - nr_pages = min(nr_pages, max_pages); - nr_pages = min(nr_pages, avail); - - if ( !need_paging ) - break; - - /* Reserve memory for shadow or HAP. */ - avail -= dom0_paging_pages(d, nr_pages); - } - - if ( is_pv_domain(d) && - (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && - ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) ) - { - /* - * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M - * note) require that there is enough virtual space beyond the initial - * allocation to set up their initial page tables. This space is - * roughly the same size as the p2m table, so make sure the initial - * allocation doesn't consume more than about half the space that's - * available between params.virt_base and the address space end. - */ - unsigned long vstart, vend, end; - size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long); - - vstart = parms->virt_base; - vend = round_pgup(parms->virt_kend); - if ( !parms->unmapped_initrd ) - vend += round_pgup(initrd_len); - end = vend + nr_pages * sizeof_long; - - if ( end > vstart ) - end += end - vstart; - if ( end <= vstart || - (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) ) - { - end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long); - nr_pages = (end - vend) / (2 * sizeof_long); - if ( dom0_min_nrpages > 0 && nr_pages < min_pages ) - nr_pages = min_pages; - printk("Dom0 memory clipped to %lu pages\n", nr_pages); - } - } - - d->max_pages = min_t(unsigned long, max_pages, UINT_MAX); - - return nr_pages; -} - -static void __init process_dom0_ioports_disable(struct domain *dom0) -{ - unsigned long io_from, io_to; - char *t, *s = opt_dom0_ioports_disable; - const char *u; - - if ( *s == '\0' ) - return; - - while ( (t = strsep(&s, ",")) != NULL ) - { - io_from = simple_strtoul(t, &u, 16); - if ( u == t ) - { - parse_error: - printk("Invalid ioport range <%s> " - "in dom0_ioports_disable, skipping\n", t); - continue; - } - - if ( *u == '\0' ) - io_to = io_from; - else if ( *u == '-' ) - io_to = simple_strtoul(u + 1, &u, 16); - else - goto parse_error; - - if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) ) - goto parse_error; - - printk("Disabling dom0 access to ioport range %04lx-%04lx\n", - io_from, io_to); - - if ( ioports_deny_access(dom0, io_from, io_to) != 0 ) - BUG(); - } -} - -static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, - unsigned long mfn, unsigned long vphysmap_s) -{ - if ( !is_pv_32bit_domain(d) ) - ((unsigned long *)vphysmap_s)[pfn] = mfn; - else - ((unsigned int *)vphysmap_s)[pfn] = mfn; - - set_gpfn_from_mfn(mfn, pfn); -} - -static __init void mark_pv_pt_pages_rdonly(struct domain *d, - l4_pgentry_t *l4start, - unsigned long vpt_start, - unsigned long nr_pt_pages) -{ - unsigned long count; - struct page_info *page; - l4_pgentry_t *pl4e; - l3_pgentry_t *pl3e; - l2_pgentry_t *pl2e; - l1_pgentry_t *pl1e; - - pl4e = l4start + l4_table_offset(vpt_start); - pl3e = l4e_to_l3e(*pl4e); - pl3e += l3_table_offset(vpt_start); - pl2e = l3e_to_l2e(*pl3e); - pl2e += l2_table_offset(vpt_start); - pl1e = l2e_to_l1e(*pl2e); - pl1e += l1_table_offset(vpt_start); - for ( count = 0; count < nr_pt_pages; count++ ) - { - l1e_remove_flags(*pl1e, _PAGE_RW); - page = mfn_to_page(l1e_get_pfn(*pl1e)); - - /* Read-only mapping + PGC_allocated + page-table page. */ - page->count_info = PGC_allocated | 3; - page->u.inuse.type_info |= PGT_validated | 1; - - /* Top-level p.t. is pinned. */ - if ( (page->u.inuse.type_info & PGT_type_mask) == - (!is_pv_32bit_domain(d) ? - PGT_l4_page_table : PGT_l3_page_table) ) - { - page->count_info += 1; _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxx https://lists.xenproject.org/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |