[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v1 15/18] x86: rework domain page allocation


  • To: "Daniel P. Smith" <dpsmith@xxxxxxxxxxxxxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Wed, 27 Jul 2022 15:22:08 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=Uz1Pzwt+NpeMuxYKJIKtLCQNN3fyW25kuPYevPazw7U=; b=kO1XAAhsmrzQUKDvr8TFFGe32N5jFVPYaLuCIRnAcMm2w/YtdkcF2J2TQEIURN0hT4INn0msxBPfXC8Jl9muk0dwd/5v8DfI73ly+DA0NKVGgttMBPO4/SvksceYr6lGP0VaLQ/4VeRmrLrTiT0kYeFHPnQLOYV5VXoyzow3BYcXi9u/Z/yfUsel3r0/REl5YWbFkUdKSTPr4fJT0dlcOOpXHYwQ/2DxmiAKJlVpznqgjbGR6oSfNvFsLsSlW20jBWVV/DZYD47ZoyJwpRWjNBBwrjh2kMIZ4voW4BhdVq/lZN5OBct1w6PqlY6mxkqELPQ5yURSZRBt/ySJRdDmJA==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=E4tjM0nVtB8NfrraIpCZSsHIpn5YLuoT9YU+B21UYBJMT6+wcP0K3iRhAvpDb4mXX/QnBFEip2d1cdR8MWlGBHpJRHrogBgb7s9U62aV5JDdts+UJDx52jdayiItazw4f9kekO7ADtIKeUyfSEMChHE6hM006Eix8vVhqSHtNvHdcbRhY4Qvjy2ZS6vrjaKlV3RycnLXUcoScpZK9p2kp/wsKcS5Ghv8iaK+C8iiCGbkQwAqc7k2kubVYHeWZGpmpxnuhrd7uJRkD5CrFu/jrB1VNlFCY8JvWt6jSPr7oVjETGaXUY8mxkQEQgSWSL5KvVTliLyRXxoZpu2bBBxfng==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;
  • Cc: scott.davis@xxxxxxxxxx, christopher.clark@xxxxxxxxxx, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx, Wei Liu <wl@xxxxxxx>
  • Delivery-date: Wed, 27 Jul 2022 13:22:31 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 06.07.2022 23:04, Daniel P. Smith wrote:
> This reworks all the dom0 page allocation functions for general domain
> construction. Where possible, common logic between the two was split into a
> separate function for reuse by the two functions.

You absolutely need to mention what behavioral / functional changes there
are (intended), even in case it is "none".

> --- a/xen/arch/x86/dom0_build.c
> +++ b/xen/arch/x86/dom0_build.c
> @@ -320,69 +320,31 @@ static unsigned long __init default_nr_pages(unsigned 
> long avail)
>  }
>  
>  unsigned long __init dom0_compute_nr_pages(
> -    struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
> +    struct boot_domain *bd, struct elf_dom_parms *parms,
> +    unsigned long initrd_len)
>  {
> -    nodeid_t node;
> -    unsigned long avail = 0, nr_pages, min_pages, max_pages, iommu_pages = 0;
> +    unsigned long avail, nr_pages, min_pages, max_pages;
>  
>      /* The ordering of operands is to work around a clang5 issue. */
>      if ( CONFIG_DOM0_MEM[0] && !dom0_mem_set )
>          parse_dom0_mem(CONFIG_DOM0_MEM);
>  
> -    for_each_node_mask ( node, dom0_nodes )
> -        avail += avail_domheap_pages_region(node, 0, 0) +
> -                 initial_images_nrpages(node);
> -
> -    /* Reserve memory for further dom0 vcpu-struct allocations... */
> -    avail -= (d->max_vcpus - 1UL)
> -             << get_order_from_bytes(sizeof(struct vcpu));
> -    /* ...and compat_l4's, if needed. */
> -    if ( is_pv_32bit_domain(d) )
> -        avail -= d->max_vcpus - 1;
> -
> -    /* Reserve memory for iommu_dom0_init() (rough estimate). */
> -    if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
> -    {
> -        unsigned int s;
> -
> -        for ( s = 9; s < BITS_PER_LONG; s += 9 )
> -            iommu_pages += max_pdx >> s;
> -
> -        avail -= iommu_pages;
> -    }
> +    avail = dom_avail_nr_pages(bd, dom0_nodes);
>  
> -    if ( paging_mode_enabled(d) || opt_dom0_shadow || opt_pv_l1tf_hwdom )
> +    /* command line overrides configuration */
> +    if (  dom0_mem_set )

Nit: Stray double blanks.

>      {
> -        unsigned long cpu_pages;
> -
> -        nr_pages = get_memsize(&dom0_size, avail) ?: default_nr_pages(avail);
> -
> -        /*
> -         * Clamp according to min/max limits and available memory
> -         * (preliminary).
> -         */
> -        nr_pages = max(nr_pages, get_memsize(&dom0_min_size, avail));
> -        nr_pages = min(nr_pages, get_memsize(&dom0_max_size, avail));
> -        nr_pages = min(nr_pages, avail);
> -
> -        cpu_pages = dom0_paging_pages(d, nr_pages);
> -
> -        if ( !iommu_use_hap_pt(d) )
> -            avail -= cpu_pages;
> -        else if ( cpu_pages > iommu_pages )
> -            avail -= cpu_pages - iommu_pages;

I can't see any of this represented in the new code. Have you gone through
the history of this code, to understand why things are the way they are,
and hence what (corner) cases need to remain behaviorally unchanged?

> @@ -40,6 +42,106 @@ struct vcpu *__init alloc_dom_vcpu0(struct boot_domain 
> *bd)
>  }
>  
>  
> +unsigned long __init dom_avail_nr_pages(
> +    struct boot_domain *bd, nodemask_t nodes)
> +{
> +    unsigned long avail = 0, iommu_pages = 0;
> +    bool is_ctldom = false, is_hwdom = false;
> +    unsigned long nr_pages = bd->meminfo.mem_size.nr_pages;
> +    nodeid_t node;
> +
> +    if ( builder_is_ctldom(bd) )
> +        is_ctldom = true;
> +    if ( builder_is_hwdom(bd) )
> +        is_hwdom = true;
> +
> +    for_each_node_mask ( node, nodes )
> +        avail += avail_domheap_pages_region(node, 0, 0) +
> +                 initial_images_nrpages(node);

I don't think this is suitable for other than Dom0, so I question the
splitting out and generalizing of this logic. For "ordinary" domains
their memory size should be well-defined rather than inferred from
host capacity.

Starting from host capacity also means you become ordering dependent
when it comes to creating (not starting) all the domains: Which one
is to come first? And even with this limited to just Dom0 - is its
size calculated before or after all the other domains were created?

> +    /* Reserve memory for further dom0 vcpu-struct allocations... */

dom0?

> +    avail -= (bd->domain->max_vcpus - 1UL)
> +             << get_order_from_bytes(sizeof(struct vcpu));
> +    /* ...and compat_l4's, if needed. */
> +    if ( is_pv_32bit_domain(bd->domain) )
> +        avail -= bd->domain->max_vcpus - 1;
> +
> +    /* Reserve memory for iommu_dom0_init() (rough estimate). */
> +    if ( is_hwdom && is_iommu_enabled(bd->domain) && 
> !iommu_hwdom_passthrough )

Again the question why this would be Dom0-only.

> +    {
> +        unsigned int s;
> +
> +        for ( s = 9; s < BITS_PER_LONG; s += 9 )
> +            iommu_pages += max_pdx >> s;
> +
> +        avail -= iommu_pages;
> +    }
> +
> +    if ( paging_mode_enabled(bd->domain) ||
> +         (is_ctldom && opt_dom0_shadow) ||
> +         (is_hwdom && opt_pv_l1tf_hwdom) )

An interesting combination of conditions. It (again) looks to me as if
it first needs properly separating Dom0 from hwdom, in an abstract
sense.

> +    {
> +        unsigned long cpu_pages = dom0_paging_pages(bd->domain, nr_pages);
> +
> +        if ( !iommu_use_hap_pt(bd->domain) )
> +            avail -= cpu_pages;
> +        else if ( cpu_pages > iommu_pages )
> +            avail -= cpu_pages - iommu_pages;
> +    }
> +
> +    return avail;
> +}
> +
> +unsigned long __init dom_compute_nr_pages(
> +    struct boot_domain *bd, struct elf_dom_parms *parms,
> +    unsigned long initrd_len)
> +{
> +    unsigned long avail, nr_pages = bd->meminfo.mem_size.nr_pages;
> +
> +    if ( builder_is_initdom(bd) )
> +        return dom0_compute_nr_pages(bd, parms, initrd_len);
> +
> +    avail = dom_avail_nr_pages(bd, node_online_map);
> +
> +    if ( is_pv_domain(bd->domain) && (parms->p2m_base == UNSET_ADDR) )
> +    {
> +        /*
> +         * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
> +         * note) require that there is enough virtual space beyond the 
> initial
> +         * allocation to set up their initial page tables. This space is
> +         * roughly the same size as the p2m table, so make sure the initial
> +         * allocation doesn't consume more than about half the space that's
> +         * available between params.virt_base and the address space end.
> +         */

This duplicates an existing comment (and hence below likely also
existing code) rather than replacing / moving the original. As in
an earlier case - how are the two going to remain in sync?

Jan



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.