[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

RE: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to common


  • To: Julien Grall <julien@xxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, "sstabellini@xxxxxxxxxx" <sstabellini@xxxxxxxxxx>
  • From: Wei Chen <Wei.Chen@xxxxxxx>
  • Date: Wed, 25 Aug 2021 11:15:15 +0000
  • Accept-language: en-US
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=o7b9NfXRP5iYc+BDdzVlW9+N+dX/idxhiVCxcaLW/6c=; b=a/8VvJi+Y7jsopYxiIONujhyF4SVAtlS6fiRxnVAqstIffZVpNj00jwT69R0bLTYzwBRBANBLD/k/Zpj7JzPMaDiVhRvRzTAMJqMpknnmfmGLocWadJDnlyIZPqtUqQeiNTerQF+J7WK+eb5eX5ZVPwRFr0p/Sq7Ke+JNSF1uwnbhLDFsEiJfMl2KbPTAK0rhSWvbU6KqPhiCj3UrIuZUBp1aV4lqDBrltH76G398IV32Xb4ohYv1J6fABrhYOKOxnNHY5DpfjBhhsKEmbmlHOyyfGEMYUObcOgOVmoyxbtn8K2HgTwp/oUtvd3DydYcSTE0l6QHp/5US3R6npyJ5Q==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=A8VCkJ59zsIjjOrqlCb5vnvkapQiQMFDOkEWVWKo8PUEMVvCvUPLENm+gIVJ03k7woxAevVnVV5FjVAbYgR6Ev3WiNURP9Gmjyo4wk7oXzQnmZlAl1Ar4snXWDR48G/RHJcFyCBFn/EGcuTN8EH72997dTmn3fhs3j07DDJV8WsAFT7njSzBb95DkymLTCcZBreCM7DpEutZtM/q+AKEOk3z7/D8QmPh0x64qt06vYd6lakavj3I4W+bOd/g8BqEK7e1g631TfXIyOTWN3G3L6XMre0vkYNWDwdkGnmLdaLQAObgMhjVN+5Z43FHuMOEGaUVS6+wxpCdZLodXbYqPw==
  • Authentication-results-original: xen.org; dkim=none (message not signed) header.d=none;xen.org; dmarc=none action=none header.from=arm.com;
  • Cc: Bertrand Marquis <Bertrand.Marquis@xxxxxxx>
  • Delivery-date: Wed, 25 Aug 2021 11:15:37 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>
  • Nodisclaimer: true
  • Original-authentication-results: xen.org; dkim=none (message not signed) header.d=none;xen.org; dmarc=none action=none header.from=arm.com;
  • Thread-index: AQHXjpswu81a+7plB06pZowt7s9MAKuEGMmAgAANx9A=
  • Thread-topic: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to common

Hi Julien,

> -----Original Message-----
> From: Julien Grall <julien@xxxxxxx>
> Sent: 2021年8月25日 18:22
> To: Wei Chen <Wei.Chen@xxxxxxx>; xen-devel@xxxxxxxxxxxxxxxxxxxx;
> sstabellini@xxxxxxxxxx; jbeulich@xxxxxxxx
> Cc: Bertrand Marquis <Bertrand.Marquis@xxxxxxx>
> Subject: Re: [XEN RFC PATCH 12/40] xen/x86: Move numa_initmem_init to
> common
> 
> Hi Wei,
> 
> On 11/08/2021 11:23, Wei Chen wrote:
> > This function can be reused by Arm device tree based
> > NUMA support. So we move it from x86 to common, as well
> > as its related variables and functions:
> > setup_node_bootmem, numa_init_array and numa_emulation.
> >
> > As numa_initmem_init has been moved to common, _memnodemap
> > is not used cross files. We can restore _memnodemap to
> > static.
> 
> As we discussed on a previous patch, we should try to avoid this kind of
> dance. I can help to find a split that would achieve that.
> 

Yes, thanks!

> >
> > Signed-off-by: Wei Chen <wei.chen@xxxxxxx>
> > ---
> >   xen/arch/x86/numa.c         | 118 ----------------------------------
> >   xen/common/numa.c           | 122 +++++++++++++++++++++++++++++++++++-
> >   xen/include/asm-x86/numa.h  |   5 --
> >   xen/include/asm-x86/setup.h |   1 -
> >   xen/include/xen/numa.h      |   8 ++-
> >   5 files changed, 128 insertions(+), 126 deletions(-)
> >
> > diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
> > index f2626b3968..6908738305 100644
> > --- a/xen/arch/x86/numa.c
> > +++ b/xen/arch/x86/numa.c
> > @@ -38,7 +38,6 @@ nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
> >
> >   nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
> >
> > -bool numa_off;
> >   s8 acpi_numa = 0;
> >
> >   int srat_disabled(void)
> > @@ -46,123 +45,6 @@ int srat_disabled(void)
> >       return numa_off || acpi_numa < 0;
> >   }
> >
> > -/* initialize NODE_DATA given nodeid and start/end */
> > -void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
> > -{
> > -    unsigned long start_pfn, end_pfn;
> > -
> > -    start_pfn = start >> PAGE_SHIFT;
> > -    end_pfn = end >> PAGE_SHIFT;
> > -
> > -    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
> > -    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
> > -
> > -    node_set_online(nodeid);
> > -}
> > -
> > -void __init numa_init_array(void)
> > -{
> > -    int rr, i;
> > -
> > -    /* There are unfortunately some poorly designed mainboards around
> > -       that only connect memory to a single CPU. This breaks the 1:1
> cpu->node
> > -       mapping. To avoid this fill in the mapping for all possible
> > -       CPUs, as the number of CPUs is not known yet.
> > -       We round robin the existing nodes. */
> > -    rr = first_node(node_online_map);
> > -    for ( i = 0; i < nr_cpu_ids; i++ )
> > -    {
> > -        if ( cpu_to_node[i] != NUMA_NO_NODE )
> > -            continue;
> > -        numa_set_node(i, rr);
> > -        rr = cycle_node(rr, node_online_map);
> > -    }
> > -}
> > -
> > -#ifdef CONFIG_NUMA_EMU
> > -static int numa_fake __initdata = 0;
> > -
> > -/* Numa emulation */
> > -static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
> > -{
> > -    int i;
> > -    struct node nodes[MAX_NUMNODES];
> > -    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
> > -
> > -    /* Kludge needed for the hash function */
> > -    if ( hweight64(sz) > 1 )
> > -    {
> > -        u64 x = 1;
> > -        while ( (x << 1) < sz )
> > -            x <<= 1;
> > -        if ( x < sz/2 )
> > -            printk(KERN_ERR "Numa emulation unbalanced. Complain to
> maintainer\n");
> > -        sz = x;
> > -    }
> > -
> > -    memset(&nodes,0,sizeof(nodes));
> > -    for ( i = 0; i < numa_fake; i++ )
> > -    {
> > -        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
> > -        if ( i == numa_fake - 1 )
> > -            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
> > -        nodes[i].end = nodes[i].start + sz;
> > -        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64"
> (%"PRIu64"MB)\n",
> > -               i,
> > -               nodes[i].start, nodes[i].end,
> > -               (nodes[i].end - nodes[i].start) >> 20);
> > -        node_set_online(i);
> > -    }
> > -    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> > -    if ( memnode_shift < 0 )
> > -    {
> > -        memnode_shift = 0;
> > -        printk(KERN_ERR "No NUMA hash function found. Emulation
> disabled.\n");
> > -        return -1;
> > -    }
> > -    for_each_online_node ( i )
> > -        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> > -    numa_init_array();
> > -
> > -    return 0;
> > -}
> > -#endif
> > -
> > -void __init numa_initmem_init(unsigned long start_pfn, unsigned long
> end_pfn)
> > -{
> > -    int i;
> > -
> > -#ifdef CONFIG_NUMA_EMU
> > -    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
> > -        return;
> > -#endif
> > -
> > -#ifdef CONFIG_ACPI_NUMA
> > -    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
> > -         (u64)end_pfn << PAGE_SHIFT) )
> > -        return;
> > -#endif
> > -
> > -    printk(KERN_INFO "%s\n",
> > -           numa_off ? "NUMA turned off" : "No NUMA configuration
> found");
> > -
> > -    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
> > -           (u64)start_pfn << PAGE_SHIFT,
> > -           (u64)end_pfn << PAGE_SHIFT);
> > -    /* setup dummy node covering all memory */
> > -    memnode_shift = BITS_PER_LONG - 1;
> > -    memnodemap = _memnodemap;
> > -    memnodemapsize = ARRAY_SIZE(_memnodemap);
> > -
> > -    nodes_clear(node_online_map);
> > -    node_set_online(0);
> > -    for ( i = 0; i < nr_cpu_ids; i++ )
> > -        numa_set_node(i, 0);
> > -    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
> > -    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
> > -                    (u64)end_pfn << PAGE_SHIFT);
> > -}
> > -
> >   void numa_set_node(int cpu, nodeid_t node)
> >   {
> >       cpu_to_node[cpu] = node;
> > diff --git a/xen/common/numa.c b/xen/common/numa.c
> > index 1facc8fe2b..26c0006d04 100644
> > --- a/xen/common/numa.c
> > +++ b/xen/common/numa.c
> > @@ -14,12 +14,13 @@
> >   #include <xen/smp.h>
> >   #include <xen/pfn.h>
> >   #include <xen/sched.h>
> 
> NIT: We tend to add a newline betwen <xen/...> headers and <asm/...>
> headers.
> 

got it

> > +#include <asm/acpi.h>
> >
> >   struct node_data node_data[MAX_NUMNODES];
> >
> >   /* Mapping from pdx to node id */
> >   int memnode_shift;
> > -typeof(*memnodemap) _memnodemap[64];
> > +static typeof(*memnodemap) _memnodemap[64];
> >   unsigned long memnodemapsize;
> >   u8 *memnodemap;
> >
> > @@ -34,6 +35,8 @@ int num_node_memblks;
> >   struct node node_memblk_range[NR_NODE_MEMBLKS];
> >   nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
> >
> > +bool numa_off;
> > +
> >   /*
> >    * Given a shift value, try to populate memnodemap[]
> >    * Returns :
> > @@ -191,3 +194,120 @@ void numa_add_cpu(int cpu)
> >   {
> >       cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
> >   }
> > +
> > +/* initialize NODE_DATA given nodeid and start/end */
> > +void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
> 
>  From an abstract PoV, start and end should be paddr_t. This should be
> done on a separate patch though.
> 

Ok.

> > +{
> > +    unsigned long start_pfn, end_pfn;
> > +
> > +    start_pfn = start >> PAGE_SHIFT;
> > +    end_pfn = end >> PAGE_SHIFT;
> > +
> > +    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
> > +    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
> > +
> > +    node_set_online(nodeid);
> > +}
> > +
> > +void __init numa_init_array(void)
> > +{
> > +    int rr, i;
> > +
> > +    /* There are unfortunately some poorly designed mainboards around
> > +       that only connect memory to a single CPU. This breaks the 1:1
> cpu->node
> > +       mapping. To avoid this fill in the mapping for all possible
> > +       CPUs, as the number of CPUs is not known yet.
> > +       We round robin the existing nodes. */
> > +    rr = first_node(node_online_map);
> > +    for ( i = 0; i < nr_cpu_ids; i++ )
> > +    {
> > +        if ( cpu_to_node[i] != NUMA_NO_NODE )
> > +            continue;
> > +        numa_set_node(i, rr);
> > +        rr = cycle_node(rr, node_online_map);
> > +    }
> > +}
> > +
> > +#ifdef CONFIG_NUMA_EMU
> > +int numa_fake __initdata = 0;
> > +
> > +/* Numa emulation */
> > +static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
> 
> Here, this should be either "unsigned long" or ideally "mfn_t".
> Although, if you use "unsigned long", you will need to...
> 

Do we need a separate patch to do it?

> > +{
> > +    int i;
> > +    struct node nodes[MAX_NUMNODES];
> > +    u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
> 
> ... cast "(end_pfn - start_pfn)" to uin64_t or use pfn_to_paddr().
> 

Ok

> > +
> > +    /* Kludge needed for the hash function */
> > +    if ( hweight64(sz) > 1 )
> > +    {
> > +        u64 x = 1;
> > +        while ( (x << 1) < sz )
> > +            x <<= 1;
> > +        if ( x < sz/2 )
> > +            printk(KERN_ERR "Numa emulation unbalanced. Complain to
> maintainer\n");
> > +        sz = x;
> > +    }
> > +
> > +    memset(&nodes,0,sizeof(nodes));
> > +    for ( i = 0; i < numa_fake; i++ )
> > +    {
> > +        nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
> > +        if ( i == numa_fake - 1 )
> > +            sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
> > +        nodes[i].end = nodes[i].start + sz;
> > +        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64"
> (%"PRIu64"MB)\n",
> > +               i,
> > +               nodes[i].start, nodes[i].end,
> > +               (nodes[i].end - nodes[i].start) >> 20);
> > +        node_set_online(i);
> > +    }
> > +    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> > +    if ( memnode_shift < 0 )
> > +    {
> > +        memnode_shift = 0;
> > +        printk(KERN_ERR "No NUMA hash function found. Emulation
> disabled.\n");
> > +        return -1;
> > +    }
> > +    for_each_online_node ( i )
> > +        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> > +    numa_init_array();
> > +
> > +    return 0;
> > +}
> > +#endif
> > +
> > +void __init numa_initmem_init(unsigned long start_pfn, unsigned long
> end_pfn)
> > +{
> > +    int i;
> > +
> > +#ifdef CONFIG_NUMA_EMU
> > +    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
> > +        return;
> > +#endif
> > +
> > +#ifdef CONFIG_ACPI_NUMA
> > +    if ( !numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
> > +         (u64)end_pfn << PAGE_SHIFT) )
> 
> (u64)v << PAGE_SHIFT should be switched to use pfn_to_paddr() or
> mfn_to_paddr() if you decide to make start_pfn and end_pfn typesafe.
> 

Still need a separate patch to change it before move?

> > +        return;
> > +#endif
> > +
> > +    printk(KERN_INFO "%s\n",
> > +           numa_off ? "NUMA turned off" : "No NUMA configuration
> found");
> > +
> > +    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
> > +           (u64)start_pfn << PAGE_SHIFT,
> > +           (u64)end_pfn << PAGE_SHIFT);
> 
> Same remark here. PRIx64 would also have to be switched to PRIpaddr.
> 

Hmm, It seems I'd better to use a separate patch to do PRIpaddr clean up
before move code.

> > +    /* setup dummy node covering all memory */
> > +    memnode_shift = BITS_PER_LONG - 1;
> > +    memnodemap = _memnodemap;
> > +    memnodemapsize = ARRAY_SIZE(_memnodemap);
> > +
> > +    nodes_clear(node_online_map);
> > +    node_set_online(0);
> > +    for ( i = 0; i < nr_cpu_ids; i++ )
> > +        numa_set_node(i, 0);
> > +    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
> > +    setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT,
> > +                    (u64)end_pfn << PAGE_SHIFT);
> > +}
> > diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
> > index e8a92ad9df..f8e4e15586 100644
> > --- a/xen/include/asm-x86/numa.h
> > +++ b/xen/include/asm-x86/numa.h
> > @@ -21,16 +21,11 @@ extern nodeid_t pxm_to_node(unsigned int pxm);
> >
> >   #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
> >
> > -extern void numa_init_array(void);
> > -extern bool numa_off;
> > -
> > -
> >   extern int srat_disabled(void);
> >   extern void numa_set_node(int cpu, nodeid_t node);
> >   extern nodeid_t setup_node(unsigned int pxm);
> >   extern void srat_detect_node(int cpu);
> >
> > -extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
> >   extern nodeid_t apicid_to_node[];
> >   extern void init_cpu_to_node(void);
> >
> > diff --git a/xen/include/asm-x86/setup.h b/xen/include/asm-x86/setup.h
> > index 24be46115d..63838ba2d1 100644
> > --- a/xen/include/asm-x86/setup.h
> > +++ b/xen/include/asm-x86/setup.h
> > @@ -17,7 +17,6 @@ void early_time_init(void);
> >
> >   void set_nr_cpu_ids(unsigned int max_cpus);
> >
> > -void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
> >   void arch_init_memory(void);
> >   void subarch_init_memory(void);
> >
> > diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
> > index 67b79a73a3..258a5cb3db 100644
> > --- a/xen/include/xen/numa.h
> > +++ b/xen/include/xen/numa.h
> > @@ -26,7 +26,6 @@
> >   extern int memnode_shift;
> >   extern unsigned long memnodemapsize;
> >   extern u8 *memnodemap;
> > -extern typeof(*memnodemap) _memnodemap[64];
> >
> >   struct node_data {
> >       unsigned long node_start_pfn;
> > @@ -69,6 +68,13 @@ extern int conflicting_memblks(u64 start, u64 end);
> >   extern void cutoff_node(int i, u64 start, u64 end);
> >   extern int valid_numa_range(u64 start, u64 end, nodeid_t node);
> >
> > +extern void numa_init_array(void);
> > +extern void numa_initmem_init(unsigned long start_pfn, unsigned long
> end_pfn);
> > +extern bool numa_off;
> > +extern int numa_fake;
> > +
> > +extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
> > +
> >   #endif /* CONFIG_NUMA */
> >
> >   #endif /* _XEN_NUMA_H */
> >
> 
> Cheers,
> 
> --
> Julien Grall

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.