[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 2/6] xen/x86: move generically usable NUMA code from x86 to common


  • To: Wei Chen <wei.chen@xxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Thu, 25 Aug 2022 12:58:09 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=kVcDTDa4xb0OK4+x7pmNUaClrXv5F5u4nG3dDHNIolU=; b=CPqK/FBDVZd2KblkHO/ISjtRXM41bbWVDjGbcjQiTj1Rqimn0bdMd9WUp4O2k7Xc8moRlL0zj24lRkNmt49kX6DoWCK4MWnoXEJP+vhoaNG/pDsn4KprTx6WgPVEYVzHBdJ2UksBjUrPswcEndiUj74FWyHM59AbPbzdp7kL8hL7aPZkTNhi3zprkEY7WmJh5uoOO0lCMPhAYoGsv+/9sMxJaI3yNQ/BoYKH5Qani1+HJrN/l4BUmzR58JZlZaXz4INjnygdKAmM9Hs3ZOW5AbZdi8r0+5fqpZccUlYDNppw6BPsWaYL4jYY7T8Hfu5mPNoOtIOUQ8lBNEPnxL7LCQ==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=JOf1WFivHEUjCVLkzk7adBI3nlvYvO4wrk4X3pX96bPbH+oTRYU0hMM4x0JB5qUZUm0+RdfS1aGtzsURmLW0wgB67Klo7ztstnRIPU2eH/zPgSXRRvLxwJrUp3MHtRB2aeFNxzfVCc7hp2uuCpf7UFArxqeG1lh46NU6WfxO5uFcOOyB8TdWfGUsfCYKG+3vOqUdSj41kJd3KXUDKfX3a1CBSGbjQJ1ICK/0gacEV9YmwAXquGIDL6UgEfURWkPsqtJgiIjHCsbBrjioCWNyPigGKBtzlzPhLCvXvS+AZiM8bZeu83nmswqFirdf5bsHZP60lZwb4dvtNcxIEkJkMA==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;
  • Cc: nd@xxxxxxx, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx
  • Delivery-date: Thu, 25 Aug 2022 10:58:21 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 22.08.2022 04:58, Wei Chen wrote:
> --- /dev/null
> +++ b/xen/common/numa.c
> @@ -0,0 +1,440 @@
> +/*
> + * Generic VM initialization for NUMA setups.
> + * Copyright 2002,2003 Andi Kleen, SuSE Labs.
> + * Adapted for Xen: Ryan Harper <ryanh@xxxxxxxxxx>
> + */
> +
> +#include <xen/init.h>
> +#include <xen/keyhandler.h>
> +#include <xen/mm.h>
> +#include <xen/nodemask.h>
> +#include <xen/numa.h>
> +#include <xen/param.h>
> +#include <xen/sched.h>
> +#include <xen/softirq.h>
> +
> +struct node_data __ro_after_init node_data[MAX_NUMNODES];
> +
> +/* Mapping from pdx to node id */
> +unsigned int __ro_after_init memnode_shift;
> +unsigned long __ro_after_init memnodemapsize;
> +uint8_t *__ro_after_init memnodemap;
> +static uint8_t __ro_after_init _memnodemap[64];
> +
> +nodeid_t __ro_after_init cpu_to_node[NR_CPUS] = {

I don't think this can be __ro_after_init, or you'll break CPU
hotplug.

> +    [0 ... NR_CPUS-1] = NUMA_NO_NODE
> +};
> +
> +cpumask_t __ro_after_init node_to_cpumask[MAX_NUMNODES];

Same here.

> +nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
> +
> +bool __read_mostly numa_off;

This, otoh, can be, or have I missed a place where it's written by a
non-__init function?

> +bool numa_disabled(void)
> +{
> +    return numa_off || arch_numa_disabled(false);
> +}
> +
> +/*
> + * Given a shift value, try to populate memnodemap[]
> + * Returns :
> + * 1 if OK
> + * 0 if memnodmap[] too small (of shift too small)
> + * -1 if node overlap or lost ram (shift too big)
> + */
> +static int __init populate_memnodemap(const struct node *nodes,
> +                                      nodeid_t numnodes, unsigned int shift,

I don't think you can use nodeid_t for a variable holding a node count.
Think of what would happen if there were 256 nodes, the IDs of which
all fit in nodeid_t. (Same again further down.)

> +                                      nodeid_t *nodeids)
> +{
> +    unsigned long spdx, epdx;
> +    nodeid_t i;

This is likely inefficient for a loop counter variable. Note how you
use "unsigned int" in e.g. extract_lsb_from_nodes().

> +unsigned int __init compute_hash_shift(const struct node *nodes,
> +                                       nodeid_t numnodes, nodeid_t *nodeids)
> +{
> +    unsigned int shift;
> +
> +    shift = extract_lsb_from_nodes(nodes, numnodes);
> +    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
> +        memnodemap = _memnodemap;
> +    else if ( allocate_cachealigned_memnodemap() )
> +        return -1;

With this the function can't very well have "unsigned int" return type.

> +void __init numa_init_array(void)
> +{
> +    int rr, i;

"unsigned int" for i and perhaps nodeid_t for rr?

> +static int __init numa_emulation(unsigned long start_pfn,
> +                                 unsigned long end_pfn)
> +{
> +    unsigned int i;
> +    struct node nodes[MAX_NUMNODES];
> +    uint64_t sz = pfn_to_paddr(end_pfn - start_pfn) / numa_fake;
> +
> +    /* Kludge needed for the hash function */
> +    if ( hweight64(sz) > 1 )
> +    {
> +        u64 x = 1;

uint64_t and a blank line between declaration(s) and statement(s)
please.

> +        while ( (x << 1) < sz )
> +            x <<= 1;
> +        if ( x < sz / 2 )
> +            printk(KERN_ERR "Numa emulation unbalanced. Complain to 
> maintainer\n");
> +        sz = x;
> +    }
> +
> +    memset(&nodes, 0, sizeof(nodes));
> +    for ( i = 0; i < numa_fake; i++ )
> +    {
> +        nodes[i].start = pfn_to_paddr(start_pfn) + i * sz;
> +        if ( i == numa_fake - 1 )
> +            sz = pfn_to_paddr(end_pfn) - nodes[i].start;
> +        nodes[i].end = nodes[i].start + sz;
> +        printk(KERN_INFO "Faking node %u at %"PRIx64"-%"PRIx64" 
> (%"PRIu64"MB)\n",
> +               i, nodes[i].start, nodes[i].end,
> +               (nodes[i].end - nodes[i].start) >> 20);
> +        node_set_online(i);
> +    }
> +    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> +    if ( memnode_shift < 0 )

Does the compiler not warn here, comparing an unsigned value for being
negative?

> --- a/xen/include/xen/numa.h
> +++ b/xen/include/xen/numa.h
> @@ -18,4 +18,70 @@
>    (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \
>     ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE)
>  
> +/* The following content can be used when NUMA feature is enabled */
> +#ifdef CONFIG_NUMA
> +
> +extern nodeid_t      cpu_to_node[NR_CPUS];
> +extern cpumask_t     node_to_cpumask[];
> +
> +#define cpu_to_node(cpu)        (cpu_to_node[cpu])
> +#define parent_node(node)       (node)
> +#define node_to_first_cpu(node) (__ffs(node_to_cpumask[node]))
> +#define node_to_cpumask(node)   (node_to_cpumask[node])

Please could you take the opportunity and drop unnecessary parentheses
from here? Afaict only parent_node() need them to be kept.

> +struct node {
> +    paddr_t start, end;
> +};
> +
> +extern unsigned int compute_hash_shift(const struct node *nodes,
> +                                       nodeid_t numnodes, nodeid_t *nodeids);
> +
> +#define VIRTUAL_BUG_ON(x)
> +
> +extern bool numa_off;
> +extern void numa_add_cpu(unsigned int cpu);

Please can you have variable and function declarations visually separated,
by adding a blank line between them?

> +extern void numa_init_array(void);
> +extern void numa_set_node(unsigned int cpu, nodeid_t node);
> +extern void numa_initmem_init(unsigned long start_pfn, unsigned long 
> end_pfn);
> +extern int  numa_scan_nodes(paddr_t start, paddr_t end);
> +
> +extern int arch_numa_setup(const char *opt);
> +extern bool arch_numa_disabled(bool init_as_disable);
> +extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
> +
> +static inline void clear_node_cpumask(unsigned int cpu)
> +{
> +    cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
> +}
> +
> +/* Simple perfect hash to map pdx to node numbers */
> +extern unsigned int memnode_shift;
> +extern unsigned long memnodemapsize;
> +extern uint8_t *memnodemap;
> +
> +struct node_data {
> +    unsigned long node_start_pfn;
> +    unsigned long node_spanned_pages;
> +};
> +
> +extern struct node_data node_data[];
> +
> +static inline __attribute_pure__ nodeid_t phys_to_nid(paddr_t addr)

Nit: The conventional place for attributes is between return type
and function (or object) name.

> +{
> +    nodeid_t nid;
> +    VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
> +    nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
> +    VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
> +    return nid;
> +}
> +
> +#define NODE_DATA(nid)          (&(node_data[nid]))

Again please take the opportunity and drop the unnecessary inner
parentheses.

> +#define node_start_pfn(nid)     (NODE_DATA(nid)->node_start_pfn)
> +#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
> +#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
> +                                NODE_DATA(nid)->node_spanned_pages)

Pleae correct indentation here - it was correct originally (except
for the fact that it was using hard tabs).

Jan



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.