[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 4/4] hvm: NUMA guest: inject NUMA topology into the guest



This patch extends the hvm_info_table to store the number of guest nodes
and will create a suitable ACPI SRAT table to describe the used guest
NUMA topology.

Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>

Regards,
Andre.

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
# HG changeset patch
# User Andre Przywara <andre.przywara@xxxxxxx>
# Date 1215084035 -7200
# Node ID aa69281c1ecf288c729a9fb5aaab1fa0983072bb
# Parent  b84c5f2fe83bd7c94ed956ba412689e614177f5c
advertise NUMA topology to the guest (via an ACPI table)

diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/acpi/acpi2_0.h
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h   Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h   Thu Jul 03 13:20:35 2008 +0200
@@ -356,6 +356,61 @@
 };
 
 /*
+ * System Resource Affinity Table header definition (SRAT).
+ */
+struct acpi_20_srat {
+    struct acpi_header header;
+    uint32_t table_revision;
+    uint32_t reserved2[2];
+};
+
+#define ACPI_SRAT_TABLE_REVISION 1
+
+/*
+ * System Resource Affinity Table structure types.
+ */
+#define ACPI_PROCESSOR_AFFIN           0x00
+#define ACPI_MEMORY_AFFIN              0x01
+
+struct acpi_20_srat_processor {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t sapic_id;
+    uint8_t domain_hi[3];
+    uint32_t reserved;
+};
+
+/*
+ * Local APIC Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0)
+
+struct acpi_20_srat_memory {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t domain_hi[3];      /* this is ACPI 3.0, reserved in 2.0 */
+    uint16_t reserved;
+    uint32_t base_address_lo;
+    uint32_t base_address_hi;
+    uint32_t length_lo;
+    uint32_t length_hi;
+    uint32_t reserved2;
+    uint32_t flags;
+    uint32_t reserved3[2];
+};
+
+/*
+ * Memory Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_MEM_AFFIN_ENABLED (1 << 0)
+#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1)
+#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2)  /* this is ACPI 3.0 */
+
+/*
  * Table Signatures.
  */
 #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -366,6 +421,7 @@
 #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
 
 /*
  * Table revision numbers.
@@ -378,6 +434,7 @@
 #define ACPI_2_0_TCPA_REVISION 0x02
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_2_0_SRAT_REVISION 0x01
 
 #pragma pack ()
 
diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/acpi/build.c
--- a/tools/firmware/hvmloader/acpi/build.c     Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/acpi/build.c     Thu Jul 03 13:20:35 2008 +0200
@@ -20,6 +20,9 @@
 #include "ssdt_tpm.h"
 #include "../config.h"
 #include "../util.h"
+#include "../e820.h"
+
+#define ONEMB 0x100000
 
 #define align16(sz)        (((sz) + 15) & ~15)
 #define fixed_strcpy(d, s) strncpy((d), (s), sizeof(d))
@@ -45,6 +48,140 @@
 
     p = table;
     p[checksum_offset] = -sum;
+}
+
+static int vcpu_to_numa_node (int vcpu_id, int nr_vcpus)
+{
+int div,mod;
+
+    div=nr_vcpus / get_numanodes();
+    mod=nr_vcpus % get_numanodes();
+
+    if ( vcpu_id < mod * (div + 1)) return vcpu_id / (div + 1);
+    return ( ( vcpu_id - (mod * (div + 1)) ) / div ) + mod;
+}
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+static uint64_t guessmemsize (void)
+{
+    uint64_t ret = 0;
+    struct e820entry *map = HVM_E820;
+    int i;
+
+    for ( i = 0; i < *HVM_E820_NR ; i++)
+    {
+        if (map[i].addr == ONEMB )
+            ret+=map[i].size + PAGE_SIZE * 3 + ONEMB;
+        if (map[i].addr == (1ULL << 32))
+            ret+=map[i].size;
+    }
+    return ret;
+}
+
+int construct_srat(struct acpi_20_srat *srat)
+{
+    struct acpi_20_srat_processor *processor;
+    struct acpi_20_srat_memory    *memory;
+    struct e820entry *map = HVM_E820;
+    int i, offset = 0;
+    uint64_t hvm_node_mem;
+
+    memset(srat, 0, sizeof(*srat));
+    srat->header.signature    = ACPI_2_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_2_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision      = ACPI_SRAT_TABLE_REVISION;
+    offset += sizeof(*srat);
+
+    processor = (struct acpi_20_srat_processor *)(srat + 1);
+    for ( i = 0; i < get_vcpu_nr(); i++ )
+    {
+        memset(processor, 0, sizeof(*processor));
+        processor->type    = ACPI_PROCESSOR_AFFIN;
+        processor->length  = sizeof(*processor);
+        processor->domain  = vcpu_to_numa_node (i, get_vcpu_nr());
+        processor->apic_id = LAPIC_ID(i);
+        processor->flags   = ACPI_LOCAL_APIC_AFFIN_ENABLED;
+        processor->sapic_id= 0;
+        offset += sizeof(*processor);
+        processor++;
+    }
+
+ /*
+  * Equally distribute the memory on all NUMA nodes. Round up the size
+  * of available memory to whole megabytes, as (at least) Linux cannot cope
+  * with uneven NUMA node boundaries. The remaining part of memory will be
+  * assigned to the last NUMA node. The mapping of the first MB is copied
+  * from the E820 map and assigned to node 0
+  */
+    hvm_node_mem = guessmemsize()+ONEMB-1;
+    hvm_node_mem = hvm_node_mem >> 20;
+ /* 64bit/32bit does not work because of missing libgcc */
+    hvm_node_mem = (uint32_t)hvm_node_mem / get_numanodes();
+    hvm_node_mem = hvm_node_mem << 20;
+
+    memory = (struct acpi_20_srat_memory *)(processor);
+    for ( i = 0; i < *HVM_E820_NR; i++ )
+    {
+        if ( map[i].type != E820_RAM ) continue;
+        if ( map[i].addr >= ONEMB ) break;
+
+        memset(memory, 0, sizeof(*memory));
+        memory->type        = ACPI_MEMORY_AFFIN;
+        memory->length      = sizeof(*memory);
+        memory->domain      = 0;
+        memory->base_address_lo = map[i].addr & 0xFFFFFFFFL;
+        memory->base_address_hi = map[i].addr >> 32;
+        memory->length_lo   = map[i].size & 0xFFFFFFFFL;
+        memory->length_hi   = map[i].size >> 32;
+        memory->flags       = ACPI_MEM_AFFIN_ENABLED;
+
+        offset += sizeof(*memory);
+        memory++;
+    }
+
+    for ( i = 0; i < get_numanodes(); i++ )
+    {
+        memset(memory, 0, sizeof(*memory));
+        memory->type        = ACPI_MEMORY_AFFIN;
+        memory->length      = sizeof(*memory);
+        memory->domain      = i;
+        if ( i == 0 )
+        {
+            memory->base_address_lo = ONEMB;
+            memory->base_address_hi = 0;
+            memory->length_lo   = ( hvm_node_mem  - ONEMB ) & 0xFFFFFFFFL;
+            memory->length_hi   = ( hvm_node_mem  - ONEMB ) >> 32;
+        } else
+        if ( i == get_numanodes()-1 )
+        {
+            memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+            memory->base_address_hi = (i * hvm_node_mem) >> 32;
+            memory->length_lo   = (guessmemsize()-hvm_node_mem*i) & 
0xFFFFFFFFL;
+            memory->length_hi   = (guessmemsize()-hvm_node_mem*i) >> 32;
+        } else
+        {
+            memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL;
+            memory->base_address_hi = (i * hvm_node_mem) >> 32;
+            memory->length_lo   = hvm_node_mem & 0xFFFFFFFFL;
+            memory->length_hi   = hvm_node_mem >> 32;
+        }
+        memory->flags       = ACPI_MEM_AFFIN_ENABLED;
+        offset += sizeof(*memory);
+        memory++;
+    }
+
+    srat->header.length = offset;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+    return align16(offset);
 }
 
 static int uart_exists(uint16_t uart_base)
@@ -192,6 +329,7 @@
 static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
 {
     int offset = 0, nr_tables = 0;
+    struct acpi_20_srat *srat;
     struct acpi_20_madt *madt;
     struct acpi_20_hpet *hpet;
     struct acpi_20_tcpa *tcpa;
@@ -204,6 +342,14 @@
         madt = (struct acpi_20_madt *)&buf[offset];
         offset += construct_madt(madt);
         table_ptrs[nr_tables++] = (unsigned long)madt;
+    }
+
+    /* SRAT. */
+    if ( get_numanodes() > 0 )
+    {
+        srat = (struct acpi_20_srat *)&buf[offset];
+        offset += construct_srat(srat);
+        table_ptrs[nr_tables++] = (unsigned long)srat;
     }
 
     /* HPET. */
diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/util.c
--- a/tools/firmware/hvmloader/util.c   Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.c   Thu Jul 03 13:20:35 2008 +0200
@@ -594,6 +594,12 @@
     return (t ? t->nr_vcpus : 1);
 }
 
+int get_numanodes(void)
+{
+    struct hvm_info_table *t = get_hvm_info_table();
+    return (t ? t->numanodes : 1);
+}
+
 int get_acpi_enabled(void)
 {
     struct hvm_info_table *t = get_hvm_info_table();
diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/util.h
--- a/tools/firmware/hvmloader/util.h   Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/firmware/hvmloader/util.h   Thu Jul 03 13:20:35 2008 +0200
@@ -104,6 +104,7 @@
 
 /* HVM-builder info. */
 int get_vcpu_nr(void);
+int get_numanodes(void);
 int get_acpi_enabled(void);
 int get_apic_mode(void);
 
diff -r b84c5f2fe83b -r aa69281c1ecf tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:17:11 2008 +0200
+++ b/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:20:35 2008 +0200
@@ -845,6 +845,18 @@
 
 #endif /* __i386__ || __x86_64__ */
 
+static unsigned hweight_long (unsigned long value)
+{
+int ret=0;
+
+    while (value>0)
+    {
+        if (value&1) ++ret;
+        value>>=1;
+    }
+    return ret;
+}
+
 static PyObject *pyxc_hvm_build(XcObject *self,
                                 PyObject *args,
                                 PyObject *kwds)
@@ -884,6 +896,7 @@
     va_hvm->acpi_enabled = acpi;
     va_hvm->apic_mode    = apic;
     va_hvm->nr_vcpus     = vcpus;
+    va_hvm->numanodes    = hweight_long(nodemask);
     for ( i = 0, sum = 0; i < va_hvm->length; i++ )
         sum += ((uint8_t *)va_hvm)[i];
     va_hvm->checksum = -sum;
diff -r b84c5f2fe83b -r aa69281c1ecf xen/include/public/hvm/hvm_info_table.h
--- a/xen/include/public/hvm/hvm_info_table.h   Thu Jul 03 13:17:11 2008 +0200
+++ b/xen/include/public/hvm/hvm_info_table.h   Thu Jul 03 13:20:35 2008 +0200
@@ -36,6 +36,7 @@
     uint8_t     acpi_enabled;
     uint8_t     apic_mode;
     uint32_t    nr_vcpus;
+    uint32_t    numanodes;
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.