[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1/5] [POST-4.0]: HVM NUMA guest: generate SRAT table



The attached patch extends the hvm_info_table to include NUMA topology information. The hvmloader will read these entries and will generate an appropriate ACPI SRAT table. For now the information is zeroed in libxc, so that the SRAT generation will be skipped until activated in a later patch. (At least) Linux seems to be quite picky about the SRAT table, "odd" memory borders will crash it early. I am not sure whether sanity checks should be introduced here in the hvmloader or in libxc.

Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>

Comments welcome.

Regards,
Andre.

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 488-3567-12
----to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Karl-Hammerschmidt-Str. 34, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Andrew Bowd; Thomas M. McCoy; Giuliano Meroni
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
commit 9fed27f6b4ca3c3fae6aef792008ee620c3bc28d
Author: Andre Przywara <andre.przywara@xxxxxxx>
Date:   Mon Feb 1 00:47:32 2010 +0100

    generate SRAT table for HVM guests

diff --git a/tools/firmware/hvmloader/acpi/acpi2_0.h 
b/tools/firmware/hvmloader/acpi/acpi2_0.h
index c574f89..b237918 100644
--- a/tools/firmware/hvmloader/acpi/acpi2_0.h
+++ b/tools/firmware/hvmloader/acpi/acpi2_0.h
@@ -356,6 +356,61 @@ struct acpi_20_madt_intsrcovr {
 };
 
 /*
+ * System Resource Affinity Table header definition (SRAT).
+ */
+struct acpi_20_srat {
+    struct acpi_header header;
+    uint32_t table_revision;
+    uint32_t reserved2[2];
+};
+
+#define ACPI_SRAT_TABLE_REVISION 1
+
+/*
+ * System Resource Affinity Table structure types.
+ */
+#define ACPI_PROCESSOR_AFFIN           0x00
+#define ACPI_MEMORY_AFFIN              0x01
+
+struct acpi_20_srat_processor {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t apic_id;
+    uint32_t flags;
+    uint8_t sapic_id;
+    uint8_t domain_hi[3];
+    uint32_t reserved;
+};
+
+/*
+ * Local APIC Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0)
+
+struct acpi_20_srat_memory {
+    uint8_t type;
+    uint8_t length;
+    uint8_t domain;
+    uint8_t domain_hi[3];      /* this is ACPI 3.0, reserved in 2.0 */
+    uint16_t reserved;
+    uint32_t base_address_lo;
+    uint32_t base_address_hi;
+    uint32_t length_lo;
+    uint32_t length_hi;
+    uint32_t reserved2;
+    uint32_t flags;
+    uint32_t reserved3[2];
+};
+
+/*
+ * Memory Affinity Flags.  All other bits are reserved and must be 0.
+ */
+#define ACPI_MEM_AFFIN_ENABLED (1 << 0)
+#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1)
+#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2)  /* this is ACPI 3.0 */
+
+/*
  * Table Signatures.
  */
 #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -366,6 +421,7 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T')
 #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A')
 #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T')
+#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
 
 /*
  * Table revision numbers.
@@ -378,6 +434,7 @@ struct acpi_20_madt_intsrcovr {
 #define ACPI_2_0_TCPA_REVISION 0x02
 #define ACPI_2_0_HPET_REVISION 0x01
 #define ACPI_1_0_FADT_REVISION 0x01
+#define ACPI_2_0_SRAT_REVISION 0x01
 
 #pragma pack ()
 
diff --git a/tools/firmware/hvmloader/acpi/build.c 
b/tools/firmware/hvmloader/acpi/build.c
index cc42992..6c2bdef 100644
--- a/tools/firmware/hvmloader/acpi/build.c
+++ b/tools/firmware/hvmloader/acpi/build.c
@@ -51,6 +51,96 @@ static void set_checksum(
     p[checksum_offset] = -sum;
 }
 
+int construct_srat(struct acpi_20_srat *srat)
+{
+    struct acpi_20_srat_processor *processor;
+    struct acpi_20_srat_memory    *memory;
+    int i, offset = 0;
+    uint64_t mem_base, mem_len, mem_hole;
+
+    memset(srat, 0, sizeof(*srat));
+    srat->header.signature    = ACPI_2_0_SRAT_SIGNATURE;
+    srat->header.revision     = ACPI_2_0_SRAT_REVISION;
+    fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID);
+    fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID);
+    srat->header.oem_revision = ACPI_OEM_REVISION;
+    srat->header.creator_id   = ACPI_CREATOR_ID;
+    srat->header.creator_revision = ACPI_CREATOR_REVISION;
+    srat->table_revision      = ACPI_SRAT_TABLE_REVISION;
+    offset += sizeof(*srat);
+
+    processor = (struct acpi_20_srat_processor *)(srat + 1);
+    for ( i = 0; i < hvm_info->nr_vcpus; i++ )
+    {
+        memset(processor, 0, sizeof(*processor));
+        processor->type    = ACPI_PROCESSOR_AFFIN;
+        processor->length  = sizeof(*processor);
+        processor->domain  = hvm_info->vcpu_to_node[i];
+        processor->apic_id = LAPIC_ID(i);
+        processor->flags   = ACPI_LOCAL_APIC_AFFIN_ENABLED;
+        processor->sapic_id= 0;
+        offset += sizeof(*processor);
+        processor++;
+    }
+
+ /*
+  * Populate the SRAT memory affinity table according to the setup provided
+  * in the HVM info table. Put in holes for the area from 640KB till 1MB
+  * and for the PCI hole just below 4 GB (if applicable).
+  */
+    memory = (struct acpi_20_srat_memory *)(processor);
+
+    mem_base = 0;
+    mem_hole = 0;
+
+    for ( i = 0; i < hvm_info->num_nodes; i++ )
+    {
+        printf("NUMA: node %d: %d MB\n", i, hvm_info->node_mem[i]);
+        mem_len = hvm_info->node_mem[i] << 20;
+        memset(memory, 0, sizeof(*memory));
+        memory->type        = ACPI_MEMORY_AFFIN;
+        memory->length      = sizeof(*memory);
+        memory->domain      = i;
+        memory->flags       = ACPI_MEM_AFFIN_ENABLED;
+
+        if ( mem_hole > 0 )
+        {
+            mem_len -= mem_hole;
+            mem_hole = 0;
+        }
+
+        if (mem_base == hvm_info->low_mem_pgend << 12)
+            mem_base = 1ULL << 32;
+        memory->base_address_lo = mem_base & 0xFFFFFFFFL;
+        memory->base_address_hi = mem_base >> 32;
+        if (mem_base < hvm_info->low_mem_pgend << 12 &&
+            mem_base + mem_len > hvm_info->low_mem_pgend << 12)
+        {
+            mem_len = (hvm_info->low_mem_pgend << 12) - mem_base;
+            mem_hole = mem_len;
+            i--;
+        }
+        if (mem_base == 0)
+        {
+            mem_hole = 1024 * 1024;
+            mem_len = 640 * 1024;
+            mem_base = 384 * 1024;
+            i--;
+        }
+
+        memory->length_lo = mem_len & 0xFFFFFFFFL;
+        memory->length_hi = mem_len >> 32;
+        offset += sizeof(*memory);
+        memory++;
+        mem_base += mem_len;
+    }
+
+    srat->header.length = offset;
+    set_checksum(srat, offsetof(struct acpi_header, checksum), offset);
+
+    return align16(offset);
+}
+
 static uint8_t battery_port_exists(void)
 {
     return (inb(0x88) == 0x1F);
@@ -162,6 +252,7 @@ static int construct_hpet(struct acpi_20_hpet *hpet)
 static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs)
 {
     int offset = 0, nr_tables = 0;
+    struct acpi_20_srat *srat;
     struct acpi_20_madt *madt;
     struct acpi_20_hpet *hpet;
     struct acpi_20_tcpa *tcpa;
@@ -177,6 +268,14 @@ static int construct_secondary_tables(uint8_t *buf, 
unsigned long *table_ptrs)
         table_ptrs[nr_tables++] = (unsigned long)madt;
     }
 
+    /* SRAT. */
+    if ( hvm_info->num_nodes > 0 )
+    {
+        srat = (struct acpi_20_srat *)&buf[offset];
+        offset += construct_srat(srat);
+        table_ptrs[nr_tables++] = (unsigned long)srat;
+    }
+
     /* HPET. */
     if ( hpet_exists(ACPI_HPET_ADDRESS) )
     {
diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c
index c460d97..8fc7ac5 100644
--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -60,6 +60,10 @@ static void build_hvm_info(void *hvm_info_page, uint64_t 
mem_size)
     hvm_info->high_mem_pgend = highmem_end >> PAGE_SHIFT;
     hvm_info->reserved_mem_pgstart = special_pfn(0);
 
+    hvm_info->num_nodes = 0;
+    memset(hvm_info->vcpu_to_node, 0, 4 * HVM_MAX_VCPUS);
+    memset(hvm_info->node_mem, 0, 4 * 64);
+
     /* Finish with the checksum. */
     for ( i = 0, sum = 0; i < hvm_info->length; i++ )
         sum += ((uint8_t *)hvm_info)[i];
diff --git a/xen/include/public/hvm/hvm_info_table.h 
b/xen/include/public/hvm/hvm_info_table.h
index adb3fb9..c2bea12 100644
--- a/xen/include/public/hvm/hvm_info_table.h
+++ b/xen/include/public/hvm/hvm_info_table.h
@@ -70,6 +70,9 @@ struct hvm_info_table {
 
     /* Bitmap of which CPUs are online at boot time. */
     uint8_t     vcpu_online[HVM_MAX_VCPUS/8];
+    uint32_t    num_nodes;
+    uint8_t     vcpu_to_node[HVM_MAX_VCPUS];
+    uint32_t    node_mem[64];
 };
 
 #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.