[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 14/14] xen/x86: setup PVHv2 Dom0 ACPI tables



>>> On 30.11.16 at 17:49, <roger.pau@xxxxxxxxxx> wrote:
> @@ -567,7 +573,7 @@ static __init void hvm_setup_e820(struct domain *d, 
> unsigned long nr_pages)
>      /*
>       * Craft the e820 memory map for Dom0 based on the hardware e820 map.
>       */
> -    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
> +    d->arch.e820 = xzalloc_array(struct e820entry, E820MAX);

I have to admit that I consider this both wasteful and inflexible.
While commonly you'll need far less entries, what if in fact you
need more?

> +static int __init hvm_setup_acpi_madt(struct domain *d, paddr_t *addr)
> +{
> +    struct acpi_table_madt *madt;
> +    struct acpi_table_header *table;
> +    struct acpi_madt_io_apic *io_apic;
> +    struct acpi_madt_local_apic *local_apic;

I think I had asked about the absence of struct acpi_madt_local_x2apic
here before, but now that I look again I also wonder how you get away
without struct acpi_madt_nmi_source and struct acpi_madt_local_apic_nmi;
I can kind of see struct acpi_madt_local_apic_override not being
needed here (and I'd really hope anyway that no BIOS actually makes
use of it). The question mainly is what possible damage there may be
if what Dom0 gets to see disagrees with what the firmware acts on
(remember that AML code may alter MADT contents).

> +    acpi_status status;
> +    unsigned long size;
> +    unsigned int i;
> +    int rc;
> +
> +    /* Count number of interrupt overrides in the MADT. */
> +    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
> +                          acpi_count_intr_ovr, MAX_IRQ_SOURCES);

What's the reason for using MAX_IRQ_SOURCES here? There's no
static array involved here limiting the supported count.

> +    /* Calculate the size of the crafted MADT. */
> +    size = sizeof(*madt);
> +    size += sizeof(*io_apic);
> +    size += sizeof(*local_apic) * dom0_max_vcpus();
> +    size += sizeof(struct acpi_madt_interrupt_override) * 
> acpi_intr_overrrides;

sizeof(*intsrcovr)

> +    madt = xzalloc_bytes(size);
> +    if ( !madt )
> +    {
> +        printk("Unable to allocate memory for MADT table\n");
> +        return -ENOMEM;
> +    }
> +
> +    /* Copy the native MADT table header. */
> +    status = acpi_get_table(ACPI_SIG_MADT, 0, &table);
> +    if ( !ACPI_SUCCESS(status) )
> +    {
> +        printk("Failed to get MADT ACPI table, aborting.\n");
> +        return -EINVAL;
> +    }
> +    *((struct acpi_table_header *)madt) = *table;

madt.header = *table;

> +    madt->address = APIC_DEFAULT_PHYS_BASE;
> +    /*
> +     * NB: this is currently set to 4, which is the revision in the ACPI
> +     * spec 6.1. Sadly ACPICA doesn't provide revision numbers for the
> +     * tables described in the headers.
> +     */
> +    madt->header.revision = 4;

I can see you wanting to cap the revision here, but is it safe to
possibly bump it from what firmware has?

> +    /*
> +     * Setup the IO APIC entry.
> +     * FIXME: the current vIO-APIC code just supports one IO-APIC instance
> +     * per domain. This must be fixed in order to provide the same amount of
> +     * IO APICs as available on bare metal.
> +     */
> +    if ( nr_ioapics > 1 )
> +        printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 
> emulated IO APIC\n",
> +               nr_ioapics);
> +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
> +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
> +    io_apic->header.length = sizeof(*io_apic);
> +    io_apic->id = 1;

Is it safe to use an ID other than what firmware did assign? Where
is this 1 coming from? And how does this get sync-ed up with the
respective struct hvm_hw_vioapic field?

> +    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
> +
> +    local_apic = (struct acpi_madt_local_apic *)(io_apic + 1);
> +    for ( i = 0; i < dom0_max_vcpus(); i++ )
> +    {
> +        local_apic->header.type = ACPI_MADT_TYPE_LOCAL_APIC;
> +        local_apic->header.length = sizeof(*local_apic);
> +        local_apic->processor_id = i;
> +        local_apic->id = i * 2;
> +        local_apic->lapic_flags = ACPI_MADT_ENABLED;
> +        local_apic++;
> +    }
> +
> +    /* Setup interrupt overrides. */
> +    intsrcovr = (struct acpi_madt_interrupt_override *)local_apic;
> +    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, 
> acpi_set_intr_ovr,
> +                          MAX_IRQ_SOURCES);
> +    ASSERT(((unsigned char *)intsrcovr - (unsigned char *)madt) == size);

Likely easier to read if you used void * or long for the casts here.

> +static bool __init hvm_acpi_table_allowed(const char *sig)
> +{
> +    static const char __init banned_tables[][ACPI_NAME_SIZE] = {
> +        ACPI_SIG_HPET, ACPI_SIG_SLIT, ACPI_SIG_SRAT, ACPI_SIG_MPST,
> +        ACPI_SIG_PMTT, ACPI_SIG_MADT, ACPI_SIG_DMAR};
> +    unsigned long pfn, nr_pages;
> +    int i;
> +
> +    for ( i = 0 ; i < ARRAY_SIZE(banned_tables); i++ )
> +        if ( strncmp(sig, banned_tables[i], ACPI_NAME_SIZE) == 0 )
> +            return false;
> +
> +    /* Make sure table doesn't reside in a RAM region. */
> +    pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
> +    nr_pages = DIV_ROUND_UP(
> +                    (acpi_gbl_root_table_list.tables[i].address & 
> ~PAGE_MASK) +
> +                    acpi_gbl_root_table_list.tables[i].length, PAGE_SIZE);

How did you choose indentation here? I think there are a number
of ways compatible with how we commonly do it, but this is not one
of them. Preferably

    nr_pages =
        DIV_ROUND_UP((acpi_gbl_root_table_list.tables[i].address & ~PAGE_MASK) +
                     acpi_gbl_root_table_list.tables[i].length,
                     PAGE_SIZE);

or even

    nr_pages =
        PFN_UP((acpi_gbl_root_table_list.tables[i].address & ~PAGE_MASK) +
               acpi_gbl_root_table_list.tables[i].length);

> +    if ( acpi_memory_banned(pfn, nr_pages) )
> +    {
> +        printk("Skipping table %.4s because resides in a non ACPI or 
> reserved region\n",

I think this wording can be mis-read. How about "... because resides
in a non-ACPI, non-reserved region"?

> +static int __init hvm_setup_acpi_xsdt(struct domain *d, paddr_t madt_addr,
> +                                      paddr_t *addr)
> +{
> +    struct acpi_table_xsdt *xsdt;
> +    struct acpi_table_header *table;
> +    struct acpi_table_rsdp *rsdp;
> +    unsigned long size;
> +    unsigned int i, num_tables;
> +    paddr_t xsdt_paddr;
> +    int j, rc;

How come i is properly unsigned, but j is plain int?

> +    /*
> +     * Restore original DMAR table signature, we are going to filter it
> +     * from the new XSDT that is presented to the guest, so it no longer
> +     * makes sense to have it's signature zapped.
> +     */
> +    acpi_dmar_reinstate();
> +
> +    /* Account for the space needed by the XSDT. */
> +    size = sizeof(*xsdt);
> +    num_tables = 0;

Mind making these the initializers of the variables?

> +    /* Count the number of tables that will be added to the XSDT. */
> +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> +    {
> +        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
> +
> +        if ( hvm_acpi_table_allowed(sig) )
> +            num_tables++;
> +    }
> +
> +    /*
> +     * No need to subtract one because we will be adding a custom MADT (and
> +     * the native one is not accounted for).
> +     */
> +    size += num_tables * sizeof(xsdt->table_offset_entry[0]);

The comment has been confusing me each time I've come here so
far. The reason you don't need to add or subtract anything is
because struct acpi_table_xsdt includes one array slot already
(which isn't visible here, i.e. one would need to look at the header),
_and_ you've skipped native MADT _and_ you're going to add a
made up MADT, so I think it would be better to explain it in those
terms.

> +    xsdt = xzalloc_bytes(size);
> +    if ( !xsdt )
> +    {
> +        printk("Unable to allocate memory for XSDT table\n");
> +        return -ENOMEM;
> +    }
> +
> +    /* Copy the native XSDT table header. */
> +    rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(*rsdp));
> +    if ( !rsdp )
> +    {
> +        printk("Unable to map RSDP\n");
> +        return -EINVAL;
> +    }
> +    xsdt_paddr = rsdp->xsdt_physical_address;
> +    acpi_os_unmap_memory(rsdp, sizeof(*rsdp));
> +    table = acpi_os_map_memory(xsdt_paddr, sizeof(*table));
> +    if ( !table )
> +    {
> +        printk("Unable to map XSDT\n");
> +        return -EINVAL;
> +    }
> +    *((struct acpi_table_header *)xsdt) = *table;

xsdt.header

> +    acpi_os_unmap_memory(table, sizeof(*table));
> +
> +    /* Add the custom MADT. */
> +    j = 0;
> +    xsdt->table_offset_entry[j++] = madt_addr;

Please re-use num_tables for the counting here. It also would
likely be a little more clear if you used plain 0 as array index
and assigned 1 right away.

> +static int __init hvm_setup_acpi(struct domain *d, paddr_t start_info)
> +{
> +    struct acpi_table_rsdp rsdp, *native_rsdp;
> +    unsigned long pfn, nr_pages;
> +    paddr_t madt_paddr, xsdt_paddr, rsdp_paddr;
> +    unsigned int i;
> +    int rc;
> +
> +    /* Scan top-level tables and add their regions to the guest memory map. 
> */
> +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> +    {
> +        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
> +
> +        if ( hvm_acpi_table_allowed(sig) )
> +            hvm_add_mem_range(d, acpi_gbl_root_table_list.tables[i].address,
> +                              acpi_gbl_root_table_list.tables[i].address +
> +                              acpi_gbl_root_table_list.tables[i].length,
> +                              E820_ACPI);
> +    }
> +
> +    /* Identity map ACPI e820 regions. */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        if ( d->arch.e820[i].type != E820_ACPI &&
> +             d->arch.e820[i].type != E820_NVS )
> +            continue;
> +
> +        pfn = PFN_DOWN(d->arch.e820[i].addr);
> +        nr_pages = DIV_ROUND_UP((d->arch.e820[i].addr & ~PAGE_MASK) +
> +                                d->arch.e820[i].size, PAGE_SIZE);

Perhaps PFN_UP() again?

> +        rc = modify_identity_mmio(d, pfn, nr_pages, true);
> +        if ( rc )
> +        {
> +            printk(
> +                "Failed to map ACPI region [%#lx, %#lx) into Dom0 memory 
> map\n",

Please don't split lines like this.

> +                   pfn, pfn + nr_pages);
> +            return rc;
> +        }
> +    }
> +
> +    rc = hvm_setup_acpi_madt(d, &madt_paddr);
> +    if ( rc )
> +        return rc;
> +
> +    rc = hvm_setup_acpi_xsdt(d, madt_paddr, &xsdt_paddr);
> +    if ( rc )
> +        return rc;
> +
> +    /* Craft a custom RSDP. */
> +    memset(&rsdp, 0, sizeof(rsdp));
> +    memcpy(rsdp.signature, ACPI_SIG_RSDP, sizeof(rsdp.signature));
> +    native_rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), 
> sizeof(rsdp));

While this one is fine to call from here, ...

> +    memcpy(rsdp.oem_id, native_rsdp->oem_id, sizeof(rsdp.oem_id));
> +    acpi_os_unmap_memory(native_rsdp, sizeof(rsdp));
> +    rsdp.revision = 2;
> +    rsdp.xsdt_physical_address = xsdt_paddr;
> +    rsdp.rsdt_physical_address = 0;
> +    rsdp.length = sizeof(rsdp);
> +    rsdp.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp),
> +                                      ACPI_RSDP_REV0_SIZE);
> +    rsdp.extended_checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp),
> +                                               sizeof(rsdp));

... these two represent a layering violation: Code outside of
xen/drivers/acpi/*.c is not supposed to call anything in
xen/drivers/acpi/{tables,utilities}/. Since introducing a wrapper
is likely overkill for the purpose here I think we can tolerate it if
suitably annotated in a comment. (Granted, ARM has a similar
issue.)

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.