[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3.1 15/15] xen/x86: setup PVHv2 Dom0 ACPI tables



>>> On 29.10.16 at 11:00, <roger.pau@xxxxxxxxxx> wrote:
> Also, regions marked as E820_ACPI or E820_NVS are identity mapped into Dom0
> p2m, plus any top-level ACPI tables that should be accessible to Dom0 and
> that don't reside in RAM regions. This is needed because some memory maps
> don't properly account for all the memory used by ACPI, so it's common to
> find ACPI tables in holes.

I question whether this behavior should be enabled by default. Not
having seen the code yet I also wonder whether these regions
shouldn't simply be added to the guest's E820 as E820_ACPI, which
should then result in them getting mapped without further special
casing.

> +static int __init hvm_add_mem_range(struct domain *d, uint64_t s, uint64_t e,
> +                                    uint32_t type)

I see s and e being uint64_t, but I don't see why type can't be plain
unsigned int.

> +{
> +    unsigned int i;
> +
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        uint64_t rs = d->arch.e820[i].addr;
> +        uint64_t re = rs + d->arch.e820[i].size;
> +
> +        if ( rs == e && d->arch.e820[i].type == type )
> +        {
> +            d->arch.e820[i].addr = s;
> +            return 0;
> +        }
> +
> +        if ( re == s && d->arch.e820[i].type == type &&
> +             (i + 1 == d->arch.nr_e820 || d->arch.e820[i + 1].addr >= e) )

I understand this to be overlap prevention, but there's no equivalent
in the earlier if(). Are you relying on the table being strictly sorted at
all times? If so, a comment should say so.

> +        {
> +            d->arch.e820[i].size += e - s;
> +            return 0;
> +        }
> +
> +        if ( rs >= e )
> +            break;
> +
> +        if ( re > s )
> +            return -ENOMEM;

I don't think ENOMEM is appropriate to signal an overlap. And don't
you need to reverse these last two if()s?

> @@ -2112,6 +2166,371 @@ static int __init hvm_setup_cpus(struct domain *d, 
> paddr_t entry,
>      return 0;
>  }
>  
> +static int __init acpi_count_intr_ov(struct acpi_subtable_header *header,
> +                                     const unsigned long end)
> +{
> +
> +    acpi_intr_overrrides++;
> +    return 0;
> +}
> +
> +static int __init acpi_set_intr_ov(struct acpi_subtable_header *header,
> +                                   const unsigned long end)

May I ask for "ov" to become at least "ovr" in all cases? Also stray
const.

> +{
> +    struct acpi_madt_interrupt_override *intr =
> +        container_of(header, struct acpi_madt_interrupt_override, header);

Yet missing const here.

> +    ACPI_MEMCPY(intsrcovr, intr, sizeof(*intr));

Structure assignment (for type safety; also elsewhere)?

> +static int __init hvm_setup_acpi_madt(struct domain *d, paddr_t *addr)
> +{
> +    struct acpi_table_madt *madt;
> +    struct acpi_table_header *table;
> +    struct acpi_madt_io_apic *io_apic;
> +    struct acpi_madt_local_apic *local_apic;
> +    struct vcpu *saved_current, *v = d->vcpu[0];
> +    acpi_status status;
> +    unsigned long size;
> +    unsigned int i;
> +    int rc;
> +
> +    /* Count number of interrupt overrides in the MADT. */
> +    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, 
> acpi_count_intr_ov,
> +                          MAX_IRQ_SOURCES);
> +
> +    /* Calculate the size of the crafted MADT. */
> +    size = sizeof(struct acpi_table_madt);
> +    size += sizeof(struct acpi_madt_interrupt_override) * 
> acpi_intr_overrrides;
> +    size += sizeof(struct acpi_madt_io_apic);
> +    size += sizeof(struct acpi_madt_local_apic) * dom0_max_vcpus();

All the sizeof()s would better use the variables declared above.

> +    madt = xzalloc_bytes(size);
> +    if ( !madt )
> +    {
> +        printk("Unable to allocate memory for MADT table\n");
> +        return -ENOMEM;
> +    }
> +
> +    /* Copy the native MADT table header. */
> +    status = acpi_get_table(ACPI_SIG_MADT, 0, &table);
> +    if ( !ACPI_SUCCESS(status) )
> +    {
> +        printk("Failed to get MADT ACPI table, aborting.\n");
> +        return -EINVAL;
> +    }
> +    ACPI_MEMCPY(madt, table, sizeof(*table));
> +    madt->address = APIC_DEFAULT_PHYS_BASE;

You may also need to override table revision (at least it shouldn't end
up larger than what we know about).

> +    /* Setup the IO APIC entry. */
> +    if ( nr_ioapics > 1 )
> +        printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 
> emulated IO APIC\n",
> +               nr_ioapics);

I've said elsewhere already that I think we should provide 1 vIO-APIC
per physical one.

> +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
> +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
> +    io_apic->header.length = sizeof(*io_apic);
> +    io_apic->id = 1;
> +    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
> +
> +    local_apic = (struct acpi_madt_local_apic *)(io_apic + 1);
> +    for ( i = 0; i < dom0_max_vcpus(); i++ )
> +    {
> +        local_apic->header.type = ACPI_MADT_TYPE_LOCAL_APIC;
> +        local_apic->header.length = sizeof(*local_apic);
> +        local_apic->processor_id = i;
> +        local_apic->id = i * 2;
> +        local_apic->lapic_flags = ACPI_MADT_ENABLED;
> +        local_apic++;
> +    }

What about x2apic? And for lapic, do you limit vCPU count anywhere?

> +    /* Setup interrupt overwrites. */

overrides

> +static bool __init hvm_acpi_table_allowed(const char *sig)
> +{
> +    static const char __init banned_tables[][ACPI_NAME_SIZE] = {
> +        ACPI_SIG_HPET, ACPI_SIG_SLIT, ACPI_SIG_SRAT, ACPI_SIG_MPST,
> +        ACPI_SIG_PMTT, ACPI_SIG_MADT, ACPI_SIG_DMAR};
> +    unsigned long pfn, nr_pages;
> +    int i;
> +
> +    for ( i = 0 ; i < ARRAY_SIZE(banned_tables); i++ )
> +        if ( strncmp(sig, banned_tables[i], ACPI_NAME_SIZE) == 0 )
> +            return false;
> +
> +    /* Make sure table doesn't reside in a RAM region. */
> +    pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
> +    nr_pages = DIV_ROUND_UP(acpi_gbl_root_table_list.tables[i].length,
> +                            PAGE_SIZE);

You also need to add in the offset-into-page from the base address.

> +    if ( range_is_ram(pfn, nr_pages) )
> +    {
> +        printk("Skipping table %.4s because resides in a RAM region\n",
> +               sig);
> +        return false;

I think this should be more strict, at least to start with: Require the
table to be in an E820_ACPI region (or maybe an E820_RESERVED
one), but nothing else.

> +static int __init hvm_setup_acpi_xsdt(struct domain *d, paddr_t madt_addr,
> +                                      paddr_t *addr)
> +{
> +    struct acpi_table_xsdt *xsdt;
> +    struct acpi_table_header *table;
> +    struct acpi_table_rsdp *rsdp;
> +    struct vcpu *saved_current, *v = d->vcpu[0];
> +    unsigned long size;
> +    unsigned int i, num_tables;
> +    int j, rc;
> +
> +    /*
> +     * Restore original DMAR table signature, we are going to filter it
> +     * from the new XSDT that is presented to the guest, so it no longer
> +     * makes sense to have it's signature zapped.
> +     */
> +    acpi_dmar_reinstate();
> +
> +    /* Account for the space needed by the XSDT. */
> +    size = sizeof(*xsdt);
> +    num_tables = 0;
> +
> +    /* Count the number of tables that will be added to the XSDT. */
> +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> +    {
> +        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
> +
> +        if ( !hvm_acpi_table_allowed(sig) )
> +            continue;
> +
> +        num_tables++;
> +    }

Unless you expect something to be added to this loop, please
simplify it by inverting the condition and dropping the continue.

> +    /*
> +     * No need to subtract one because we will be adding a custom MADT (and
> +     * the native one is not accounted for).
> +     */
> +    size += num_tables * sizeof(u64);

sizeof(xsdt->table_offset_entry[0])

> +    xsdt = xzalloc_bytes(size);
> +    if ( !xsdt )
> +    {
> +        printk("Unable to allocate memory for XSDT table\n");
> +        return -ENOMEM;
> +    }
> +
> +    /* Copy the native XSDT table header. */
> +    rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(*rsdp));
> +    if ( !rsdp )
> +    {
> +        printk("Unable to map RSDP\n");
> +        return -EINVAL;
> +    }
> +    table = acpi_os_map_memory(rsdp->xsdt_physical_address, sizeof(*table));
> +    if ( !table )
> +    {
> +        printk("Unable to map XSDT\n");
> +        return -EINVAL;
> +    }
> +    ACPI_MEMCPY(xsdt, table, sizeof(*table));
> +    acpi_os_unmap_memory(table, sizeof(*table));
> +    acpi_os_unmap_memory(rsdp, sizeof(*rsdp));

At this point we're not in SYS_STATE_active yet, and hence there
can only be one mapping at a time. The way it's written right now
does not represent an active problem, but to prevent someone
falling into this trap you should unmap the first mapping before
establishing the second one.

> +    /* Add the custom MADT. */
> +    j = 0;
> +    xsdt->table_offset_entry[j++] = madt_addr;
> +
> +    /* Copy the address of the rest of the allowed tables. */

addresses?

> +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> +    {
> +        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
> +        unsigned long pfn, nr_pages;
> +
> +        if ( !hvm_acpi_table_allowed(sig) )
> +            continue;
> +
> +        /* Make sure table doesn't reside in a RAM region. */
> +        pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
> +        nr_pages = DIV_ROUND_UP(acpi_gbl_root_table_list.tables[i].length,
> +                                PAGE_SIZE);

See above (and there appears to be at least one more further down).

> +        /* Make sure table is mapped. */
> +        rc = modify_identity_mmio(d, pfn, nr_pages, true);
> +        if ( rc )
> +            printk("Failed to map ACPI region [%#lx, %#lx) into Dom0 memory 
> map\n",
> +                   pfn, pfn + nr_pages);

Isn't the comment for this code block meant to go ahead of the earlier
one, in place of the comment that's there (and looks wrong)?

> +        xsdt->table_offset_entry[j++] =
> +                            acpi_gbl_root_table_list.tables[i].address;
> +    }
> +
> +    xsdt->header.length = size;
> +    xsdt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, xsdt), size);
> +
> +    /* Place the new XSDT in guest memory space. */
> +    if ( hvm_steal_ram(d, size, GB(4), addr) )
> +    {
> +        printk("Unable to find allocate guest RAM for XSDT\n");

"find" or "allocate"?

> +        return -ENOMEM;
> +    }
> +
> +    /* Mark this region as E820_ACPI. */
> +    if ( hvm_add_mem_range(d, *addr, *addr + size, E820_ACPI) )
> +        printk("Unable to add XSDT region to memory map\n");
> +
> +    saved_current = current;
> +    set_current(v);
> +    rc = hvm_copy_to_guest_phys(*addr, xsdt, size);
> +    set_current(saved_current);

This pattern appears to be recurring - please make a helper function
(which then also eases possibly addressing my earlier remark
regarding that playing with current).

> +    if ( rc != HVMCOPY_okay )
> +    {
> +        printk("Unable to copy XSDT into guest memory\n");
> +        return -EFAULT;
> +    }
> +    xfree(xsdt);
> +
> +    return 0;
> +}
> +
> +
> +static int __init hvm_setup_acpi(struct domain *d, paddr_t start_info)

Only one blank line between functions please.

> +{
> +    struct vcpu *saved_current, *v = d->vcpu[0];
> +    struct acpi_table_rsdp rsdp;
> +    unsigned long pfn, nr_pages;
> +    paddr_t madt_paddr, xsdt_paddr, rsdp_paddr;
> +    unsigned int i;
> +    int rc;
> +
> +    /* Identity map ACPI e820 regions. */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        if ( d->arch.e820[i].type != E820_ACPI &&
> +             d->arch.e820[i].type != E820_NVS )
> +            continue;
> +
> +        pfn = PFN_DOWN(d->arch.e820[i].addr);
> +        nr_pages = DIV_ROUND_UP(d->arch.e820[i].size, PAGE_SIZE);
> +
> +        rc = modify_identity_mmio(d, pfn, nr_pages, true);
> +        if ( rc )
> +        {
> +            printk(
> +                "Failed to map ACPI region [%#lx, %#lx) into Dom0 memory 
> map\n",
> +                   pfn, pfn + nr_pages);
> +            return rc;
> +        }
> +    }
> +
> +    rc = hvm_setup_acpi_madt(d, &madt_paddr);
> +    if ( rc )
> +        return rc;
> +
> +    rc = hvm_setup_acpi_xsdt(d, madt_paddr, &xsdt_paddr);
> +    if ( rc )
> +        return rc;

Coming back to the initial comment: If you did the 1:1 mapping last
and if you added problematic ranges to the E820 map, you wouldn't
need to call modify_identity_mmio() in two places.

> +    /* Craft a custom RSDP. */
> +    memset(&rsdp, 0, sizeof(rsdp));
> +    memcpy(&rsdp.signature, ACPI_SIG_RSDP, sizeof(rsdp.signature));
> +    memcpy(&rsdp.oem_id, "XenVMM", sizeof(rsdp.oem_id));

Is that a good idea? I think Dom0 should get to see the real OEM.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.