|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v3 06/11] vpci/header: Handle p2m range sets per BAR
On Thu, Sep 30, 2021 at 10:52:18AM +0300, Oleksandr Andrushchenko wrote:
> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@xxxxxxxx>
>
> Instead of handling a single range set, that contains all the memory
> regions of all the BARs and ROM, have them per BAR.
>
> This is in preparation of making non-identity mappings in p2m for the
> MMIOs/ROM.
>
> Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@xxxxxxxx>
> ---
> xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
> xen/include/xen/vpci.h | 3 +-
> 2 files changed, 122 insertions(+), 53 deletions(-)
>
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index ec4d215f36ff..9c603d26d302 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -131,49 +131,75 @@ static void modify_decoding(const struct pci_dev *pdev,
> uint16_t cmd,
>
> bool vpci_process_pending(struct vcpu *v)
> {
> - if ( v->vpci.mem )
> + if ( v->vpci.num_mem_ranges )
> {
> struct map_data data = {
> .d = v->domain,
> .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
> };
> - int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
> + struct pci_dev *pdev = v->vpci.pdev;
> + struct vpci_header *header = &pdev->vpci->header;
> + unsigned int i;
>
> - if ( rc == -ERESTART )
> - return true;
> + for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + {
> + struct vpci_bar *bar = &header->bars[i];
> + int rc;
>
> - spin_lock(&v->vpci.pdev->vpci->lock);
> - /* Disable memory decoding unconditionally on failure. */
> - modify_decoding(v->vpci.pdev,
> - rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
> - !rc && v->vpci.rom_only);
> - spin_unlock(&v->vpci.pdev->vpci->lock);
> + if ( !bar->mem )
> + continue;
>
> - rangeset_destroy(v->vpci.mem);
> - v->vpci.mem = NULL;
> - if ( rc )
> - /*
> - * FIXME: in case of failure remove the device from the domain.
> - * Note that there might still be leftover mappings. While this
> is
> - * safe for Dom0, for DomUs the domain will likely need to be
> - * killed in order to avoid leaking stale p2m mappings on
> - * failure.
> - */
> - vpci_remove_device(v->vpci.pdev);
> + rc = rangeset_consume_ranges(bar->mem, map_range, &data);
> +
> + if ( rc == -ERESTART )
> + return true;
> +
> + spin_lock(&pdev->vpci->lock);
> + /* Disable memory decoding unconditionally on failure. */
> + modify_decoding(pdev,
> + rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY :
> v->vpci.cmd,
> + !rc && v->vpci.rom_only);
> + spin_unlock(&pdev->vpci->lock);
> +
> + rangeset_destroy(bar->mem);
Now that the rangesets are per-BAR we might have to consider
allocating them at initialization time and not destroying them when
empty. We could replace the NULL checks with rangeset_is_empty
instead. Not that you have to do this on this patch, but I think it's
worth mentioning.
> + bar->mem = NULL;
> + v->vpci.num_mem_ranges--;
> + if ( rc )
> + /*
> + * FIXME: in case of failure remove the device from the
> domain.
> + * Note that there might still be leftover mappings. While
> this is
> + * safe for Dom0, for DomUs the domain will likely need to be
> + * killed in order to avoid leaking stale p2m mappings on
> + * failure.
> + */
> + vpci_remove_device(pdev);
> + }
> }
>
> return false;
> }
>
> static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
> - struct rangeset *mem, uint16_t cmd)
> + uint16_t cmd)
> {
> struct map_data data = { .d = d, .map = true };
> - int rc;
> + struct vpci_header *header = &pdev->vpci->header;
> + int rc = 0;
> + unsigned int i;
> +
> + for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + {
> + struct vpci_bar *bar = &header->bars[i];
>
> - while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) ==
> -ERESTART )
> - process_pending_softirqs();
> - rangeset_destroy(mem);
> + if ( !bar->mem )
> + continue;
> +
> + while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
> + &data)) == -ERESTART )
> + process_pending_softirqs();
> + rangeset_destroy(bar->mem);
> + bar->mem = NULL;
> + }
> if ( !rc )
> modify_decoding(pdev, cmd, false);
>
> @@ -181,7 +207,7 @@ static int __init apply_map(struct domain *d, const
> struct pci_dev *pdev,
> }
>
> static void defer_map(struct domain *d, struct pci_dev *pdev,
> - struct rangeset *mem, uint16_t cmd, bool rom_only)
> + uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)
Like mentioned below, I don't think you need to pass the number of
BARs that need mapping changes. Iff that's strictly needed, it should
be an unsigned int.
> {
> struct vcpu *curr = current;
>
> @@ -192,9 +218,9 @@ static void defer_map(struct domain *d, struct pci_dev
> *pdev,
> * started for the same device if the domain is not well-behaved.
> */
> curr->vpci.pdev = pdev;
> - curr->vpci.mem = mem;
> curr->vpci.cmd = cmd;
> curr->vpci.rom_only = rom_only;
> + curr->vpci.num_mem_ranges = num_mem_ranges;
> /*
> * Raise a scheduler softirq in order to prevent the guest from resuming
> * execution with pending mapping operations, to trigger the invocation
> @@ -206,42 +232,47 @@ static void defer_map(struct domain *d, struct pci_dev
> *pdev,
> static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool
> rom_only)
> {
> struct vpci_header *header = &pdev->vpci->header;
> - struct rangeset *mem = rangeset_new(NULL, NULL, 0);
> struct pci_dev *tmp, *dev = NULL;
> const struct vpci_msix *msix = pdev->vpci->msix;
> - unsigned int i;
> + unsigned int i, j;
> int rc;
> -
> - if ( !mem )
> - return -ENOMEM;
> + uint8_t num_mem_ranges;
>
> /*
> - * Create a rangeset that represents the current device BARs memory
> region
> + * Create a rangeset per BAR that represents the current device memory
> region
> * and compare it against all the currently active BAR memory regions. If
> * an overlap is found, subtract it from the region to be
> mapped/unmapped.
> *
> - * First fill the rangeset with all the BARs of this device or with the
> ROM
> + * First fill the rangesets with all the BARs of this device or with the
> ROM
> * BAR only, depending on whether the guest is toggling the memory decode
> * bit of the command register, or the enable bit of the ROM BAR
> register.
> */
> for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> {
> - const struct vpci_bar *bar = &header->bars[i];
> + struct vpci_bar *bar = &header->bars[i];
> unsigned long start = PFN_DOWN(bar->addr);
> unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>
> + bar->mem = NULL;
Why do you need to set mem to NULL here? I think we should instead
assert that bar->mem == NULL here.
> +
> if ( !MAPPABLE_BAR(bar) ||
> (rom_only ? bar->type != VPCI_BAR_ROM
> : (bar->type == VPCI_BAR_ROM &&
> !header->rom_enabled)) )
> continue;
>
> - rc = rangeset_add_range(mem, start, end);
> + bar->mem = rangeset_new(NULL, NULL, 0);
> + if ( !bar->mem )
> + {
> + rc = -ENOMEM;
> + goto fail;
> + }
> +
> + rc = rangeset_add_range(bar->mem, start, end);
> if ( rc )
> {
> printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
> start, end, rc);
> - rangeset_destroy(mem);
> - return rc;
> + goto fail;
> }
> }
>
> @@ -252,14 +283,21 @@ static int modify_bars(const struct pci_dev *pdev,
> uint16_t cmd, bool rom_only)
> unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
> vmsix_table_size(pdev->vpci, i) - 1);
>
> - rc = rangeset_remove_range(mem, start, end);
> - if ( rc )
> + for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
> {
> - printk(XENLOG_G_WARNING
> - "Failed to remove MSIX table [%lx, %lx]: %d\n",
> - start, end, rc);
> - rangeset_destroy(mem);
> - return rc;
> + const struct vpci_bar *bar = &header->bars[j];
> +
> + if ( !bar->mem )
> + continue;
> +
> + rc = rangeset_remove_range(bar->mem, start, end);
> + if ( rc )
> + {
> + printk(XENLOG_G_WARNING
> + "Failed to remove MSIX table [%lx, %lx]: %d\n",
> + start, end, rc);
> + goto fail;
> + }
> }
> }
>
> @@ -291,7 +329,8 @@ static int modify_bars(const struct pci_dev *pdev,
> uint16_t cmd, bool rom_only)
> unsigned long start = PFN_DOWN(bar->addr);
> unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
>
> - if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end)
> ||
> + if ( !bar->enabled ||
> + !rangeset_overlaps_range(bar->mem, start, end) ||
> /*
> * If only the ROM enable bit is toggled check against other
> * BARs in the same device for overlaps, but not against the
> @@ -300,13 +339,12 @@ static int modify_bars(const struct pci_dev *pdev,
> uint16_t cmd, bool rom_only)
> (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
> continue;
>
> - rc = rangeset_remove_range(mem, start, end);
> + rc = rangeset_remove_range(bar->mem, start, end);
> if ( rc )
> {
> printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
> start, end, rc);
> - rangeset_destroy(mem);
> - return rc;
> + goto fail;
> }
> }
> }
> @@ -324,12 +362,42 @@ static int modify_bars(const struct pci_dev *pdev,
> uint16_t cmd, bool rom_only)
> * will always be to establish mappings and process all the BARs.
> */
> ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
> - return apply_map(pdev->domain, pdev, mem, cmd);
> + return apply_map(pdev->domain, pdev, cmd);
> }
>
> - defer_map(dev->domain, dev, mem, cmd, rom_only);
> + /* Find out how many memory ranges has left after MSI and overlaps. */
> + num_mem_ranges = 0;
> + for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + {
> + struct vpci_bar *bar = &header->bars[i];
There's no need to declare this local variable AFAICT, just use
header->bars[i].mem. In any case this is likely to go away if you
follow my recommendation below to just call defer_map unconditionally
like it's currently done.
> +
> + if ( !rangeset_is_empty(bar->mem) )
> + num_mem_ranges++;
> + }
> +
> + /*
> + * There are cases when PCI device, root port for example, has neither
> + * memory space nor IO. In this case PCI command register write is
> + * missed resulting in the underlying PCI device not functional, so:
> + * - if there are no regions write the command register now
> + * - if there are regions then defer work and write later on
> + */
> + if ( !num_mem_ranges )
> + pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
I think this is wrong, as not calling defer_map will prevent the
rangesets (bar[i]->mem) from being destroyed, so we are effectively
leaking memory.
You need to take a path similar to the failure one in case there are
no mappings pending, or even better just call defer_map anyway and let
it do it's thing, it should be capable of handling empty rangesets
just fine. That's how it's currently done.
> + else
> + defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
>
> return 0;
> +
> +fail:
We usually ask labels to be indented with one space.
> + for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
> + {
> + struct vpci_bar *bar = &header->bars[i];
> +
> + rangeset_destroy(bar->mem);
> + bar->mem = NULL;
> + }
> + return rc;
> }
>
> static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
> diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
> index a0320b22cb36..352e02d0106d 100644
> --- a/xen/include/xen/vpci.h
> +++ b/xen/include/xen/vpci.h
> @@ -80,6 +80,7 @@ struct vpci {
> /* Guest view of the BAR. */
> uint64_t guest_addr;
> uint64_t size;
> + struct rangeset *mem;
> enum {
> VPCI_BAR_EMPTY,
> VPCI_BAR_IO,
> @@ -154,9 +155,9 @@ struct vpci {
>
> struct vpci_vcpu {
> /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
> - struct rangeset *mem;
> struct pci_dev *pdev;
> uint16_t cmd;
> + uint8_t num_mem_ranges;
AFAICT This could be a simple bool:
bool map_pending : 1;
As there's no strict need to know how many BARs have pending mappings.
Thanks, Roger.
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |