[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC PATCH 01/10] xen: pci: add per-domain pci list lock



On Wed, 31 Aug 2022, Volodymyr Babchuk wrote:
> domain->pdevs_lock protects access to domain->pdev_list.
> As this, it should be used when we are adding, removing on enumerating
> PCI devices assigned to a domain.
> 
> This enables more granular locking instead of one huge pcidevs_lock that
> locks entire PCI subsystem. Please note that pcidevs_lock() is still
> used, we are going to remove it in subsequent patches.
> 
> Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@xxxxxxxx>

I reviewed the patch, and made sure to pay extra attention to:
- error paths
- missing locks
- lock ordering
- interruptions

Here is what I found:


1) iommu.c:reassign_device_ownership and pci_amd_iommu.c:reassign_device
Both functions without any pdevs_lock locking do:
list_move(&pdev->domain_list, &target->pdev_list);

It seems to be it would need pdevs_lock. Maybe we need to change
list_move into list_del (protected by the pdevs_lock of the old domain)
and list_add (protected by the pdev_lock of the new domain).


2) has_arch_pdevs
has_arch_pdevs is implemented as list_empty and needs locking as well,
however no domain->pdevs_lock are added to protect has_arch_pdevs in
this patch. I think we need pdevs_lock around has_arch_pdevs.


Two more comments below about lock inversion and taking the same lock
twice



> ---
>  xen/common/domain.c                         |  1 +
>  xen/drivers/passthrough/amd/iommu_cmd.c     |  4 ++-
>  xen/drivers/passthrough/amd/pci_amd_iommu.c |  7 ++++-
>  xen/drivers/passthrough/pci.c               | 29 ++++++++++++++++++++-
>  xen/drivers/passthrough/vtd/iommu.c         |  9 +++++--
>  xen/drivers/vpci/header.c                   |  3 +++
>  xen/drivers/vpci/msi.c                      |  7 ++++-
>  xen/drivers/vpci/vpci.c                     |  4 +--
>  xen/include/xen/pci.h                       |  2 +-
>  xen/include/xen/sched.h                     |  1 +
>  10 files changed, 58 insertions(+), 9 deletions(-)
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 7062393e37..4611141b87 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -618,6 +618,7 @@ struct domain *domain_create(domid_t domid,
>  
>  #ifdef CONFIG_HAS_PCI
>      INIT_LIST_HEAD(&d->pdev_list);
> +    spin_lock_init(&d->pdevs_lock);
>  #endif
>  
>      /* All error paths can depend on the above setup. */
> diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c 
> b/xen/drivers/passthrough/amd/iommu_cmd.c
> index 40ddf366bb..47c45398d4 100644
> --- a/xen/drivers/passthrough/amd/iommu_cmd.c
> +++ b/xen/drivers/passthrough/amd/iommu_cmd.c
> @@ -308,11 +308,12 @@ void amd_iommu_flush_iotlb(u8 devfn, const struct 
> pci_dev *pdev,
>      flush_command_buffer(iommu, iommu_dev_iotlb_timeout);
>  }
>  
> -static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
> +static void amd_iommu_flush_all_iotlbs(struct domain *d, daddr_t daddr,
>                                         unsigned int order)
>  {
>      struct pci_dev *pdev;
>  
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev( d, pdev )
>      {
>          u8 devfn = pdev->devfn;
> @@ -323,6 +324,7 @@ static void amd_iommu_flush_all_iotlbs(const struct 
> domain *d, daddr_t daddr,
>          } while ( devfn != pdev->devfn &&
>                    PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
>      }
> +    spin_unlock(&d->pdevs_lock);
>  }
>  
>  /* Flush iommu cache after p2m changes. */
> diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c 
> b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> index 4ba8e764b2..64c016491d 100644
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -96,20 +96,25 @@ static int __must_check allocate_domain_resources(struct 
> domain *d)
>      return rc;
>  }
>  
> -static bool any_pdev_behind_iommu(const struct domain *d,
> +static bool any_pdev_behind_iommu(struct domain *d,
>                                    const struct pci_dev *exclude,
>                                    const struct amd_iommu *iommu)
>  {
>      const struct pci_dev *pdev;
>  
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev ( d, pdev )
>      {
>          if ( pdev == exclude )
>              continue;
>  
>          if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu )
> +     {
> +         spin_unlock(&d->pdevs_lock);
>              return true;
> +     }

code style: tabs instead of spaces


>      }
> +    spin_unlock(&d->pdevs_lock);
>  
>      return false;
>  }
> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
> index cdaf5c247f..4366f8f965 100644
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -523,7 +523,9 @@ static void __init _pci_hide_device(struct pci_dev *pdev)
>      if ( pdev->domain )
>          return;
>      pdev->domain = dom_xen;
> +    spin_lock(&dom_xen->pdevs_lock);
>      list_add(&pdev->domain_list, &dom_xen->pdev_list);
> +    spin_unlock(&dom_xen->pdevs_lock);
>  }
>  
>  int __init pci_hide_device(unsigned int seg, unsigned int bus,
> @@ -595,7 +597,7 @@ struct pci_dev *pci_get_real_pdev(pci_sbdf_t sbdf)
>      return pdev;
>  }
>  
> -struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf)
> +struct pci_dev *pci_get_pdev(struct domain *d, pci_sbdf_t sbdf)
>  {
>      struct pci_dev *pdev;
>  
> @@ -620,9 +622,16 @@ struct pci_dev *pci_get_pdev(const struct domain *d, 
> pci_sbdf_t sbdf)
>                  return pdev;
>      }
>      else
> +    {
> +        spin_lock(&d->pdevs_lock);
>          list_for_each_entry ( pdev, &d->pdev_list, domain_list )
>              if ( pdev->sbdf.bdf == sbdf.bdf )
> +            {
> +                spin_unlock(&d->pdevs_lock);
>                  return pdev;
> +            }
> +        spin_unlock(&d->pdevs_lock);
> +    }
>  
>      return NULL;
>  }
> @@ -817,7 +826,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>      if ( !pdev->domain )
>      {
>          pdev->domain = hardware_domain;
> +        spin_lock(&hardware_domain->pdevs_lock);
>          list_add(&pdev->domain_list, &hardware_domain->pdev_list);
> +        spin_unlock(&hardware_domain->pdevs_lock);
>  
>          /*
>           * For devices not discovered by Xen during boot, add vPCI handlers
> @@ -827,7 +838,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>          if ( ret )
>          {
>              printk(XENLOG_ERR "Setup of vPCI failed: %d\n", ret);
> +            spin_lock(&pdev->domain->pdevs_lock);
>              list_del(&pdev->domain_list);
> +            spin_unlock(&pdev->domain->pdevs_lock);
>              pdev->domain = NULL;
>              goto out;
>          }
> @@ -835,7 +848,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>          if ( ret )
>          {
>              vpci_remove_device(pdev);
> +            spin_lock(&pdev->domain->pdevs_lock);
>              list_del(&pdev->domain_list);
> +            spin_unlock(&pdev->domain->pdevs_lock);
>              pdev->domain = NULL;
>              goto out;
>          }
> @@ -885,7 +900,11 @@ int pci_remove_device(u16 seg, u8 bus, u8 devfn)
>              pci_cleanup_msi(pdev);
>              ret = iommu_remove_device(pdev);
>              if ( pdev->domain )
> +            {
> +                spin_lock(&pdev->domain->pdevs_lock);
>                  list_del(&pdev->domain_list);
> +                spin_unlock(&pdev->domain->pdevs_lock);
> +            }
>              printk(XENLOG_DEBUG "PCI remove device %pp\n", &pdev->sbdf);
>              free_pdev(pseg, pdev);
>              break;
> @@ -967,12 +986,14 @@ int pci_release_devices(struct domain *d)
>          pcidevs_unlock();
>          return ret;
>      }
> +    spin_lock(&d->pdevs_lock);
>      list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
>      {
>          bus = pdev->bus;
>          devfn = pdev->devfn;
>          ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;

This causes pdevs_lock to be taken twice. deassign_device also takes a
pdevs_lock.  Probably we need to change all the
spin_lock(&d->pdevs_lock) into spin_lock_recursive.



>      }
> +    spin_unlock(&d->pdevs_lock);
>      pcidevs_unlock();
>  
>      return ret;
> @@ -1194,7 +1215,9 @@ static int __hwdom_init cf_check 
> _setup_hwdom_pci_devices(
>              if ( !pdev->domain )
>              {
>                  pdev->domain = ctxt->d;
> +                spin_lock(&pdev->domain->pdevs_lock);
>                  list_add(&pdev->domain_list, &ctxt->d->pdev_list);
> +                spin_unlock(&pdev->domain->pdevs_lock);
>                  setup_one_hwdom_device(ctxt, pdev);
>              }
>              else if ( pdev->domain == dom_xen )
> @@ -1556,6 +1579,7 @@ static int iommu_get_device_group(
>          return group_id;
>  
>      pcidevs_lock();
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev( d, pdev )
>      {
>          unsigned int b = pdev->bus;
> @@ -1571,6 +1595,7 @@ static int iommu_get_device_group(
>          if ( sdev_id < 0 )
>          {
>              pcidevs_unlock();
> +            spin_unlock(&d->pdevs_lock);

lock inversion


>              return sdev_id;
>          }
>  
> @@ -1581,6 +1606,7 @@ static int iommu_get_device_group(
>              if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
>              {
>                  pcidevs_unlock();
> +                spin_unlock(&d->pdevs_lock);

lock inversion


>                  return -EFAULT;
>              }
>              i++;
> @@ -1588,6 +1614,7 @@ static int iommu_get_device_group(
>      }
>  
>      pcidevs_unlock();
> +    spin_unlock(&d->pdevs_lock);

lock inversion


>      return i;
>  }
> diff --git a/xen/drivers/passthrough/vtd/iommu.c 
> b/xen/drivers/passthrough/vtd/iommu.c
> index 62e143125d..fff1442265 100644
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -183,12 +183,13 @@ static void cleanup_domid_map(domid_t domid, struct 
> vtd_iommu *iommu)
>      }
>  }
>  
> -static bool any_pdev_behind_iommu(const struct domain *d,
> +static bool any_pdev_behind_iommu(struct domain *d,
>                                    const struct pci_dev *exclude,
>                                    const struct vtd_iommu *iommu)
>  {
>      const struct pci_dev *pdev;
>  
> +    spin_lock(&d->pdevs_lock);
>      for_each_pdev ( d, pdev )
>      {
>          const struct acpi_drhd_unit *drhd;
> @@ -198,8 +199,12 @@ static bool any_pdev_behind_iommu(const struct domain *d,
>  
>          drhd = acpi_find_matched_drhd_unit(pdev);
>          if ( drhd && drhd->iommu == iommu )
> +        {
> +            spin_unlock(&d->pdevs_lock);
>              return true;
> +        }
>      }
> +    spin_unlock(&d->pdevs_lock);
>  
>      return false;
>  }
> @@ -208,7 +213,7 @@ static bool any_pdev_behind_iommu(const struct domain *d,
>   * If no other devices under the same iommu owned by this domain,
>   * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap.
>   */
> -static void check_cleanup_domid_map(const struct domain *d,
> +static void check_cleanup_domid_map(struct domain *d,
>                                      const struct pci_dev *exclude,
>                                      struct vtd_iommu *iommu)
>  {
> diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
> index a1c928a0d2..a59aa7ad0b 100644
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -267,6 +267,7 @@ static int modify_bars(const struct pci_dev *pdev, 
> uint16_t cmd, bool rom_only)
>       * Check for overlaps with other BARs. Note that only BARs that are
>       * currently mapped (enabled) are checked for overlaps.
>       */
> +    spin_lock(&pdev->domain->pdevs_lock);
>      for_each_pdev ( pdev->domain, tmp )
>      {
>          if ( tmp == pdev )
> @@ -306,11 +307,13 @@ static int modify_bars(const struct pci_dev *pdev, 
> uint16_t cmd, bool rom_only)
>                  printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
>                         start, end, rc);
>                  rangeset_destroy(mem);
> +                spin_unlock( &pdev->domain->pdevs_lock);
>                  return rc;
>              }
>          }
>      }
>  
> +    spin_unlock( &pdev->domain->pdevs_lock);
>      ASSERT(dev);
>  
>      if ( system_state < SYS_STATE_active )
> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
> index 8f2b59e61a..8969c335b0 100644
> --- a/xen/drivers/vpci/msi.c
> +++ b/xen/drivers/vpci/msi.c
> @@ -265,7 +265,7 @@ REGISTER_VPCI_INIT(init_msi, VPCI_PRIORITY_LOW);
>  
>  void vpci_dump_msi(void)
>  {
> -    const struct domain *d;
> +    struct domain *d;
>  
>      rcu_read_lock(&domlist_read_lock);
>      for_each_domain ( d )
> @@ -277,6 +277,9 @@ void vpci_dump_msi(void)
>  
>          printk("vPCI MSI/MSI-X d%d\n", d->domain_id);
>  
> +        if ( !spin_trylock(&d->pdevs_lock) )
> +            continue;
> +
>          for_each_pdev ( d, pdev )
>          {
>              const struct vpci_msi *msi;
> @@ -326,6 +329,8 @@ void vpci_dump_msi(void)
>              spin_unlock(&pdev->vpci->lock);
>              process_pending_softirqs();
>          }
> +        spin_unlock(&d->pdevs_lock);
> +
>      }
>      rcu_read_unlock(&domlist_read_lock);
>  }
> diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
> index 3467c0de86..7d1f9fd438 100644
> --- a/xen/drivers/vpci/vpci.c
> +++ b/xen/drivers/vpci/vpci.c
> @@ -312,7 +312,7 @@ static uint32_t merge_result(uint32_t data, uint32_t new, 
> unsigned int size,
>  
>  uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size)
>  {
> -    const struct domain *d = current->domain;
> +    struct domain *d = current->domain;
>      const struct pci_dev *pdev;
>      const struct vpci_register *r;
>      unsigned int data_offset = 0;
> @@ -415,7 +415,7 @@ static void vpci_write_helper(const struct pci_dev *pdev,
>  void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size,
>                  uint32_t data)
>  {
> -    const struct domain *d = current->domain;
> +    struct domain *d = current->domain;
>      const struct pci_dev *pdev;
>      const struct vpci_register *r;
>      unsigned int data_offset = 0;
> diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
> index 5975ca2f30..19047b4b20 100644
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -177,7 +177,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>  int pci_remove_device(u16 seg, u8 bus, u8 devfn);
>  int pci_ro_device(int seg, int bus, int devfn);
>  int pci_hide_device(unsigned int seg, unsigned int bus, unsigned int devfn);
> -struct pci_dev *pci_get_pdev(const struct domain *d, pci_sbdf_t sbdf);
> +struct pci_dev *pci_get_pdev(struct domain *d, pci_sbdf_t sbdf);
>  struct pci_dev *pci_get_real_pdev(pci_sbdf_t sbdf);
>  void pci_check_disable_device(u16 seg, u8 bus, u8 devfn);
>  
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index 1cf629e7ec..0775228ba9 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -457,6 +457,7 @@ struct domain
>  
>  #ifdef CONFIG_HAS_PCI
>      struct list_head pdev_list;
> +    spinlock_t pdevs_lock;

I think it would be better called "pdev_lock" but OK either way


>  #endif
>  
>  #ifdef CONFIG_HAS_PASSTHROUGH
> -- 
> 2.36.1
> 



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.