[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v8 for-xen-4.5 1/2] dpci: Move from an hvm_irq_dpci (and struct domain) to an hvm_dirq_dpci model.



When an interrupt for an PCI (or PCIe) passthrough device
is to be sent to a guest, we find the appropiate
'hvm_dirq_dpci' structure for the interrupt (PIRQ), set
a bit (masked), and schedule an tasklet.

Then the 'hvm_dirq_assist' tasklet gets called with the 'struct
domain' from where it iterates over the the radix-tree of
'hvm_dirq_dpci' (from zero to the number of PIRQs allocated)
which are masked to the guest and calls each 'hvm_pirq_assist'.
If the PIRQ has a bit set (masked) it figures out how to
inject the PIRQ to the guest.

This is inefficient and not fair as:
 - We iterate starting at PIRQ 0 and up every time. That means
   the PCIe devices that have lower PIRQs get to be called
   first.
 - If we have many PCIe devices passed in with many PIRQs and
   if most of the time only the highest numbered PIRQ get an
   interrupt (as the initial ones are for control) we end up
   iterating over many PIRQs.

But we could do beter - the 'hvm_dirq_dpci' has the field for
'struct domain', which we can use instead of having to pass in the
'struct domain'.

As such this patch moves the tasklet to the 'struct hvm_dirq_dpci'
and sets the 'dom' field to the domain. We also double-check
that the '->dom' is not reset before using it.

We have to be careful with this as that means we MUST have
'dom' set before pirq_guest_bind() is called. As such
we add the 'pirq_dpci->dom = d;' to cover for all such
cases.

The mechanism to tear it down is more complex as there
are two ways it can be executed:

 a) pci_clean_dpci_irq. This gets called when the guest is
    being destroyed. We end up calling 'tasklet_kill'.

    The scenarios in which the 'struct pirq' (and subsequently
    the 'hvm_pirq_dpci') gets destroyed is when:

    - guest did not use the pirq at all after setup.
    - guest did use pirq, but decided to mask and left it in that
      state.
    - guest did use pirq, but crashed.

    In all of those scenarios we end up calling 'tasklet_kill'
    which will spin on the tasklet if it is running.

 b) pt_irq_destroy_bind (guest disables the MSI). We double-check
    that the softirq has run by piggy-backing on the existing
    'pirq_cleanup_check' mechanism which calls 'pt_pirq_cleanup_check'.
    We add the extra call to 'pt_pirq_softirq_active' in
    'pt_pirq_cleanup_check'.

    NOTE: Guests that use event channels unbind first the
    event channel from PIRQs, so the 'pt_pirq_cleanup_check'
    won't be called as eventch is set to zero. In that case
    we either clean it up via the a) mechanism. It is OK
    to re-use the tasklet when 'pt_irq_create_bind' is called
    afterwards.

    There is an extra scenario regardless of eventch being
    set or not: the guest did 'pt_irq_destroy_bind' while an
    interrupt was triggered and tasklet was scheduled (but had not
    been run). It is OK to still run the tasklet as
    hvm_dirq_assist won't do anything (as the flags are
    set to zero). As such we can exit out of hvm_dirq_assist
    without doing anything.

Suggested-by: Jan Beulich <jbeulich@xxxxxxxx>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
---
 xen/drivers/passthrough/io.c  | 75 +++++++++++++++++++++++++++++--------------
 xen/drivers/passthrough/pci.c |  4 +--
 xen/include/xen/hvm/irq.h     |  2 +-
 3 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 4cd32b5..0cf0a52 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -27,7 +27,7 @@
 #include <xen/hvm/irq.h>
 #include <xen/tasklet.h>
 
-static void hvm_dirq_assist(unsigned long _d);
+static void hvm_dirq_assist(unsigned long arg);
 
 bool_t pt_irq_need_timer(uint32_t flags)
 {
@@ -114,9 +114,6 @@ int pt_irq_create_bind(
             spin_unlock(&d->event_lock);
             return -ENOMEM;
         }
-        softirq_tasklet_init(
-            &hvm_irq_dpci->dirq_tasklet,
-            hvm_dirq_assist, (unsigned long)d);
         for ( i = 0; i < NR_HVM_IRQS; i++ )
             INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]);
 
@@ -130,6 +127,18 @@ int pt_irq_create_bind(
         return -ENOMEM;
     }
     pirq_dpci = pirq_dpci(info);
+    /*
+     * The 'pt_irq_create_bind' can be called right after 'pt_irq_destroy_bind'
+     * was called. The 'pirq_cleanup_check' which would free the structure
+     * is only called if the event channel for the PIRQ is active. However
+     * OS-es that use event channels usually bind the PIRQ to an event channel
+     * and also unbind it before 'pt_irq_destroy_bind' is called which means
+     * we end up re-using the 'dpci' structure. This can be easily reproduced
+     * with unloading and loading the driver for the device.
+     *
+     * As such on every 'pt_irq_create_bind' call we MUST reset the values.
+     */
+    pirq_dpci->dom = d;
 
     switch ( pt_irq_bind->irq_type )
     {
@@ -156,6 +165,7 @@ int pt_irq_create_bind(
             {
                 pirq_dpci->gmsi.gflags = 0;
                 pirq_dpci->gmsi.gvec = 0;
+                pirq_dpci->dom = NULL;
                 pirq_dpci->flags = 0;
                 pirq_cleanup_check(info, d);
                 spin_unlock(&d->event_lock);
@@ -232,7 +242,6 @@ int pt_irq_create_bind(
         {
             unsigned int share;
 
-            pirq_dpci->dom = d;
             if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE )
             {
                 pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED |
@@ -415,11 +424,18 @@ void pt_pirq_init(struct domain *d, struct hvm_pirq_dpci 
*dpci)
 {
     INIT_LIST_HEAD(&dpci->digl_list);
     dpci->gmsi.dest_vcpu_id = -1;
+    softirq_tasklet_init(&dpci->tasklet, hvm_dirq_assist, (unsigned long)dpci);
 }
 
 bool_t pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci)
 {
-    return !dpci->flags;
+    if ( !dpci->flags )
+    {
+        tasklet_kill(&dpci->tasklet);
+        dpci->dom = NULL;
+        return 1;
+    }
+    return 0;
 }
 
 int pt_pirq_iterate(struct domain *d,
@@ -459,7 +475,7 @@ int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq)
         return 0;
 
     pirq_dpci->masked = 1;
-    tasklet_schedule(&dpci->dirq_tasklet);
+    tasklet_schedule(&pirq_dpci->tasklet);
     return 1;
 }
 
@@ -513,9 +529,27 @@ void hvm_dpci_msi_eoi(struct domain *d, int vector)
     spin_unlock(&d->event_lock);
 }
 
-static int _hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
-                            void *arg)
+static void hvm_dirq_assist(unsigned long arg)
 {
+    struct hvm_pirq_dpci *pirq_dpci = (struct hvm_pirq_dpci *)arg;
+    struct domain *d = pirq_dpci->dom;
+
+    /*
+     * We can be racing with 'pt_irq_destroy_bind' - with us being scheduled
+     * right before 'pirq_guest_unbind' gets called - but us not yet executed.
+     *
+     * And '->dom' gets cleared later in the destroy path. We exit and clear
+     * 'masked' - which is OK as later in this code we would
+     * do nothing except clear the ->masked field anyhow.
+     */
+    if ( !d )
+    {
+        pirq_dpci->masked = 0;
+        return;
+    }
+    ASSERT(d->arch.hvm_domain.irq.dpci);
+
+    spin_lock(&d->event_lock);
     if ( test_and_clear_bool(pirq_dpci->masked) )
     {
         struct pirq *pirq = dpci_pirq(pirq_dpci);
@@ -526,13 +560,17 @@ static int _hvm_dirq_assist(struct domain *d, struct 
hvm_pirq_dpci *pirq_dpci,
             send_guest_pirq(d, pirq);
 
             if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
-                return 0;
+            {
+                spin_unlock(&d->event_lock);
+                return;
+            }
         }
 
         if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
         {
             vmsi_deliver_pirq(d, pirq_dpci);
-            return 0;
+            spin_unlock(&d->event_lock);
+            return;
         }
 
         list_for_each_entry ( digl, &pirq_dpci->digl_list, list )
@@ -545,7 +583,8 @@ static int _hvm_dirq_assist(struct domain *d, struct 
hvm_pirq_dpci *pirq_dpci,
         {
             /* for translated MSI to INTx interrupt, eoi as early as possible 
*/
             __msi_pirq_eoi(pirq_dpci);
-            return 0;
+            spin_unlock(&d->event_lock);
+            return;
         }
 
         /*
@@ -558,18 +597,6 @@ static int _hvm_dirq_assist(struct domain *d, struct 
hvm_pirq_dpci *pirq_dpci,
         ASSERT(pt_irq_need_timer(pirq_dpci->flags));
         set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT);
     }
-
-    return 0;
-}
-
-static void hvm_dirq_assist(unsigned long _d)
-{
-    struct domain *d = (struct domain *)_d;
-
-    ASSERT(d->arch.hvm_domain.irq.dpci);
-
-    spin_lock(&d->event_lock);
-    pt_pirq_iterate(d, _hvm_dirq_assist, NULL);
     spin_unlock(&d->event_lock);
 }
 
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 1eba833..81e8a3a 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -767,6 +767,8 @@ static int pci_clean_dpci_irq(struct domain *d,
         xfree(digl);
     }
 
+    tasklet_kill(&pirq_dpci->tasklet);
+
     return 0;
 }
 
@@ -784,8 +786,6 @@ static void pci_clean_dpci_irqs(struct domain *d)
     hvm_irq_dpci = domain_get_irq_dpci(d);
     if ( hvm_irq_dpci != NULL )
     {
-        tasklet_kill(&hvm_irq_dpci->dirq_tasklet);
-
         pt_pirq_iterate(d, pci_clean_dpci_irq, NULL);
 
         d->arch.hvm_domain.irq.dpci = NULL;
diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
index c89f4b1..94a550a 100644
--- a/xen/include/xen/hvm/irq.h
+++ b/xen/include/xen/hvm/irq.h
@@ -88,7 +88,6 @@ struct hvm_irq_dpci {
     DECLARE_BITMAP(isairq_map, NR_ISAIRQS);
     /* Record of mapped Links */
     uint8_t link_cnt[NR_LINK];
-    struct tasklet dirq_tasklet;
 };
 
 /* Machine IRQ to guest device/intx mapping. */
@@ -100,6 +99,7 @@ struct hvm_pirq_dpci {
     struct domain *dom;
     struct hvm_gmsi_info gmsi;
     struct timer timer;
+    struct tasklet tasklet;
 };
 
 void pt_pirq_init(struct domain *, struct hvm_pirq_dpci *);
-- 
1.9.3


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.