[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC PATCH] dpci: Put the dpci back on the list if running on another CPU.



> >> Additionally I think it should be considered whether the bitmap
> >> approach of interpreting ->state is the right one, and we don't
> >> instead want a clean 3-state (idle, sched, run) model.
> > 
> > Could you elaborate a bit more please? As in three different unsigned int
> > (or bool_t) that set in what state we are in?
> 
> An enum { STATE_IDLE, STATE_SCHED, STATE_RUN }. Especially
> if my comment above turns out to be wrong, you'd have no real
> need for the SCHED and RUN flags to be set at the same time.

I cobbled what I believe is what you were thinking off.

As you can see to preserve the existing functionality such as
being able to schedule N amount of interrupt injections
for the N interrupts we might get - I modified '->masked'
to be an atomic counter.

The end result is that we can still live-lock. Unless we:
 - Drop on the floor the injection of N interrupts and
   just deliever at max one per VMX_EXIT (and not bother
   with interrupts arriving when we are in the VMX handler).

 - Alter the softirq code slightly - to have an variant
   which will only iterate once over the pending softirq
   bits per call. (so save an copy of the bitmap on the
   stack when entering the softirq handler - and use that.
   We could also xor it against the current to catch any
   non-duplicate bits being set that we should deal with).

Here is the compile, but not run-time tested patch.

From e7d8bcd7c5d32c520554a4ad69c4716246036002 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Date: Tue, 17 Mar 2015 13:31:52 -0400
Subject: [RFC PATCH] dpci: Switch to tristate instead of bitmap

*TODO*:
 - Writeup.
 - Tests

Suggested-by: Jan Beulich <jbuelich@xxxxxxxx>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
---
 xen/drivers/passthrough/io.c | 140 ++++++++++++++++++++++++-------------------
 xen/include/xen/hvm/irq.h    |   4 +-
 2 files changed, 82 insertions(+), 62 deletions(-)

diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index ae050df..663e104 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -30,42 +30,28 @@
 static DEFINE_PER_CPU(struct list_head, dpci_list);
 
 /*
- * These two bit states help to safely schedule, deschedule, and wait until
- * the softirq has finished.
- *
- * The semantics behind these two bits is as follow:
- *  - STATE_SCHED - whoever modifies it has to ref-count the domain (->dom).
- *  - STATE_RUN - only softirq is allowed to set and clear it. If it has
- *      been set hvm_dirq_assist will RUN with a saved value of the
- *      'struct domain' copied from 'pirq_dpci->dom' before STATE_RUN was set.
- *
- * The usual states are: STATE_SCHED(set) -> STATE_RUN(set) ->
- * STATE_SCHED(unset) -> STATE_RUN(unset).
- *
- * However the states can also diverge such as: STATE_SCHED(set) ->
- * STATE_SCHED(unset) -> STATE_RUN(set) -> STATE_RUN(unset). That means
- * the 'hvm_dirq_assist' never run and that the softirq did not do any
- * ref-counting.
- */
-
-enum {
-    STATE_SCHED,
-    STATE_RUN
-};
-
-/*
  * This can be called multiple times, but the softirq is only raised once.
- * That is until the STATE_SCHED state has been cleared. The state can be
- * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
- * or by 'pt_pirq_softirq_reset' (which will try to clear the state before
+ * That is until state is in init. The state can be changed  by:
+ * the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
+ * or by 'pt_pirq_softirq_reset' (which will try to init the state before
  * the softirq had a chance to run).
  */
 static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
 {
     unsigned long flags;
 
-    if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
+    switch ( cmpxchg(&pirq_dpci->state, STATE_INIT, STATE_SCHED) )
+    {
+    case STATE_RUN:
+    case STATE_SCHED:
+        /*
+         * The pirq_dpci->mapping has been incremented to let us know
+         * how many we have left to do.
+         */
         return;
+    case STATE_INIT:
+        break;
+    }
 
     get_knownalive_domain(pirq_dpci->dom);
 
@@ -85,7 +71,7 @@ static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
  */
 bool_t pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci)
 {
-    if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) )
+    if ( pirq_dpci->state != STATE_INIT )
         return 1;
 
     /*
@@ -109,22 +95,22 @@ static void pt_pirq_softirq_reset(struct hvm_pirq_dpci 
*pirq_dpci)
 
     ASSERT(spin_is_locked(&d->event_lock));
 
-    switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
+    switch ( cmpxchg(&pirq_dpci->state, STATE_SCHED, STATE_INIT) )
     {
-    case (1 << STATE_SCHED):
+    case STATE_SCHED:
         /*
-         * We are going to try to de-schedule the softirq before it goes in
-         * STATE_RUN. Whoever clears STATE_SCHED MUST refcount the 'dom'.
+         * We are going to try to de-schedule the softirq before it goes to
+         * running state. Whoever moves from sched MUST refcount the 'dom'.
          */
         put_domain(d);
         /* fallthrough. */
-    case (1 << STATE_RUN):
-    case (1 << STATE_RUN) | (1 << STATE_SCHED):
+    case STATE_RUN:
+    case STATE_INIT:
         /*
-         * The reason it is OK to reset 'dom' when STATE_RUN bit is set is due
+         * The reason it is OK to reset 'dom' when init or running is set is 
due
          * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom'
-         * in local variable before it sets STATE_RUN - and therefore will not
-         * dereference '->dom' which would crash.
+         * in local variable before it sets state to running - and therefore
+         * will not dereference '->dom' which would crash.
          */
         pirq_dpci->dom = NULL;
         break;
@@ -135,7 +121,7 @@ static void pt_pirq_softirq_reset(struct hvm_pirq_dpci 
*pirq_dpci)
      * or never initialized it). Note that we hold the lock that
      * 'hvm_dirq_assist' could be spinning on.
      */
-    pirq_dpci->masked = 0;
+    atomic_set(&pirq_dpci->masked, 0);
 }
 
 bool_t pt_irq_need_timer(uint32_t flags)
@@ -149,7 +135,8 @@ static int pt_irq_guest_eoi(struct domain *d, struct 
hvm_pirq_dpci *pirq_dpci,
     if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT,
                               &pirq_dpci->flags) )
     {
-        pirq_dpci->masked = 0;
+        ASSERT(atomic_read(&pirq_dpci->masked) <= 1);
+        atomic_set(&pirq_dpci->masked, 0);
         pirq_dpci->pending = 0;
         pirq_guest_eoi(dpci_pirq(pirq_dpci));
     }
@@ -278,6 +265,8 @@ int pt_irq_create_bind(
              * As such on every 'pt_irq_create_bind' call we MUST set it.
              */
             pirq_dpci->dom = d;
+            atomic_set(&pirq_dpci->masked, 0);
+            pirq_dpci->state = STATE_INIT;
             /* bind after hvm_irq_dpci is setup to avoid race with irq 
handler*/
             rc = pirq_guest_bind(d->vcpu[0], info, 0);
             if ( rc == 0 && pt_irq_bind->u.msi.gtable )
@@ -398,6 +387,8 @@ int pt_irq_create_bind(
             if ( pt_irq_need_timer(pirq_dpci->flags) )
                 init_timer(&pirq_dpci->timer, pt_irq_time_out, pirq_dpci, 0);
             /* Deal with gsi for legacy devices */
+            atomic_set(&pirq_dpci->masked, 0);
+            pirq_dpci->state = STATE_INIT;
             rc = pirq_guest_bind(d->vcpu[0], info, share);
             if ( unlikely(rc) )
             {
@@ -576,6 +567,7 @@ bool_t pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci)
     if ( !dpci->flags && !pt_pirq_softirq_active(dpci) )
     {
         dpci->dom = NULL;
+        dpci->state = STATE_INIT;
         return 1;
     }
     return 0;
@@ -617,7 +609,7 @@ int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq)
          !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
         return 0;
 
-    pirq_dpci->masked = 1;
+    atomic_inc(&pirq_dpci->masked);
     raise_softirq_for(pirq_dpci);
     return 1;
 }
@@ -672,12 +664,13 @@ void hvm_dpci_msi_eoi(struct domain *d, int vector)
     spin_unlock(&d->event_lock);
 }
 
-static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci)
+static bool_t hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci 
*pirq_dpci)
 {
+    int more;
     ASSERT(d->arch.hvm_domain.irq.dpci);
 
     spin_lock(&d->event_lock);
-    if ( test_and_clear_bool(pirq_dpci->masked) )
+    while ( (more = !atomic_dec_and_test(&pirq_dpci->masked)) )
     {
         struct pirq *pirq = dpci_pirq(pirq_dpci);
         const struct dev_intx_gsi_link *digl;
@@ -687,17 +680,13 @@ static void hvm_dirq_assist(struct domain *d, struct 
hvm_pirq_dpci *pirq_dpci)
             send_guest_pirq(d, pirq);
 
             if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
-            {
-                spin_unlock(&d->event_lock);
-                return;
-            }
+                break;
         }
 
         if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
         {
             vmsi_deliver_pirq(d, pirq_dpci);
-            spin_unlock(&d->event_lock);
-            return;
+            break;
         }
 
         list_for_each_entry ( digl, &pirq_dpci->digl_list, list )
@@ -710,8 +699,7 @@ static void hvm_dirq_assist(struct domain *d, struct 
hvm_pirq_dpci *pirq_dpci)
         {
             /* for translated MSI to INTx interrupt, eoi as early as possible 
*/
             __msi_pirq_eoi(pirq_dpci);
-            spin_unlock(&d->event_lock);
-            return;
+            break;
         }
 
         /*
@@ -725,6 +713,8 @@ static void hvm_dirq_assist(struct domain *d, struct 
hvm_pirq_dpci *pirq_dpci)
         set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT);
     }
     spin_unlock(&d->event_lock);
+
+    return more;
 }
 
 static void __hvm_dpci_eoi(struct domain *d,
@@ -781,7 +771,7 @@ unlock:
 }
 
 /*
- * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
+ * Note: 'pt_pirq_softirq_reset' can move the state to init before we get to
  * doing it. If that is the case we let 'pt_pirq_softirq_reset' do 
ref-counting.
  */
 static void dpci_softirq(void)
@@ -797,23 +787,53 @@ static void dpci_softirq(void)
     {
         struct hvm_pirq_dpci *pirq_dpci;
         struct domain *d;
+        bool_t again = 0;
 
         pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, 
softirq_list);
         list_del(&pirq_dpci->softirq_list);
 
         d = pirq_dpci->dom;
         smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */
-        if ( test_and_set_bit(STATE_RUN, &pirq_dpci->state) )
-            BUG();
-        /*
-         * The one who clears STATE_SCHED MUST refcount the domain.
-         */
-        if ( test_and_clear_bit(STATE_SCHED, &pirq_dpci->state) )
+
+        switch ( cmpxchg(&pirq_dpci->state, STATE_SCHED, STATE_RUN) )
+        {
+        case STATE_INIT:
+            /*  pt_pirq_softirq_reset cleared it. Let it do the ref-counting. 
*/
+            continue;
+        case STATE_RUN:
+            again = 1;
+            /* Fall through. */
+        case STATE_SCHED:
+            break;
+        }
+        if ( !again )
         {
-            hvm_dirq_assist(d, pirq_dpci);
+            /*
+            * The one who changes sched to running MUST refcount the domain.
+            */
+            again = hvm_dirq_assist(d, pirq_dpci);
             put_domain(d);
+            switch ( cmpxchg(&pirq_dpci->state, STATE_RUN, STATE_INIT) )
+            {
+            case STATE_SCHED:
+            case STATE_INIT:
+                BUG();
+            case STATE_RUN:
+                break;
+            }
+        }
+        if ( again )
+        {
+            unsigned long flags;
+
+            /* Put back on the list and retry. */
+            local_irq_save(flags);
+            list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
+            local_irq_restore(flags);
+
+            raise_softirq(HVM_DPCI_SOFTIRQ);
+            continue;
         }
-        clear_bit(STATE_RUN, &pirq_dpci->state);
     }
 }
 
diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
index 3996f1f..48dbc51 100644
--- a/xen/include/xen/hvm/irq.h
+++ b/xen/include/xen/hvm/irq.h
@@ -93,8 +93,8 @@ struct hvm_irq_dpci {
 /* Machine IRQ to guest device/intx mapping. */
 struct hvm_pirq_dpci {
     uint32_t flags;
-    unsigned int state;
-    bool_t masked;
+    enum { STATE_INIT, STATE_SCHED, STATE_RUN } state;
+    atomic_t masked;
     uint16_t pending;
     struct list_head digl_list;
     struct domain *dom;
-- 
2.1.0

> 
> Jan
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.