[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v5][XSA-97] x86/paging: make log-dirty operations preemptible



Both the freeing and the inspection of the bitmap get done in (nested)
loops which - besides having a rather high iteration count in general,
albeit that would be covered by XSA-77 - have the number of non-trivial
iterations they need to perform (indirectly) controllable by both the
guest they are for and any domain controlling the guest (including the
one running qemu for it).

Note that the tying of the continuations to the invoking domain (which
previously [wrongly] used the invoking vCPU instead) implies that the
tools requesting such operations have to make sure they don't issue
multiple similar operations in parallel.

Note further that this breaks supervisor-mode kernel assumptions in
hypercall_create_continuation() (where regs->eip gets rewound to the
current hypercall stub beginning), but otoh
hypercall_cancel_continuation() doesn't work in that mode either.
Perhaps time to rip out all the remains of that feature?

This is part of CVE-2014-5146 / XSA-97.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
Reviewed-by: Tim Deegan <tim@xxxxxxx>
---
v5: Also wire up helper hypercall in HVM/PVH hypercall tables (pointed
    out by Tim).
v4: Fold in Andrew's paging_mode_log_dirty() guarding adjustments to
    paging_log_dirty_disable() and its caller. Convert tying
    continuations to domain (was vCPU). Introduce a separate internal
    hypercall (__HYPERCALL_arch_1) to deal with the continuation (both
    suggested by Tim and Andrew). As a result hap_domctl() and
    shadow_domctl() now don't propagate -ERESTART into paging_domctl()
    anymore, as that would imply setting d->arch.paging.preempt fields
    without holding the paging lock. Sadly this results in mixed
    methods used for continuations here.
v3: Convert if(!resuming) to ASSERT() in paging_log_dirty_op().
v2: Re-order L4 loop continuation/termination handling in
    paging_free_log_dirty_bitmap(). Add an ASSERT() in a special case
    exit path of paging_log_dirty_op().

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1924,7 +1924,9 @@ int domain_relinquish_resources(struct d
         pci_release_devices(d);
 
         /* Tear down paging-assistance stuff. */
-        paging_teardown(d);
+        ret = paging_teardown(d);
+        if ( ret )
+            return ret;
 
         /* Drop the in-use references to page-table bases. */
         for_each_vcpu ( d, v )
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -61,9 +61,11 @@ long arch_do_domctl(
 
     case XEN_DOMCTL_shadow_op:
     {
-        ret = paging_domctl(d,
-                            &domctl->u.shadow_op,
-                            guest_handle_cast(u_domctl, void));
+        ret = paging_domctl(d, &domctl->u.shadow_op,
+                            guest_handle_cast(u_domctl, void), 0);
+        if ( ret == -ERESTART )
+            return hypercall_create_continuation(__HYPERVISOR_arch_1,
+                                                 "h", u_domctl);
         copyback = 1;
     }
     break;
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -4794,7 +4794,8 @@ static hvm_hypercall_t *const hvm_hyperc
     HYPERCALL(hvm_op),
     HYPERCALL(sysctl),
     HYPERCALL(domctl),
-    HYPERCALL(tmem_op)
+    HYPERCALL(tmem_op),
+    [ __HYPERVISOR_arch_1 ] = (hvm_hypercall_t *)paging_domctl_continuation
 };
 
 #define COMPAT_CALL(x)                                        \
@@ -4814,7 +4815,8 @@ static hvm_hypercall_t *const hvm_hyperc
     HYPERCALL(hvm_op),
     HYPERCALL(sysctl),
     HYPERCALL(domctl),
-    HYPERCALL(tmem_op)
+    HYPERCALL(tmem_op),
+    [ __HYPERVISOR_arch_1 ] = (hvm_hypercall_t *)paging_domctl_continuation
 };
 
 /* PVH 32bitfixme. */
@@ -4832,7 +4834,8 @@ static hvm_hypercall_t *const pvh_hyperc
     [ __HYPERVISOR_physdev_op ]      = (hvm_hypercall_t *)hvm_physdev_op,
     HYPERCALL(hvm_op),
     HYPERCALL(sysctl),
-    HYPERCALL(domctl)
+    HYPERCALL(domctl),
+    [ __HYPERVISOR_arch_1 ] = (hvm_hypercall_t *)paging_domctl_continuation
 };
 
 int hvm_do_hypercall(struct cpu_user_regs *regs)
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -26,6 +26,7 @@
 #include <asm/shadow.h>
 #include <asm/p2m.h>
 #include <asm/hap.h>
+#include <asm/event.h>
 #include <asm/hvm/nestedhvm.h>
 #include <xen/numa.h>
 #include <xsm/xsm.h>
@@ -116,26 +117,46 @@ static void paging_free_log_dirty_page(s
     d->arch.paging.free_page(d, mfn_to_page(mfn));
 }
 
-void paging_free_log_dirty_bitmap(struct domain *d)
+static int paging_free_log_dirty_bitmap(struct domain *d, int rc)
 {
     mfn_t *l4, *l3, *l2;
     int i4, i3, i2;
 
+    paging_lock(d);
+
     if ( !mfn_valid(d->arch.paging.log_dirty.top) )
-        return;
+    {
+        paging_unlock(d);
+        return 0;
+    }
 
-    paging_lock(d);
+    if ( !d->arch.paging.preempt.dom )
+    {
+        memset(&d->arch.paging.preempt.log_dirty, 0,
+               sizeof(d->arch.paging.preempt.log_dirty));
+        ASSERT(rc <= 0);
+        d->arch.paging.preempt.log_dirty.done = -rc;
+    }
+    else if ( d->arch.paging.preempt.dom != current->domain ||
+              d->arch.paging.preempt.op != XEN_DOMCTL_SHADOW_OP_OFF )
+    {
+        paging_unlock(d);
+        return -EBUSY;
+    }
 
     l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+    i4 = d->arch.paging.preempt.log_dirty.i4;
+    i3 = d->arch.paging.preempt.log_dirty.i3;
+    rc = 0;
 
-    for ( i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++ )
+    for ( ; i4 < LOGDIRTY_NODE_ENTRIES; i4++, i3 = 0 )
     {
         if ( !mfn_valid(l4[i4]) )
             continue;
 
         l3 = map_domain_page(mfn_x(l4[i4]));
 
-        for ( i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
+        for ( ; i3 < LOGDIRTY_NODE_ENTRIES; i3++ )
         {
             if ( !mfn_valid(l3[i3]) )
                 continue;
@@ -148,20 +169,54 @@ void paging_free_log_dirty_bitmap(struct
 
             unmap_domain_page(l2);
             paging_free_log_dirty_page(d, l3[i3]);
+            l3[i3] = _mfn(INVALID_MFN);
+
+            if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
+            {
+                d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
+                d->arch.paging.preempt.log_dirty.i4 = i4;
+                rc = -ERESTART;
+                break;
+            }
         }
 
         unmap_domain_page(l3);
+        if ( rc )
+            break;
         paging_free_log_dirty_page(d, l4[i4]);
+        l4[i4] = _mfn(INVALID_MFN);
+
+        if ( i4 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
+        {
+            d->arch.paging.preempt.log_dirty.i3 = 0;
+            d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
+            rc = -ERESTART;
+            break;
+        }
     }
 
     unmap_domain_page(l4);
-    paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
-    d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
 
-    ASSERT(d->arch.paging.log_dirty.allocs == 0);
-    d->arch.paging.log_dirty.failed_allocs = 0;
+    if ( !rc )
+    {
+        paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top);
+        d->arch.paging.log_dirty.top = _mfn(INVALID_MFN);
+
+        ASSERT(d->arch.paging.log_dirty.allocs == 0);
+        d->arch.paging.log_dirty.failed_allocs = 0;
+
+        rc = -d->arch.paging.preempt.log_dirty.done;
+        d->arch.paging.preempt.dom = NULL;
+    }
+    else
+    {
+        d->arch.paging.preempt.dom = current->domain;
+        d->arch.paging.preempt.op = XEN_DOMCTL_SHADOW_OP_OFF;
+    }
 
     paging_unlock(d);
+
+    return rc;
 }
 
 int paging_log_dirty_enable(struct domain *d, bool_t log_global)
@@ -187,15 +242,25 @@ int paging_log_dirty_enable(struct domai
     return ret;
 }
 
-int paging_log_dirty_disable(struct domain *d)
+static int paging_log_dirty_disable(struct domain *d, bool_t resuming)
 {
-    int ret;
+    int ret = 1;
+
+    if ( !resuming )
+    {
+        domain_pause(d);
+        /* Safe because the domain is paused. */
+        if ( paging_mode_log_dirty(d) )
+        {
+            ret = d->arch.paging.log_dirty.disable_log_dirty(d);
+            ASSERT(ret <= 0);
+        }
+    }
+
+    ret = paging_free_log_dirty_bitmap(d, ret);
+    if ( ret == -ERESTART )
+        return ret;
 
-    domain_pause(d);
-    /* Safe because the domain is paused. */
-    ret = d->arch.paging.log_dirty.disable_log_dirty(d);
-    if ( !paging_mode_log_dirty(d) )
-        paging_free_log_dirty_bitmap(d);
     domain_unpause(d);
 
     return ret;
@@ -335,7 +400,9 @@ int paging_mfn_is_dirty(struct domain *d
 
 /* Read a domain's log-dirty bitmap and stats.  If the operation is a CLEAN,
  * clear the bitmap and stats as well. */
-int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc)
+static int paging_log_dirty_op(struct domain *d,
+                               struct xen_domctl_shadow_op *sc,
+                               bool_t resuming)
 {
     int rv = 0, clean = 0, peek = 1;
     unsigned long pages = 0;
@@ -343,9 +410,22 @@ int paging_log_dirty_op(struct domain *d
     unsigned long *l1 = NULL;
     int i4, i3, i2;
 
-    domain_pause(d);
+    if ( !resuming )
+        domain_pause(d);
     paging_lock(d);
 
+    if ( !d->arch.paging.preempt.dom )
+        memset(&d->arch.paging.preempt.log_dirty, 0,
+               sizeof(d->arch.paging.preempt.log_dirty));
+    else if ( d->arch.paging.preempt.dom != current->domain ||
+              d->arch.paging.preempt.op != sc->op )
+    {
+        paging_unlock(d);
+        ASSERT(!resuming);
+        domain_unpause(d);
+        return -EBUSY;
+    }
+
     clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
 
     PAGING_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n",
@@ -374,17 +454,15 @@ int paging_log_dirty_op(struct domain *d
         goto out;
     }
 
-    pages = 0;
     l4 = paging_map_log_dirty_bitmap(d);
+    i4 = d->arch.paging.preempt.log_dirty.i4;
+    i3 = d->arch.paging.preempt.log_dirty.i3;
+    pages = d->arch.paging.preempt.log_dirty.done;
 
-    for ( i4 = 0;
-          (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES);
-          i4++ )
+    for ( ; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++, i3 = 0 )
     {
         l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL;
-        for ( i3 = 0;
-              (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES);
-              i3++ )
+        for ( ; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ )
         {
             l2 = ((l3 && mfn_valid(l3[i3])) ?
                   map_domain_page(mfn_x(l3[i3])) : NULL);
@@ -419,18 +497,51 @@ int paging_log_dirty_op(struct domain *d
             }
             if ( l2 )
                 unmap_domain_page(l2);
+
+            if ( i3 < LOGDIRTY_NODE_ENTRIES - 1 && hypercall_preempt_check() )
+            {
+                d->arch.paging.preempt.log_dirty.i4 = i4;
+                d->arch.paging.preempt.log_dirty.i3 = i3 + 1;
+                rv = -ERESTART;
+                break;
+            }
         }
         if ( l3 )
             unmap_domain_page(l3);
+
+        if ( !rv && i4 < LOGDIRTY_NODE_ENTRIES - 1 &&
+             hypercall_preempt_check() )
+        {
+            d->arch.paging.preempt.log_dirty.i4 = i4 + 1;
+            d->arch.paging.preempt.log_dirty.i3 = 0;
+            rv = -ERESTART;
+        }
+        if ( rv )
+            break;
     }
     if ( l4 )
         unmap_domain_page(l4);
 
-    if ( pages < sc->pages )
-        sc->pages = pages;
+    if ( !rv )
+        d->arch.paging.preempt.dom = NULL;
+    else
+    {
+        d->arch.paging.preempt.dom = current->domain;
+        d->arch.paging.preempt.op = sc->op;
+        d->arch.paging.preempt.log_dirty.done = pages;
+    }
 
     paging_unlock(d);
 
+    if ( rv )
+    {
+        /* Never leave the domain paused for other errors. */
+        ASSERT(rv == -ERESTART);
+        return rv;
+    }
+
+    if ( pages < sc->pages )
+        sc->pages = pages;
     if ( clean )
     {
         /* We need to further call clean_dirty_bitmap() functions of specific
@@ -441,6 +552,7 @@ int paging_log_dirty_op(struct domain *d
     return rv;
 
  out:
+    d->arch.paging.preempt.dom = NULL;
     paging_unlock(d);
     domain_unpause(d);
 
@@ -504,12 +616,6 @@ void paging_log_dirty_init(struct domain
     d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap;
 }
 
-/* This function fress log dirty bitmap resources. */
-static void paging_log_dirty_teardown(struct domain*d)
-{
-    paging_free_log_dirty_bitmap(d);
-}
-
 /************************************************/
 /*           CODE FOR PAGING SUPPORT            */
 /************************************************/
@@ -551,7 +657,7 @@ void paging_vcpu_init(struct vcpu *v)
 
 
 int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
-                  XEN_GUEST_HANDLE_PARAM(void) u_domctl)
+                  XEN_GUEST_HANDLE_PARAM(void) u_domctl, bool_t resuming)
 {
     int rc;
 
@@ -575,6 +681,20 @@ int paging_domctl(struct domain *d, xen_
         return -EINVAL;
     }
 
+    if ( resuming
+         ? (d->arch.paging.preempt.dom != current->domain ||
+            d->arch.paging.preempt.op != sc->op)
+         : (d->arch.paging.preempt.dom &&
+            sc->op != XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION) )
+    {
+        printk(XENLOG_G_DEBUG
+               "%pv: Paging op %#x on Dom%u with unfinished prior op %#x by 
Dom%u\n",
+               current, sc->op, d->domain_id, d->arch.paging.preempt.op,
+               d->arch.paging.preempt.dom
+               ? d->arch.paging.preempt.dom->domain_id : DOMID_INVALID);
+        return -EBUSY;
+    }
+
     rc = xsm_shadow_control(XSM_HOOK, d, sc->op);
     if ( rc )
         return rc;
@@ -597,14 +717,13 @@ int paging_domctl(struct domain *d, xen_
         return paging_log_dirty_enable(d, 1);
 
     case XEN_DOMCTL_SHADOW_OP_OFF:
-        if ( paging_mode_log_dirty(d) )
-            if ( (rc = paging_log_dirty_disable(d)) != 0 )
-                return rc;
+        if ( (rc = paging_log_dirty_disable(d, resuming)) != 0 )
+            return rc;
         break;
 
     case XEN_DOMCTL_SHADOW_OP_CLEAN:
     case XEN_DOMCTL_SHADOW_OP_PEEK:
-        return paging_log_dirty_op(d, sc);
+        return paging_log_dirty_op(d, sc, resuming);
     }
 
     /* Here, dispatch domctl to the appropriate paging code */
@@ -614,19 +733,67 @@ int paging_domctl(struct domain *d, xen_
         return shadow_domctl(d, sc, u_domctl);
 }
 
+long paging_domctl_continuation(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    struct xen_domctl op;
+    struct domain *d;
+    int ret;
+
+    if ( copy_from_guest(&op, u_domctl, 1) )
+        return -EFAULT;
+
+    if ( op.interface_version != XEN_DOMCTL_INTERFACE_VERSION ||
+         op.cmd != XEN_DOMCTL_shadow_op )
+        return -EBADRQC;
+
+    d = rcu_lock_domain_by_id(op.domain);
+    if ( d == NULL )
+        return -ESRCH;
+
+    ret = xsm_domctl(XSM_OTHER, d, op.cmd);
+    if ( !ret )
+    {
+        if ( domctl_lock_acquire() )
+        {
+            ret = paging_domctl(d, &op.u.shadow_op,
+                                guest_handle_cast(u_domctl, void), 1);
+
+            domctl_lock_release();
+        }
+        else
+            ret = -ERESTART;
+    }
+
+    rcu_unlock_domain(d);
+
+    if ( ret == -ERESTART )
+        ret = hypercall_create_continuation(__HYPERVISOR_arch_1,
+                                            "h", u_domctl);
+    else if ( __copy_field_to_guest(u_domctl, &op, u.shadow_op) )
+        ret = -EFAULT;
+
+    return ret;
+}
+
 /* Call when destroying a domain */
-void paging_teardown(struct domain *d)
+int paging_teardown(struct domain *d)
 {
+    int rc;
+
     if ( hap_enabled(d) )
         hap_teardown(d);
     else
         shadow_teardown(d);
 
     /* clean up log dirty resources. */
-    paging_log_dirty_teardown(d);
+    rc = paging_free_log_dirty_bitmap(d, 0);
+    if ( rc == -ERESTART )
+        return rc;
 
     /* Move populate-on-demand cache back to domain_list for destruction */
     p2m_pod_empty_cache(d);
+
+    return rc;
 }
 
 /* Call once all of the references to the domain have gone away */
--- a/xen/arch/x86/x86_64/compat/entry.S
+++ b/xen/arch/x86/x86_64/compat/entry.S
@@ -421,6 +421,7 @@ ENTRY(compat_hypercall_table)
         .quad compat_ni_hypercall
         .endr
         .quad do_mca                    /* 48 */
+        .quad paging_domctl_continuation
         .rept NR_hypercalls-((.-compat_hypercall_table)/8)
         .quad compat_ni_hypercall
         .endr
@@ -469,6 +470,7 @@ ENTRY(compat_hypercall_args_table)
         .byte 0 /* compat_ni_hypercall      */
         .endr
         .byte 1 /* do_mca                   */
+        .byte 1 /* paging_domctl_continuation      */
         .rept NR_hypercalls-(.-compat_hypercall_args_table)
         .byte 0 /* compat_ni_hypercall      */
         .endr
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -765,6 +765,7 @@ ENTRY(hypercall_table)
         .quad do_ni_hypercall
         .endr
         .quad do_mca                /* 48 */
+        .quad paging_domctl_continuation
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -813,6 +814,7 @@ ENTRY(hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
         .byte 1 /* do_mca               */  /* 48 */
+        .byte 1 /* paging_domctl_continuation */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -615,7 +615,6 @@ int domain_kill(struct domain *d)
         {
             if ( rc == -ERESTART )
                 rc = -EAGAIN;
-            BUG_ON(rc != -EAGAIN);
             break;
         }
         if ( sched_move_domain(d, cpupool0) )
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -186,6 +186,20 @@ struct paging_domain {
     struct hap_domain       hap;
     /* log dirty support */
     struct log_dirty_domain log_dirty;
+
+    /* preemption handling */
+    struct {
+        const struct domain *dom;
+        unsigned int op;
+        union {
+            struct {
+                unsigned long done:PADDR_BITS - PAGE_SHIFT;
+                unsigned long i4:PAGETABLE_ORDER;
+                unsigned long i3:PAGETABLE_ORDER;
+            } log_dirty;
+        };
+    } preempt;
+
     /* alloc/free pages from the pool for paging-assistance structures
      * (used by p2m and log-dirty code for their tries) */
     struct page_info * (*alloc_page)(struct domain *d);
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -132,9 +132,6 @@ struct paging_mode {
 /*****************************************************************************
  * Log dirty code */
 
-/* free log dirty bitmap resource */
-void paging_free_log_dirty_bitmap(struct domain *d);
-
 /* get the dirty bitmap for a specific range of pfns */
 void paging_log_dirty_range(struct domain *d,
                             unsigned long begin_pfn,
@@ -144,9 +141,6 @@ void paging_log_dirty_range(struct domai
 /* enable log dirty */
 int paging_log_dirty_enable(struct domain *d, bool_t log_global);
 
-/* disable log dirty */
-int paging_log_dirty_disable(struct domain *d);
-
 /* log dirty initialization */
 void paging_log_dirty_init(struct domain *d,
                            int  (*enable_log_dirty)(struct domain *d,
@@ -203,10 +197,13 @@ int paging_domain_init(struct domain *d,
  * and disable ephemeral shadow modes (test mode and log-dirty mode) and
  * manipulate the log-dirty bitmap. */
 int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
-                  XEN_GUEST_HANDLE_PARAM(void) u_domctl);
+                  XEN_GUEST_HANDLE_PARAM(void) u_domctl, bool_t resuming);
+
+/* Helper hypercall for dealing with continuations. */
+long paging_domctl_continuation(XEN_GUEST_HANDLE_PARAM(xen_domctl_t));
 
 /* Call when destroying a domain */
-void paging_teardown(struct domain *d);
+int paging_teardown(struct domain *d);
 
 /* Call once all of the references to the domain have gone away */
 void paging_final_teardown(struct domain *d);


Attachment: xsa97-hap.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.