[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3] xen: introduce VCPUOP_register_runstate_phys_memory_area hypercall



From: Andrii Anisov <andrii_anisov@xxxxxxxx>

Existing interface to register runstate are with its virtual address
is prone to issues which became more obvious with KPTI enablement in
guests. The nature of those issues is the fact that the guest could
be interrupted by the hypervisor at any time, and there is no guarantee
to have the registered virtual address translated with the currently
available guest's page tables. Before the KPTI such a situation was
possible in case the guest is caught in the middle of PT processing
(e.g. superpage shattering). With the KPTI this happens also when the
guest runs userspace, so has a pretty high probability.
So it was agreed to register runstate with the guest's physical address
so that its mapping is permanent from the hypervisor point of view. [1]

The hypercall employs the same vcpu_register_runstate_memory_area
structure for the interface, but requires a registered area to not
cross a page boundary.

[1] https://lists.xenproject.org/archives/html/xen-devel/2019-02/msg00416.html

Signed-off-by: Andrii Anisov <andrii_anisov@xxxxxxxx>
---
 xen/arch/arm/domain.c        |  58 ++++++++++++++++++---
 xen/arch/x86/domain.c        |  99 ++++++++++++++++++++++++++++++++---
 xen/arch/x86/x86_64/domain.c |  16 +++++-
 xen/common/domain.c          | 121 +++++++++++++++++++++++++++++++++++++++----
 xen/include/public/vcpu.h    |  15 ++++++
 xen/include/xen/sched.h      |  28 +++++++---
 6 files changed, 306 insertions(+), 31 deletions(-)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index ff330b3..ecedf1c 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -274,17 +274,15 @@ static void ctxt_switch_to(struct vcpu *n)
     virt_timer_restore(n);
 }
 
-/* Update per-VCPU guest runstate shared memory area (if registered). */
-static void update_runstate_area(struct vcpu *v)
+static void update_runstate_by_gvaddr(struct vcpu *v)
 {
     void __user *guest_handle = NULL;
 
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return;
+    ASSERT(!guest_handle_is_null(runstate_guest_virt(v)));
 
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
-        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
+        guest_handle = &v->runstate_guest.virt.p->state_entry_time + 1;
         guest_handle--;
         v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
         __raw_copy_to_guest(guest_handle,
@@ -292,7 +290,7 @@ static void update_runstate_area(struct vcpu *v)
         smp_wmb();
     }
 
-    __copy_to_guest(runstate_guest(v), &v->runstate, 1);
+    __copy_to_guest(runstate_guest_virt(v), &v->runstate, 1);
 
     if ( guest_handle )
     {
@@ -303,6 +301,53 @@ static void update_runstate_area(struct vcpu *v)
     }
 }
 
+static void update_runstate_by_gpaddr(struct vcpu *v)
+{
+    struct vcpu_runstate_info *runstate =
+            (struct vcpu_runstate_info *)v->runstate_guest.phys;
+
+    ASSERT(runstate != NULL);
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+    }
+
+    memcpy(v->runstate_guest.phys, &v->runstate, sizeof(v->runstate));
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+    }
+}
+
+/* Update per-VCPU guest runstate shared memory area (if registered). */
+static void update_runstate_area(struct vcpu *v)
+{
+    if ( xchg(&v->runstate_in_use, 1) )
+        return;
+
+    switch ( v->runstate_guest_type )
+    {
+    case RUNSTATE_NONE:
+        break;
+
+    case RUNSTATE_VADDR:
+        update_runstate_by_gvaddr(v);
+        break;
+
+    case RUNSTATE_PADDR:
+        update_runstate_by_gpaddr(v);
+        break;
+    }
+
+    xchg(&v->runstate_in_use, 0);
+}
+
 static void schedule_tail(struct vcpu *prev)
 {
     ASSERT(prev != current);
@@ -998,6 +1043,7 @@ long do_arm_vcpu_op(int cmd, unsigned int vcpuid, 
XEN_GUEST_HANDLE_PARAM(void) a
     {
         case VCPUOP_register_vcpu_info:
         case VCPUOP_register_runstate_memory_area:
+        case VCPUOP_register_runstate_phys_memory_area:
             return do_vcpu_op(cmd, vcpuid, arg);
         default:
             return -EINVAL;
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index ac960dd..fe71776 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1566,22 +1566,21 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
 }
 
 /* Update per-VCPU guest runstate shared memory area (if registered). */
-bool update_runstate_area(struct vcpu *v)
+static bool update_runstate_by_gvaddr(struct vcpu *v)
 {
     bool rc;
     struct guest_memory_policy policy = { .nested_guest_mode = false };
     void __user *guest_handle = NULL;
 
-    if ( guest_handle_is_null(runstate_guest(v)) )
-        return true;
+    ASSERT(!guest_handle_is_null(runstate_guest_virt(v)));
 
     update_guest_memory_policy(v, &policy);
 
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
         guest_handle = has_32bit_shinfo(v->domain)
-            ? &v->runstate_guest.compat.p->state_entry_time + 1
-            : &v->runstate_guest.native.p->state_entry_time + 1;
+            ? &v->runstate_guest.virt.compat.p->state_entry_time + 1
+            : &v->runstate_guest.virt.native.p->state_entry_time + 1;
         guest_handle--;
         v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
         __raw_copy_to_guest(guest_handle,
@@ -1594,11 +1593,11 @@ bool update_runstate_area(struct vcpu *v)
         struct compat_vcpu_runstate_info info;
 
         XLAT_vcpu_runstate_info(&info, &v->runstate);
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
+        __copy_to_guest(v->runstate_guest.virt.compat, &info, 1);
         rc = true;
     }
     else
-        rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) !=
+        rc = __copy_to_guest(runstate_guest_virt(v), &v->runstate, 1) !=
              sizeof(v->runstate);
 
     if ( guest_handle )
@@ -1614,6 +1613,92 @@ bool update_runstate_area(struct vcpu *v)
     return rc;
 }
 
+static bool update_runstate_by_gpaddr_native(struct vcpu *v)
+{
+    struct vcpu_runstate_info *runstate =
+            (struct vcpu_runstate_info *)v->runstate_guest.phys;
+
+    ASSERT(runstate != NULL);
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+    }
+
+    memcpy(v->runstate_guest.phys, &v->runstate, sizeof(v->runstate));
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+    }
+
+    return true;
+}
+
+static bool update_runstate_by_gpaddr_compat(struct vcpu *v)
+{
+    struct compat_vcpu_runstate_info *runstate =
+            (struct compat_vcpu_runstate_info *)v->runstate_guest.phys;
+
+    ASSERT(runstate != NULL);
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
+    }
+
+    {
+        struct compat_vcpu_runstate_info info;
+        XLAT_vcpu_runstate_info(&info, &v->runstate);
+        memcpy(v->runstate_guest.phys, &info, sizeof(info));
+    }
+    else
+        memcpy(v->runstate_guest.phys, &v->runstate, sizeof(v->runstate));
+
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
+    {
+        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        smp_wmb();
+        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+    }
+
+    return true;
+}
+
+bool update_runstate_area(struct vcpu *v)
+{
+    bool rc = true;
+
+    if ( xchg(&v->runstate_in_use, 1) )
+        return rc;
+
+    switch ( v->runstate_guest_type )
+    {
+    case RUNSTATE_NONE:
+        break;
+
+    case RUNSTATE_VADDR:
+        rc = update_runstate_by_gvaddr(v);
+        break;
+
+    case RUNSTATE_PADDR:
+        if ( has_32bit_shinfo(v->domain) )
+            rc = update_runstate_by_gpaddr_compat(v);
+        else
+            rc = update_runstate_by_gpaddr_native(v);
+        break;
+    }
+
+    xchg(&v->runstate_in_use, 0);
+    return rc;
+}
+
 static void _update_runstate_area(struct vcpu *v)
 {
     if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c
index c46dccc..85d0072 100644
--- a/xen/arch/x86/x86_64/domain.c
+++ b/xen/arch/x86/x86_64/domain.c
@@ -12,6 +12,8 @@
 CHECK_vcpu_get_physid;
 #undef xen_vcpu_get_physid
 
+extern void discard_runstate_area(struct vcpu *v);
+
 int
 arch_compat_vcpu_op(
     int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -35,8 +37,16 @@ arch_compat_vcpu_op(
              !compat_handle_okay(area.addr.h, 1) )
             break;
 
+        while( xchg(&v->runstate_in_use, 1) == 0);
+
+        discard_runstate_area(v);
+
         rc = 0;
-        guest_from_compat_handle(v->runstate_guest.compat, area.addr.h);
+
+        guest_from_compat_handle(v->runstate_guest.virt.compat,
+                                 area.addr.h);
+
+        v->runstate_guest_type = RUNSTATE_VADDR;
 
         if ( v == current )
         {
@@ -49,7 +59,9 @@ arch_compat_vcpu_op(
             vcpu_runstate_get(v, &runstate);
             XLAT_vcpu_runstate_info(&info, &runstate);
         }
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
+        __copy_to_guest(v->runstate_guest.virt.compat, &info, 1);
+
+        xchg(&v->runstate_in_use, 0);
 
         break;
     }
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 90c6607..d276b87 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -698,6 +698,74 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct 
domain **d)
     return 0;
 }
 
+static void unmap_runstate_area(struct vcpu *v)
+{
+    mfn_t mfn;
+
+    if ( ! v->runstate_guest.phys )
+        return;
+
+    mfn = domain_page_map_to_mfn(v->runstate_guest.phys);
+
+    unmap_domain_page_global((void *)
+                             ((unsigned long)v->runstate_guest.phys &
+                              PAGE_MASK));
+
+    v->runstate_guest.phys = NULL;
+    put_page_and_type(mfn_to_page(mfn));
+}
+
+static int map_runstate_area(struct vcpu *v,
+                      struct vcpu_register_runstate_memory_area *area)
+{
+    unsigned long offset = area->addr.p & ~PAGE_MASK;
+    gfn_t gfn = gaddr_to_gfn(area->addr.p);
+    struct domain *d = v->domain;
+    void *mapping;
+    struct page_info *page;
+    size_t size = sizeof(struct vcpu_runstate_info);
+
+    if ( offset > (PAGE_SIZE - size) )
+        return -EINVAL;
+
+    page = get_page_from_gfn(d, gfn_x(gfn), NULL, P2M_ALLOC);
+    if ( !page )
+        return -EINVAL;
+
+    if ( !get_page_type(page, PGT_writable_page) )
+    {
+        put_page(page);
+        return -EINVAL;
+    }
+
+    mapping = __map_domain_page_global(page);
+
+    if ( mapping == NULL )
+    {
+        put_page_and_type(page);
+        return -ENOMEM;
+    }
+
+    v->runstate_guest.phys = mapping + offset;
+
+    return 0;
+}
+
+void discard_runstate_area(struct vcpu *v)
+{
+    if ( v->runstate_guest_type == RUNSTATE_PADDR )
+        unmap_runstate_area(v);
+
+    v->runstate_guest_type = RUNSTATE_NONE;
+}
+
+static void discard_runstate_area_locked(struct vcpu *v)
+{
+    while ( xchg(&v->runstate_in_use, 1) );
+    discard_runstate_area(v);
+    xchg(&v->runstate_in_use, 0);
+}
+
 int domain_kill(struct domain *d)
 {
     int rc = 0;
@@ -734,7 +802,10 @@ int domain_kill(struct domain *d)
         if ( cpupool_move_domain(d, cpupool0) )
             return -ERESTART;
         for_each_vcpu ( d, v )
+        {
+            discard_runstate_area_locked(v);
             unmap_vcpu_info(v);
+        }
         d->is_dying = DOMDYING_dead;
         /* Mem event cleanup has to go here because the rings 
          * have to be put before we call put_domain. */
@@ -1188,7 +1259,7 @@ int domain_soft_reset(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        set_xen_guest_handle(runstate_guest(v), NULL);
+        discard_runstate_area_locked(v);
         unmap_vcpu_info(v);
     }
 
@@ -1518,18 +1589,50 @@ long do_vcpu_op(int cmd, unsigned int vcpuid, 
XEN_GUEST_HANDLE_PARAM(void) arg)
             break;
 
         rc = 0;
-        runstate_guest(v) = area.addr.h;
 
-        if ( v == current )
-        {
-            __copy_to_guest(runstate_guest(v), &v->runstate, 1);
-        }
-        else
+        while( xchg(&v->runstate_in_use, 1) == 0);
+
+        discard_runstate_area(v);
+
+        if ( !guest_handle_is_null(runstate_guest_virt(v)) )
         {
-            vcpu_runstate_get(v, &runstate);
-            __copy_to_guest(runstate_guest(v), &runstate, 1);
+            runstate_guest_virt(v) = area.addr.h;
+            v->runstate_guest_type = RUNSTATE_VADDR;
+
+            if ( v == current )
+            {
+                __copy_to_guest(runstate_guest_virt(v), &v->runstate, 1);
+            }
+            else
+            {
+                vcpu_runstate_get(v, &runstate);
+                __copy_to_guest(runstate_guest_virt(v), &runstate, 1);
+            }
         }
 
+        xchg(&v->runstate_in_use, 0);
+
+        break;
+    }
+
+    case VCPUOP_register_runstate_phys_memory_area:
+    {
+        struct vcpu_register_runstate_memory_area area;
+
+        rc = -EFAULT;
+        if ( copy_from_guest(&area, arg, 1) )
+             break;
+
+        while( xchg(&v->runstate_in_use, 1) == 0);
+
+        discard_runstate_area(v);
+
+        rc = map_runstate_area(v, &area);
+        if ( !rc )
+            v->runstate_guest_type = RUNSTATE_PADDR;
+
+        xchg(&v->runstate_in_use, 0);
+
         break;
     }
 
diff --git a/xen/include/public/vcpu.h b/xen/include/public/vcpu.h
index 3623af9..d7da4a3 100644
--- a/xen/include/public/vcpu.h
+++ b/xen/include/public/vcpu.h
@@ -235,6 +235,21 @@ struct vcpu_register_time_memory_area {
 typedef struct vcpu_register_time_memory_area vcpu_register_time_memory_area_t;
 DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t);
 
+/*
+ * Register a shared memory area from which the guest may obtain its own
+ * runstate information without needing to execute a hypercall.
+ * Notes:
+ *  1. The registered address must be guest's physical address.
+ *  2. The registered runstate area should not cross page boundary.
+ *  3. Only one shared area may be registered per VCPU. The shared area is
+ *     updated by the hypervisor each time the VCPU is scheduled. Thus
+ *     runstate.state will always be RUNSTATE_running and
+ *     runstate.state_entry_time will indicate the system time at which the
+ *     VCPU was last scheduled to run.
+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
+ */
+#define VCPUOP_register_runstate_phys_memory_area 14
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
 
 /*
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 2201fac..6c8de8f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -163,17 +163,31 @@ struct vcpu
     void            *sched_priv;    /* scheduler-specific data */
 
     struct vcpu_runstate_info runstate;
+
+    enum {
+        RUNSTATE_NONE = 0,
+        RUNSTATE_PADDR = 1,
+        RUNSTATE_VADDR = 2,
+    } runstate_guest_type;
+
+    unsigned long runstate_in_use;
+
+    union
+    {
 #ifndef CONFIG_COMPAT
-# define runstate_guest(v) ((v)->runstate_guest)
-    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
+# define runstate_guest_virt(v) ((v)->runstate_guest.virt)
+           XEN_GUEST_HANDLE(vcpu_runstate_info_t) virt; /* guest address */
 #else
-# define runstate_guest(v) ((v)->runstate_guest.native)
-    union {
-        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
-        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
-    } runstate_guest; /* guest address */
+# define runstate_guest_virt(v) ((v)->runstate_guest.virt.native)
+           union {
+               XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
+               XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
+           } virt; /* guest address */
 #endif
 
+        void*   phys;
+    } runstate_guest;
+
     /* last time when vCPU is scheduled out */
     uint64_t last_run_time;
 
-- 
2.7.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.