[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen stable-4.15] x86/shadow: defer releasing of PV's top-level shadow reference



commit 3a9a2901cc8b24f28dbdc6fb63f57006c77a1f47
Author:     Jan Beulich <JBeulich@xxxxxxxx>
AuthorDate: Wed Sep 20 10:36:06 2023 +0100
Commit:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Wed Sep 20 10:36:10 2023 +0100

    x86/shadow: defer releasing of PV's top-level shadow reference
    
    sh_set_toplevel_shadow() re-pinning the top-level shadow we may be
    running on is not enough (and at the same time unnecessary when the
    shadow isn't what we're running on): That shadow becomes eligible for
    blowing away (from e.g. shadow_prealloc()) immediately after the
    paging lock was dropped. Yet it needs to remain valid until the actual
    page table switch occurred.
    
    Propagate up the call chain the shadow entry that needs releasing
    eventually, and carry out the release immediately after switching page
    tables. Handle update_cr3() failures by switching to idle pagetables.
    Note that various further uses of update_cr3() are HVM-only or only act
    on paused vCPU-s, in which case sh_set_toplevel_shadow() will not defer
    releasing of the reference.
    
    While changing the update_cr3() hook, also convert the "do_locking"
    parameter to boolean.
    
    This is CVE-2023-34322 / XSA-438.
    
    Reported-by: Tim Deegan <tim@xxxxxxx>
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxx>
    (cherry picked from commit fb0ff49fe9f784bfee0370c2a3c5f20e39d7a1cb)
---
 xen/arch/x86/mm.c                | 27 +++++++++++++++-----
 xen/arch/x86/mm/hap/hap.c        |  4 ++-
 xen/arch/x86/mm/shadow/common.c  | 55 ++++++++++++++++++++++++++--------------
 xen/arch/x86/mm/shadow/multi.c   | 38 +++++++++++++++++----------
 xen/arch/x86/mm/shadow/none.c    |  3 ++-
 xen/arch/x86/mm/shadow/private.h | 14 +++++-----
 xen/arch/x86/pv/domain.c         | 25 ++++++++++++++++--
 xen/include/asm-x86/mm.h         |  2 +-
 xen/include/asm-x86/paging.h     |  6 ++---
 xen/include/asm-x86/shadow.h     |  8 ++++++
 10 files changed, 127 insertions(+), 55 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index c88dc749d4..44ac8cae76 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -564,15 +564,12 @@ void write_ptbase(struct vcpu *v)
  *
  * Update ref counts to shadow tables appropriately.
  */
-void update_cr3(struct vcpu *v)
+pagetable_t update_cr3(struct vcpu *v)
 {
     mfn_t cr3_mfn;
 
     if ( paging_mode_enabled(v->domain) )
-    {
-        paging_update_cr3(v, false);
-        return;
-    }
+        return paging_update_cr3(v, false);
 
     if ( !(v->arch.flags & TF_kernel_mode) )
         cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user);
@@ -580,6 +577,8 @@ void update_cr3(struct vcpu *v)
         cr3_mfn = pagetable_get_mfn(v->arch.guest_table);
 
     make_cr3(v, cr3_mfn);
+
+    return pagetable_null();
 }
 
 static inline void set_tlbflush_timestamp(struct page_info *page)
@@ -3241,6 +3240,7 @@ int new_guest_cr3(mfn_t mfn)
     struct domain *d = curr->domain;
     int rc;
     mfn_t old_base_mfn;
+    pagetable_t old_shadow;
 
     if ( is_pv_32bit_domain(d) )
     {
@@ -3308,9 +3308,22 @@ int new_guest_cr3(mfn_t mfn)
     if ( !VM_ASSIST(d, m2p_strict) )
         fill_ro_mpt(mfn);
     curr->arch.guest_table = pagetable_from_mfn(mfn);
-    update_cr3(curr);
+    old_shadow = update_cr3(curr);
+
+    /*
+     * In shadow mode update_cr3() can fail, in which case here we're still
+     * running on the prior top-level shadow (which we're about to release).
+     * Switch to the idle page tables in such an event; the guest will have
+     * been crashed already.
+     */
+    if ( likely(!mfn_eq(pagetable_get_mfn(old_shadow),
+                        maddr_to_mfn(curr->arch.cr3 & ~X86_CR3_NOFLUSH))) )
+        write_ptbase(curr);
+    else
+        write_ptbase(idle_vcpu[curr->processor]);
 
-    write_ptbase(curr);
+    if ( !pagetable_is_null(old_shadow) )
+        shadow_put_top_level(d, old_shadow);
 
     if ( likely(mfn_x(old_base_mfn) != 0) )
     {
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 1f9a157a0c..fa479d3d97 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -728,10 +728,12 @@ static bool_t hap_invlpg(struct vcpu *v, unsigned long 
linear)
     return 1;
 }
 
-static void hap_update_cr3(struct vcpu *v, int do_locking, bool noflush)
+static pagetable_t hap_update_cr3(struct vcpu *v, bool do_locking, bool 
noflush)
 {
     v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3];
     hvm_update_guest_cr3(v, noflush);
+
+    return pagetable_null();
 }
 
 /*
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index e73931573b..04ceca4a52 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -2641,13 +2641,13 @@ void shadow_update_paging_modes(struct vcpu *v)
 }
 
 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
-void sh_set_toplevel_shadow(struct vcpu *v,
-                            unsigned int slot,
-                            mfn_t gmfn,
-                            unsigned int root_type,
-                            mfn_t (*make_shadow)(struct vcpu *v,
-                                                 mfn_t gmfn,
-                                                 uint32_t shadow_type))
+pagetable_t sh_set_toplevel_shadow(struct vcpu *v,
+                                   unsigned int slot,
+                                   mfn_t gmfn,
+                                   unsigned int root_type,
+                                   mfn_t (*make_shadow)(struct vcpu *v,
+                                                        mfn_t gmfn,
+                                                        uint32_t shadow_type))
 {
     mfn_t smfn;
     pagetable_t old_entry, new_entry;
@@ -2704,20 +2704,37 @@ void sh_set_toplevel_shadow(struct vcpu *v,
                   mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry)));
     v->arch.paging.shadow.shadow_table[slot] = new_entry;
 
-    /* Decrement the refcount of the old contents of this slot */
-    if ( !pagetable_is_null(old_entry) )
+    /*
+     * Decrement the refcount of the old contents of this slot, unless
+     * we're still running on that shadow - in that case it'll need holding
+     * on to until the actual page table switch did occur.
+     */
+    if ( !pagetable_is_null(old_entry) && (v != current || !is_pv_domain(d)) )
     {
-        mfn_t old_smfn = pagetable_get_mfn(old_entry);
-        /* Need to repin the old toplevel shadow if it's been unpinned
-         * by shadow_prealloc(): in PV mode we're still running on this
-         * shadow and it's not safe to free it yet. */
-        if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(d, old_smfn) )
-        {
-            printk(XENLOG_G_ERR "can't re-pin %"PRI_mfn"\n", mfn_x(old_smfn));
-            domain_crash(d);
-        }
-        sh_put_ref(d, old_smfn, 0);
+        sh_put_ref(d, pagetable_get_mfn(old_entry), 0);
+        old_entry = pagetable_null();
     }
+
+    /*
+     * 2- and 3-level shadow mode is used for HVM only. Therefore we never run
+     * on such a shadow, so only call sites requesting an L4 shadow need to pay
+     * attention to the returned value.
+     */
+    ASSERT(pagetable_is_null(old_entry) || root_type == SH_type_l4_64_shadow);
+
+    return old_entry;
+}
+
+/*
+ * Helper invoked when releasing of a top-level shadow's reference was
+ * deferred in sh_set_toplevel_shadow() above.
+ */
+void shadow_put_top_level(struct domain *d, pagetable_t old_entry)
+{
+    ASSERT(!pagetable_is_null(old_entry));
+    paging_lock(d);
+    sh_put_ref(d, pagetable_get_mfn(old_entry), 0);
+    paging_unlock(d);
 }
 
 /**************************************************************************/
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 6a9f82d39c..35d47d6fbb 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3604,8 +3604,8 @@ sh_detach_old_tables(struct vcpu *v)
     }
 }
 
-static void
-sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
+static pagetable_t
+sh_update_cr3(struct vcpu *v, bool do_locking, bool noflush)
 /* Updates vcpu->arch.cr3 after the guest has changed CR3.
  * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
  * if appropriate).
@@ -3619,6 +3619,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool 
noflush)
 {
     struct domain *d = v->domain;
     mfn_t gmfn;
+    pagetable_t old_entry = pagetable_null();
 #if GUEST_PAGING_LEVELS == 3 && defined(CONFIG_HVM)
     const guest_l3e_t *gl3e;
     unsigned int i, guest_idx;
@@ -3628,7 +3629,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool 
noflush)
     if ( is_pv_domain(d) && !v->is_initialised )
     {
         ASSERT(v->arch.cr3 == 0);
-        return;
+        return old_entry;
     }
 
     if ( do_locking ) paging_lock(v->domain);
@@ -3701,11 +3702,12 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool 
noflush)
 #if GUEST_PAGING_LEVELS == 4
     if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
         guest_flush_tlb_mask(d, d->dirty_cpumask);
-    sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow);
+    old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow,
+                                       sh_make_shadow);
     if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
     {
         ASSERT(d->is_dying || d->is_shutting_down);
-        return;
+        return old_entry;
     }
     if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
     {
@@ -3751,26 +3753,32 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool 
noflush)
                 gl2gfn = guest_l3e_get_gfn(gl3e[i]);
                 gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt);
                 if ( p2m_is_ram(p2mt) )
-                    sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3)
-                                           ? SH_type_l2h_shadow
-                                           : SH_type_l2_shadow,
-                                           sh_make_shadow);
+                    old_entry = sh_set_toplevel_shadow(v, i, gl2mfn,
+                                                       (i == 3
+                                                        ? SH_type_l2h_shadow
+                                                        : SH_type_l2_shadow),
+                                                       sh_make_shadow);
                 else
-                    sh_set_toplevel_shadow(v, i, INVALID_MFN, 0,
-                                           sh_make_shadow);
+                    old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0,
+                                                       sh_make_shadow);
             }
             else
-                sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, sh_make_shadow);
+                old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0,
+                                                   sh_make_shadow);
+
+            ASSERT(pagetable_is_null(old_entry));
         }
     }
 #elif GUEST_PAGING_LEVELS == 2
     if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
         guest_flush_tlb_mask(d, d->dirty_cpumask);
-    sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow);
+    old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow,
+                                       sh_make_shadow);
+    ASSERT(pagetable_is_null(old_entry));
     if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
     {
         ASSERT(d->is_dying || d->is_shutting_down);
-        return;
+        return old_entry;
     }
 #else
 #error This should never happen
@@ -3864,6 +3872,8 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool 
noflush)
 
     /* Release the lock, if we took it (otherwise it's the caller's problem) */
     if ( do_locking ) paging_unlock(v->domain);
+
+    return old_entry;
 }
 
 
diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c
index 691269a59e..9b9c03ce7e 100644
--- a/xen/arch/x86/mm/shadow/none.c
+++ b/xen/arch/x86/mm/shadow/none.c
@@ -50,9 +50,10 @@ static unsigned long _gva_to_gfn(struct vcpu *v, struct 
p2m_domain *p2m,
     return gfn_x(INVALID_GFN);
 }
 
-static void _update_cr3(struct vcpu *v, int do_locking, bool noflush)
+static pagetable_t _update_cr3(struct vcpu *v, bool do_locking, bool noflush)
 {
     ASSERT_UNREACHABLE();
+    return pagetable_null();
 }
 
 static void _update_paging_modes(struct vcpu *v)
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index 1be84fc951..c9064b294b 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -360,13 +360,13 @@ mfn_t shadow_alloc(struct domain *d,
 void  shadow_free(struct domain *d, mfn_t smfn);
 
 /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */
-void sh_set_toplevel_shadow(struct vcpu *v,
-                            unsigned int slot,
-                            mfn_t gmfn,
-                            unsigned int root_type,
-                            mfn_t (*make_shadow)(struct vcpu *v,
-                                                 mfn_t gmfn,
-                                                 uint32_t shadow_type));
+pagetable_t sh_set_toplevel_shadow(struct vcpu *v,
+                                   unsigned int slot,
+                                   mfn_t gmfn,
+                                   unsigned int root_type,
+                                   mfn_t (*make_shadow)(struct vcpu *v,
+                                                        mfn_t gmfn,
+                                                        uint32_t shadow_type));
 
 /* Update the shadows in response to a pagetable write from Xen */
 int sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size);
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 39a00c6ec2..d994a8cf36 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -408,10 +408,13 @@ bool __init xpti_pcid_enabled(void)
 
 static void _toggle_guest_pt(struct vcpu *v)
 {
+    bool guest_update;
+    pagetable_t old_shadow;
     unsigned long cr3;
 
     v->arch.flags ^= TF_kernel_mode;
-    update_cr3(v);
+    guest_update = v->arch.flags & TF_kernel_mode;
+    old_shadow = update_cr3(v);
 
     /*
      * Don't flush user global mappings from the TLB. Don't tick TLB clock.
@@ -420,13 +423,31 @@ static void _toggle_guest_pt(struct vcpu *v)
      * TLB flush (for just the incoming PCID), as the top level page table may
      * have changed behind our backs. To be on the safe side, suppress the
      * no-flush unconditionally in this case.
+     *
+     * Furthermore in shadow mode update_cr3() can fail, in which case here
+     * we're still running on the prior top-level shadow (which we're about
+     * to release). Switch to the idle page tables in such an event; the
+     * guest will have been crashed already.
      */
     cr3 = v->arch.cr3;
     if ( shadow_mode_enabled(v->domain) )
+    {
         cr3 &= ~X86_CR3_NOFLUSH;
+
+        if ( unlikely(mfn_eq(pagetable_get_mfn(old_shadow),
+                             maddr_to_mfn(cr3))) )
+        {
+            cr3 = idle_vcpu[v->processor]->arch.cr3;
+            /* Also suppress runstate/time area updates below. */
+            guest_update = false;
+        }
+    }
     write_cr3(cr3);
 
-    if ( !(v->arch.flags & TF_kernel_mode) )
+    if ( !pagetable_is_null(old_shadow) )
+        shadow_put_top_level(v->domain, old_shadow);
+
+    if ( !guest_update )
         return;
 
     if ( v->arch.pv.need_update_runstate_area && update_runstate_area(v) )
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 71dd28f126..cffd0d6425 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -569,7 +569,7 @@ void audit_domains(void);
 #endif
 
 void make_cr3(struct vcpu *v, mfn_t mfn);
-void update_cr3(struct vcpu *v);
+pagetable_t update_cr3(struct vcpu *v);
 int vcpu_destroy_pagetables(struct vcpu *);
 void *do_page_walk(struct vcpu *v, unsigned long addr);
 
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 5ec508a351..5bcdbf93a7 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -136,7 +136,7 @@ struct paging_mode {
                                             unsigned long cr3,
                                             paddr_t ga, uint32_t *pfec,
                                             unsigned int *page_order);
-    void          (*update_cr3            )(struct vcpu *v, int do_locking,
+    pagetable_t   (*update_cr3            )(struct vcpu *v, bool do_locking,
                                             bool noflush);
     void          (*update_paging_modes   )(struct vcpu *v);
     bool          (*flush_tlb             )(bool (*flush_vcpu)(void *ctxt,
@@ -309,9 +309,9 @@ static inline unsigned long paging_ga_to_gfn_cr3(struct 
vcpu *v,
 /* Update all the things that are derived from the guest's CR3.
  * Called when the guest changes CR3; the caller can then use v->arch.cr3
  * as the value to load into the host CR3 to schedule this vcpu */
-static inline void paging_update_cr3(struct vcpu *v, bool noflush)
+static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush)
 {
-    paging_get_hostmode(v)->update_cr3(v, 1, noflush);
+    return paging_get_hostmode(v)->update_cr3(v, 1, noflush);
 }
 
 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h
index e25f9604d8..302ae97fc6 100644
--- a/xen/include/asm-x86/shadow.h
+++ b/xen/include/asm-x86/shadow.h
@@ -97,6 +97,9 @@ void shadow_blow_tables_per_domain(struct domain *d);
 int shadow_set_allocation(struct domain *d, unsigned int pages,
                           bool *preempted);
 
+/* Helper to invoke for deferred releasing of a top-level shadow's reference. 
*/
+void shadow_put_top_level(struct domain *d, pagetable_t old);
+
 #else /* !CONFIG_SHADOW_PAGING */
 
 #define shadow_vcpu_teardown(v) ASSERT(is_pv_vcpu(v))
@@ -118,6 +121,11 @@ static inline void shadow_prepare_page_type_change(struct 
domain *d,
 
 static inline void shadow_blow_tables_per_domain(struct domain *d) {}
 
+static inline void shadow_put_top_level(struct domain *d, pagetable_t old)
+{
+    ASSERT_UNREACHABLE();
+}
+
 static inline int shadow_domctl(struct domain *d,
                                 struct xen_domctl_shadow_op *sc,
                                 XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
--
generated by git-patchbot for /home/xen/git/xen.git#stable-4.15



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.