[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen master] gnttab: replace mapkind()



commit 9781b51efde251efcc0291ddb1d9c7cefe2b2555
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Wed Aug 25 14:18:39 2021 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Wed Aug 25 14:18:39 2021 +0200

    gnttab: replace mapkind()
    
    mapkind() doesn't scale very well with larger maptrack entry counts,
    using a brute force linear search through all entries, with the only
    option of an early loop exit if a matching writable entry was found.
    Introduce a radix tree alongside the main maptrack table, thus
    allowing much faster MFN-based lookup. To avoid the need to actually
    allocate space for the individual nodes, encode the two counters in the
    node pointers themselves, thus limiting the number of permitted
    simultaneous r/o and r/w mappings of the same MFN to 2³¹-1 (64-bit) /
    2¹�-1 (32-bit) each.
    
    To avoid enforcing an unnecessarily low bound on the number of
    simultaneous mappings of a single MFN, introduce
    radix_tree_{ulong_to_ptr,ptr_to_ulong} paralleling
    radix_tree_{int_to_ptr,ptr_to_int}.
    
    As a consequence locking changes are also applicable: With there no
    longer being any inspection of the remote domain's active entries,
    there's also no need anymore to hold the remote domain's grant table
    lock. And since we're no longer iterating over the local domain's map
    track table, the lock in map_grant_ref() can also be dropped before the
    new maptrack entry actually gets populated.
    
    As a nice side effect this also reduces the number of IOMMU operations
    in unmap_common(): Previously we would have "established" a readable
    mapping whenever we didn't find a writable entry anymore (yet, of
    course, at least one readable one). But we only need to do this if we
    actually dropped the last writable entry, not if there were none already
    before.
    
    This is part of CVE-2021-28698 / XSA-380.
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Reviewed-by: Julien Grall <jgrall@xxxxxxxxxx>
---
 xen/common/grant_table.c     | 201 ++++++++++++++++++++++++-------------------
 xen/include/xen/radix-tree.h |  19 ++++
 2 files changed, 131 insertions(+), 89 deletions(-)

diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index 17cce0154a..76a78df405 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -37,6 +37,7 @@
 #include <xen/iommu.h>
 #include <xen/paging.h>
 #include <xen/keyhandler.h>
+#include <xen/radix-tree.h>
 #include <xen/vmap.h>
 #include <xen/nospec.h>
 #include <xsm/xsm.h>
@@ -82,8 +83,13 @@ struct grant_table {
     grant_status_t       **status;
     /* Active grant table. */
     struct active_grant_entry **active;
-    /* Mapping tracking table per vcpu. */
+    /* Handle-indexed tracking table of mappings. */
     struct grant_mapping **maptrack;
+    /*
+     * MFN-indexed tracking tree of mappings, if needed.  Note that this is
+     * protected by @lock, not @maptrack_lock.
+     */
+    struct radix_tree_root maptrack_tree;
 
     /* Domain to which this struct grant_table belongs. */
     const struct domain *domain;
@@ -516,34 +522,6 @@ static int get_paged_frame(unsigned long gfn, mfn_t *mfn,
     return GNTST_okay;
 }
 
-static inline void
-double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
-{
-    /*
-     * See mapkind() for why the write lock is also required for the
-     * remote domain.
-     */
-    if ( lgt < rgt )
-    {
-        grant_write_lock(lgt);
-        grant_write_lock(rgt);
-    }
-    else
-    {
-        if ( lgt != rgt )
-            grant_write_lock(rgt);
-        grant_write_lock(lgt);
-    }
-}
-
-static inline void
-double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
-{
-    grant_write_unlock(lgt);
-    if ( lgt != rgt )
-        grant_write_unlock(rgt);
-}
-
 #define INVALID_MAPTRACK_HANDLE UINT_MAX
 
 static inline grant_handle_t
@@ -970,41 +948,17 @@ static struct active_grant_entry *grant_map_exists(const 
struct domain *ld,
     return ERR_PTR(-EINVAL);
 }
 
-#define MAPKIND_READ 1
-#define MAPKIND_WRITE 2
-static unsigned int mapkind(
-    struct grant_table *lgt, const struct domain *rd, mfn_t mfn)
-{
-    struct grant_mapping *map;
-    grant_handle_t handle, limit = lgt->maptrack_limit;
-    unsigned int kind = 0;
-
-    /*
-     * Must have the local domain's grant table write lock when
-     * iterating over its maptrack entries.
-     */
-    ASSERT(percpu_rw_is_write_locked(&lgt->lock));
-    /*
-     * Must have the remote domain's grant table write lock while
-     * counting its active entries.
-     */
-    ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock));
-
-    smp_rmb();
-
-    for ( handle = 0; !(kind & MAPKIND_WRITE) && handle < limit; handle++ )
-    {
-        map = &maptrack_entry(lgt, handle);
-        if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ||
-             map->domid != rd->domain_id )
-            continue;
-        if ( mfn_eq(_active_entry(rd->grant_table, map->ref).mfn, mfn) )
-            kind |= map->flags & GNTMAP_readonly ?
-                    MAPKIND_READ : MAPKIND_WRITE;
-    }
-
-    return kind;
-}
+union maptrack_node {
+    struct {
+        /* Radix tree slot pointers use two of the bits. */
+#ifdef __BIG_ENDIAN_BITFIELD
+        unsigned long    : 2;
+#endif
+        unsigned long rd : BITS_PER_LONG / 2 - 1;
+        unsigned long wr : BITS_PER_LONG / 2 - 1;
+    } cnt;
+    unsigned long raw;
+};
 
 static void
 map_grant_ref(
@@ -1023,7 +977,6 @@ map_grant_ref(
     struct grant_mapping *mt;
     grant_entry_header_t *shah;
     uint16_t *status;
-    bool_t need_iommu;
 
     ld = current->domain;
 
@@ -1244,31 +1197,75 @@ map_grant_ref(
      * as mem-sharing and IOMMU use are incompatible). The dom_io case would
      * need checking separately if we compared against owner here.
      */
-    need_iommu = ld != rd && gnttab_need_iommu_mapping(ld);
-    if ( need_iommu )
-    {
+    if ( ld != rd && gnttab_need_iommu_mapping(ld) )
+    {
+        union maptrack_node node = {
+            .cnt.rd = !!(op->flags & GNTMAP_readonly),
+            .cnt.wr = !(op->flags & GNTMAP_readonly),
+        };
+        int err;
+        void **slot = NULL;
         unsigned int kind;
 
-        double_gt_lock(lgt, rgt);
+        grant_write_lock(lgt);
+
+        err = radix_tree_insert(&lgt->maptrack_tree, mfn_x(mfn),
+                                radix_tree_ulong_to_ptr(node.raw));
+        if ( err == -EEXIST )
+        {
+            slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(mfn));
+            if ( likely(slot) )
+            {
+                node.raw = radix_tree_ptr_to_ulong(*slot);
+                err = -EBUSY;
+
+                /* Update node only when refcount doesn't overflow. */
+                if ( op->flags & GNTMAP_readonly ? ++node.cnt.rd
+                                                 : ++node.cnt.wr )
+                {
+                    radix_tree_replace_slot(slot,
+                                            radix_tree_ulong_to_ptr(node.raw));
+                    err = 0;
+                }
+            }
+            else
+                ASSERT_UNREACHABLE();
+        }
 
         /*
          * We're not translated, so we know that dfns and mfns are
          * the same things, so the IOMMU entry is always 1-to-1.
          */
-        kind = mapkind(lgt, rd, mfn);
-        if ( !(op->flags & GNTMAP_readonly) &&
-             !(kind & MAPKIND_WRITE) )
+        if ( !(op->flags & GNTMAP_readonly) && node.cnt.wr == 1 )
             kind = IOMMUF_readable | IOMMUF_writable;
-        else if ( !kind )
+        else if ( (op->flags & GNTMAP_readonly) &&
+                  node.cnt.rd == 1 && !node.cnt.wr )
             kind = IOMMUF_readable;
         else
             kind = 0;
-        if ( kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind) )
+        if ( err ||
+             (kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind)) )
         {
-            double_gt_unlock(lgt, rgt);
+            if ( !err )
+            {
+                if ( slot )
+                {
+                    op->flags & GNTMAP_readonly ? node.cnt.rd--
+                                                : node.cnt.wr--;
+                    radix_tree_replace_slot(slot,
+                                            radix_tree_ulong_to_ptr(node.raw));
+                }
+                else
+                    radix_tree_delete(&lgt->maptrack_tree, mfn_x(mfn));
+            }
+
             rc = GNTST_general_error;
-            goto undo_out;
         }
+
+        grant_write_unlock(lgt);
+
+        if ( rc != GNTST_okay )
+            goto undo_out;
     }
 
     TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom);
@@ -1276,10 +1273,6 @@ map_grant_ref(
     /*
      * All maptrack entry users check mt->flags first before using the
      * other fields so just ensure the flags field is stored last.
-     *
-     * However, if gnttab_need_iommu_mapping() then this would race
-     * with a concurrent mapkind() call (on an unmap, for example)
-     * and a lock is required.
      */
     mt = &maptrack_entry(lgt, handle);
     mt->domid = op->dom;
@@ -1287,9 +1280,6 @@ map_grant_ref(
     smp_wmb();
     write_atomic(&mt->flags, op->flags);
 
-    if ( need_iommu )
-        double_gt_unlock(lgt, rgt);
-
     op->dev_bus_addr = mfn_to_maddr(mfn);
     op->handle       = handle;
     op->status       = GNTST_okay;
@@ -1497,19 +1487,34 @@ unmap_common(
     /* See the respective comment in map_grant_ref(). */
     if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) )
     {
-        unsigned int kind;
+        void **slot;
+        union maptrack_node node;
         int err = 0;
 
-        double_gt_lock(lgt, rgt);
+        grant_write_lock(lgt);
+        slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(op->mfn));
+        node.raw = likely(slot) ? radix_tree_ptr_to_ulong(*slot) : 0;
+
+        /* Refcount must not underflow. */
+        if ( !(flags & GNTMAP_readonly ? node.cnt.rd--
+                                       : node.cnt.wr--) )
+            BUG();
 
-        kind = mapkind(lgt, rd, op->mfn);
-        if ( !kind )
+        if ( !node.raw )
             err = iommu_legacy_unmap(ld, _dfn(mfn_x(op->mfn)), 1);
-        else if ( !(kind & MAPKIND_WRITE) )
+        else if ( !(flags & GNTMAP_readonly) && !node.cnt.wr )
             err = iommu_legacy_map(ld, _dfn(mfn_x(op->mfn)), op->mfn, 1,
                                    IOMMUF_readable);
 
-        double_gt_unlock(lgt, rgt);
+        if ( err )
+            ;
+        else if ( !node.raw )
+            radix_tree_delete(&lgt->maptrack_tree, mfn_x(op->mfn));
+        else
+            radix_tree_replace_slot(slot,
+                                    radix_tree_ulong_to_ptr(node.raw));
+
+        grant_write_unlock(lgt);
 
         if ( err )
             rc = GNTST_general_error;
@@ -1956,6 +1961,8 @@ int grant_table_init(struct domain *d, int 
max_grant_frames,
         gt->maptrack = vzalloc(gt->max_maptrack_frames * 
sizeof(*gt->maptrack));
         if ( gt->maptrack == NULL )
             goto out;
+
+        radix_tree_init(&gt->maptrack_tree);
     }
 
     /* Shared grant table. */
@@ -3704,6 +3711,8 @@ int gnttab_release_mappings(struct domain *d)
 
     for ( handle = gt->maptrack_limit; handle; )
     {
+        mfn_t mfn;
+
         /*
          * Deal with full pages such that their freeing (in the body of the
          * if()) remains simple.
@@ -3801,17 +3810,31 @@ int gnttab_release_mappings(struct domain *d)
 
         reduce_status_for_pin(rd, act, status, map->flags & GNTMAP_readonly);
 
+        mfn = act->mfn;
+
         active_entry_release(act);
         grant_read_unlock(rgt);
 
         rcu_unlock_domain(rd);
 
         map->flags = 0;
+
+        /*
+         * This is excessive in that a single such call would suffice per
+         * mapped MFN (or none at all, if no entry was ever inserted). But it
+         * should be the common case for an MFN to be mapped just once, and
+         * this way we don't need to further maintain the counters. We also
+         * don't want to leave cleaning up of the tree as a whole to the end
+         * of the function, as this could take quite some time.
+         */
+        radix_tree_delete(&gt->maptrack_tree, mfn_x(mfn));
     }
 
     gt->maptrack_limit = 0;
     FREE_XENHEAP_PAGE(gt->maptrack[0]);
 
+    radix_tree_destroy(&gt->maptrack_tree, NULL);
+
     return 0;
 }
 
diff --git a/xen/include/xen/radix-tree.h b/xen/include/xen/radix-tree.h
index ec40cf1d9e..58c40312e6 100644
--- a/xen/include/xen/radix-tree.h
+++ b/xen/include/xen/radix-tree.h
@@ -190,6 +190,25 @@ static inline int radix_tree_ptr_to_int(void *ptr)
     return (int)((long)ptr >> 2);
 }
 
+/**
+ * radix_tree_{ulong_to_ptr,ptr_to_ulong}:
+ *
+ * Same for unsigned long values. Beware though that only BITS_PER_LONG-2
+ * bits are actually usable for the value.
+ */
+static inline void *radix_tree_ulong_to_ptr(unsigned long val)
+{
+    unsigned long ptr = (val << 2) | 0x2;
+    ASSERT((ptr >> 2) == val);
+    return (void *)ptr;
+}
+
+static inline unsigned long radix_tree_ptr_to_ulong(void *ptr)
+{
+    ASSERT(((unsigned long)ptr & 0x3) == 0x2);
+    return (unsigned long)ptr >> 2;
+}
+
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
--
generated by git-patchbot for /home/xen/git/xen.git#master



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.