[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [xen stable-4.15] gnttab: replace mapkind()
commit 2f6ebcec023be0b0873a5e67ce3a0ab301dbac4f Author: Jan Beulich <jbeulich@xxxxxxxx> AuthorDate: Wed Aug 25 14:44:48 2021 +0200 Commit: Jan Beulich <jbeulich@xxxxxxxx> CommitDate: Wed Aug 25 14:44:48 2021 +0200 gnttab: replace mapkind() mapkind() doesn't scale very well with larger maptrack entry counts, using a brute force linear search through all entries, with the only option of an early loop exit if a matching writable entry was found. Introduce a radix tree alongside the main maptrack table, thus allowing much faster MFN-based lookup. To avoid the need to actually allocate space for the individual nodes, encode the two counters in the node pointers themselves, thus limiting the number of permitted simultaneous r/o and r/w mappings of the same MFN to 2³¹-1 (64-bit) / 2¹�-1 (32-bit) each. To avoid enforcing an unnecessarily low bound on the number of simultaneous mappings of a single MFN, introduce radix_tree_{ulong_to_ptr,ptr_to_ulong} paralleling radix_tree_{int_to_ptr,ptr_to_int}. As a consequence locking changes are also applicable: With there no longer being any inspection of the remote domain's active entries, there's also no need anymore to hold the remote domain's grant table lock. And since we're no longer iterating over the local domain's map track table, the lock in map_grant_ref() can also be dropped before the new maptrack entry actually gets populated. As a nice side effect this also reduces the number of IOMMU operations in unmap_common(): Previously we would have "established" a readable mapping whenever we didn't find a writable entry anymore (yet, of course, at least one readable one). But we only need to do this if we actually dropped the last writable entry, not if there were none already before. This is part of CVE-2021-28698 / XSA-380. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Reviewed-by: Julien Grall <jgrall@xxxxxxxxxx> master commit: 9781b51efde251efcc0291ddb1d9c7cefe2b2555 master date: 2021-08-25 14:18:39 +0200 --- xen/common/grant_table.c | 201 ++++++++++++++++++++++++------------------- xen/include/xen/radix-tree.h | 19 ++++ 2 files changed, 131 insertions(+), 89 deletions(-) diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c index e43ff7df4f..7422872dec 100644 --- a/xen/common/grant_table.c +++ b/xen/common/grant_table.c @@ -37,6 +37,7 @@ #include <xen/iommu.h> #include <xen/paging.h> #include <xen/keyhandler.h> +#include <xen/radix-tree.h> #include <xen/vmap.h> #include <xen/nospec.h> #include <xsm/xsm.h> @@ -82,8 +83,13 @@ struct grant_table { grant_status_t **status; /* Active grant table. */ struct active_grant_entry **active; - /* Mapping tracking table per vcpu. */ + /* Handle-indexed tracking table of mappings. */ struct grant_mapping **maptrack; + /* + * MFN-indexed tracking tree of mappings, if needed. Note that this is + * protected by @lock, not @maptrack_lock. + */ + struct radix_tree_root maptrack_tree; /* Domain to which this struct grant_table belongs. */ const struct domain *domain; @@ -516,34 +522,6 @@ static int get_paged_frame(unsigned long gfn, mfn_t *mfn, return GNTST_okay; } -static inline void -double_gt_lock(struct grant_table *lgt, struct grant_table *rgt) -{ - /* - * See mapkind() for why the write lock is also required for the - * remote domain. - */ - if ( lgt < rgt ) - { - grant_write_lock(lgt); - grant_write_lock(rgt); - } - else - { - if ( lgt != rgt ) - grant_write_lock(rgt); - grant_write_lock(lgt); - } -} - -static inline void -double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt) -{ - grant_write_unlock(lgt); - if ( lgt != rgt ) - grant_write_unlock(rgt); -} - #define INVALID_MAPTRACK_HANDLE UINT_MAX static inline grant_handle_t @@ -982,41 +960,17 @@ static struct active_grant_entry *grant_map_exists(const struct domain *ld, return ERR_PTR(-EINVAL); } -#define MAPKIND_READ 1 -#define MAPKIND_WRITE 2 -static unsigned int mapkind( - struct grant_table *lgt, const struct domain *rd, mfn_t mfn) -{ - struct grant_mapping *map; - grant_handle_t handle, limit = lgt->maptrack_limit; - unsigned int kind = 0; - - /* - * Must have the local domain's grant table write lock when - * iterating over its maptrack entries. - */ - ASSERT(percpu_rw_is_write_locked(&lgt->lock)); - /* - * Must have the remote domain's grant table write lock while - * counting its active entries. - */ - ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock)); - - smp_rmb(); - - for ( handle = 0; !(kind & MAPKIND_WRITE) && handle < limit; handle++ ) - { - map = &maptrack_entry(lgt, handle); - if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) || - map->domid != rd->domain_id ) - continue; - if ( mfn_eq(_active_entry(rd->grant_table, map->ref).mfn, mfn) ) - kind |= map->flags & GNTMAP_readonly ? - MAPKIND_READ : MAPKIND_WRITE; - } - - return kind; -} +union maptrack_node { + struct { + /* Radix tree slot pointers use two of the bits. */ +#ifdef __BIG_ENDIAN_BITFIELD + unsigned long : 2; +#endif + unsigned long rd : BITS_PER_LONG / 2 - 1; + unsigned long wr : BITS_PER_LONG / 2 - 1; + } cnt; + unsigned long raw; +}; static void map_grant_ref( @@ -1035,7 +989,6 @@ map_grant_ref( struct grant_mapping *mt; grant_entry_header_t *shah; uint16_t *status; - bool_t need_iommu; ld = current->domain; @@ -1256,31 +1209,75 @@ map_grant_ref( * as mem-sharing and IOMMU use are incompatible). The dom_io case would * need checking separately if we compared against owner here. */ - need_iommu = ld != rd && gnttab_need_iommu_mapping(ld); - if ( need_iommu ) - { + if ( ld != rd && gnttab_need_iommu_mapping(ld) ) + { + union maptrack_node node = { + .cnt.rd = !!(op->flags & GNTMAP_readonly), + .cnt.wr = !(op->flags & GNTMAP_readonly), + }; + int err; + void **slot = NULL; unsigned int kind; - double_gt_lock(lgt, rgt); + grant_write_lock(lgt); + + err = radix_tree_insert(&lgt->maptrack_tree, mfn_x(mfn), + radix_tree_ulong_to_ptr(node.raw)); + if ( err == -EEXIST ) + { + slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(mfn)); + if ( likely(slot) ) + { + node.raw = radix_tree_ptr_to_ulong(*slot); + err = -EBUSY; + + /* Update node only when refcount doesn't overflow. */ + if ( op->flags & GNTMAP_readonly ? ++node.cnt.rd + : ++node.cnt.wr ) + { + radix_tree_replace_slot(slot, + radix_tree_ulong_to_ptr(node.raw)); + err = 0; + } + } + else + ASSERT_UNREACHABLE(); + } /* * We're not translated, so we know that dfns and mfns are * the same things, so the IOMMU entry is always 1-to-1. */ - kind = mapkind(lgt, rd, mfn); - if ( !(op->flags & GNTMAP_readonly) && - !(kind & MAPKIND_WRITE) ) + if ( !(op->flags & GNTMAP_readonly) && node.cnt.wr == 1 ) kind = IOMMUF_readable | IOMMUF_writable; - else if ( !kind ) + else if ( (op->flags & GNTMAP_readonly) && + node.cnt.rd == 1 && !node.cnt.wr ) kind = IOMMUF_readable; else kind = 0; - if ( kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind) ) + if ( err || + (kind && iommu_legacy_map(ld, _dfn(mfn_x(mfn)), mfn, 1, kind)) ) { - double_gt_unlock(lgt, rgt); + if ( !err ) + { + if ( slot ) + { + op->flags & GNTMAP_readonly ? node.cnt.rd-- + : node.cnt.wr--; + radix_tree_replace_slot(slot, + radix_tree_ulong_to_ptr(node.raw)); + } + else + radix_tree_delete(&lgt->maptrack_tree, mfn_x(mfn)); + } + rc = GNTST_general_error; - goto undo_out; } + + grant_write_unlock(lgt); + + if ( rc != GNTST_okay ) + goto undo_out; } TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom); @@ -1288,10 +1285,6 @@ map_grant_ref( /* * All maptrack entry users check mt->flags first before using the * other fields so just ensure the flags field is stored last. - * - * However, if gnttab_need_iommu_mapping() then this would race - * with a concurrent mapkind() call (on an unmap, for example) - * and a lock is required. */ mt = &maptrack_entry(lgt, handle); mt->domid = op->dom; @@ -1299,9 +1292,6 @@ map_grant_ref( smp_wmb(); write_atomic(&mt->flags, op->flags); - if ( need_iommu ) - double_gt_unlock(lgt, rgt); - op->dev_bus_addr = mfn_to_maddr(mfn); op->handle = handle; op->status = GNTST_okay; @@ -1509,19 +1499,34 @@ unmap_common( /* See the respective comment in map_grant_ref(). */ if ( rc == GNTST_okay && ld != rd && gnttab_need_iommu_mapping(ld) ) { - unsigned int kind; + void **slot; + union maptrack_node node; int err = 0; - double_gt_lock(lgt, rgt); + grant_write_lock(lgt); + slot = radix_tree_lookup_slot(&lgt->maptrack_tree, mfn_x(op->mfn)); + node.raw = likely(slot) ? radix_tree_ptr_to_ulong(*slot) : 0; + + /* Refcount must not underflow. */ + if ( !(flags & GNTMAP_readonly ? node.cnt.rd-- + : node.cnt.wr--) ) + BUG(); - kind = mapkind(lgt, rd, op->mfn); - if ( !kind ) + if ( !node.raw ) err = iommu_legacy_unmap(ld, _dfn(mfn_x(op->mfn)), 1); - else if ( !(kind & MAPKIND_WRITE) ) + else if ( !(flags & GNTMAP_readonly) && !node.cnt.wr ) err = iommu_legacy_map(ld, _dfn(mfn_x(op->mfn)), op->mfn, 1, IOMMUF_readable); - double_gt_unlock(lgt, rgt); + if ( err ) + ; + else if ( !node.raw ) + radix_tree_delete(&lgt->maptrack_tree, mfn_x(op->mfn)); + else + radix_tree_replace_slot(slot, + radix_tree_ulong_to_ptr(node.raw)); + + grant_write_unlock(lgt); if ( err ) rc = GNTST_general_error; @@ -1968,6 +1973,8 @@ int grant_table_init(struct domain *d, int max_grant_frames, gt->maptrack = vzalloc(gt->max_maptrack_frames * sizeof(*gt->maptrack)); if ( gt->maptrack == NULL ) goto out; + + radix_tree_init(>->maptrack_tree); } /* Shared grant table. */ @@ -3716,6 +3723,8 @@ int gnttab_release_mappings(struct domain *d) for ( handle = gt->maptrack_limit; handle; ) { + mfn_t mfn; + /* * Deal with full pages such that their freeing (in the body of the * if()) remains simple. @@ -3813,17 +3822,31 @@ int gnttab_release_mappings(struct domain *d) reduce_status_for_pin(rd, act, status, map->flags & GNTMAP_readonly); + mfn = act->mfn; + active_entry_release(act); grant_read_unlock(rgt); rcu_unlock_domain(rd); map->flags = 0; + + /* + * This is excessive in that a single such call would suffice per + * mapped MFN (or none at all, if no entry was ever inserted). But it + * should be the common case for an MFN to be mapped just once, and + * this way we don't need to further maintain the counters. We also + * don't want to leave cleaning up of the tree as a whole to the end + * of the function, as this could take quite some time. + */ + radix_tree_delete(>->maptrack_tree, mfn_x(mfn)); } gt->maptrack_limit = 0; FREE_XENHEAP_PAGE(gt->maptrack[0]); + radix_tree_destroy(>->maptrack_tree, NULL); + return 0; } diff --git a/xen/include/xen/radix-tree.h b/xen/include/xen/radix-tree.h index ec40cf1d9e..58c40312e6 100644 --- a/xen/include/xen/radix-tree.h +++ b/xen/include/xen/radix-tree.h @@ -190,6 +190,25 @@ static inline int radix_tree_ptr_to_int(void *ptr) return (int)((long)ptr >> 2); } +/** + * radix_tree_{ulong_to_ptr,ptr_to_ulong}: + * + * Same for unsigned long values. Beware though that only BITS_PER_LONG-2 + * bits are actually usable for the value. + */ +static inline void *radix_tree_ulong_to_ptr(unsigned long val) +{ + unsigned long ptr = (val << 2) | 0x2; + ASSERT((ptr >> 2) == val); + return (void *)ptr; +} + +static inline unsigned long radix_tree_ptr_to_ulong(void *ptr) +{ + ASSERT(((unsigned long)ptr & 0x3) == 0x2); + return (unsigned long)ptr >> 2; +} + int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); -- generated by git-patchbot for /home/xen/git/xen.git#stable-4.15
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |