[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v5 2/2] gnttab: refactor locking for scalability
From: Matt Wilson <msw@xxxxxxxxxx> This patch refactors grant table locking. It splits the previous single spinlock per grant table into multiple locks. The heavily modified components of the grant table (the maptrack state and the active entries) are now protected by their own spinlocks. The remaining elements of the grant table are read-mostly, so the main grant table lock is modified to be a rwlock to improve concurrency. Workloads with high grant table operation rates, especially map/unmap operations as used by blkback/blkfront when persistent grants are not supported, show marked improvement with these changes. A domU with 24 VBDs in a streaming 2M write workload achieved 1,400 MB/sec before this change. Performance more than doubles with this patch, reaching 3,000 MB/sec before tuning and 3,600 MB/sec after adjusting event channel vCPU bindings. Signed-off-by: Matt Wilson <msw@xxxxxxxxxx> [chegger: ported to xen-staging, split into multiple commits] v5: * Fixed git rebase merge issue introduced in v4 (acquiring locking twice) * Change for ()-loop in grant_map_exists * Coding style fixes v4: * Addressed ASSERT()'s from Jan Beulich * Addressed locking issues in unmap_common pointed out by Jan Beulich v3: * Addressed gnttab_swap_grant_ref() comment from Andrew Cooper Signed-off-by: Christoph Egger <chegger@xxxxxxxxx> Cc: Anthony Liguori <aliguori@xxxxxxxxxx> Cc: Jan Beulich <jbeulich@xxxxxxxx> Cc: Keir Fraser <keir@xxxxxxx> --- docs/misc/grant-tables.txt | 21 +++++ xen/common/grant_table.c | 215 +++++++++++++++++++++++++++----------------- 2 files changed, 155 insertions(+), 81 deletions(-) diff --git a/docs/misc/grant-tables.txt b/docs/misc/grant-tables.txt index c9ae8f2..1ada018 100644 --- a/docs/misc/grant-tables.txt +++ b/docs/misc/grant-tables.txt @@ -63,6 +63,7 @@ is complete. act->domid : remote domain being granted rights act->frame : machine frame being granted act->pin : used to hold reference counts + act->lock : spinlock used to serialize access to active entry state Map tracking ~~~~~~~~~~~~ @@ -87,6 +88,8 @@ is complete. version, partially initialized active table pages, etc. grant_table->maptrack_lock : spinlock used to protect the maptrack state + active_grant_entry->lock : spinlock used to serialize modifications to + active entries The primary lock for the grant table is a read/write spinlock. All functions that access members of struct grant_table must acquire a @@ -102,6 +105,24 @@ is complete. state can be rapidly modified under some workloads, and the critical sections are very small, thus we use a spinlock to protect them. + Active entries are obtained by calling active_entry_acquire(gt, ref). + This function returns a pointer to the active entry after locking its + spinlock. The caller must hold the rwlock for the gt in question + before calling active_entry_acquire(). This is because the grant + table can be dynamically extended via gnttab_grow_table() while a + domain is running and must be fully initialized. Once all access to + the active entry is complete, release the lock by calling + active_entry_release(act). + + Summary of rules for locking: + active_entry_acquire() and active_entry_release() can only be + called when holding the relevant grant table's lock. I.e.: + read_lock(>->lock); + act = active_entry_acquire(gt, ref); + ... + active_entry_release(act); + read_unlock(>->lock); + ******************************************************************************** Granting a foreign domain access to frames diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c index f52a51d..94c9864 100644 --- a/xen/common/grant_table.c +++ b/xen/common/grant_table.c @@ -157,10 +157,13 @@ struct active_grant_entry { in the page. */ unsigned length:16; /* For sub-page grants, the length of the grant. */ + spinlock_t lock; /* lock to protect access of this entry. + see docs/misc/grant-tables.txt for + locking protocol */ }; #define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry)) -#define active_entry(t, e) \ +#define _active_entry(t, e) \ ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE]) static inline void gnttab_flush_tlb(const struct domain *d) @@ -188,6 +191,28 @@ nr_active_grant_frames(struct grant_table *gt) return num_act_frames_from_sha_frames(nr_grant_frames(gt)); } +static inline struct active_grant_entry * +active_entry_acquire(struct grant_table *t, grant_ref_t e) +{ + struct active_grant_entry *act; + + /* + * not perfect, but better than nothing for a debug build + * sanity check + */ + ASSERT(rw_is_locked(&t->lock)); + + act = &_active_entry(t, e); + spin_lock(&act->lock); + + return act; +} + +static inline void active_entry_release(struct active_grant_entry *act) +{ + spin_unlock(&act->lock); +} + /* Check if the page has been paged out, or needs unsharing. If rc == GNTST_okay, *page contains the page struct with a ref taken. Caller must do put_page(*page). @@ -228,30 +253,6 @@ static int __get_paged_frame(unsigned long gfn, unsigned long *frame, struct pag return rc; } -static inline void -double_maptrack_lock(struct grant_table *lgt, struct grant_table *rgt) -{ - if ( lgt < rgt ) - { - spin_lock(&lgt->maptrack_lock); - spin_lock(&rgt->maptrack_lock); - } - else - { - if ( lgt != rgt ) - spin_lock(&rgt->maptrack_lock); - spin_lock(&lgt->maptrack_lock); - } -} - -static inline void -double_maptrack_unlock(struct grant_table *lgt, struct grant_table *rgt) -{ - spin_unlock(&lgt->maptrack_lock); - if ( lgt != rgt ) - spin_unlock(&rgt->maptrack_lock); -} - static inline int __get_maptrack_handle( struct grant_table *t) @@ -316,7 +317,10 @@ get_maptrack_handle( return handle; } -/* Number of grant table entries. Caller must hold d's grant table lock. */ +/* + * Number of grant table entries. Caller must hold d's grant table + * read lock. + */ static unsigned int nr_grant_entries(struct grant_table *gt) { ASSERT(gt->gt_version != 0); @@ -505,26 +509,23 @@ static int grant_map_exists(const struct domain *ld, unsigned long mfn, unsigned int *ref_count) { - const struct active_grant_entry *act; + struct active_grant_entry *act; unsigned int ref, max_iter; ASSERT(rw_is_locked(&rgt->lock)); max_iter = min(*ref_count + (1 << GNTTABOP_CONTINUATION_ARG_SHIFT), nr_grant_entries(rgt)); - for ( ref = *ref_count; ref < max_iter; ref++ ) + for ( ref = *ref_count; ref < max_iter; active_entry_release(act), ref++ ) { - act = &active_entry(rgt, ref); + act = active_entry_acquire(rgt, ref); - if ( !act->pin ) - continue; - - if ( act->domid != ld->domain_id ) - continue; - - if ( act->frame != mfn ) + if ( !act->pin || + act->domid != ld->domain_id || + act->frame != mfn ) continue; + active_entry_release(act); return 0; } @@ -537,6 +538,13 @@ static int grant_map_exists(const struct domain *ld, return -EINVAL; } +/* + * Count the number of mapped ro or rw entries tracked in the left + * grant table given a provided mfn provided by a foreign domain. + * + * This function takes the maptrack lock from the left grant table and + * must be called with the right grant table's rwlock held. + */ static void mapcount( struct grant_table *lgt, struct domain *rd, unsigned long mfn, unsigned int *wrc, unsigned int *rdc) @@ -546,15 +554,32 @@ static void mapcount( *wrc = *rdc = 0; + /* + * N.B.: while taking the left side maptrack spinlock prevents + * any mapping changes, the right side active entries could be + * changing while we are counting. The caller has to hold the + * grant table write lock, or some other mechanism should be + * used to prevent concurrent changes during this operation. + * This is tricky because we can't promote a read lock into + * a write lock. + */ + ASSERT(rw_is_locked(&rd->grant_table->lock)); + spin_lock(&lgt->maptrack_lock); + for ( handle = 0; handle < lgt->maptrack_limit; handle++ ) { + struct active_grant_entry *act; + map = &maptrack_entry(lgt, handle); if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) || map->domid != rd->domain_id ) continue; - if ( active_entry(rd->grant_table, map->ref).frame == mfn ) + act = &_active_entry(rd->grant_table, map->ref); + if ( act->frame == mfn ) (map->flags & GNTMAP_readonly) ? (*rdc)++ : (*wrc)++; } + + spin_unlock(&lgt->maptrack_lock); } /* @@ -576,7 +601,6 @@ __gnttab_map_grant_ref( struct page_info *pg = NULL; int rc = GNTST_okay; u32 old_pin; - u32 act_pin; unsigned int cache_flags; struct active_grant_entry *act = NULL; struct grant_mapping *mt; @@ -639,7 +663,7 @@ __gnttab_map_grant_ref( if ( unlikely(op->ref >= nr_grant_entries(rgt))) PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref (%d).\n", op->ref); - act = &active_entry(rgt, op->ref); + act = active_entry_acquire(rgt, op->ref); shah = shared_entry_header(rgt, op->ref); if (rgt->gt_version == 1) { sha1 = &shared_entry_v1(rgt, op->ref); @@ -656,7 +680,7 @@ __gnttab_map_grant_ref( ((act->domid != ld->domain_id) || (act->pin & 0x80808080U) != 0 || (act->is_sub_page)) ) - PIN_FAIL(unlock_out, GNTST_general_error, + PIN_FAIL(act_release_out, GNTST_general_error, "Bad domain (%d != %d), or risk of counter overflow %08x, or subpage %d\n", act->domid, ld->domain_id, act->pin, act->is_sub_page); @@ -667,7 +691,7 @@ __gnttab_map_grant_ref( if ( (rc = _set_status(rgt->gt_version, ld->domain_id, op->flags & GNTMAP_readonly, 1, shah, act, status) ) != GNTST_okay ) - goto unlock_out; + goto act_release_out; if ( !act->pin ) { @@ -698,7 +722,6 @@ __gnttab_map_grant_ref( GNTPIN_hstr_inc : GNTPIN_hstw_inc; frame = act->frame; - act_pin = act->pin; cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) ); @@ -776,8 +799,6 @@ __gnttab_map_grant_ref( goto undo_out; } - double_maptrack_lock(lgt, rgt); - if ( gnttab_need_iommu_mapping(ld) ) { unsigned int wrc, rdc; @@ -785,21 +806,20 @@ __gnttab_map_grant_ref( /* We're not translated, so we know that gmfns and mfns are the same things, so the IOMMU entry is always 1-to-1. */ mapcount(lgt, rd, frame, &wrc, &rdc); - if ( (act_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) && + if ( (act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) && !(old_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) ) { if ( wrc == 0 ) err = iommu_map_page(ld, frame, frame, IOMMUF_readable|IOMMUF_writable); } - else if ( act_pin && !old_pin ) + else if ( act->pin && !old_pin ) { if ( (wrc + rdc) == 0 ) err = iommu_map_page(ld, frame, frame, IOMMUF_readable); } if ( err ) { - double_maptrack_unlock(lgt, rgt); rc = GNTST_general_error; goto undo_out; } @@ -812,7 +832,7 @@ __gnttab_map_grant_ref( mt->ref = op->ref; mt->flags = op->flags; - double_maptrack_unlock(lgt, rgt); + active_entry_release(act); read_unlock(&rgt->lock); op->dev_bus_addr = (u64)frame << PAGE_SHIFT; @@ -836,10 +856,6 @@ __gnttab_map_grant_ref( put_page(pg); } - read_lock(&rgt->lock); - - act = &active_entry(rgt, op->ref); - if ( op->flags & GNTMAP_device_map ) act->pin -= (op->flags & GNTMAP_readonly) ? GNTPIN_devr_inc : GNTPIN_devw_inc; @@ -855,6 +871,9 @@ __gnttab_map_grant_ref( if ( !act->pin ) gnttab_clear_flag(_GTF_reading, status); + act_release_out: + active_entry_release(act); + unlock_out: read_unlock(&rgt->lock); op->status = rc; @@ -941,18 +960,17 @@ __gnttab_unmap_common( rgt = rd->grant_table; read_lock(&rgt->lock); - double_maptrack_lock(lgt, rgt); + act = active_entry_acquire(rgt, op->map->ref); op->flags = op->map->flags; if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) ) { gdprintk(XENLOG_WARNING, "Unstable handle %u\n", op->handle); rc = GNTST_bad_handle; - goto unmap_out; + goto act_release_out; } op->rd = rd; - act = &active_entry(rgt, op->map->ref); if ( op->frame == 0 ) { @@ -961,7 +979,7 @@ __gnttab_unmap_common( else { if ( unlikely(op->frame != act->frame) ) - PIN_FAIL(unmap_out, GNTST_general_error, + PIN_FAIL(act_release_out, GNTST_general_error, "Bad frame number doesn't match gntref. (%lx != %lx)\n", op->frame, act->frame); if ( op->flags & GNTMAP_device_map ) @@ -980,7 +998,7 @@ __gnttab_unmap_common( if ( (rc = replace_grant_host_mapping(op->host_addr, op->frame, op->new_addr, op->flags)) < 0 ) - goto unmap_out; + goto act_release_out; ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask)); op->map->flags &= ~GNTMAP_host_map; @@ -1002,7 +1020,7 @@ __gnttab_unmap_common( if ( err ) { rc = GNTST_general_error; - goto unmap_out; + goto act_release_out; } } @@ -1010,8 +1028,8 @@ __gnttab_unmap_common( if ( !(op->flags & GNTMAP_readonly) ) gnttab_mark_dirty(rd, op->frame); - unmap_out: - double_maptrack_unlock(lgt, rgt); + act_release_out: + active_entry_release(act); read_unlock(&rgt->lock); op->status = rc; @@ -1046,9 +1064,9 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op) read_lock(&rgt->lock); if ( rgt->gt_version == 0 ) - goto unmap_out; + goto unlock_out; - act = &active_entry(rgt, op->map->ref); + act = active_entry_acquire(rgt, op->map->ref); sha = shared_entry_header(rgt, op->map->ref); if ( rgt->gt_version == 1 ) @@ -1062,7 +1080,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op) * Suggests that __gntab_unmap_common failed early and so * nothing further to do */ - goto unmap_out; + goto act_release_out; } pg = mfn_to_page(op->frame); @@ -1086,7 +1104,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op) * Suggests that __gntab_unmap_common failed in * replace_grant_host_mapping() so nothing further to do */ - goto unmap_out; + goto act_release_out; } if ( !is_iomem_page(op->frame) ) @@ -1107,8 +1125,12 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op) if ( act->pin == 0 ) gnttab_clear_flag(_GTF_reading, status); - unmap_out: + act_release_out: + active_entry_release(act); + + unlock_out: read_unlock(&rgt->lock); + if ( put_handle ) { op->map->flags = 0; @@ -1304,7 +1326,7 @@ gnttab_grow_table(struct domain *d, unsigned int req_nr_frames) /* d's grant table write lock must be held by the caller */ struct grant_table *gt = d->grant_table; - unsigned int i; + unsigned int i, j; ASSERT(req_nr_frames <= max_grant_frames); @@ -1319,6 +1341,8 @@ gnttab_grow_table(struct domain *d, unsigned int req_nr_frames) if ( (gt->active[i] = alloc_xenheap_page()) == NULL ) goto active_alloc_failed; clear_page(gt->active[i]); + for ( j = 0; j < ACGNT_PER_PAGE; j++ ) + spin_lock_init(>->active[i][j].lock); } /* Shared */ @@ -1813,7 +1837,7 @@ __release_grant_for_copy( read_lock(&rgt->lock); - act = &active_entry(rgt, gref); + act = active_entry_acquire(rgt, gref); sha = shared_entry_header(rgt, gref); r_frame = act->frame; @@ -1852,6 +1876,7 @@ __release_grant_for_copy( released_read = 1; } + active_entry_release(act); read_unlock(&rgt->lock); if ( td != rd ) @@ -1913,14 +1938,14 @@ __acquire_grant_for_copy( read_lock(&rgt->lock); if ( rgt->gt_version == 0 ) - PIN_FAIL(unlock_out, GNTST_general_error, + PIN_FAIL(gnt_unlock_out, GNTST_general_error, "remote grant table not ready\n"); if ( unlikely(gref >= nr_grant_entries(rgt)) ) - PIN_FAIL(unlock_out, GNTST_bad_gntref, + PIN_FAIL(gnt_unlock_out, GNTST_bad_gntref, "Bad grant reference %ld\n", gref); - act = &active_entry(rgt, gref); + act = active_entry_acquire(rgt, gref); shah = shared_entry_header(rgt, gref); if ( rgt->gt_version == 1 ) { @@ -1985,6 +2010,7 @@ __acquire_grant_for_copy( * the right table (if rd == td), so we have to drop the * lock here and reacquire */ + active_entry_release(act); read_unlock(&rgt->lock); rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id, @@ -1992,8 +2018,11 @@ __acquire_grant_for_copy( &trans_page_off, &trans_length, 0); read_lock(&rgt->lock); + act = active_entry_acquire(rgt, gref); + if ( rc != GNTST_okay ) { __fixup_status_for_copy_pin(act, status); + active_entry_release(act); read_unlock(&rgt->lock); rcu_unlock_domain(td); return rc; @@ -2007,6 +2036,7 @@ __acquire_grant_for_copy( { __fixup_status_for_copy_pin(act, status); rcu_unlock_domain(td); + active_entry_release(act); read_unlock(&rgt->lock); put_page(*page); return __acquire_grant_for_copy(rd, gref, ldom, readonly, @@ -2075,6 +2105,7 @@ __acquire_grant_for_copy( *length = act->length; *frame = act->frame; + active_entry_release(act); read_unlock(&rgt->lock); return rc; @@ -2087,6 +2118,9 @@ __acquire_grant_for_copy( gnttab_clear_flag(_GTF_reading, status); unlock_out: + active_entry_release(act); + + gnt_unlock_out: read_unlock(&rgt->lock); return rc; @@ -2413,16 +2447,18 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop) { for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ ) { - act = &active_entry(gt, i); + act = active_entry_acquire(gt, i); if ( act->pin != 0 ) { gdprintk(XENLOG_WARNING, "tried to change grant table version from %d to %d, but some grant entries still in use\n", gt->gt_version, op.version); + active_entry_release(act); res = -EBUSY; goto out_unlock; } + active_entry_release(act); } } @@ -2601,9 +2637,17 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b) { struct domain *d = rcu_lock_current_domain(); struct grant_table *gt = d->grant_table; - struct active_grant_entry *act; + struct active_grant_entry *act_a = NULL; + struct active_grant_entry *act_b = NULL; s16 rc = GNTST_okay; + if ( ref_a == ref_b ) + /* + * noop, so avoid acquiring the same active entry + * twice which would case a deadlock. + */ + return rc; + write_lock(>->lock); /* Bounds check on the grant refs */ @@ -2612,12 +2656,12 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b) if ( unlikely(ref_b >= nr_grant_entries(d->grant_table))) PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b (%d).\n", ref_b); - act = &active_entry(gt, ref_a); - if ( act->pin ) + act_a = active_entry_acquire(gt, ref_a); + if ( act_a->pin ) PIN_FAIL(out, GNTST_eagain, "ref a %ld busy\n", (long)ref_a); - act = &active_entry(gt, ref_b); - if ( act->pin ) + act_b = active_entry_acquire(gt, ref_b); + if ( act_b->pin ) PIN_FAIL(out, GNTST_eagain, "ref b %ld busy\n", (long)ref_b); if ( gt->gt_version == 1 ) @@ -2644,6 +2688,10 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b) } out: + if ( act_b != NULL ) + active_entry_release(act_b); + if ( act_a != NULL ) + active_entry_release(act_a); write_unlock(>->lock); rcu_unlock_domain(d); @@ -2953,7 +3001,7 @@ grant_table_create( struct domain *d) { struct grant_table *t; - int i; + int i, j; if ( (t = xzalloc(struct grant_table)) == NULL ) goto no_mem_0; @@ -2973,6 +3021,8 @@ grant_table_create( if ( (t->active[i] = alloc_xenheap_page()) == NULL ) goto no_mem_2; clear_page(t->active[i]); + for ( j = 0; j < ACGNT_PER_PAGE; j++ ) + spin_lock_init(&t->active[i][j].lock); } /* Tracking of mapped foreign frames table */ @@ -3068,9 +3118,8 @@ gnttab_release_mappings( rgt = rd->grant_table; read_lock(&rgt->lock); - double_maptrack_lock(gt, rgt); - act = &active_entry(rgt, ref); + act = active_entry_acquire(rgt, ref); sha = shared_entry_header(rgt, ref); if (rgt->gt_version == 1) status = &sha->flags; @@ -3128,7 +3177,7 @@ gnttab_release_mappings( if ( act->pin == 0 ) gnttab_clear_flag(_GTF_reading, status); - double_maptrack_unlock(gt, rgt); + active_entry_release(act); read_unlock(&rgt->lock); rcu_unlock_domain(rd); @@ -3195,9 +3244,12 @@ static void gnttab_usage_print(struct domain *rd) uint16_t status; uint64_t frame; - act = &active_entry(gt, ref); + act = active_entry_acquire(gt, ref); if ( !act->pin ) + { + active_entry_release(act); continue; + } sha = shared_entry_header(gt, ref); @@ -3227,6 +3279,7 @@ static void gnttab_usage_print(struct domain *rd) printk("[%3d] %5d 0x%06lx 0x%08x %5d 0x%06"PRIx64" 0x%02x\n", ref, act->domid, act->frame, act->pin, sha->domid, frame, status); + active_entry_release(act); } out: -- 1.7.9.5 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |