blktap: fix locking (again) c/s 1090 wasn't really helping much - I was doing that under the wrong impression that the zap_pte() hook would be called with the mmap_sem held. That being wrong, none of the uses of mmap_sem here actually are write ones (they only look up vma information), so convert them all to read acquires/releases. A new spinlock needs to be introduced, protecting idx_map. This has to nest inside mmap_sem, which requires some code movement. --- a/drivers/xen/blktap/blktap.c +++ b/drivers/xen/blktap/blktap.c @@ -113,6 +113,7 @@ typedef struct tap_blkif { pid_t pid; /*tapdisk process id */ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace shutdown */ + spinlock_t map_lock; /*protects idx_map */ struct idx_map { u16 mem, req; } *idx_map; /*Record the user ring id to kern @@ -295,7 +296,7 @@ static pte_t blktap_clear_pte(struct vm_ pte_t copy; tap_blkif_t *info = NULL; unsigned int seg, usr_idx, pending_idx, mmap_idx, count = 0; - unsigned long offset, uvstart = 0; + unsigned long offset; struct page *pg; struct grant_handle_pair *khandle; struct gnttab_unmap_grant_ref unmap[2]; @@ -304,25 +305,28 @@ static pte_t blktap_clear_pte(struct vm_ * If the address is before the start of the grant mapped region or * if vm_file is NULL (meaning mmap failed and we have nothing to do) */ - if (vma->vm_file != NULL) { + if (vma->vm_file != NULL) info = vma->vm_file->private_data; - uvstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); - } - if (vma->vm_file == NULL || uvaddr < uvstart) + if (info == NULL || uvaddr < info->user_vstart) return ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, is_fullmm); - /* TODO Should these be changed to if statements? */ - BUG_ON(!info); - BUG_ON(!info->idx_map); - - offset = (uvaddr - uvstart) >> PAGE_SHIFT; + offset = (uvaddr - info->user_vstart) >> PAGE_SHIFT; usr_idx = OFFSET_TO_USR_IDX(offset); seg = OFFSET_TO_SEG(offset); + spin_lock(&info->map_lock); + pending_idx = info->idx_map[usr_idx].req; mmap_idx = info->idx_map[usr_idx].mem; + /* fast_flush_area() may already have cleared this entry */ + if (mmap_idx == INVALID_MIDX) { + spin_unlock(&info->map_lock); + return ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, + is_fullmm); + } + pg = idx_to_page(mmap_idx, pending_idx, seg); ClearPageReserved(pg); info->foreign_map.map[offset + RING_PAGES] = NULL; @@ -365,6 +369,8 @@ static pte_t blktap_clear_pte(struct vm_ BUG(); } + spin_unlock(&info->map_lock); + return copy; } @@ -489,6 +495,7 @@ found: } info->minor = minor; + spin_lock_init(&info->map_lock); /* * Make sure that we have a minor before others can * see us. @@ -1029,25 +1036,30 @@ static void fast_flush_area(pending_req_ unsigned int u_idx, tap_blkif_t *info) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; - unsigned int i, mmap_idx, invcount = 0, locked = 0; + unsigned int i, mmap_idx, invcount = 0; struct grant_handle_pair *khandle; uint64_t ptep; int ret; unsigned long uvaddr; struct mm_struct *mm = info->mm; + if (mm != NULL) + down_read(&mm->mmap_sem); + if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) { - down_write(&mm->mmap_sem); + slow: blktap_zap_page_range(mm, MMAP_VADDR(info->user_vstart, u_idx, 0), req->nr_pages); info->idx_map[u_idx].mem = INVALID_MIDX; - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return; } mmap_idx = req->mem_idx; + spin_lock(&info->map_lock); + for (i = 0; i < req->nr_pages; i++) { uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); @@ -1066,15 +1078,13 @@ static void fast_flush_area(pending_req_ if (mm != NULL && khandle->user != INVALID_GRANT_HANDLE) { BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); - if (!locked++) - down_write(&mm->mmap_sem); if (create_lookup_pte_addr( mm, MMAP_VADDR(info->user_vstart, u_idx, i), &ptep) !=0) { - up_write(&mm->mmap_sem); + spin_unlock(&info->map_lock); WPRINTK("Couldn't get a pte addr!\n"); - return; + goto slow; } gnttab_set_unmap_op(&unmap[invcount], ptep, @@ -1091,19 +1101,11 @@ static void fast_flush_area(pending_req_ GNTTABOP_unmap_grant_ref, unmap, invcount); BUG_ON(ret); - if (mm != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) { - if (!locked++) - down_write(&mm->mmap_sem); - blktap_zap_page_range(mm, - MMAP_VADDR(info->user_vstart, u_idx, 0), - req->nr_pages); - } - - if (!locked && mm != NULL) - down_write(&mm->mmap_sem); info->idx_map[u_idx].mem = INVALID_MIDX; + + spin_unlock(&info->map_lock); if (mm != NULL) - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); } /****************************************************************** @@ -1406,7 +1408,9 @@ static void dispatch_rw_block_io(blkif_t goto fail_response; /* Check we have space on user ring - should never fail. */ + spin_lock(&info->map_lock); usr_idx = GET_NEXT_REQ(info->idx_map); + spin_unlock(&info->map_lock); if (usr_idx >= MAX_PENDING_REQS) { WARN_ON(1); goto fail_response; @@ -1445,7 +1449,7 @@ static void dispatch_rw_block_io(blkif_t op = 0; mm = info->mm; if (!xen_feature(XENFEAT_auto_translated_physmap)) - down_write(&mm->mmap_sem); + down_read(&mm->mmap_sem); for (i = 0; i < nseg; i++) { unsigned long uvaddr; unsigned long kvaddr; @@ -1462,9 +1466,9 @@ static void dispatch_rw_block_io(blkif_t /* Now map it to user. */ ret = create_lookup_pte_addr(mm, uvaddr, &ptep); if (ret) { - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); WPRINTK("Couldn't get a pte addr!\n"); - goto fail_flush; + goto fail_response; } gnttab_set_map_op(&map[op], ptep, @@ -1478,12 +1482,15 @@ static void dispatch_rw_block_io(blkif_t req->seg[i].first_sect + 1); } + if (xen_feature(XENFEAT_auto_translated_physmap)) + down_read(&mm->mmap_sem); + + spin_lock(&info->map_lock); + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); BUG_ON(ret); if (!xen_feature(XENFEAT_auto_translated_physmap)) { - up_write(&mm->mmap_sem); - for (i = 0; i < (nseg*2); i+=2) { unsigned long uvaddr; unsigned long offset; @@ -1548,18 +1555,18 @@ static void dispatch_rw_block_io(blkif_t } } + /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ + info->idx_map[usr_idx].mem = mmap_idx; + info->idx_map[usr_idx].req = pending_idx; + + spin_unlock(&info->map_lock); + if (ret) goto fail_flush; - if (xen_feature(XENFEAT_auto_translated_physmap)) - down_write(&mm->mmap_sem); - /* Mark mapped pages as reserved: */ - for (i = 0; i < req->nr_segments; i++) { - struct page *pg; - - pg = idx_to_page(mmap_idx, pending_idx, i); - SetPageReserved(pg); - if (xen_feature(XENFEAT_auto_translated_physmap)) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { + for (i = 0; i < nseg; i++) { + struct page *pg = idx_to_page(mmap_idx, pending_idx, i); unsigned long uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); if (vma && uvaddr >= vma->vm_end) { @@ -1577,18 +1584,12 @@ static void dispatch_rw_block_io(blkif_t continue; } ret = vm_insert_page(vma, uvaddr, pg); - if (ret) { - up_write(&mm->mmap_sem); + if (ret) goto fail_flush; - } } } - if (xen_feature(XENFEAT_auto_translated_physmap)) - up_write(&mm->mmap_sem); - /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ - info->idx_map[usr_idx].mem = mmap_idx; - info->idx_map[usr_idx].req = pending_idx; + up_read(&mm->mmap_sem); blkif_get(blkif); /* Finally, write the request message to the user ring. */ @@ -1611,6 +1612,7 @@ static void dispatch_rw_block_io(blkif_t return; fail_flush: + up_read(&mm->mmap_sem); WPRINTK("Reached Fail_flush\n"); fast_flush_area(pending_req, pending_idx, usr_idx, info); fail_response: