[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] amd iommu: Automatic page coalescing
# HG changeset patch # User Wei Wang <wei.wang2@xxxxxxx> # Date 1313503391 -3600 # Node ID 80e9fcdaef3673ca1294ceb4da3e49dda32ca318 # Parent 8d6edc3d26d26931f3732a2008fb4818bc7bab2d amd iommu: Automatic page coalescing This patch implements automatic page coalescing when separated io page table is used. It uses ignore bits in iommu pde to cache how many entries lower next page level are suitable for coalescing and then builds a super page entry when all lower entries are contiguous. This patch has been tested OK for weeks mainly with graphic devices and 3D mark vantage. Signed-off-by: Wei Wang <wei.wang2@xxxxxxx> --- diff -r 8d6edc3d26d2 -r 80e9fcdaef36 xen/drivers/passthrough/amd/iommu_map.c --- a/xen/drivers/passthrough/amd/iommu_map.c Sat Aug 13 10:14:58 2011 +0100 +++ b/xen/drivers/passthrough/amd/iommu_map.c Tue Aug 16 15:03:11 2011 +0100 @@ -168,98 +168,59 @@ AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n"); } -static void clear_iommu_l1e_present(u64 l2e, unsigned long gfn) +/* Given pfn and page table level, return pde index */ +static unsigned int pfn_to_pde_idx(unsigned long pfn, unsigned int level) { - u32 *l1e; - int offset; - void *l1_table; + unsigned int idx; - l1_table = map_domain_page(l2e >> PAGE_SHIFT); - - offset = gfn & (~PTE_PER_TABLE_MASK); - l1e = (u32*)(l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE)); - - /* clear l1 entry */ - l1e[0] = l1e[1] = 0; - - unmap_domain_page(l1_table); + idx = pfn >> (PTE_PER_TABLE_SHIFT * (--level)); + idx &= ~PTE_PER_TABLE_MASK; + return idx; } -static int set_iommu_l1e_present(u64 l2e, unsigned long gfn, - u64 maddr, int iw, int ir) +void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn) { - u64 addr_lo, addr_hi, maddr_old; + u64 *table, *pte; + + table = map_domain_page(l1_mfn); + pte = table + pfn_to_pde_idx(gfn, IOMMU_PAGING_MODE_LEVEL_1); + *pte = 0; + unmap_domain_page(table); +} + +static bool_t set_iommu_pde_present(u32 *pde, unsigned long next_mfn, + unsigned int next_level, + bool_t iw, bool_t ir) +{ + u64 addr_lo, addr_hi, maddr_old, maddr_next; u32 entry; - void *l1_table; - int offset; - u32 *l1e; - int need_flush = 0; + bool_t need_flush = 0; - l1_table = map_domain_page(l2e >> PAGE_SHIFT); + maddr_next = (u64)next_mfn << PAGE_SHIFT; - offset = gfn & (~PTE_PER_TABLE_MASK); - l1e = (u32*)((u8*)l1_table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE)); - - addr_hi = get_field_from_reg_u32(l1e[1], + addr_hi = get_field_from_reg_u32(pde[1], IOMMU_PTE_ADDR_HIGH_MASK, IOMMU_PTE_ADDR_HIGH_SHIFT); - addr_lo = get_field_from_reg_u32(l1e[0], + addr_lo = get_field_from_reg_u32(pde[0], IOMMU_PTE_ADDR_LOW_MASK, IOMMU_PTE_ADDR_LOW_SHIFT); - maddr_old = ((addr_hi << 32) | addr_lo) << PAGE_SHIFT; + maddr_old = (addr_hi << 32) | (addr_lo << PAGE_SHIFT); - if ( maddr_old && (maddr_old != maddr) ) + if ( maddr_old != maddr_next ) need_flush = 1; - addr_lo = maddr & DMA_32BIT_MASK; - addr_hi = maddr >> 32; - - set_field_in_reg_u32((u32)addr_hi, 0, - IOMMU_PTE_ADDR_HIGH_MASK, - IOMMU_PTE_ADDR_HIGH_SHIFT, &entry); - set_field_in_reg_u32(iw ? IOMMU_CONTROL_ENABLED : - IOMMU_CONTROL_DISABLED, entry, - IOMMU_PTE_IO_WRITE_PERMISSION_MASK, - IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT, &entry); - set_field_in_reg_u32(ir ? IOMMU_CONTROL_ENABLED : - IOMMU_CONTROL_DISABLED, entry, - IOMMU_PTE_IO_READ_PERMISSION_MASK, - IOMMU_PTE_IO_READ_PERMISSION_SHIFT, &entry); - l1e[1] = entry; - - set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, - IOMMU_PTE_ADDR_LOW_MASK, - IOMMU_PTE_ADDR_LOW_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_PAGING_MODE_LEVEL_0, entry, - IOMMU_PTE_NEXT_LEVEL_MASK, - IOMMU_PTE_NEXT_LEVEL_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, - IOMMU_PTE_PRESENT_MASK, - IOMMU_PTE_PRESENT_SHIFT, &entry); - l1e[0] = entry; - - unmap_domain_page(l1_table); - return need_flush; -} - -static void amd_iommu_set_page_directory_entry(u32 *pde, - u64 next_ptr, u8 next_level) -{ - u64 addr_lo, addr_hi; - u32 entry; - - addr_lo = next_ptr & DMA_32BIT_MASK; - addr_hi = next_ptr >> 32; + addr_lo = maddr_next & DMA_32BIT_MASK; + addr_hi = maddr_next >> 32; /* enable read/write permissions,which will be enforced at the PTE */ set_field_in_reg_u32((u32)addr_hi, 0, IOMMU_PDE_ADDR_HIGH_MASK, IOMMU_PDE_ADDR_HIGH_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + set_field_in_reg_u32(iw, entry, IOMMU_PDE_IO_WRITE_PERMISSION_MASK, IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT, &entry); - set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + set_field_in_reg_u32(ir, entry, IOMMU_PDE_IO_READ_PERMISSION_MASK, IOMMU_PDE_IO_READ_PERMISSION_SHIFT, &entry); pde[1] = entry; @@ -275,6 +236,26 @@ IOMMU_PDE_PRESENT_MASK, IOMMU_PDE_PRESENT_SHIFT, &entry); pde[0] = entry; + + return need_flush; +} + +static bool_t set_iommu_pte_present(unsigned long pt_mfn, unsigned long gfn, + unsigned long next_mfn, int pde_level, + bool_t iw, bool_t ir) +{ + u64 *table; + u32 *pde; + bool_t need_flush = 0; + + table = map_domain_page(pt_mfn); + + pde = (u32*)(table + pfn_to_pde_idx(gfn, pde_level)); + + need_flush = set_iommu_pde_present(pde, next_mfn, + IOMMU_PAGING_MODE_LEVEL_0, iw, ir); + unmap_domain_page(table); + return need_flush; } void amd_iommu_set_root_page_table( @@ -413,11 +394,18 @@ return ptr; } +static unsigned int iommu_next_level(u32 *entry) +{ + return get_field_from_reg_u32(entry[0], + IOMMU_PDE_NEXT_LEVEL_MASK, + IOMMU_PDE_NEXT_LEVEL_SHIFT); +} + static int amd_iommu_is_pte_present(u32 *entry) { - return (get_field_from_reg_u32(entry[0], - IOMMU_PDE_PRESENT_MASK, - IOMMU_PDE_PRESENT_SHIFT)); + return get_field_from_reg_u32(entry[0], + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT); } void invalidate_dev_table_entry(struct amd_iommu *iommu, @@ -439,54 +427,241 @@ send_iommu_command(iommu, cmd); } -static u64 iommu_l2e_from_pfn(struct page_info *table, int level, - unsigned long io_pfn) +/* For each pde, We use ignored bits (bit 1 - bit 8 and bit 63) + * to save pde count, pde count = 511 is a candidate of page coalescing. + */ +static unsigned int get_pde_count(u64 pde) { - unsigned long offset; - void *pde = NULL; - void *table_vaddr; - u64 next_table_maddr = 0; - unsigned int lowest = 1; + unsigned int count; + u64 upper_mask = 1ULL << 63 ; + u64 lower_mask = 0xFF << 1; - BUG_ON( table == NULL || level < lowest ); + count = ((pde & upper_mask) >> 55) | ((pde & lower_mask) >> 1); + return count; +} - if ( level == lowest ) - return page_to_maddr(table); +/* Convert pde count into iommu pte ignored bits */ +static void set_pde_count(u64 *pde, unsigned int count) +{ + u64 upper_mask = 1ULL << 8 ; + u64 lower_mask = 0xFF; + u64 pte_mask = (~(1ULL << 63)) & (~(0xFF << 1)); - while ( level > lowest ) + *pde &= pte_mask; + *pde |= ((count & upper_mask ) << 55) | ((count & lower_mask ) << 1); +} + +/* Return 1, if pages are suitable for merging at merge_level. + * otherwise increase pde count if mfn is contigous with mfn - 1 + */ +static int iommu_update_pde_count(struct domain *d, unsigned long pt_mfn, + unsigned long gfn, unsigned long mfn, + unsigned int merge_level) +{ + unsigned int pde_count, next_level; + unsigned long first_mfn; + u64 *table, *pde, *ntable; + u64 ntable_maddr, mask; + struct hvm_iommu *hd = domain_hvm_iommu(d); + bool_t ok = 0; + + ASSERT( spin_is_locked(&hd->mapping_lock) && pt_mfn ); + + next_level = merge_level - 1; + + /* get pde at merge level */ + table = map_domain_page(pt_mfn); + pde = table + pfn_to_pde_idx(gfn, merge_level); + + /* get page table of next level */ + ntable_maddr = amd_iommu_get_next_table_from_pte((u32*)pde); + ntable = map_domain_page(ntable_maddr >> PAGE_SHIFT); + + /* get the first mfn of next level */ + first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT; + + if ( first_mfn == 0 ) + goto out; + + mask = (1ULL<< (PTE_PER_TABLE_SHIFT * next_level)) - 1; + + if ( ((first_mfn & mask) == 0) && + (((gfn & mask) | first_mfn) == mfn) ) { - offset = io_pfn >> ((PTE_PER_TABLE_SHIFT * - (level - IOMMU_PAGING_MODE_LEVEL_1))); - offset &= ~PTE_PER_TABLE_MASK; + pde_count = get_pde_count(*pde); - table_vaddr = __map_domain_page(table); - pde = table_vaddr + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE); - next_table_maddr = amd_iommu_get_next_table_from_pte(pde); + if ( pde_count == (PTE_PER_TABLE_SIZE - 1) ) + ok = 1; + else if ( pde_count < (PTE_PER_TABLE_SIZE - 1)) + { + pde_count++; + set_pde_count(pde, pde_count); + } + } - if ( !amd_iommu_is_pte_present(pde) ) + else + /* non-contiguous mapping */ + set_pde_count(pde, 0); + +out: + unmap_domain_page(ntable); + unmap_domain_page(table); + + return ok; +} + +static int iommu_merge_pages(struct domain *d, unsigned long pt_mfn, + unsigned long gfn, unsigned int flags, + unsigned int merge_level) +{ + u64 *table, *pde, *ntable; + u64 ntable_mfn; + unsigned long first_mfn; + struct hvm_iommu *hd = domain_hvm_iommu(d); + + ASSERT( spin_is_locked(&hd->mapping_lock) && pt_mfn ); + + table = map_domain_page(pt_mfn); + pde = table + pfn_to_pde_idx(gfn, merge_level); + + /* get first mfn */ + ntable_mfn = amd_iommu_get_next_table_from_pte((u32*)pde) >> PAGE_SHIFT; + + if ( ntable_mfn == 0 ) + { + unmap_domain_page(table); + return 1; + } + + ntable = map_domain_page(ntable_mfn); + first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT; + + if ( first_mfn == 0 ) + { + unmap_domain_page(ntable); + unmap_domain_page(table); + return 1; + } + + /* setup super page mapping, next level = 0 */ + set_iommu_pde_present((u32*)pde, first_mfn, + IOMMU_PAGING_MODE_LEVEL_0, + !!(flags & IOMMUF_writable), + !!(flags & IOMMUF_readable)); + + amd_iommu_flush_all_pages(d); + + unmap_domain_page(ntable); + unmap_domain_page(table); + return 0; +} + +/* Walk io page tables and build level page tables if necessary + * {Re, un}mapping super page frames causes re-allocation of io + * page tables. + */ +static int iommu_pde_from_gfn(struct domain *d, unsigned long pfn, + unsigned long pt_mfn[]) +{ + u64 *pde, *next_table_vaddr; + unsigned long next_table_mfn; + unsigned int level; + struct page_info *table; + struct hvm_iommu *hd = domain_hvm_iommu(d); + + table = hd->root_table; + level = hd->paging_mode; + + BUG_ON( table == NULL || level < IOMMU_PAGING_MODE_LEVEL_1 || + level > IOMMU_PAGING_MODE_LEVEL_6 ); + + next_table_mfn = page_to_mfn(table); + + if ( level == IOMMU_PAGING_MODE_LEVEL_1 ) + { + pt_mfn[level] = next_table_mfn; + return 0; + } + + while ( level > IOMMU_PAGING_MODE_LEVEL_1 ) + { + unsigned int next_level = level - 1; + pt_mfn[level] = next_table_mfn; + + next_table_vaddr = map_domain_page(next_table_mfn); + pde = next_table_vaddr + pfn_to_pde_idx(pfn, level); + + /* Here might be a super page frame */ + next_table_mfn = amd_iommu_get_next_table_from_pte((uint32_t*)pde) + >> PAGE_SHIFT; + + /* Split super page frame into smaller pieces.*/ + if ( amd_iommu_is_pte_present((u32*)pde) && + (iommu_next_level((u32*)pde) == 0) && + next_table_mfn != 0 ) { - if ( next_table_maddr == 0 ) + int i; + unsigned long mfn, gfn; + unsigned int page_sz; + + page_sz = 1 << (PTE_PER_TABLE_SHIFT * (next_level - 1)); + gfn = pfn & ~((1 << (PTE_PER_TABLE_SHIFT * next_level)) - 1); + mfn = next_table_mfn; + + /* allocate lower level page table */ + table = alloc_amd_iommu_pgtable(); + if ( table == NULL ) + { + AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n"); + unmap_domain_page(next_table_vaddr); + return 1; + } + + next_table_mfn = page_to_mfn(table); + set_iommu_pde_present((u32*)pde, next_table_mfn, next_level, + !!IOMMUF_writable, !!IOMMUF_readable); + + for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) + { + set_iommu_pte_present(next_table_mfn, gfn, mfn, next_level, + !!IOMMUF_writable, !!IOMMUF_readable); + mfn += page_sz; + gfn += page_sz; + } + + amd_iommu_flush_all_pages(d); + } + + /* Install lower level page table for non-present entries */ + else if ( !amd_iommu_is_pte_present((u32*)pde) ) + { + if ( next_table_mfn == 0 ) { table = alloc_amd_iommu_pgtable(); if ( table == NULL ) { - printk("AMD-Vi: Cannot allocate I/O page table\n"); - return 0; + AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n"); + unmap_domain_page(next_table_vaddr); + return 1; } - next_table_maddr = page_to_maddr(table); - amd_iommu_set_page_directory_entry( - (u32 *)pde, next_table_maddr, level - 1); + next_table_mfn = page_to_mfn(table); + set_iommu_pde_present((u32*)pde, next_table_mfn, next_level, + !!IOMMUF_writable, !!IOMMUF_readable); } else /* should never reach here */ - return 0; + { + unmap_domain_page(next_table_vaddr); + return 1; + } } - unmap_domain_page(table_vaddr); - table = maddr_to_page(next_table_maddr); + unmap_domain_page(next_table_vaddr); level--; } - return next_table_maddr; + /* mfn of level 1 page table */ + pt_mfn[level] = next_table_mfn; + return 0; } static int update_paging_mode(struct domain *d, unsigned long gfn) @@ -500,7 +675,7 @@ struct page_info *new_root = NULL; struct page_info *old_root = NULL; void *new_root_vaddr; - u64 old_root_maddr; + unsigned long old_root_mfn; struct hvm_iommu *hd = domain_hvm_iommu(d); level = hd->paging_mode; @@ -522,12 +697,13 @@ } new_root_vaddr = __map_domain_page(new_root); - old_root_maddr = page_to_maddr(old_root); - amd_iommu_set_page_directory_entry((u32 *)new_root_vaddr, - old_root_maddr, level); + old_root_mfn = page_to_mfn(old_root); + set_iommu_pde_present(new_root_vaddr, old_root_mfn, level, + !!IOMMUF_writable, !!IOMMUF_readable); level++; old_root = new_root; offset >>= PTE_PER_TABLE_SHIFT; + unmap_domain_page(new_root_vaddr); } if ( new_root != NULL ) @@ -575,15 +751,18 @@ int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags) { - u64 iommu_l2e; - int need_flush = 0; + bool_t need_flush = 0; struct hvm_iommu *hd = domain_hvm_iommu(d); + unsigned long pt_mfn[7]; + unsigned int merge_level; BUG_ON( !hd->root_table ); if ( iommu_hap_pt_share && is_hvm_domain(d) ) return 0; + memset(pt_mfn, 0, sizeof(pt_mfn)); + spin_lock(&hd->mapping_lock); /* Since HVM domain is initialized with 2 level IO page table, @@ -592,14 +771,14 @@ { if ( update_paging_mode(d, gfn) ) { + spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } } - iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn); - if ( iommu_l2e == 0 ) + if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); @@ -607,19 +786,48 @@ return -EFAULT; } - need_flush = set_iommu_l1e_present(iommu_l2e, gfn, (u64)mfn << PAGE_SHIFT, + /* Install 4k mapping first */ + need_flush = set_iommu_pte_present(pt_mfn[1], gfn, mfn, + IOMMU_PAGING_MODE_LEVEL_1, !!(flags & IOMMUF_writable), !!(flags & IOMMUF_readable)); - if ( need_flush ) - amd_iommu_flush_pages(d, gfn, 0); + /* Do not increase pde count if io mapping has not been changed */ + if ( !need_flush ) + goto out; + + amd_iommu_flush_pages(d, gfn, 0); + + for ( merge_level = IOMMU_PAGING_MODE_LEVEL_2; + merge_level <= hd->paging_mode; merge_level++ ) + { + if ( pt_mfn[merge_level] == 0 ) + break; + if ( !iommu_update_pde_count(d, pt_mfn[merge_level], + gfn, mfn, merge_level) ) + break; + /* Deallocate lower level page table */ + free_amd_iommu_pgtable(mfn_to_page(pt_mfn[merge_level - 1])); + + if ( iommu_merge_pages(d, pt_mfn[merge_level], gfn, + flags, merge_level) ) + { + spin_unlock(&hd->mapping_lock); + AMD_IOMMU_DEBUG("Merge iommu page failed at level %d, " + "gfn = %lx mfn = %lx\n", merge_level, gfn, mfn); + domain_crash(d); + return -EFAULT; + } + } + +out: spin_unlock(&hd->mapping_lock); return 0; } int amd_iommu_unmap_page(struct domain *d, unsigned long gfn) { - u64 iommu_l2e; + unsigned long pt_mfn[7]; struct hvm_iommu *hd = domain_hvm_iommu(d); BUG_ON( !hd->root_table ); @@ -627,6 +835,8 @@ if ( iommu_hap_pt_share && is_hvm_domain(d) ) return 0; + memset(pt_mfn, 0, sizeof(pt_mfn)); + spin_lock(&hd->mapping_lock); /* Since HVM domain is initialized with 2 level IO page table, @@ -635,15 +845,14 @@ { if ( update_paging_mode(d, gfn) ) { + spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } } - iommu_l2e = iommu_l2e_from_pfn(hd->root_table, hd->paging_mode, gfn); - - if ( iommu_l2e == 0 ) + if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); @@ -652,7 +861,7 @@ } /* mark PTE as 'page not present' */ - clear_iommu_l1e_present(iommu_l2e, gfn); + clear_iommu_pte_present(pt_mfn[1], gfn); spin_unlock(&hd->mapping_lock); amd_iommu_flush_pages(d, gfn, 0); diff -r 8d6edc3d26d2 -r 80e9fcdaef36 xen/drivers/passthrough/amd/pci_amd_iommu.c --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c Sat Aug 13 10:14:58 2011 +0100 +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c Tue Aug 16 15:03:11 2011 +0100 @@ -237,7 +237,9 @@ * XXX Should we really map all non-RAM (above 4G)? Minimally * a pfn_valid() check would seem desirable here. */ - amd_iommu_map_page(d, pfn, pfn, IOMMUF_readable|IOMMUF_writable); + if ( mfn_valid(pfn) ) + amd_iommu_map_page(d, pfn, pfn, + IOMMUF_readable|IOMMUF_writable); } } @@ -333,7 +335,8 @@ { void *table_vaddr, *pde; u64 next_table_maddr; - int index; + int index, next_level, present; + u32 *entry; table_vaddr = __map_domain_page(pg); @@ -343,7 +346,18 @@ { pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE); next_table_maddr = amd_iommu_get_next_table_from_pte(pde); - if ( next_table_maddr != 0 ) + entry = (u32*)pde; + + next_level = get_field_from_reg_u32(entry[0], + IOMMU_PDE_NEXT_LEVEL_MASK, + IOMMU_PDE_NEXT_LEVEL_SHIFT); + + present = get_field_from_reg_u32(entry[0], + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT); + + if ( (next_table_maddr != 0) && (next_level != 0) + && present ) { deallocate_next_page_table( maddr_to_page(next_table_maddr), level - 1); _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |