[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3 8/9] libxc: rework of domain builder's page table handler



On 10/29/2015 01:48 PM, Wei Liu wrote:
On Tue, Oct 13, 2015 at 03:11:17PM +0200, Juergen Gross wrote:
In order to prepare a p2m list outside of the initial kernel mapping
do a rework of the domain builder's page table handler. The goal is
to be able to use common helpers for page table allocation and setup
for initial kernel page tables and page tables mapping the p2m list.
This is achieved by supporting multiple mapping areas. The mapped
virtual addresses of the single areas must not overlap, while the
page tables of a new area added might already be partially present.
Especially the top level page table is existing only once, of course.


Currently restrict the number of mappings to 1 because the only mapping
now is the initial mapping created by toolstack. There should not be
behaviour change and guest visible change introduced.

If my understanding is correct, can yo please add that to the commit
message?

Sure.

I'm currently thinking about changing this patch even further. While
doing similar work in grub-xen I found the page table building there
to be much more generic and more compact. Instead of open coding the
different page table levels all is done there in a loop over those
levels. The main loop consists of only 33 lines (and this is after
adding support of multiple mapping areas)!

What do you think?

Given this is a particularly thorny area, a second eye would be much
appreciated.

On the positive side: any bug in this code should be really easy to
spot, as the domain wouldn't be able to work reliably (this was at least
my experience when developing this patch and the related one in grub).

Some comments below.

Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
---
  tools/libxc/xc_dom_x86.c | 404 ++++++++++++++++++++++++++++-------------------
  1 file changed, 240 insertions(+), 164 deletions(-)

diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
index c815e10..333ef6b 100644
--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -65,17 +65,27 @@
  #define NR_IOREQ_SERVER_PAGES 8
  #define ioreq_server_pfn(x) (special_pfn(0) - NR_IOREQ_SERVER_PAGES + (x))

-#define bits_to_mask(bits)       (((xen_vaddr_t)1 << (bits))-1)
+#define bits_to_mask(bits)       (((xen_vaddr_t)1 << (bits)) - 1)

Whitespace-only change here.

Oops, sorry.


  #define round_down(addr, mask)   ((addr) & ~(mask))
  #define round_up(addr, mask)     ((addr) | (mask))

-struct xc_dom_image_x86 {
-    /* initial page tables */
+struct xc_dom_x86_mapping_lvl {
+    xen_vaddr_t from;
+    xen_vaddr_t to;
+    xen_pfn_t pfn;
      unsigned int pgtables;
-    unsigned int pg_l4;
-    unsigned int pg_l3;
-    unsigned int pg_l2;
-    unsigned int pg_l1;
+};
+
+struct xc_dom_x86_mapping {
+    struct xc_dom_x86_mapping_lvl area;
+    struct xc_dom_x86_mapping_lvl lvls[4];
+    xen_pfn_t pfn_start;

This is unused throughout the patch and the next patch. You can delete it.

Indeed.


+};
+
+struct xc_dom_image_x86 {
+    unsigned n_mappings;
+#define MAPPING_MAX 1
+    struct xc_dom_x86_mapping maps[MAPPING_MAX];
  };

  /* get guest IO ABI protocol */
@@ -105,43 +115,107 @@ const char *xc_domain_get_native_protocol(xc_interface 
*xch,
      return protocol;
  }

-static unsigned long
-nr_page_tables(struct xc_dom_image *dom,
-               xen_vaddr_t start, xen_vaddr_t end, unsigned long bits)
+static void
+nr_page_tables(struct xc_dom_image *dom, int lvl,
+               xen_vaddr_t from, xen_vaddr_t to, unsigned long bits)
  {
      xen_vaddr_t mask = bits_to_mask(bits);
-    int tables;
+    struct xc_dom_image_x86 *domx86 = dom->arch_private;
+    struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;
+    struct xc_dom_x86_mapping *map_cmp;
+    unsigned map_n;

      if ( bits == 0 )
-        return 0;  /* unused */
+        return;  /* unused */
+
+    if ( lvl == 3 && domx86->n_mappings != 0 )
+        return;  /* Top level page table already in first mapping. */

      if ( bits == (8 * sizeof(unsigned long)) )
      {
-        /* must be pgd, need one */
-        start = 0;
-        end = -1;
-        tables = 1;
+        /* 32 bit top level page table special case */
+        map->lvls[lvl].from = 0;
+        map->lvls[lvl].to = -1;
+        map->lvls[lvl].pgtables = 1;
+        goto done;
      }
-    else
+
+    from = round_down(from, mask);
+    to = round_up(to, mask);
+
+    for ( map_n = 0, map_cmp = domx86->maps; map_n < domx86->n_mappings;
+          map_n++, map_cmp++ )
      {
-        start = round_down(start, mask);
-        end = round_up(end, mask);
-        tables = ((end - start) >> bits) + 1;
+        if ( map_cmp->lvls[lvl].from == map_cmp->lvls[lvl].to )
+            continue;
+        if ( from >= map_cmp->lvls[lvl].from && to <= map_cmp->lvls[lvl].to )
+            return;  /* Area already completely covered on this level. */
+        if ( from >= map_cmp->lvls[lvl].from && from <= map_cmp->lvls[lvl].to )
+            from = map_cmp->lvls[lvl].to + 1;
+        if ( to >= map_cmp->lvls[lvl].from && to <= map_cmp->lvls[lvl].to )
+            to = map_cmp->lvls[lvl].from - 1;

Is it possible that later mapping covers the previous ones? How is that
handled?

I'm not sure I understand your question.

In case you are asking whether different mappings are allowed to overlap
in terms of virtual addresses: no. I haven't added any checking code to
ensure this. I can do it, if you want.


      }

+    map->lvls[lvl].from = from;
+    map->lvls[lvl].to = to;
+    map->lvls[lvl].pgtables = ((to - from) >> bits) + 1;
+
+ done:
      DOMPRINTF("%s: 0x%016" PRIx64 "/%ld: 0x%016" PRIx64
-              " -> 0x%016" PRIx64 ", %d table(s)",
-              __FUNCTION__, mask, bits, start, end, tables);
-    return tables;
+              " -> 0x%016" PRIx64 ", %d table(s)", __FUNCTION__, mask, bits,
+              map->lvls[lvl].from, map->lvls[lvl].to, map->lvls[lvl].pgtables);
  }

-static int alloc_pgtables(struct xc_dom_image *dom, int pae,
-                          int l4_bits, int l3_bits, int l2_bits, int l1_bits)
+static int count_pgtables(struct xc_dom_image *dom, int pae, int bits[4],
+                          xen_vaddr_t from, xen_vaddr_t to, xen_pfn_t pfn)
+{
+    struct xc_dom_image_x86 *domx86 = dom->arch_private;
+    struct xc_dom_x86_mapping *map;
+    xen_pfn_t pfn_end;
+    int level;
+
+    if ( domx86->n_mappings == MAPPING_MAX )
+    {
+        xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+                     "%s: too many mappings\n", __FUNCTION__);
+        return -ENOMEM;
+    }
+    map = domx86->maps + domx86->n_mappings;
+
+    pfn_end = pfn + ((to - from) >> PAGE_SHIFT_X86);
+    if ( pfn_end >= dom->p2m_size )
+    {
+        xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
+                     "%s: not enough memory for initial mapping (%#"PRIpfn" > 
%#"PRIpfn")",
+                     __FUNCTION__, pfn_end, dom->p2m_size);
+        return -ENOMEM;
+    }
+
+    memset(map, 0, sizeof(*map));
+
+    map->area.from = from;
+    map->area.to = to;
+    for (level = 3; level >= 0; level--)
+    {
+        map->lvls[level].pfn = pfn + map->area.pgtables;
+        nr_page_tables(dom, level, from, to, bits[level]);
+        if ( pae && to < 0xc0000000 && level == 1)
+        {
+            DOMPRINTF("%s: PAE: extra l2 page table for l3#3", __FUNCTION__);
+            map->lvls[level].pgtables++;
+        }
+        map->area.pgtables += map->lvls[level].pgtables;
+    }
+
+    return 0;
+}
+
+static int alloc_pgtables(struct xc_dom_image *dom, int pae, int bits[4])
  {
      int pages, extra_pages;
      xen_vaddr_t try_virt_end;
-    xen_pfn_t try_pfn_end;
      struct xc_dom_image_x86 *domx86 = dom->arch_private;
+    struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;

      extra_pages = dom->alloc_bootstack ? 1 : 0;
      extra_pages += 128; /* 512kB padding */

Hmm... Not really related to this patch: Should this be derived from
PAGE_SIZE_X86 as well?

Probably, yes.

The rest looks OK to me.


Thanks,

Juergen


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.