[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 2 of 4] xenpaging: use wait queues


  • To: xen-devel@xxxxxxxxxxxxxxxxxxx, olaf@xxxxxxxxx
  • From: "Andres Lagar-Cavilla" <andres@xxxxxxxxxxxxxxxx>
  • Date: Thu, 1 Dec 2011 07:17:23 -0800
  • Delivery-date: Thu, 01 Dec 2011 15:18:04 +0000
  • Domainkey-signature: a=rsa-sha1; c=nofws; d=lagarcavilla.org; h=message-id :in-reply-to:references:date:subject:from:to:reply-to :mime-version:content-type:content-transfer-encoding; q=dns; s= lagarcavilla.org; b=fxhJrHw49RrrnjGyo6MG8cBKvQaWWdJ2WFuJV/AZTqN+ 6dVkblJk2T6Ff/uNB4hmrJTgjzeq2ctgFCBTrmdnpm63+ziO4WSlY14V8QFiUtL9 dOBnZL+C2st/vUYjfCVLUAuGt73Scqgxfjqg6g9VF3vSFq1aqKg9Iz8AfXiWj48=
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>

> Date: Thu, 01 Dec 2011 12:09:18 +0100
> From: Olaf Hering <olaf@xxxxxxxxx>
> To: xen-devel@xxxxxxxxxxxxxxxxxxx
> Subject: [Xen-devel] [PATCH 2 of 4] xenpaging: use wait queues
> Message-ID: <8147822efdee65d1f5b9.1322737758@xxxxxxxxxxxx>
> Content-Type: text/plain; charset="us-ascii"
>
> # HG changeset patch
> # User Olaf Hering <olaf@xxxxxxxxx>
> # Date 1322737507 -3600
> # Node ID 8147822efdee65d1f5b94656ab2032aedb76979f
> # Parent  612f69531fd15cf59c58404f6e4762733a9a268c
> xenpaging: use wait queues
>
> Use a wait queue to put a guest vcpu to sleep while the requested gfn is
> in paging state. This adds missing p2m_mem_paging_populate() calls to
> some callers of the new get_gfn* variants, which would crash now
> because they get an invalid mfn. It also fixes guest crashes due to
> unexpected returns from do_memory_op because copy_to/from_guest ran into
> a paged gfn. Now those places will always get a valid mfn.
>
> Since each gfn could be requested by several guest vcpus at the same
> time a queue of paged gfns is maintained. Each vcpu will be attached to
> that queue. Once p2m_mem_paging_resume restored the gfn the waiting
> vcpus will resume execution.
>
> There is untested code in p2m_mem_paging_init_queue() to allow cpu
> hotplug. Since each vcpu may wait on a different gfn there have to be as
> many queues as vcpus. But xl vcpu-set does not seem to work right now,
> so this code path cant be excercised right now.
>
>
> Signed-off-by: Olaf Hering <olaf@xxxxxxxxx>
>
> diff -r 612f69531fd1 -r 8147822efdee xen/arch/x86/hvm/hvm.c
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -454,6 +454,8 @@ int hvm_domain_initialise(struct domain
>      spin_lock_init(&d->arch.hvm_domain.irq_lock);
>      spin_lock_init(&d->arch.hvm_domain.uc_lock);
>
> +    spin_lock_init(&d->arch.hvm_domain.gfn_lock);
> +

Probably gfn_lock should be replaced with a name a bit more expressive.
That notwithstanding, it seems this lock only gets used in the mm layer.
If so, it should be declared as an mm_lock_t in arch/x86/mm-locks.h, and
subject to ordering constraints.

Both gfn_lock and the list of wait queues could be collapsed in a single
struct, so as to decrease pressure on the size of struct domain.

>      INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list);
>      spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock);
>
> diff -r 612f69531fd1 -r 8147822efdee xen/arch/x86/mm/p2m.c
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -30,6 +30,7 @@
>  #include <asm/p2m.h>
>  #include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
>  #include <xen/iommu.h>
> +#include <xen/wait.h>
>  #include <asm/mem_event.h>
>  #include <public/mem_event.h>
>  #include <asm/mem_sharing.h>
> @@ -144,6 +145,182 @@ void p2m_change_entry_type_global(struct
>      p2m_unlock(p2m);
>  }
>
> +#ifdef __x86_64__
> +struct p2m_mem_paging_queue {
> +    struct list_head list;
> +    struct waitqueue_head wq;
> +    unsigned long gfn;
> +    unsigned short waiters;
> +    unsigned short woken;
> +    unsigned short index;
> +};
> +
> +struct p2m_mem_paging_queue_head {
> +    struct list_head list;
> +    unsigned int max;
> +};
> +
> +int p2m_mem_paging_init_queue(struct domain *d, unsigned int max)
> +{
> +    struct p2m_mem_paging_queue_head *h;
> +    struct p2m_mem_paging_queue *q;
> +    unsigned int i, nr;
> +    int ret = 0;
> +
> +    spin_lock(&d->arch.hvm_domain.gfn_lock);
> +
> +    if (!d->arch.hvm_domain.gfn_queue) {
> +        ret = -ENOMEM;
> +        h = xzalloc(struct p2m_mem_paging_queue_head);
> +        if (!h) {
> +            domain_crash(d);
> +            goto out;
> +        }
> +
> +        INIT_LIST_HEAD(&h->list);
> +        nr = max;
> +    } else {
> +        h = d->arch.hvm_domain.gfn_queue;
> +        if (max <= h->max)
> +            goto out;
> +        nr = max - h->max;
> +    }
> +
> +    ret = -ENOMEM;
> +    q = xzalloc_array(struct p2m_mem_paging_queue, nr);
> +    if (!q) {
> +        if (!d->arch.hvm_domain.gfn_queue)
> +            xfree(h);
> +        domain_crash(d);
> +        goto out;
> +    }
> +
> +    for (i = 0; i < nr; i++) {
> +        init_waitqueue_head(&q[i].wq);
> +        INIT_LIST_HEAD(&q[i].list);
> +        q[i].index = h->max + i + 1;
> +        list_add_tail(&q[i].list, &h->list);
> +    }
> +
> +    h->max = max;
> +    d->arch.hvm_domain.gfn_queue = h;
> +    ret = 0;
> +
> +out:
> +    spin_unlock(&d->arch.hvm_domain.gfn_lock);
> +    return ret;
> +}
> +
> +static struct p2m_mem_paging_queue *p2m_mem_paging_get_queue(struct
> domain *d, unsigned long gfn)
> +{
> +    struct p2m_mem_paging_queue_head *h;
> +    struct p2m_mem_paging_queue *q, *q_match, *q_free;
> +
> +    h = d->arch.hvm_domain.gfn_queue;
> +    q_match = q_free = NULL;
> +
> +    spin_lock(&d->arch.hvm_domain.gfn_lock);
> +
> +    list_for_each_entry(q, &h->list, list) {
"Hashing" the gfn ( i.e. gfn & ((1 << order_of_vcpus) - 1) ) and starting
the linear scan from there would shortcut the common case of finding a
queued gfn.

Checking the p2m type of the gfn for paging in would make the search O(1)
for the case in which the gfn is not queued.

> +        if (q->gfn == gfn) {
> +            q_match = q;
> +            break;
> +        }
> +        if (!q_free && !q->waiters)
> +            q_free = q;
> +    }
> +
> +    if (!q_match && q_free)
> +        q_match = q_free;
> +
> +    if (q_match) {
> +        if (q_match->woken)
> +            printk("wq woken for gfn %u:%u %lx %u %u %u\n",
> current->domain->domain_id, current->vcpu_id, gfn, q_match->index,
> q_match->woken, q_match->waiters);
> +        q_match->waiters++;
> +        q_match->gfn = gfn;
> +    }
> +
> +    if (!q_match)
> +        printk("No wq_get for gfn %u:%u %lx\n",
> current->domain->domain_id, current->vcpu_id, gfn);
> +
> +    spin_unlock(&d->arch.hvm_domain.gfn_lock);
> +    return q_match;
> +}
> +
> +static void p2m_mem_paging_put_queue(struct domain *d, struct
> p2m_mem_paging_queue *q_match)
> +{
> +    spin_lock(&d->arch.hvm_domain.gfn_lock);
> +
> +    if (q_match->waiters == 0)
> +        printk("wq_put no waiters, gfn %u:%u %lx %u\n",
> current->domain->domain_id, current->vcpu_id, q_match->gfn,
> q_match->woken);
> +    else if (--q_match->waiters == 0)
> +        q_match->gfn = q_match->woken = 0;;
> +
> +    spin_unlock(&d->arch.hvm_domain.gfn_lock);
> +}
> +
> +static void p2m_mem_paging_wake_queue(struct domain *d, unsigned long
> gfn)
> +{
> +    struct p2m_mem_paging_queue_head *h;
> +    struct p2m_mem_paging_queue *q, *q_match = NULL;
> +
> +    spin_lock(&d->arch.hvm_domain.gfn_lock);
> +
> +    h = d->arch.hvm_domain.gfn_queue;
Same here, re big O of searching the gfn.

> +    list_for_each_entry(q, &h->list, list) {
> +        if (q->gfn == gfn) {
> +            q_match = q;
> +            break;
> +        }
> +    }
> +    if (q_match) {
> +        if (q_match->woken || q_match->waiters == 0)
> +            printk("Wrong wake for gfn %u:%u %p %lx %u %u\n",
> current->domain->domain_id, current->vcpu_id, q_match, gfn,
> q_match->woken, q_match->waiters);
> +        q_match->woken++;
> +        wake_up_all(&q_match->wq);
> +    }
> +    spin_unlock(&d->arch.hvm_domain.gfn_lock);
> +}
> +
> +/* Returns 0 if the gfn is still paged */
> +static int p2m_mem_paging_get_entry(mfn_t *mfn,
> +               struct p2m_domain *p2m, unsigned long gfn,
> +               p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
> +               unsigned int *page_order)
> +{
> +    *mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order);
This will be called in the wake up routine, and it will be racy against
any concurrent p2m updates.

> +
> +    return p2m_is_paging(*t) ? 0 : 1;
> +}
> +
> +/* Go to sleep in case of guest access */
> +static void p2m_mem_paging_wait(mfn_t *mfn,
> +                    struct p2m_domain *p2m, unsigned long gfn,
> +                    p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
> +                    unsigned int *page_order)
> +{
> +    struct p2m_mem_paging_queue *pmpq;
> +
> +    /* Return p2mt as is in case of query */
> +    if ( q == p2m_query )
> +        return;
> +    /* Foreign domains can not go to sleep */
> +    if ( current->domain != p2m->domain )
> +        return;
> +
> +    pmpq = p2m_mem_paging_get_queue(p2m->domain, gfn);
> +    if ( !pmpq )
> +        return;
> +
> +    /* Populate the page once */
> +    if ( *t == p2m_ram_paging_out || *t == p2m_ram_paged )
> +        p2m_mem_paging_populate(p2m->domain, gfn);
> +
> +    wait_event(pmpq->wq, p2m_mem_paging_get_entry(mfn, p2m, gfn, t, a, q,
> page_order));
> +    p2m_mem_paging_put_queue(p2m->domain, pmpq);
> +}
> +#endif
> +
>  mfn_t get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
>                      p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
>                      unsigned int *page_order)
> @@ -161,6 +338,11 @@ mfn_t get_gfn_type_access(struct p2m_dom
>      mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order);
>
>  #ifdef __x86_64__
> +    if ( unlikely(p2m_is_paging(*t)) )
> +        p2m_mem_paging_wait(&mfn, p2m, gfn, t, a, q, page_order);
> +#endif
> +
> +#ifdef __x86_64__
>      if ( q == p2m_unshare && p2m_is_shared(*t) )
>      {
>          ASSERT(!p2m_is_nestedp2m(p2m));
> @@ -914,54 +1096,42 @@ void p2m_mem_paging_drop_page(struct dom
>  void p2m_mem_paging_populate(struct domain *d, unsigned long gfn)
>  {
>      struct vcpu *v = current;
> -    mem_event_request_t req;
> +    mem_event_request_t req = { .type = MEM_EVENT_TYPE_PAGING, .gfn = gfn
> };
>      p2m_type_t p2mt;
>      p2m_access_t a;
>      mfn_t mfn;
>      struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +    int put_request = 0;
>
>      /* Check that there's space on the ring for this request */
>      if ( mem_event_check_ring(d, &d->mem_event->paging) )
>          return;
>
> -    memset(&req, 0, sizeof(req));
> -    req.type = MEM_EVENT_TYPE_PAGING;
> -
>      /* Fix p2m mapping */
>      p2m_lock(p2m);
>      mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, p2m_query, NULL);
> -    /* Allow only nominated or evicted pages to enter page-in path */
> -    if ( p2mt == p2m_ram_paging_out || p2mt == p2m_ram_paged )
> -    {
> -        /* Evict will fail now, tag this request for pager */
> -        if ( p2mt == p2m_ram_paging_out )
> -            req.flags |= MEM_EVENT_FLAG_EVICT_FAIL;
> +    /* Forward the state only if gfn is in page-out path */
> +    if ( p2mt == p2m_ram_paging_out || p2mt == p2m_ram_paged ) {
> +        /* Ignore foreign requests to allow mmap in pager */
> +        if ( mfn_valid(mfn) && p2mt == p2m_ram_paging_out && v->domain ==
> d ) {
> +            /* Restore gfn because it is needed by guest before evict */
> +            set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
> +                       paging_mode_log_dirty(d) ? p2m_ram_logdirty :
> p2m_ram_rw, a);
No event here to tell the pager to cancel its work?

You're not just adding wait queues, but also short-cutting the state
machine of paging, which, whilst delicate, works quite well right now. You
should definitely split that into two patches if possible.

And please keep in mind that there are pagers other than xenpaging, so
interface churn is a big headache for everyone, if unavoidable.

> +        } else {
> +            set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
> p2m_ram_paging_in_start, a);
> +            put_request = 1;
> +        }
> +        /* Evict will fail now, the pager has to try another gfn */
>
> -        set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K,
> p2m_ram_paging_in_start, a);
>          audit_p2m(p2m, 1);
>      }
>      p2m_unlock(p2m);
>
> -    /* Pause domain if request came from guest and gfn has paging type */
> -    if (  p2m_is_paging(p2mt) && v->domain == d )
> -    {
> -        vcpu_pause_nosync(v);
> -        req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED;
> -    }
> -    /* No need to inform pager if the gfn is not in the page-out path */
> -    else if ( p2mt != p2m_ram_paging_out && p2mt != p2m_ram_paged )
> -    {
> -        /* gfn is already on its way back and vcpu is not paused */
> +    /* One request per gfn, guest vcpus go to sleep, foreigners try again
> */
> +    if ( put_request )
> +        mem_event_put_request(d, &d->mem_event->paging, &req);
> +    else
>          mem_event_put_req_producers(d, &d->mem_event->paging);
These (mem_event_put_req_producers, foreign_producers) are the kinds of
things that are really confusing in the ring management side right now.

Thanks, good stuff
Andres
> -        return;
> -    }
> -
> -    /* Send request to pager */
> -    req.gfn = gfn;
> -    req.p2mt = p2mt;
> -    req.vcpu_id = v->vcpu_id;
> -
> -    mem_event_put_request(d, &d->mem_event->paging, &req);
>  }
>
>  /**
> @@ -1062,12 +1232,11 @@ void p2m_mem_paging_resume(struct domain
>          p2m_unlock(p2m);
>      }
>
> -    /* Unpause domain */
> -    if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED )
> -        vcpu_unpause(d->vcpu[rsp.vcpu_id]);
> -
>      /* Wake vcpus waiting for room in the ring */
>      mem_event_wake_requesters(&d->mem_event->paging);
> +
> +    /* Unpause all vcpus that were paused because the gfn was paged */
> +    p2m_mem_paging_wake_queue(d, rsp.gfn);
>  }
>
>  void p2m_mem_access_check(unsigned long gpa, bool_t gla_valid, unsigned
> long gla,
> diff -r 612f69531fd1 -r 8147822efdee xen/common/domctl.c
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -547,6 +547,9 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
>                  goto maxvcpu_out;
>          }
>
> +        if ( p2m_mem_paging_init_queue(d, max) )
> +            goto maxvcpu_out;
> +
>          ret = 0;
>
>      maxvcpu_out:
> diff -r 612f69531fd1 -r 8147822efdee xen/include/asm-x86/hvm/domain.h
> --- a/xen/include/asm-x86/hvm/domain.h
> +++ b/xen/include/asm-x86/hvm/domain.h
> @@ -91,6 +91,9 @@ struct hvm_domain {
>
>      struct viridian_domain viridian;
>
> +    spinlock_t                        gfn_lock;
> +    struct p2m_mem_paging_queue_head *gfn_queue;
> +
>      bool_t                 hap_enabled;
>      bool_t                 mem_sharing_enabled;
>      bool_t                 qemu_mapcache_invalidate;
> diff -r 612f69531fd1 -r 8147822efdee xen/include/asm-x86/p2m.h
> --- a/xen/include/asm-x86/p2m.h
> +++ b/xen/include/asm-x86/p2m.h
> @@ -468,6 +468,8 @@ p2m_pod_offline_or_broken_replace(struct
>  /* Modify p2m table for shared gfn */
>  int set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
>
> +/* Initialize per-gfn wait queue */
> +int p2m_mem_paging_init_queue(struct domain *d, unsigned int max);
>  /* Check if a nominated gfn is valid to be paged out */
>  int p2m_mem_paging_nominate(struct domain *d, unsigned long gfn);
>  /* Evict a frame */
>
>
>
> ------------------------------
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel
>
>
> End of Xen-devel Digest, Vol 82, Issue 13
> *****************************************
>



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.