[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC XEN PATCH 03/16] xen/x86: add a hypercall XENPF_pmem_add to report host pmem regions



On Mon, Oct 10, 2016 at 08:32:22AM +0800, Haozhong Zhang wrote:
> Xen hypervisor does not include a pmem driver. Instead, it relies on the
> pmem driver in Dom0 to report the PFN ranges of the entire pmem region,
> its reserved area and data area via XENPF_pmem_add. The reserved area is
> used by Xen hypervisor to place the frame table and M2P table, and is
> disallowed to be accessed from Dom0 once it's reported.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@xxxxxxxxx>
> ---
> Cc: Jan Beulich <jbeulich@xxxxxxxx>
> Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Cc: Daniel De Graaf <dgdegra@xxxxxxxxxxxxx>
> ---
>  xen/arch/x86/Makefile             |   1 +
>  xen/arch/x86/platform_hypercall.c |   7 ++
>  xen/arch/x86/pmem.c               | 161 
> ++++++++++++++++++++++++++++++++++++++
>  xen/arch/x86/x86_64/mm.c          |  54 +++++++++++++
>  xen/include/asm-x86/mm.h          |   4 +
>  xen/include/public/platform.h     |  14 ++++
>  xen/include/xen/pmem.h            |  31 ++++++++
>  xen/xsm/flask/hooks.c             |   1 +
>  8 files changed, 273 insertions(+)
>  create mode 100644 xen/arch/x86/pmem.c
>  create mode 100644 xen/include/xen/pmem.h
> 
> diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
> index 931917d..9cf2da1 100644
> --- a/xen/arch/x86/Makefile
> +++ b/xen/arch/x86/Makefile
> @@ -67,6 +67,7 @@ obj-$(CONFIG_TBOOT) += tboot.o
>  obj-y += hpet.o
>  obj-y += vm_event.o
>  obj-y += xstate.o
> +obj-y += pmem.o

If possible please keep this alphabetical. Also I wonder if it makes
sense to have CONFIG_PMEM or such?

>  
>  x86_emulate.o: x86_emulate/x86_emulate.c x86_emulate/x86_emulate.h
>  
> diff --git a/xen/arch/x86/platform_hypercall.c 
> b/xen/arch/x86/platform_hypercall.c
> index 0879e19..c47eea4 100644
> --- a/xen/arch/x86/platform_hypercall.c
> +++ b/xen/arch/x86/platform_hypercall.c
> @@ -24,6 +24,7 @@
>  #include <xen/pmstat.h>
>  #include <xen/irq.h>
>  #include <xen/symbols.h>
> +#include <xen/pmem.h>
>  #include <asm/current.h>
>  #include <public/platform.h>
>  #include <acpi/cpufreq/processor_perf.h>
> @@ -822,6 +823,12 @@ ret_t 
> do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op)
>      }
>      break;
>  
> +    case XENPF_pmem_add:

Missing call to ret = xsm_resource_plug_core(XSM_HOOK);
or something similar .

> +        ret = pmem_add(op->u.pmem_add.spfn, op->u.pmem_add.epfn,
> +                       op->u.pmem_add.rsv_spfn, op->u.pmem_add.rsv_epfn,
> +                       op->u.pmem_add.data_spfn, op->u.pmem_add.data_epfn);
> +        break;
> +
>      default:
>          ret = -ENOSYS;
>          break;
> diff --git a/xen/arch/x86/pmem.c b/xen/arch/x86/pmem.c
> new file mode 100644
> index 0000000..70358ed
> --- /dev/null
> +++ b/xen/arch/x86/pmem.c
> @@ -0,0 +1,161 @@
> +/******************************************************************************
> + * arch/x86/pmem.c
> + *
> + * Copyright (c) 2016, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.

Hm, please consult Intel lawyers with what '(at your option)' what other
later versions they are comfortable with.

> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
> + *
> + * Author: Haozhong Zhang <haozhong.zhang@xxxxxxxxx>
> + */
> +
> +#include <xen/guest_access.h>
> +#include <xen/list.h>
> +#include <xen/spinlock.h>
> +#include <xen/pmem.h>

Since this is a new file could I ask you sort these alphabetically?

> +#include <xen/iocap.h>
> +#include <asm-x86/mm.h>
> +
> +/*
> + * All pmem regions reported from Dom0 are linked in pmem_list, which
> + * is proected by pmem_list_lock. Its entries are of type struct pmem

protected
> + * and sorted incrementally by field spa.
> + */
> +static DEFINE_SPINLOCK(pmem_list_lock);
> +static LIST_HEAD(pmem_list);
> +
> +struct pmem {
> +    struct list_head link;   /* link to pmem_list */
> +    unsigned long spfn;      /* start PFN of the whole pmem region */
> +    unsigned long epfn;      /* end PFN of the whole pmem region */
> +    unsigned long rsv_spfn;  /* start PFN of the reserved area */
> +    unsigned long rsv_epfn;  /* end PFN of the reserved area */
> +    unsigned long data_spfn; /* start PFN of the data area */
> +    unsigned long data_epfn; /* end PFN of the data area */

Why not just:
struct pmem {
        struct list_head link;
        struct xenpf_pmem_add pmem;
}

or such?

> +};
> +
> +static int is_included(unsigned long s1, unsigned long e1,

bool?
> +                       unsigned long s2, unsigned long e2)
> +{
> +    return s1 <= s2 && s2 < e2 && e2 <= e1;

Is the s2 < e2 necessary?

> +}
> +
> +static int is_overlaped(unsigned long s1, unsigned long e1,

overlapped and perhaps bool?

> +                        unsigned long s2, unsigned long e2)
> +{
> +    return (s1 <= s2 && s2 < e1) || (s2 < s1 && s1 < e2);
> +}
> +
> +static int check_reserved_size(unsigned long rsv_mfns, unsigned long 
> total_mfns)

bool?
> +{
> +    return rsv_mfns >=
> +        ((sizeof(struct page_info) * total_mfns) >> PAGE_SHIFT) +
> +        ((sizeof(*machine_to_phys_mapping) * total_mfns) >> PAGE_SHIFT);
> +}
> +
> +static int pmem_add_check(unsigned long spfn, unsigned long epfn,

bool?
> +                          unsigned long rsv_spfn, unsigned long rsv_epfn,
> +                          unsigned long data_spfn, unsigned long data_epfn)
> +{
> +    if ( spfn >= epfn || rsv_spfn >= rsv_epfn || data_spfn >= data_epfn )
> +        return 0;

Hm, I think it ought to be possible to have no rsv area..?

> +
> +    if ( !is_included(spfn, epfn, rsv_spfn, rsv_epfn) ||
> +         !is_included(spfn, epfn, data_spfn, data_epfn) )
> +        return 0;
> +
> +    if ( is_overlaped(rsv_spfn, rsv_epfn, data_spfn, data_epfn) )
> +        return 0;
> +
> +    if ( !check_reserved_size(rsv_epfn - rsv_spfn, epfn - spfn) )
> +        return 0;
> +
> +    return 1;
> +}
> +
> +static int pmem_list_add(unsigned long spfn, unsigned long epfn,
> +                         unsigned long rsv_spfn, unsigned long rsv_epfn,
> +                         unsigned long data_spfn, unsigned long data_epfn)
> +{
> +    struct list_head *cur;
> +    struct pmem *new_pmem;
> +    int ret = 0;
> +
> +    spin_lock(&pmem_list_lock);
> +
> +    list_for_each_prev(cur, &pmem_list)
> +    {
> +        struct pmem *cur_pmem = list_entry(cur, struct pmem, link);
> +        unsigned long cur_spfn = cur_pmem->spfn;
> +        unsigned long cur_epfn = cur_pmem->epfn;
> +
> +        if ( (cur_spfn <= spfn && spfn < cur_epfn) ||
> +             (spfn <= cur_spfn && cur_spfn < epfn) )
> +        {
> +            ret = -EINVAL;
> +            goto out;
> +        }
> +
> +        if ( cur_spfn < spfn )
> +            break;
> +    }
> +
> +    new_pmem = xmalloc(struct pmem);
> +    if ( !new_pmem )
> +    {
> +        ret = -ENOMEM;
> +        goto out;
> +    }
> +    new_pmem->spfn      = spfn;
> +    new_pmem->epfn      = epfn;
> +    new_pmem->rsv_spfn  = rsv_spfn;
> +    new_pmem->rsv_epfn  = rsv_epfn;
> +    new_pmem->data_spfn = data_spfn;
> +    new_pmem->data_epfn = data_epfn;
> +    list_add(&new_pmem->link, cur);
> +
> + out:
> +    spin_unlock(&pmem_list_lock);
> +    return ret;
> +}
> +
> +int pmem_add(unsigned long spfn, unsigned long epfn,
> +             unsigned long rsv_spfn, unsigned long rsv_epfn,
> +             unsigned long data_spfn, unsigned long data_epfn)
> +{
> +    int ret;
> +
> +    if ( !pmem_add_check(spfn, epfn, rsv_spfn, rsv_epfn, data_spfn, 
> data_epfn) )
> +        return -EINVAL;
> +
> +    ret = pmem_setup(spfn, epfn, rsv_spfn, rsv_epfn, data_spfn, data_epfn);
> +    if ( ret )
> +        goto out;
> +
> +    ret = iomem_deny_access(current->domain, rsv_spfn, rsv_epfn);
> +    if ( ret )
> +        goto out;
> +
> +    ret = pmem_list_add(spfn, epfn, rsv_spfn, rsv_epfn, data_spfn, 
> data_epfn);
> +    if ( ret )
> +        goto out;
> +
> +    printk(XENLOG_INFO
> +           "pmem: pfns     0x%lx - 0x%lx\n"
> +           "      reserved 0x%lx - 0x%lx\n"
> +           "      data     0x%lx - 0x%lx\n",
> +           spfn, epfn, rsv_spfn, rsv_epfn, data_spfn, data_epfn);
> +
> + out:
> +    return ret;
> +}
> diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
> index 5c0f527..b1f92f6 100644
> --- a/xen/arch/x86/x86_64/mm.c
> +++ b/xen/arch/x86/x86_64/mm.c
> @@ -1474,6 +1474,60 @@ destroy_frametable:
>      return ret;
>  }
>  
> +int pmem_setup(unsigned long spfn, unsigned long epfn,
> +               unsigned long rsv_spfn, unsigned long rsv_epfn,
> +               unsigned long data_spfn, unsigned long data_epfn)
> +{
> +    unsigned old_max = max_page, old_total = total_pages;
> +    struct mem_hotadd_info info =
> +        { .spfn = spfn, .epfn = epfn, .cur = spfn };
> +    struct mem_hotadd_info rsv_info =
> +        { .spfn = rsv_spfn, .epfn = rsv_epfn, .cur = rsv_spfn };
> +    int ret;
> +    unsigned long i;
> +    struct page_info *pg;
> +
> +    if ( !mem_hotadd_check(spfn, epfn) )
> +        return -EINVAL;
> +
> +    ret = extend_frame_table(&info, &rsv_info);

Aah, that is why you needed this extra piece.

> +    if ( ret )
> +        goto destroy_frametable;
> +
> +    if ( max_page < epfn )
> +    {
> +        max_page = epfn;
> +        max_pdx = pfn_to_pdx(max_page - 1) + 1;
> +    }
> +    total_pages += epfn - spfn;
> +
> +    set_pdx_range(spfn, epfn);
> +    ret = setup_m2p_table(&info, &rsv_info);
> +    if ( ret )
> +        goto destroy_m2p;
> +
> +    share_hotadd_m2p_table(&info);
> +
> +    for ( i = spfn; i < epfn; i++ )
> +    {
> +        pg = mfn_to_page(i);
> +        pg->count_info = (rsv_spfn <= i && i < rsv_info.cur) ?
> +                         PGC_state_inuse : PGC_state_free;
> +    }
> +

What about iommu_map_page calls to make it possible for dom0 to
do DMA operations to this area?

> +    return 0;
> +
> +destroy_m2p:
> +    destroy_m2p_mapping(&info);
> +    max_page = old_max;
> +    total_pages = old_total;
> +    max_pdx = pfn_to_pdx(max_page - 1) + 1;
> +destroy_frametable:
> +    cleanup_frame_table(&info);
> +
> +    return ret;
> +}
> +
>  #include "compat/mm.c"
>  
>  /*
> diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
> index b781495..e31f1c8 100644
> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -597,4 +597,8 @@ typedef struct mm_rwlock {
>  
>  extern const char zero_page[];
>  
> +int pmem_setup(unsigned long spfn, unsigned long epfn,
> +               unsigned long rsv_spfn, unsigned long rsv_epfn,
> +               unsigned long data_spfn, unsigned long data_epfn);
> +
>  #endif /* __ASM_X86_MM_H__ */
> diff --git a/xen/include/public/platform.h b/xen/include/public/platform.h
> index 1e6a6ce..c7e7cce 100644
> --- a/xen/include/public/platform.h
> +++ b/xen/include/public/platform.h
> @@ -608,6 +608,19 @@ struct xenpf_symdata {
>  typedef struct xenpf_symdata xenpf_symdata_t;
>  DEFINE_XEN_GUEST_HANDLE(xenpf_symdata_t);
>  
> +#define XENPF_pmem_add     64
> +struct xenpf_pmem_add {
> +    /* IN variables */
> +    uint64_t spfn;      /* start PFN of the whole pmem region */
> +    uint64_t epfn;      /* end PFN of the whole pmem region */
> +    uint64_t rsv_spfn;  /* start PFN of the reserved area within the region 
> */
> +    uint64_t rsv_epfn;  /* end PFN of the reserved area within the region */

Could you include (perhaps above the hypercall) an explanation
what 'reserved' and 'data' is? And can these values be zero? Say spfn ==
data_spfn  and epfn == data_epfn?

> +    uint64_t data_spfn; /* start PFN of the data area within the region */
> +    uint64_t data_epfn; /* end PFN of the data area within the region */
> +};
> +typedef struct xenpf_pmem_add xenpf_pmem_add_t;
> +DEFINE_XEN_GUEST_HANDLE(xenpf_pmem_add_t);
> +
>  /*
>   * ` enum neg_errnoval
>   * ` HYPERVISOR_platform_op(const struct xen_platform_op*);
> @@ -638,6 +651,7 @@ struct xen_platform_op {
>          struct xenpf_core_parking      core_parking;
>          struct xenpf_resource_op       resource_op;
>          struct xenpf_symdata           symdata;
> +        struct xenpf_pmem_add          pmem_add;
>          uint8_t                        pad[128];
>      } u;
>  };
> diff --git a/xen/include/xen/pmem.h b/xen/include/xen/pmem.h
> new file mode 100644
> index 0000000..a670ab8
> --- /dev/null
> +++ b/xen/include/xen/pmem.h
> @@ -0,0 +1,31 @@
> +/*
> + * xen/include/xen/pmem.h
> + *
> + * Copyright (c) 2016, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.

This '(at your option)' is for Intel lawyers to decide on. Could you
make sure you include what version they would be comfortable with?

> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
> + *
> + * Author: Haozhong Zhang <haozhong.zhang@xxxxxxxxx>
> + */
> +
> +#ifndef __XEN_PMEM_H__
> +#define __XEN_PMEM_H__
> +
> +#include <xen/types.h>
> +
> +int pmem_add(unsigned long spfn, unsigned long epfn,
> +             unsigned long rsv_spfn, unsigned long rsv_epfn,
> +             unsigned long data_spfn, unsigned long data_epfn);
> +
> +#endif /* __XEN_PMEM_H__ */
> diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
> index 177c11f..948a161 100644
> --- a/xen/xsm/flask/hooks.c
> +++ b/xen/xsm/flask/hooks.c
> @@ -1360,6 +1360,7 @@ static int flask_platform_op(uint32_t op)
>      case XENPF_cpu_offline:
>      case XENPF_cpu_hotadd:
>      case XENPF_mem_hotadd:
> +    case XENPF_pmem_add:

Thanks for looking at XSM, but I think you missed the comment above:

/* These operations have their own XSM hooks */

which means that platform_hypercall.c should have an call to
xsm_resource_plug_core(XSM_HOOK) call. Or something equivalant.

>          return 0;
>  #endif
>  
> -- 
> 2.10.1
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> https://lists.xen.org/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.