[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 1/4] xen: Introduce non-broken hypercalls for the paging mempool size



On Thu, 17 Nov 2022, Andrew Cooper wrote:
> The existing XEN_DOMCTL_SHADOW_OP_{GET,SET}_ALLOCATION have problems:
> 
>  * All set_allocation() flavours have an overflow-before-widen bug when
>    calculating "sc->mb << (20 - PAGE_SHIFT)".
>  * All flavours have a granularity of 1M.  This was tolerable when the size of
>    the pool could only be set at the same granularity, but is broken now that
>    ARM has a 16-page stopgap allocation in use.
>  * All get_allocation() flavours round up, and in particular turn 0 into 1,
>    meaning the get op returns junk before a successful set op.
>  * The x86 flavours reject the hypercalls before the VM has vCPUs allocated,
>    despite the pool size being a domain property.
>  * Even the hypercall names are long-obsolete.
> 
> Implement a better interface, which can be first used to unit test the
> behaviour, and subsequently correct a broken implementation.  The old
> interface will be retired in due course.
> 
> The unit of bytes (as opposed pages) is a deliberate API/ABI improvement to
> more easily support multiple page granularities.
> 
> This is part of XSA-409 / CVE-2022-33747.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Release-acked-by: Henry Wang <Henry.Wang@xxxxxxx>

ARM side:

Reviewed-by: Stefano Stabellini <sstabellini@xxxxxxxxxx>

> ---
> CC: Jan Beulich <JBeulich@xxxxxxxx>
> CC: Roger Pau Monné <roger.pau@xxxxxxxxxx>
> CC: Wei Liu <wl@xxxxxxx>
> CC: Stefano Stabellini <sstabellini@xxxxxxxxxx>
> CC: Julien Grall <julien@xxxxxxx>
> CC: Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>
> CC: Bertrand Marquis <bertrand.marquis@xxxxxxx>
> CC: Henry Wang <Henry.Wang@xxxxxxx>
> CC: Anthony PERARD <anthony.perard@xxxxxxxxxx>
> 
> v2:
>  * s/p2m/paging/
>  * Fix overflow-before-widen in ARM's arch_get_p2m_mempool_size()
>  * Fix overflow-before-widen in both {hap,shadow}_get_allocation_bytes()
>  * Leave a TODO about x86/PV, drop assertion.
>  * Check for long->int truncation in x86's arch_set_paging_mempool_size()
> 
> Future TODOs:
>  * x86 shadow still rounds up.  This is buggy as it's a simultaneous equation
>    with tot_pages which varies over time with ballooning.
>  * x86 PV is weird.  There are no toolstack interact with the shadow pool
>    size, but the "shadow" pool it does come into existence when logdirty (or
>    pv-l1tf) when first enabled.
>  * The shadow+hap logic is in desperate need of deduping.
> ---
>  tools/include/xenctrl.h           |  3 +++
>  tools/libs/ctrl/xc_domain.c       | 29 ++++++++++++++++++++++++++
>  xen/arch/arm/p2m.c                | 26 +++++++++++++++++++++++
>  xen/arch/x86/include/asm/hap.h    |  1 +
>  xen/arch/x86/include/asm/shadow.h |  4 ++++
>  xen/arch/x86/mm/hap/hap.c         | 11 ++++++++++
>  xen/arch/x86/mm/paging.c          | 43 
> +++++++++++++++++++++++++++++++++++++++
>  xen/arch/x86/mm/shadow/common.c   | 11 ++++++++++
>  xen/common/domctl.c               | 14 +++++++++++++
>  xen/include/public/domctl.h       | 24 +++++++++++++++++++++-
>  xen/include/xen/domain.h          |  3 +++
>  11 files changed, 168 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/include/xenctrl.h b/tools/include/xenctrl.h
> index 0c8b4c3aa7a5..23037874d3d5 100644
> --- a/tools/include/xenctrl.h
> +++ b/tools/include/xenctrl.h
> @@ -893,6 +893,9 @@ long long xc_logdirty_control(xc_interface *xch,
>                                unsigned int mode,
>                                xc_shadow_op_stats_t *stats);
>  
> +int xc_get_paging_mempool_size(xc_interface *xch, uint32_t domid, uint64_t 
> *size);
> +int xc_set_paging_mempool_size(xc_interface *xch, uint32_t domid, uint64_t 
> size);
> +
>  int xc_sched_credit_domain_set(xc_interface *xch,
>                                 uint32_t domid,
>                                 struct xen_domctl_sched_credit *sdom);
> diff --git a/tools/libs/ctrl/xc_domain.c b/tools/libs/ctrl/xc_domain.c
> index 14c0420c35be..e939d0715739 100644
> --- a/tools/libs/ctrl/xc_domain.c
> +++ b/tools/libs/ctrl/xc_domain.c
> @@ -706,6 +706,35 @@ long long xc_logdirty_control(xc_interface *xch,
>      return (rc == 0) ? domctl.u.shadow_op.pages : rc;
>  }
>  
> +int xc_get_paging_mempool_size(xc_interface *xch, uint32_t domid, uint64_t 
> *size)
> +{
> +    int rc;
> +    struct xen_domctl domctl = {
> +        .cmd         = XEN_DOMCTL_get_paging_mempool_size,
> +        .domain      = domid,
> +    };
> +
> +    rc = do_domctl(xch, &domctl);
> +    if ( rc )
> +        return rc;
> +
> +    *size = domctl.u.paging_mempool.size;
> +    return 0;
> +}
> +
> +int xc_set_paging_mempool_size(xc_interface *xch, uint32_t domid, uint64_t 
> size)
> +{
> +    struct xen_domctl domctl = {
> +        .cmd         = XEN_DOMCTL_set_paging_mempool_size,
> +        .domain      = domid,
> +        .u.paging_mempool = {
> +            .size = size,
> +        },
> +    };
> +
> +    return do_domctl(xch, &domctl);
> +}
> +
>  int xc_domain_setmaxmem(xc_interface *xch,
>                          uint32_t domid,
>                          uint64_t max_memkb)
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index 94d3b60b1387..8c1972e58227 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -100,6 +100,13 @@ unsigned int p2m_get_allocation(struct domain *d)
>      return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT);
>  }
>  
> +/* Return the size of the pool, in bytes. */
> +int arch_get_paging_mempool_size(struct domain *d, uint64_t *size)
> +{
> +    *size = (uint64_t)ACCESS_ONCE(d->arch.paging.p2m_total_pages) << 
> PAGE_SHIFT;
> +    return 0;
> +}
> +
>  /*
>   * Set the pool of pages to the required number of pages.
>   * Returns 0 for success, non-zero for failure.
> @@ -157,6 +164,25 @@ int p2m_set_allocation(struct domain *d, unsigned long 
> pages, bool *preempted)
>      return 0;
>  }
>  
> +int arch_set_paging_mempool_size(struct domain *d, uint64_t size)
> +{
> +    unsigned long pages = size >> PAGE_SHIFT;
> +    bool preempted = false;
> +    int rc;
> +
> +    if ( (size & ~PAGE_MASK) ||          /* Non page-sized request? */
> +         pages != (size >> PAGE_SHIFT) ) /* 32-bit overflow? */
> +        return -EINVAL;
> +
> +    spin_lock(&d->arch.paging.lock);
> +    rc = p2m_set_allocation(d, pages, &preempted);
> +    spin_unlock(&d->arch.paging.lock);
> +
> +    ASSERT(preempted == (rc == -ERESTART));
> +
> +    return rc;
> +}
> +
>  int p2m_teardown_allocation(struct domain *d)
>  {
>      int ret = 0;
> diff --git a/xen/arch/x86/include/asm/hap.h b/xen/arch/x86/include/asm/hap.h
> index 90dece29deca..14d2f212dab9 100644
> --- a/xen/arch/x86/include/asm/hap.h
> +++ b/xen/arch/x86/include/asm/hap.h
> @@ -47,6 +47,7 @@ int   hap_track_dirty_vram(struct domain *d,
>  extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
>  int hap_set_allocation(struct domain *d, unsigned int pages, bool 
> *preempted);
>  unsigned int hap_get_allocation(struct domain *d);
> +int hap_get_allocation_bytes(struct domain *d, uint64_t *size);
>  
>  #endif /* XEN_HAP_H */
>  
> diff --git a/xen/arch/x86/include/asm/shadow.h 
> b/xen/arch/x86/include/asm/shadow.h
> index 1365fe480518..dad876d29499 100644
> --- a/xen/arch/x86/include/asm/shadow.h
> +++ b/xen/arch/x86/include/asm/shadow.h
> @@ -97,6 +97,8 @@ void shadow_blow_tables_per_domain(struct domain *d);
>  int shadow_set_allocation(struct domain *d, unsigned int pages,
>                            bool *preempted);
>  
> +int shadow_get_allocation_bytes(struct domain *d, uint64_t *size);
> +
>  #else /* !CONFIG_SHADOW_PAGING */
>  
>  #define shadow_vcpu_teardown(v) ASSERT(is_pv_vcpu(v))
> @@ -108,6 +110,8 @@ int shadow_set_allocation(struct domain *d, unsigned int 
> pages,
>      ({ ASSERT_UNREACHABLE(); -EOPNOTSUPP; })
>  #define shadow_set_allocation(d, pages, preempted) \
>      ({ ASSERT_UNREACHABLE(); -EOPNOTSUPP; })
> +#define shadow_get_allocation_bytes(d, size) \
> +    ({ ASSERT_UNREACHABLE(); -EOPNOTSUPP; })
>  
>  static inline void sh_remove_shadows(struct domain *d, mfn_t gmfn,
>                                       int fast, int all) {}
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index f809ea9aa6ae..0fc1b1d9aced 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -345,6 +345,17 @@ unsigned int hap_get_allocation(struct domain *d)
>              + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
>  }
>  
> +int hap_get_allocation_bytes(struct domain *d, uint64_t *size)
> +{
> +    unsigned long pages = d->arch.paging.hap.total_pages;
> +
> +    pages += d->arch.paging.hap.p2m_pages;
> +
> +    *size = pages << PAGE_SHIFT;
> +
> +    return 0;
> +}
> +
>  /* Set the pool of pages to the required number of pages.
>   * Returns 0 for success, non-zero for failure. */
>  int hap_set_allocation(struct domain *d, unsigned int pages, bool *preempted)
> diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
> index 3a355eee9ca3..8d579fa9a3e8 100644
> --- a/xen/arch/x86/mm/paging.c
> +++ b/xen/arch/x86/mm/paging.c
> @@ -977,6 +977,49 @@ int __init paging_set_allocation(struct domain *d, 
> unsigned int pages,
>  }
>  #endif
>  
> +int arch_get_paging_mempool_size(struct domain *d, uint64_t *size)
> +{
> +    int rc;
> +
> +    if ( is_pv_domain(d) )                 /* TODO: Relax in due course */
> +        return -EOPNOTSUPP;
> +
> +    if ( hap_enabled(d) )
> +        rc = hap_get_allocation_bytes(d, size);
> +    else
> +        rc = shadow_get_allocation_bytes(d, size);
> +
> +    return rc;
> +}
> +
> +int arch_set_paging_mempool_size(struct domain *d, uint64_t size)
> +{
> +    unsigned long pages = size >> PAGE_SHIFT;
> +    bool preempted = false;
> +    int rc;
> +
> +    if ( is_pv_domain(d) )                 /* TODO: Relax in due course */
> +        return -EOPNOTSUPP;
> +
> +    if ( size & ~PAGE_MASK ||              /* Non page-sized request? */
> +         pages != (unsigned int)pages )    /* Overflow $X_set_allocation()? 
> */
> +        return -EINVAL;
> +
> +    paging_lock(d);
> +    if ( hap_enabled(d) )
> +        rc = hap_set_allocation(d, pages, &preempted);
> +    else
> +        rc = shadow_set_allocation(d, pages, &preempted);
> +    paging_unlock(d);
> +
> +    /*
> +     * TODO: Adjust $X_set_allocation() so this is true.
> +    ASSERT(preempted == (rc == -ERESTART));
> +     */
> +
> +    return preempted ? -ERESTART : rc;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
> index badfd53c6b23..a8404f97f668 100644
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -1427,6 +1427,17 @@ static unsigned int shadow_get_allocation(struct 
> domain *d)
>              + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
>  }
>  
> +int shadow_get_allocation_bytes(struct domain *d, uint64_t *size)
> +{
> +    unsigned long pages = d->arch.paging.shadow.total_pages;
> +
> +    pages += d->arch.paging.shadow.p2m_pages;
> +
> +    *size = pages << PAGE_SHIFT;
> +
> +    return 0;
> +}
> +
>  /**************************************************************************/
>  /* Hash table for storing the guest->shadow mappings.
>   * The table itself is an array of pointers to shadows; the shadows are then
> diff --git a/xen/common/domctl.c b/xen/common/domctl.c
> index 69fb9abd346f..ad71ad8a4cc5 100644
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -874,6 +874,20 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
> u_domctl)
>          ret = iommu_do_domctl(op, d, u_domctl);
>          break;
>  
> +    case XEN_DOMCTL_get_paging_mempool_size:
> +        ret = arch_get_paging_mempool_size(d, &op->u.paging_mempool.size);
> +        if ( !ret )
> +            copyback = 1;
> +        break;
> +
> +    case XEN_DOMCTL_set_paging_mempool_size:
> +        ret = arch_set_paging_mempool_size(d, op->u.paging_mempool.size);
> +
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(
> +                __HYPERVISOR_domctl, "h", u_domctl);
> +        break;
> +
>      default:
>          ret = arch_do_domctl(op, d, u_domctl);
>          break;
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> index b2ae839c3632..d4072761791a 100644
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -214,7 +214,10 @@ struct xen_domctl_getpageframeinfo3 {
>   /* Return the bitmap but do not modify internal copy. */
>  #define XEN_DOMCTL_SHADOW_OP_PEEK        12
>  
> -/* Memory allocation accessors. */
> +/*
> + * Memory allocation accessors.  These APIs are broken and will be removed.
> + * Use XEN_DOMCTL_{get,set}_paging_mempool_size instead.
> + */
>  #define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION   30
>  #define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION   31
>  
> @@ -946,6 +949,22 @@ struct xen_domctl_cacheflush {
>      xen_pfn_t start_pfn, nr_pfns;
>  };
>  
> +/*
> + * XEN_DOMCTL_get_paging_mempool_size / XEN_DOMCTL_set_paging_mempool_size.
> + *
> + * Get or set the paging memory pool size.  The size is in bytes.
> + *
> + * This is a dedicated pool of memory for Xen to use while managing the 
> guest,
> + * typically containing pagetables.  As such, there is an implementation
> + * specific minimum granularity.
> + *
> + * The set operation can fail mid-way through the request (e.g. Xen running
> + * out of memory, no free memory to reclaim from the pool, etc.).
> + */
> +struct xen_domctl_paging_mempool {
> +    uint64_aligned_t size; /* IN/OUT.  Size in bytes. */
> +};
> +
>  #if defined(__i386__) || defined(__x86_64__)
>  struct xen_domctl_vcpu_msr {
>      uint32_t         index;
> @@ -1274,6 +1293,8 @@ struct xen_domctl {
>  #define XEN_DOMCTL_get_cpu_policy                82
>  #define XEN_DOMCTL_set_cpu_policy                83
>  #define XEN_DOMCTL_vmtrace_op                    84
> +#define XEN_DOMCTL_get_paging_mempool_size       85
> +#define XEN_DOMCTL_set_paging_mempool_size       86
>  #define XEN_DOMCTL_gdbsx_guestmemio            1000
>  #define XEN_DOMCTL_gdbsx_pausevcpu             1001
>  #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
> @@ -1335,6 +1356,7 @@ struct xen_domctl {
>          struct xen_domctl_psr_alloc         psr_alloc;
>          struct xen_domctl_vuart_op          vuart_op;
>          struct xen_domctl_vmtrace_op        vmtrace_op;
> +        struct xen_domctl_paging_mempool    paging_mempool;
>          uint8_t                             pad[128];
>      } u;
>  };
> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
> index 2c8116afba27..0de9cbc1696d 100644
> --- a/xen/include/xen/domain.h
> +++ b/xen/include/xen/domain.h
> @@ -98,6 +98,9 @@ void arch_get_info_guest(struct vcpu *, 
> vcpu_guest_context_u);
>  int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg);
>  int default_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) 
> arg);
>  
> +int arch_get_paging_mempool_size(struct domain *d, uint64_t *size /* bytes 
> */);
> +int arch_set_paging_mempool_size(struct domain *d, uint64_t size /* bytes 
> */);
> +
>  int domain_relinquish_resources(struct domain *d);
>  
>  void dump_pageframe_info(struct domain *d);
> -- 
> 2.11.0
> 

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.