Xen project Mailing List

Re: [Xen-devel] [PATCH v5 1/8] xen: vnuma topoplogy and subop hypercalls

>>> On 03.06.14 at 06:53, <ufimtseva@xxxxxxxxx> wrote: > +int vnuma_init_zero_topology(struct domain *d) > +{ > + d->vnuma.vmemrange[0].end = d->vnuma.vmemrange[d->vnuma.nr_vnodes - > 1].end; > + d->vnuma.vdistance[0] = 10; > + memset(d->vnuma.vnode_to_pnode, NUMA_NO_NODE, d->vnuma.nr_vnodes); Here you're relying on specific characteristics of NUMA_NO_NODE; you shouldn't be using memset() in cases like this. > + memset(d->vnuma.vcpu_to_vnode, 0, d->max_vcpus); > + d->vnuma.nr_vnodes = 1; Also, considering this, is there any point at all in setting any but the first array element of d->vnuma.vnode_to_pnode[]? > + return 0; > +} And in the end the function has no caller anyway, so one can't even judge whether e.g. d->vnuma.nr_vnodes is reliably > 0 on entry. > @@ -888,6 +889,89 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) > u_domctl) > } > break; > > + case XEN_DOMCTL_setvnumainfo: > + { > + unsigned int dist_size, nr_vnodes; > + > + ret = -EINVAL; > + > + /* If number of vnodes was set before, skip */ > + if ( d->vnuma.nr_vnodes > 0 ) > + break; > + > + nr_vnodes = op->u.vnuma.nr_vnodes; > + if ( nr_vnodes == 0 ) > + goto setvnumainfo_out; > + > + if ( nr_vnodes > (UINT_MAX / nr_vnodes) ) > + goto setvnumainfo_out; > + > + ret = -EFAULT; > + if ( guest_handle_is_null(op->u.vnuma.vdistance) || > + guest_handle_is_null(op->u.vnuma.vmemrange) || > + guest_handle_is_null(op->u.vnuma.vcpu_to_vnode) || > + guest_handle_is_null(op->u.vnuma.vnode_to_pnode) ) > + goto setvnumainfo_out; > + > + dist_size = nr_vnodes * nr_vnodes; > + > + d->vnuma.vdistance = xmalloc_array(unsigned int, dist_size); I think XSA-77 was issued between the previous iteration of this series and this one. With that advisory in mind the question here is - how big of an allocation can this become? I.e. I'm afraid the restriction enforced above on nr_vnodes isn't enough. A reasonable thing to do might be to make sure none of the allocation sizes would exceed PAGE_SIZE, at least for now (any bigger allocation - if needed in the future - would then need to be split). > + d->vnuma.vmemrange = xmalloc_array(vmemrange_t, nr_vnodes); > + d->vnuma.vcpu_to_vnode = xmalloc_array(unsigned int, d->max_vcpus); While not an unbounded allocation, I assume you realize that the current worst case is an order-3 page allocation here, which is prone to fail after reasonably long uptime of a system. It may nevertheless be okay for starters, since only domains with more than 1k vCPU-s would be affected, and that seems tolerable for the moment. > + d->vnuma.vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes); > + > + if ( d->vnuma.vdistance == NULL || > + d->vnuma.vmemrange == NULL || > + d->vnuma.vcpu_to_vnode == NULL || > + d->vnuma.vnode_to_pnode == NULL ) > + { > + ret = -ENOMEM; > + goto setvnumainfo_nomem; This is the only use of this label, i.e. the error handling code could be moved right here. Also, if you use goto here, then please be consistent about where you set "ret". > + } > + > + if ( unlikely(__copy_from_guest(d->vnuma.vdistance, > + op->u.vnuma.vdistance, > + dist_size)) ) > + goto setvnumainfo_out; > + if ( unlikely(__copy_from_guest(d->vnuma.vmemrange, > + op->u.vnuma.vmemrange, > + nr_vnodes)) ) > + goto setvnumainfo_out; > + if ( unlikely(__copy_from_guest(d->vnuma.vcpu_to_vnode, > + op->u.vnuma.vcpu_to_vnode, > + d->max_vcpus)) ) > + goto setvnumainfo_out; > + if ( unlikely(__copy_from_guest(d->vnuma.vnode_to_pnode, > + op->u.vnuma.vnode_to_pnode, > + nr_vnodes)) ) > + goto setvnumainfo_out; I'm relatively certain I commented on this earlier on: None of these __copy_from_guest() uses are legitimate, you always need copy_from_guest() here. > + > + /* Everything is good, lets set the number of vnodes */ > + d->vnuma.nr_vnodes = nr_vnodes; > + > + ret = 0; > + break; > + > + setvnumainfo_out: > + /* On failure, set one vNUMA node */ > + d->vnuma.vmemrange[0].end = d->vnuma.vmemrange[d->vnuma.nr_vnodes - > 1].end; > + d->vnuma.vdistance[0] = 10; > + memset(d->vnuma.vnode_to_pnode, NUMA_NO_NODE, d->vnuma.nr_vnodes); > + memset(d->vnuma.vcpu_to_vnode, 0, d->max_vcpus); > + d->vnuma.nr_vnodes = 1; Isn't this an open coded instance of vnuma_init_zero_topology()? Furthermore, earlier code rejects d->vnuma.nr_vnodes > 0, i.e. the array access above is underflowing the array. Plus d->vnuma.vmemrange[] is in an unknown state, so if you want to restore previous state you need to do this differently. > + ret = 0; And this is not a success path - according to the goto-s above you mean -EFAULT here. > + break; > + > + setvnumainfo_nomem: > + /* The only case where we set number of vnodes to 0 */ > + d->vnuma.nr_vnodes = 0; > + xfree(d->vnuma.vmemrange); > + xfree(d->vnuma.vdistance); > + xfree(d->vnuma.vnode_to_pnode); > + xfree(d->vnuma.vcpu_to_vnode); And this is open coded vnuma_destroy(). > --- a/xen/common/memory.c > +++ b/xen/common/memory.c > @@ -963,6 +963,73 @@ long do_memory_op(unsigned long cmd, > XEN_GUEST_HANDLE_PARAM(void) arg) > > break; > > + case XENMEM_get_vnuma_info: > + { > + struct vnuma_topology_info guest_topo; > + struct domain *d; > + > + if ( copy_from_guest(&guest_topo, arg, 1) ) > + return -EFAULT; > + if ( (d = rcu_lock_domain_by_any_id(guest_topo.domid)) == NULL ) > + return -ESRCH; > + > + if ( d->vnuma.nr_vnodes == 0 ) { Coding style still. > + rc = -EOPNOTSUPP; > + goto vnumainfo_out; > + } > + > + rc = -EOPNOTSUPP; > + /* > + * Guest may have different kernel configuration for > + * number of cpus/nodes. It informs about them via hypercall. > + */ > + if ( guest_topo.nr_vnodes < d->vnuma.nr_vnodes || > + guest_topo.nr_vcpus < d->max_vcpus ) > + goto vnumainfo_out; So the "no vNUMA" and "insufficient space" cases are completely indistinguishable for the guest if you use the same error value for them. Perhaps the latter should be -ENOBUFS? > + > + rc = -EFAULT; > + > + if ( guest_handle_is_null(guest_topo.vmemrange.h) || > + guest_handle_is_null(guest_topo.vdistance.h) || > + guest_handle_is_null(guest_topo.vcpu_to_vnode.h) ) > + goto vnumainfo_out; > + > + /* > + * Take a failure path if out of guest allocated memory for topology. > + * No partial copying. > + */ > + guest_topo.nr_vnodes = d->vnuma.nr_vnodes; > + > + if ( __copy_to_guest(guest_topo.vmemrange.h, > + d->vnuma.vmemrange, > + d->vnuma.nr_vnodes) != 0 ) > + goto vnumainfo_out; > + > + if ( __copy_to_guest(guest_topo.vdistance.h, > + d->vnuma.vdistance, > + d->vnuma.nr_vnodes * d->vnuma.nr_vnodes) != > 0 ) > + goto vnumainfo_out; > + > + if ( __copy_to_guest(guest_topo.vcpu_to_vnode.h, > + d->vnuma.vcpu_to_vnode, > + d->max_vcpus) != 0 ) > + goto vnumainfo_out; Again you mean copy_to_guest() everywhere here. Furthermore, how will the caller know the number of entries filled with the last of them? You only tell it about the actual number of virtual nodes above... Or actually, you only _mean_ to tell it about that number - I don't see you copying that field back out (that would be a place where the __ version of a guest-copy function would be correctly used, due to the earlier copy_from_guest() on the same address range). And finally, how is the consistency of data here ensured against a racing XEN_DOMCTL_setvnumainfo? > + > + rc = 0; > + > + vnumainfo_out: > + if ( rc != 0 ) > + /* > + * In case of failure to set vNUMA topology for guest, > + * leave everything as it is, print error only. Tools will > + * show for domain vnuma topology, but wont be seen in guest. > + */ > + gdprintk(XENLOG_INFO, "vNUMA: failed to copy topology info to > guest.\n"); So debugging printk()-s like this please in non-RFC patches. And the comment is talking about "set" too, while we're in "get" here. > +struct xen_domctl_vnuma { > + uint32_t nr_vnodes; > + uint32_t __pad; Please avoid double underscores or anything else violating C name space rules in public headers (eventual existing badness not being an excuse). > --- a/xen/include/public/memory.h > +++ b/xen/include/public/memory.h > @@ -354,6 +354,20 @@ struct xen_pod_target { > }; > typedef struct xen_pod_target xen_pod_target_t; > > +/* > + * XENMEM_get_vnuma_info used by caller to get > + * vNUMA topology constructed for particular domain. > + * > + * The data exchanged is presented by vnuma_topology_info. > + */ > +#define XENMEM_get_vnuma_info 26 > + > +/* > + * XENMEM_get_vnuma_pnode used by guest to determine > + * the physical node of the specified vnode. > + */ > +/*#define XENMEM_get_vnuma_pnode 27*/ What's this? And if it was enabled, what would its use be? I thought that we agreed that the guest shouldn't know of physical NUMA information. > --- /dev/null > +++ b/xen/include/public/vnuma.h And I know I commented on this one - the information being exposed through a mem-op, the respective definitions should be put in memory.h. > +struct vnuma_topology_info { > + /* IN */ > + domid_t domid; > + /* IN/OUT */ > + unsigned int nr_vnodes; > + unsigned int nr_vcpus; Interestingly here you even document nr_vcpus as also being an output of the hypercall. > + /* OUT */ > + union { > + XEN_GUEST_HANDLE(uint) h; > + uint64_t _pad; Same remark as above regarding C name space violation. > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -444,6 +444,7 @@ struct domain > nodemask_t node_affinity; > unsigned int last_alloc_node; > spinlock_t node_affinity_lock; > + struct vnuma_info vnuma; This being put nicely in a sub-structure of course raises the question whether, in order to restrict struct domain growth, this wouldn't better be a pointer. Jan _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.