|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH 4 of 8] xen: allow for explicitly specifying node-affinity
On Fri, Oct 5, 2012 at 3:08 PM, Dario Faggioli
<dario.faggioli@xxxxxxxxxx> wrote:
> Make it possible to pass the node-affinity of a domain to the hypervisor
> from the upper layers, instead of always being computed automatically.
>
> Note that this also required generalizing the Flask hooks for setting
> and getting the affinity, so that they now deal with both vcpu and
> node affinity.
>
> Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
>
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -222,6 +222,7 @@ struct domain *domain_create(
>
> spin_lock_init(&d->node_affinity_lock);
> d->node_affinity = NODE_MASK_ALL;
> + d->auto_node_affinity = 1;
>
> spin_lock_init(&d->shutdown_lock);
> d->shutdown_code = -1;
> @@ -362,11 +363,26 @@ void domain_update_node_affinity(struct
> cpumask_or(cpumask, cpumask, online_affinity);
> }
>
> - for_each_online_node ( node )
> - if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
> - node_set(node, nodemask);
> + if ( d->auto_node_affinity )
> + {
> + /* Node-affinity is automaically computed from all vcpu-affinities */
> + for_each_online_node ( node )
> + if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
> + node_set(node, nodemask);
>
> - d->node_affinity = nodemask;
> + d->node_affinity = nodemask;
> + }
> + else
> + {
> + /* Node-affinity is provided by someone else, just filter out cpus
> + * that are either offline or not in the affinity of any vcpus. */
> + for_each_node_mask ( node, d->node_affinity )
> + if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) )
> + node_clear(node, d->node_affinity);
> + }
> +
> + sched_set_node_affinity(d, &d->node_affinity);
> +
> spin_unlock(&d->node_affinity_lock);
>
> free_cpumask_var(online_affinity);
> @@ -374,6 +390,36 @@ void domain_update_node_affinity(struct
> }
>
>
> +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity)
> +{
> + /* Being affine with no nodes is just wrong */
> + if ( nodes_empty(*affinity) )
> + return -EINVAL;
> +
> + spin_lock(&d->node_affinity_lock);
> +
> + /*
> + * Being/becoming explicitly affine to all nodes is not particularly
> + * useful. Let's take it as the `reset node affinity` command.
> + */
> + if ( nodes_full(*affinity) )
> + {
> + d->auto_node_affinity = 1;
> + goto out;
> + }
> +
> + d->auto_node_affinity = 0;
> + d->node_affinity = *affinity;
> +
> +out:
> + spin_unlock(&d->node_affinity_lock);
> +
> + domain_update_node_affinity(d);
> +
> + return 0;
> +}
> +
> +
> struct domain *get_domain_by_id(domid_t dom)
> {
> struct domain *d;
> diff --git a/xen/common/domctl.c b/xen/common/domctl.c
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -642,6 +642,40 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
> }
> break;
>
> + case XEN_DOMCTL_setnodeaffinity:
> + case XEN_DOMCTL_getnodeaffinity:
> + {
> + domid_t dom = op->domain;
> + struct domain *d = rcu_lock_domain_by_id(dom);
> +
> + ret = -ESRCH;
> + if ( d == NULL )
> + break;
> +
> + ret = xsm_nodeaffinity(op->cmd, d);
> + if ( ret )
> + goto nodeaffinity_out;
> +
> + if ( op->cmd == XEN_DOMCTL_setnodeaffinity )
> + {
> + nodemask_t new_affinity;
> +
> + ret = xenctl_bitmap_to_nodemask(&new_affinity,
> + &op->u.nodeaffinity.nodemap);
> + if ( !ret )
> + ret = domain_set_node_affinity(d, &new_affinity);
> + }
> + else
> + {
> + ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap,
> + &d->node_affinity);
> + }
> +
> + nodeaffinity_out:
> + rcu_unlock_domain(d);
> + }
> + break;
> +
> case XEN_DOMCTL_setvcpuaffinity:
> case XEN_DOMCTL_getvcpuaffinity:
> {
> diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
> --- a/xen/common/keyhandler.c
> +++ b/xen/common/keyhandler.c
> @@ -217,6 +217,14 @@ static void cpuset_print(char *set, int
> *set++ = '\0';
> }
>
> +static void nodeset_print(char *set, int size, const nodemask_t *mask)
> +{
> + *set++ = '[';
> + set += nodelist_scnprintf(set, size-2, mask);
> + *set++ = ']';
> + *set++ = '\0';
> +}
> +
> static void periodic_timer_print(char *str, int size, uint64_t period)
> {
> if ( period == 0 )
> @@ -272,6 +280,9 @@ static void dump_domains(unsigned char k
>
> dump_pageframe_info(d);
>
> + nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity);
> + printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr);
> +
> printk("VCPU information and callbacks for domain %u:\n",
> d->domain_id);
> for_each_vcpu ( d, v )
> diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
> --- a/xen/common/sched_credit.c
> +++ b/xen/common/sched_credit.c
> @@ -238,6 +238,33 @@ static inline void
> list_del_init(&svc->runq_elem);
> }
>
> +/*
> + * Translates node-affinity mask into a cpumask, so that we can use it during
> + * actual scheduling. That of course will contain all the cpus from all the
> + * set nodes in the original node-affinity mask.
> + *
> + * Note that any serialization needed to access mask safely is complete
> + * responsibility of the caller of this function/hook.
> + */
> +static void csched_set_node_affinity(
> + const struct scheduler *ops,
> + struct domain *d,
> + nodemask_t *mask)
> +{
> + struct csched_dom *sdom;
> + int node;
> +
> + /* Skip idle domain since it doesn't even have a node_affinity_cpumask */
> + if ( unlikely(is_idle_domain(d)) )
> + return;
> +
> + sdom = CSCHED_DOM(d);
> + cpumask_clear(sdom->node_affinity_cpumask);
> + for_each_node_mask( node, *mask )
> + cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask,
> + &node_to_cpumask(node));
> +}
> +
> #define for_each_csched_balance_step(__step) \
> for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- )
>
> @@ -260,7 +287,8 @@ csched_balance_cpumask(const struct vcpu
> struct domain *d = vc->domain;
> struct csched_dom *sdom = CSCHED_DOM(d);
>
> - if ( cpumask_full(sdom->node_affinity_cpumask) )
> + if ( cpumask_full(sdom->node_affinity_cpumask) ||
> + d->auto_node_affinity == 1 )
> return -1;
>
> cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity);
> @@ -1786,6 +1814,8 @@ const struct scheduler sched_credit_def
> .adjust = csched_dom_cntl,
> .adjust_global = csched_sys_cntl,
>
> + .set_node_affinity = csched_set_node_affinity,
> +
> .pick_cpu = csched_cpu_pick,
> .do_schedule = csched_schedule,
>
> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -588,6 +588,11 @@ int cpu_disable_scheduler(unsigned int c
> return ret;
> }
>
> +void sched_set_node_affinity(struct domain *d, nodemask_t *mask)
> +{
> + SCHED_OP(DOM2OP(d), set_node_affinity, d, mask);
> +}
> +
> int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity)
> {
> cpumask_t online_affinity;
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -279,6 +279,16 @@ typedef struct xen_domctl_getvcpuinfo xe
> DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
>
>
> +/* Get/set the NUMA node(s) with which the guest has affinity with. */
> +/* XEN_DOMCTL_setnodeaffinity */
> +/* XEN_DOMCTL_getnodeaffinity */
> +struct xen_domctl_nodeaffinity {
> + struct xenctl_bitmap nodemap;/* IN */
> +};
> +typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t);
> +
> +
> /* Get/set which physical cpus a vcpu can execute on. */
> /* XEN_DOMCTL_setvcpuaffinity */
> /* XEN_DOMCTL_getvcpuaffinity */
> @@ -900,6 +910,8 @@ struct xen_domctl {
> #define XEN_DOMCTL_set_access_required 64
> #define XEN_DOMCTL_audit_p2m 65
> #define XEN_DOMCTL_set_virq_handler 66
> +#define XEN_DOMCTL_setnodeaffinity 67
> +#define XEN_DOMCTL_getnodeaffinity 68
> #define XEN_DOMCTL_gdbsx_guestmemio 1000
> #define XEN_DOMCTL_gdbsx_pausevcpu 1001
> #define XEN_DOMCTL_gdbsx_unpausevcpu 1002
> @@ -913,6 +925,7 @@ struct xen_domctl {
> struct xen_domctl_getpageframeinfo getpageframeinfo;
> struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
> struct xen_domctl_getpageframeinfo3 getpageframeinfo3;
> + struct xen_domctl_nodeaffinity nodeaffinity;
> struct xen_domctl_vcpuaffinity vcpuaffinity;
> struct xen_domctl_shadow_op shadow_op;
> struct xen_domctl_max_mem max_mem;
> diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h
> --- a/xen/include/xen/nodemask.h
> +++ b/xen/include/xen/nodemask.h
> @@ -8,8 +8,9 @@
> * See detailed comments in the file linux/bitmap.h describing the
> * data type on which these nodemasks are based.
> *
> - * For details of nodemask_scnprintf() and nodemask_parse(),
> - * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
> + * For details of nodemask_scnprintf(), nodelist_scnpintf() and
> + * nodemask_parse(), see bitmap_scnprintf() and bitmap_parse()
> + * in lib/bitmap.c.
> *
> * The available nodemask operations are:
> *
> @@ -48,6 +49,7 @@
> * unsigned long *nodes_addr(mask) Array of unsigned long's in mask
> *
> * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
> + * int nodelist_scnprintf(buf, len, mask) Format nodemask as a list for
> printing
> * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask
> *
> * for_each_node_mask(node, mask) for-loop node over mask
> @@ -280,6 +282,14 @@ static inline int __first_unset_node(con
>
> #define nodes_addr(src) ((src).bits)
>
> +#define nodelist_scnprintf(buf, len, src) \
> + __nodelist_scnprintf((buf), (len), (src),
> MAX_NUMNODES)
> +static inline int __nodelist_scnprintf(char *buf, int len,
> + const nodemask_t *srcp, int nbits)
> +{
> + return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
> +}
> +
> #if 0
> #define nodemask_scnprintf(buf, len, src) \
> __nodemask_scnprintf((buf), (len), &(src),
> MAX_NUMNODES)
> diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
> --- a/xen/include/xen/sched-if.h
> +++ b/xen/include/xen/sched-if.h
> @@ -182,6 +182,8 @@ struct scheduler {
> struct xen_domctl_scheduler_op *);
> int (*adjust_global) (const struct scheduler *,
> struct xen_sysctl_scheduler_op *);
> + void (*set_node_affinity) (const struct scheduler *,
> + struct domain *, nodemask_t *);
> void (*dump_settings) (const struct scheduler *);
> void (*dump_cpu_state) (const struct scheduler *, int);
>
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -346,8 +346,12 @@ struct domain
> /* Various mem_events */
> struct mem_event_per_domain *mem_event;
>
> - /* Currently computed from union of all vcpu cpu-affinity masks. */
> + /*
> + * Can be specified by the user. If that is not the case, it is
> + * computed from the union of all the vcpu cpu-affinity masks.
> + */
> nodemask_t node_affinity;
> + int auto_node_affinity;
> unsigned int last_alloc_node;
> spinlock_t node_affinity_lock;
> };
> @@ -416,6 +420,7 @@ static inline void get_knownalive_domain
> ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
> }
>
> +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity);
> void domain_update_node_affinity(struct domain *d);
>
> struct domain *domain_create(
> @@ -519,6 +524,7 @@ void sched_destroy_domain(struct domain
> int sched_move_domain(struct domain *d, struct cpupool *c);
> long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
> long sched_adjust_global(struct xen_sysctl_scheduler_op *);
> +void sched_set_node_affinity(struct domain *, nodemask_t *);
> int sched_id(void);
> void sched_tick_suspend(void);
> void sched_tick_resume(void);
> diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h
> --- a/xen/include/xsm/xsm.h
> +++ b/xen/include/xsm/xsm.h
> @@ -56,6 +56,7 @@ struct xsm_operations {
> int (*domain_create) (struct domain *d, u32 ssidref);
> int (*max_vcpus) (struct domain *d);
> int (*destroydomain) (struct domain *d);
> + int (*nodeaffinity) (int cmd, struct domain *d);
> int (*vcpuaffinity) (int cmd, struct domain *d);
> int (*scheduler) (struct domain *d);
> int (*getdomaininfo) (struct domain *d);
> @@ -229,6 +230,11 @@ static inline int xsm_destroydomain (str
> return xsm_call(destroydomain(d));
> }
>
> +static inline int xsm_nodeaffinity (int cmd, struct domain *d)
> +{
> + return xsm_call(nodeaffinity(cmd, d));
> +}
> +
> static inline int xsm_vcpuaffinity (int cmd, struct domain *d)
> {
> return xsm_call(vcpuaffinity(cmd, d));
> diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
> --- a/xen/xsm/dummy.c
> +++ b/xen/xsm/dummy.c
> @@ -634,6 +634,7 @@ void xsm_fixup_ops (struct xsm_operation
> set_to_dummy_if_null(ops, domain_create);
> set_to_dummy_if_null(ops, max_vcpus);
> set_to_dummy_if_null(ops, destroydomain);
> + set_to_dummy_if_null(ops, nodeaffinity);
> set_to_dummy_if_null(ops, vcpuaffinity);
> set_to_dummy_if_null(ops, scheduler);
> set_to_dummy_if_null(ops, getdomaininfo);
> diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
> --- a/xen/xsm/flask/hooks.c
> +++ b/xen/xsm/flask/hooks.c
> @@ -521,17 +521,19 @@ static int flask_destroydomain(struct do
> DOMAIN__DESTROY);
> }
>
> -static int flask_vcpuaffinity(int cmd, struct domain *d)
> +static int flask_affinity(int cmd, struct domain *d)
> {
> u32 perm;
>
> switch ( cmd )
> {
> case XEN_DOMCTL_setvcpuaffinity:
> - perm = DOMAIN__SETVCPUAFFINITY;
> + case XEN_DOMCTL_setnodeaffinity:
> + perm = DOMAIN__SETAFFINITY;
> break;
> case XEN_DOMCTL_getvcpuaffinity:
> - perm = DOMAIN__GETVCPUAFFINITY;
> + case XEN_DOMCTL_getnodeaffinity:
> + perm = DOMAIN__GETAFFINITY;
> break;
> default:
> return -EPERM;
> @@ -1473,7 +1475,8 @@ static struct xsm_operations flask_ops =
> .domain_create = flask_domain_create,
> .max_vcpus = flask_max_vcpus,
> .destroydomain = flask_destroydomain,
> - .vcpuaffinity = flask_vcpuaffinity,
> + .nodeaffinity = flask_affinity,
> + .vcpuaffinity = flask_affinity,
> .scheduler = flask_scheduler,
> .getdomaininfo = flask_getdomaininfo,
> .getvcpucontext = flask_getvcpucontext,
> diff --git a/xen/xsm/flask/include/av_perm_to_string.h
> b/xen/xsm/flask/include/av_perm_to_string.h
> --- a/xen/xsm/flask/include/av_perm_to_string.h
> +++ b/xen/xsm/flask/include/av_perm_to_string.h
> @@ -37,8 +37,8 @@
> S_(SECCLASS_DOMAIN, DOMAIN__TRANSITION, "transition")
> S_(SECCLASS_DOMAIN, DOMAIN__MAX_VCPUS, "max_vcpus")
> S_(SECCLASS_DOMAIN, DOMAIN__DESTROY, "destroy")
> - S_(SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY, "setvcpuaffinity")
> - S_(SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY, "getvcpuaffinity")
> + S_(SECCLASS_DOMAIN, DOMAIN__SETAFFINITY, "setaffinity")
> + S_(SECCLASS_DOMAIN, DOMAIN__GETAFFINITY, "getaffinity")
The top of this file says, "This file is automatically generated. Do
not edit." I didn't see any files that might have been modified to
effect these changes -- did I miss them? Or is the comment a lie? Or
should you find that file and edit it instead? :-)
> S_(SECCLASS_DOMAIN, DOMAIN__SCHEDULER, "scheduler")
> S_(SECCLASS_DOMAIN, DOMAIN__GETDOMAININFO, "getdomaininfo")
> S_(SECCLASS_DOMAIN, DOMAIN__GETVCPUINFO, "getvcpuinfo")
> diff --git a/xen/xsm/flask/include/av_permissions.h
> b/xen/xsm/flask/include/av_permissions.h
> --- a/xen/xsm/flask/include/av_permissions.h
> +++ b/xen/xsm/flask/include/av_permissions.h
> @@ -38,8 +38,8 @@
> #define DOMAIN__TRANSITION 0x00000020UL
> #define DOMAIN__MAX_VCPUS 0x00000040UL
> #define DOMAIN__DESTROY 0x00000080UL
> -#define DOMAIN__SETVCPUAFFINITY 0x00000100UL
> -#define DOMAIN__GETVCPUAFFINITY 0x00000200UL
> +#define DOMAIN__SETAFFINITY 0x00000100UL
> +#define DOMAIN__GETAFFINITY 0x00000200UL
Same thing here.
Other than that, looks good!
-George
> #define DOMAIN__SCHEDULER 0x00000400UL
> #define DOMAIN__GETDOMAININFO 0x00000800UL
> #define DOMAIN__GETVCPUINFO 0x00001000UL
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |