[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v2 6/8] x86: get per domain CQM information



On 21/11/13 07:20, dongxiao.xu@xxxxxxxxx wrote:
> From: Dongxiao Xu <dongxiao.xu@xxxxxxxxx>
>
> Retrive CQM information for certain domain, which reflects the L3 cache
> occupancy for a socket.
>
> Signed-off-by: Jiongxi Li <jiongxi.li@xxxxxxxxx>
> Signed-off-by: Dongxiao Xu <dongxiao.xu@xxxxxxxxx>
> ---
>  xen/arch/x86/pqos.c             |   57 ++++++++++++++++++++++++++++++++++
>  xen/arch/x86/sysctl.c           |   64 
> +++++++++++++++++++++++++++++++++++++++
>  xen/include/asm-x86/msr-index.h |    4 +++
>  xen/include/asm-x86/pqos.h      |   14 +++++++++
>  xen/include/public/domctl.h     |   14 +++++++++
>  xen/include/public/sysctl.h     |   15 +++++++++
>  6 files changed, 168 insertions(+)
>
> diff --git a/xen/arch/x86/pqos.c b/xen/arch/x86/pqos.c
> index e294799..d95784c 100644
> --- a/xen/arch/x86/pqos.c
> +++ b/xen/arch/x86/pqos.c
> @@ -19,14 +19,30 @@
>   * Place - Suite 330, Boston, MA 02111-1307 USA.
>   */
>  #include <asm/processor.h>
> +#include <asm/msr.h>
>  #include <xen/init.h>
>  #include <xen/spinlock.h>
>  #include <xen/sched.h>
> +#include <public/domctl.h>
>  #include <asm/pqos.h>
>  
>  static bool_t pqos_enabled = 1;
>  boolean_param("pqos", pqos_enabled);
>  
> +static void read_qm_data(void *arg)
> +{
> +    struct qm_element *qm_element = arg;
> +
> +    wrmsr(MSR_IA32_QOSEVTSEL, qm_element->evtid, qm_element->rmid);
> +    rdmsrl(MSR_IA32_QMC, qm_element->qm_data);
> +}
> +
> +static void get_generic_qm_info(struct qm_element *qm_element)
> +{
> +    unsigned int cpu = qm_element->cpu;
> +    on_selected_cpus(cpumask_of(cpu), read_qm_data, qm_element, 1);
> +}
> +
>  unsigned int cqm_res_count = 0;
>  unsigned int cqm_upscaling_factor = 0;
>  bool_t cqm_enabled = 0;
> @@ -86,6 +102,23 @@ bool_t system_supports_cqm(void)
>      return cqm_enabled;
>  }
>  
> +unsigned int get_cqm_count(void)
> +{
> +    return cqm_res_count;
> +}
> +
> +unsigned int get_cqm_avail(void)
> +{
> +    unsigned int cqm_avail = 0;
> +    int i;

unsigned int please.  If cqm_res_count has its top bit set, the
following loop may never terminate.

> +
> +    for ( i = 0; i < cqm_res_count; i++ )
> +        if ( !cqm_res_array[i].inuse )
> +            cqm_avail++;
> +
> +    return cqm_avail;
> +}
> +
>  int alloc_cqm_rmid(struct domain *d)
>  {
>      int rmid, rc = 0;
> @@ -137,6 +170,30 @@ void free_cqm_rmid(struct domain *d)
>      d->arch.pqos_cqm_rmid = 0;
>  }
>  
> +void get_cqm_info(uint32_t rmid, cpumask_t cpu_cqmdata_map,

A cpumask_t is already quite large, and will get larger in the future. 
Pass by pointer please.

> +                  struct xen_domctl_getdomcqminfo *info)
> +{
> +    struct qm_element element;
> +    unsigned int cpu, i;
> +
> +    for_each_cpu ( cpu, &cpu_cqmdata_map )
> +    {
> +        element.cpu = cpu;
> +        element.rmid = rmid;
> +        element.evtid = QOS_MONITOR_EVTID_L3;
> +
> +        get_generic_qm_info(&element);
> +
> +        i = cpu_to_socket(cpu);

cpu_to_socket() can return BAD_APICID.

> +        info->socket_cqmdata[i].valid =
> +            (element.qm_data & IA32_QM_CTR_ERROR_MASK) ? 0 : 1;

info->socket_cqmdata[i].valid = !(element.qm_data & IA32_QM_CTR_ERROR_MASK);

> +        if ( info->socket_cqmdata[i].valid )
> +            info->socket_cqmdata[i].l3c_occupancy = element.qm_data * 
> cqm_upscaling_factor;
> +        else
> +            info->socket_cqmdata[i].l3c_occupancy = 0;
> +    }
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
> index 15d4b91..d631769 100644
> --- a/xen/arch/x86/sysctl.c
> +++ b/xen/arch/x86/sysctl.c
> @@ -28,6 +28,7 @@
>  #include <xen/nodemask.h>
>  #include <xen/cpu.h>
>  #include <xsm/xsm.h>
> +#include <asm/pqos.h>
>  
>  #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
>  
> @@ -101,6 +102,69 @@ long arch_do_sysctl(
>      }
>      break;
>  
> +    case XEN_SYSCTL_getdomcqminfolist:
> +    {

This whole hypercall makes me somewhat uneasy.

> +        struct domain *d;
> +        struct xen_domctl_getdomcqminfo info;
> +        uint32_t resource_count;
> +        uint32_t resource_avail;
> +        uint32_t num_domains = 0;
> +        cpumask_t cpu_cqmdata_map;
> +        DECLARE_BITMAP(sockets, QOS_MAX_SOCKETS);
> +        unsigned int cpu;
> +
> +        if ( !system_supports_cqm() )
> +        {
> +            ret = -ENODEV;
> +            break;
> +        }
> +
> +        resource_count = get_cqm_count();
> +        resource_avail = get_cqm_avail();
> +
> +        cpumask_clear(&cpu_cqmdata_map);
> +        bitmap_zero(sockets, QOS_MAX_SOCKETS);
> +        for_each_online_cpu(cpu)
> +        {
> +            int i = cpu_to_socket(cpu);
> +            if ( test_and_set_bit(i, sockets) )
> +                continue;
> +            cpumask_set_cpu(cpu, &cpu_cqmdata_map);
> +        }

What is this doing? It appears to be finding the first cpu on each socket.

> +
> +        rcu_read_lock(&domlist_read_lock);
> +        for_each_domain ( d )
> +        {
> +            if ( d->domain_id < sysctl->u.getdomaininfolist.first_domain )
> +                continue;
> +            if ( num_domains == sysctl->u.getdomaininfolist.max_domains )
> +                break;
> +            if ( d->arch.pqos_cqm_rmid <= 0 )
> +                continue;

Is there any case where pqos_cqm_rmid can be negative? alloc_cqm_rmid()
never assigns a negative number now in v2, in which case
d->arch.pqos_cqm_rmid can probably be unsigned (and related int rmid's
can be similarly promoted to unsigned)

> +            memset(&info, 0, sizeof(struct xen_domctl_getdomcqminfo));
> +            info.domain = d->domain_id;
> +            get_cqm_info(d->arch.pqos_cqm_rmid, cpu_cqmdata_map, &info);

So for a domain the hypercallee is interested in, we get its rmid, and
ask get_cqm_info() to individually IPI each one cpu from a socket to
fill in the info field?

The IPIs are quite expensive, and this system will currently monopolise
the first cpu on each socket.

> +
> +            if ( copy_to_guest_offset(sysctl->u.getdomcqminfolist.buffer,
> +                                      num_domains, &info, 1) )
> +            {
> +                ret = -EFAULT;
> +                break;
> +            }
> +
> +            num_domains++;

So this loop is primarily bounded by the number of domains, where each
domain with a valid rmid will result in a spate of IPI?

This looks like it needs hypercall continuation logic.

Also, how well does this intersect with updating the rmid assignment?

> +        }
> +        rcu_read_unlock(&domlist_read_lock);
> +
> +        sysctl->u.getdomcqminfolist.num_domains = num_domains;
> +        sysctl->u.getdomcqminfolist.resource_count = resource_count;
> +        sysctl->u.getdomcqminfolist.resource_avail = resource_avail;
> +
> +        if ( copy_to_guest(u_sysctl, sysctl, 1) )
> +            ret = -EFAULT;
> +    }
> +    break;

break should be inside the brace.

> +
>      default:
>          ret = -ENOSYS;
>          break;
> diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
> index e597a28..46ef165 100644
> --- a/xen/include/asm-x86/msr-index.h
> +++ b/xen/include/asm-x86/msr-index.h
> @@ -488,4 +488,8 @@
>  /* Geode defined MSRs */
>  #define MSR_GEODE_BUSCONT_CONF0              0x00001900
>  
> +/* Platform QoS register */
> +#define MSR_IA32_QOSEVTSEL             0x00000c8d
> +#define MSR_IA32_QMC                   0x00000c8e
> +
>  #endif /* __ASM_MSR_INDEX_H */
> diff --git a/xen/include/asm-x86/pqos.h b/xen/include/asm-x86/pqos.h
> index 88de139..5c86c5d 100644
> --- a/xen/include/asm-x86/pqos.h
> +++ b/xen/include/asm-x86/pqos.h
> @@ -27,15 +27,29 @@
>  /* QoS Monitoring Event ID */
>  #define QOS_MONITOR_EVTID_L3           0x1
>  
> +/* IA32_QM_CTR */
> +#define IA32_QM_CTR_ERROR_MASK         (0x3ul << 62)
> +
>  struct cqm_res_struct {
>      domid_t  domain_id;
>      bool_t   inuse;
>  };
>  
> +struct qm_element {
> +    uint64_t  qm_data;
> +    uint32_t  cpu;
> +    uint32_t  rmid;
> +    uint8_t   evtid;
> +};
> +
>  void init_platform_qos(void);
>  
>  bool_t system_supports_cqm(void);
>  int alloc_cqm_rmid(struct domain *);
>  void free_cqm_rmid(struct domain *);
> +unsigned int get_cqm_count(void);
> +unsigned int get_cqm_avail(void);
> +void get_cqm_info(uint32_t rmid, cpumask_t cpu_cqmdata_map,
> +                  struct xen_domctl_getdomcqminfo *info);
>  
>  #endif
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> index f5d7062..fe8b37f 100644
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -883,6 +883,20 @@ struct xen_domctl_qos_resource {
>  typedef struct xen_domctl_qos_resource xen_domctl_qos_resource_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_qos_resource_t);
>  
> +struct xen_socket_cqmdata {
> +    uint64_t l3c_occupancy;
> +    uint8_t  valid;
> +};
> +
> +struct xen_domctl_getdomcqminfo {
> +    /* OUT variables. */
> +    domid_t  domain;
> +#define QOS_MAX_SOCKETS    128

Baking this into the ABI seems short sighted, and in this specific case
looks to blow the 128 byte union size in a domctl structure.

The toolstack should be able to find the number of sockets on the
system, and provide a GUEST_HANDLE to an array of socket_cqmdata's of
the appropriate length.

> +    struct xen_socket_cqmdata socket_cqmdata[QOS_MAX_SOCKETS];
> +};
> +typedef struct xen_domctl_getdomcqminfo xen_domctl_getdomcqminfo_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomcqminfo_t);
> +
>  struct xen_domctl {
>      uint32_t cmd;
>  #define XEN_DOMCTL_createdomain                   1
> diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
> index 8437d31..0def306 100644
> --- a/xen/include/public/sysctl.h
> +++ b/xen/include/public/sysctl.h
> @@ -632,6 +632,19 @@ struct xen_sysctl_coverage_op {
>  typedef struct xen_sysctl_coverage_op xen_sysctl_coverage_op_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_sysctl_coverage_op_t);
>  
> +/* XEN_SYSCTL_getdomcqminfolist */
> +struct xen_sysctl_getdomcqminfolist {
> +    /* IN variables. */
> +    domid_t               first_domain;
> +    uint32_t              max_domains;
> +    XEN_GUEST_HANDLE_64(xen_domctl_getdomcqminfo_t) buffer;
> +    /* OUT variables. */
> +    uint32_t              num_domains;

num_domains and max_domains can be folded together as both an in and an
out parameter.  Also, "max_domains" is confusingly close to "max_domain"
at a glance.

~Andrew

> +    uint32_t              resource_count;
> +    uint32_t              resource_avail;
> +};
> +typedef struct xen_sysctl_getdomcqminfolist xen_sysctl_getdomcqminfolist_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomcqminfolist_t);
>  
>  struct xen_sysctl {
>      uint32_t cmd;
> @@ -654,6 +667,7 @@ struct xen_sysctl {
>  #define XEN_SYSCTL_cpupool_op                    18
>  #define XEN_SYSCTL_scheduler_op                  19
>  #define XEN_SYSCTL_coverage_op                   20
> +#define XEN_SYSCTL_getdomcqminfolist             21
>      uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
>      union {
>          struct xen_sysctl_readconsole       readconsole;
> @@ -675,6 +689,7 @@ struct xen_sysctl {
>          struct xen_sysctl_cpupool_op        cpupool_op;
>          struct xen_sysctl_scheduler_op      scheduler_op;
>          struct xen_sysctl_coverage_op       coverage_op;
> +        struct xen_sysctl_getdomcqminfolist getdomcqminfolist;
>          uint8_t                             pad[128];
>      } u;
>  };


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.