[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v2 6/8] x86: get per domain CQM information



> -----Original Message-----
> From: Andrew Cooper [mailto:andrew.cooper3@xxxxxxxxxx]
> Sent: Thursday, November 21, 2013 10:09 PM
> To: Xu, Dongxiao
> Cc: xen-devel@xxxxxxxxxxxxx
> Subject: Re: [Xen-devel] [PATCH v2 6/8] x86: get per domain CQM information
> 
> On 21/11/13 07:20, dongxiao.xu@xxxxxxxxx wrote:
> > From: Dongxiao Xu <dongxiao.xu@xxxxxxxxx>
> >
> > Retrive CQM information for certain domain, which reflects the L3 cache
> > occupancy for a socket.
> >
> > Signed-off-by: Jiongxi Li <jiongxi.li@xxxxxxxxx>
> > Signed-off-by: Dongxiao Xu <dongxiao.xu@xxxxxxxxx>
> > ---
> >  xen/arch/x86/pqos.c             |   57
> ++++++++++++++++++++++++++++++++++
> >  xen/arch/x86/sysctl.c           |   64
> +++++++++++++++++++++++++++++++++++++++
> >  xen/include/asm-x86/msr-index.h |    4 +++
> >  xen/include/asm-x86/pqos.h      |   14 +++++++++
> >  xen/include/public/domctl.h     |   14 +++++++++
> >  xen/include/public/sysctl.h     |   15 +++++++++
> >  6 files changed, 168 insertions(+)
> >
> > diff --git a/xen/arch/x86/pqos.c b/xen/arch/x86/pqos.c
> > index e294799..d95784c 100644
> > --- a/xen/arch/x86/pqos.c
> > +++ b/xen/arch/x86/pqos.c
> > @@ -19,14 +19,30 @@
> >   * Place - Suite 330, Boston, MA 02111-1307 USA.
> >   */
> >  #include <asm/processor.h>
> > +#include <asm/msr.h>
> >  #include <xen/init.h>
> >  #include <xen/spinlock.h>
> >  #include <xen/sched.h>
> > +#include <public/domctl.h>
> >  #include <asm/pqos.h>
> >
> >  static bool_t pqos_enabled = 1;
> >  boolean_param("pqos", pqos_enabled);
> >
> > +static void read_qm_data(void *arg)
> > +{
> > +    struct qm_element *qm_element = arg;
> > +
> > +    wrmsr(MSR_IA32_QOSEVTSEL, qm_element->evtid,
> qm_element->rmid);
> > +    rdmsrl(MSR_IA32_QMC, qm_element->qm_data);
> > +}
> > +
> > +static void get_generic_qm_info(struct qm_element *qm_element)
> > +{
> > +    unsigned int cpu = qm_element->cpu;
> > +    on_selected_cpus(cpumask_of(cpu), read_qm_data, qm_element, 1);
> > +}
> > +
> >  unsigned int cqm_res_count = 0;
> >  unsigned int cqm_upscaling_factor = 0;
> >  bool_t cqm_enabled = 0;
> > @@ -86,6 +102,23 @@ bool_t system_supports_cqm(void)
> >      return cqm_enabled;
> >  }
> >
> > +unsigned int get_cqm_count(void)
> > +{
> > +    return cqm_res_count;
> > +}
> > +
> > +unsigned int get_cqm_avail(void)
> > +{
> > +    unsigned int cqm_avail = 0;
> > +    int i;
> 
> unsigned int please.  If cqm_res_count has its top bit set, the
> following loop may never terminate.

OK.

> 
> > +
> > +    for ( i = 0; i < cqm_res_count; i++ )
> > +        if ( !cqm_res_array[i].inuse )
> > +            cqm_avail++;
> > +
> > +    return cqm_avail;
> > +}
> > +
> >  int alloc_cqm_rmid(struct domain *d)
> >  {
> >      int rmid, rc = 0;
> > @@ -137,6 +170,30 @@ void free_cqm_rmid(struct domain *d)
> >      d->arch.pqos_cqm_rmid = 0;
> >  }
> >
> > +void get_cqm_info(uint32_t rmid, cpumask_t cpu_cqmdata_map,
> 
> A cpumask_t is already quite large, and will get larger in the future.
> Pass by pointer please.

OK.

> 
> > +                  struct xen_domctl_getdomcqminfo *info)
> > +{
> > +    struct qm_element element;
> > +    unsigned int cpu, i;
> > +
> > +    for_each_cpu ( cpu, &cpu_cqmdata_map )
> > +    {
> > +        element.cpu = cpu;
> > +        element.rmid = rmid;
> > +        element.evtid = QOS_MONITOR_EVTID_L3;
> > +
> > +        get_generic_qm_info(&element);
> > +
> > +        i = cpu_to_socket(cpu);
> 
> cpu_to_socket() can return BAD_APICID.

OK, will check the return value correctness.

> 
> > +        info->socket_cqmdata[i].valid =
> > +            (element.qm_data & IA32_QM_CTR_ERROR_MASK) ? 0 : 1;
> 
> info->socket_cqmdata[i].valid = !(element.qm_data &
> IA32_QM_CTR_ERROR_MASK);

OK.

> 
> > +        if ( info->socket_cqmdata[i].valid )
> > +            info->socket_cqmdata[i].l3c_occupancy = element.qm_data *
> cqm_upscaling_factor;
> > +        else
> > +            info->socket_cqmdata[i].l3c_occupancy = 0;
> > +    }
> > +}
> > +
> >  /*
> >   * Local variables:
> >   * mode: C
> > diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c
> > index 15d4b91..d631769 100644
> > --- a/xen/arch/x86/sysctl.c
> > +++ b/xen/arch/x86/sysctl.c
> > @@ -28,6 +28,7 @@
> >  #include <xen/nodemask.h>
> >  #include <xen/cpu.h>
> >  #include <xsm/xsm.h>
> > +#include <asm/pqos.h>
> >
> >  #define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
> >
> > @@ -101,6 +102,69 @@ long arch_do_sysctl(
> >      }
> >      break;
> >
> > +    case XEN_SYSCTL_getdomcqminfolist:
> > +    {
> 
> This whole hypercall makes me somewhat uneasy.
> 
> > +        struct domain *d;
> > +        struct xen_domctl_getdomcqminfo info;
> > +        uint32_t resource_count;
> > +        uint32_t resource_avail;
> > +        uint32_t num_domains = 0;
> > +        cpumask_t cpu_cqmdata_map;
> > +        DECLARE_BITMAP(sockets, QOS_MAX_SOCKETS);
> > +        unsigned int cpu;
> > +
> > +        if ( !system_supports_cqm() )
> > +        {
> > +            ret = -ENODEV;
> > +            break;
> > +        }
> > +
> > +        resource_count = get_cqm_count();
> > +        resource_avail = get_cqm_avail();
> > +
> > +        cpumask_clear(&cpu_cqmdata_map);
> > +        bitmap_zero(sockets, QOS_MAX_SOCKETS);
> > +        for_each_online_cpu(cpu)
> > +        {
> > +            int i = cpu_to_socket(cpu);
> > +            if ( test_and_set_bit(i, sockets) )
> > +                continue;
> > +            cpumask_set_cpu(cpu, &cpu_cqmdata_map);
> > +        }
> 
> What is this doing? It appears to be finding the first cpu on each socket.

Yes, the CQM info is per-socket, so just one CPU within the socket is enough to 
get/set the data.

> 
> > +
> > +        rcu_read_lock(&domlist_read_lock);
> > +        for_each_domain ( d )
> > +        {
> > +            if ( d->domain_id < sysctl->u.getdomaininfolist.first_domain )
> > +                continue;
> > +            if ( num_domains == sysctl->u.getdomaininfolist.max_domains )
> > +                break;
> > +            if ( d->arch.pqos_cqm_rmid <= 0 )
> > +                continue;
> 
> Is there any case where pqos_cqm_rmid can be negative? alloc_cqm_rmid()
> never assigns a negative number now in v2, in which case
> d->arch.pqos_cqm_rmid can probably be unsigned (and related int rmid's
> can be similarly promoted to unsigned)

Yes, it should be changed in the v2 patch, sorry.

> 
> > +            memset(&info, 0, sizeof(struct xen_domctl_getdomcqminfo));
> > +            info.domain = d->domain_id;
> > +            get_cqm_info(d->arch.pqos_cqm_rmid, cpu_cqmdata_map,
> &info);
> 
> So for a domain the hypercallee is interested in, we get its rmid, and
> ask get_cqm_info() to individually IPI each one cpu from a socket to
> fill in the info field?
> 
> The IPIs are quite expensive, and this system will currently monopolise
> the first cpu on each socket.

Since the L3 cache usage is per-socket wise, so current solution is:
 - Found the first CPU of each socket, and mark them in cpu_cqmdata_map.
 - The get_cqm_info() issues IPI to those CPUs marked in cpu_cqmdata_map to get 
the per-socket data.

Do you have proposed solution for this scenario?

> 
> > +
> > +            if ( copy_to_guest_offset(sysctl->u.getdomcqminfolist.buffer,
> > +                                      num_domains, &info, 1) )
> > +            {
> > +                ret = -EFAULT;
> > +                break;
> > +            }
> > +
> > +            num_domains++;
> 
> So this loop is primarily bounded by the number of domains, where each
> domain with a valid rmid will result in a spate of IPI?
> 
> This looks like it needs hypercall continuation logic.

OK, what about a preemption check every domain?

> 
> Also, how well does this intersect with updating the rmid assignment?

It needs to acquire cqm_lock to guarantee the simultaneous access to the cqm 
related data.

> 
> > +        }
> > +        rcu_read_unlock(&domlist_read_lock);
> > +
> > +        sysctl->u.getdomcqminfolist.num_domains = num_domains;
> > +        sysctl->u.getdomcqminfolist.resource_count = resource_count;
> > +        sysctl->u.getdomcqminfolist.resource_avail = resource_avail;
> > +
> > +        if ( copy_to_guest(u_sysctl, sysctl, 1) )
> > +            ret = -EFAULT;
> > +    }
> > +    break;
> 
> break should be inside the brace.

I refered to other cases in arch/x86/sysctl.c and arch/x86/domctl.c, and the 
"break" is mostly placed outside the brace...

> 
> > +
> >      default:
> >          ret = -ENOSYS;
> >          break;
> > diff --git a/xen/include/asm-x86/msr-index.h
> b/xen/include/asm-x86/msr-index.h
> > index e597a28..46ef165 100644
> > --- a/xen/include/asm-x86/msr-index.h
> > +++ b/xen/include/asm-x86/msr-index.h
> > @@ -488,4 +488,8 @@
> >  /* Geode defined MSRs */
> >  #define MSR_GEODE_BUSCONT_CONF0            0x00001900
> >
> > +/* Platform QoS register */
> > +#define MSR_IA32_QOSEVTSEL             0x00000c8d
> > +#define MSR_IA32_QMC                   0x00000c8e
> > +
> >  #endif /* __ASM_MSR_INDEX_H */
> > diff --git a/xen/include/asm-x86/pqos.h b/xen/include/asm-x86/pqos.h
> > index 88de139..5c86c5d 100644
> > --- a/xen/include/asm-x86/pqos.h
> > +++ b/xen/include/asm-x86/pqos.h
> > @@ -27,15 +27,29 @@
> >  /* QoS Monitoring Event ID */
> >  #define QOS_MONITOR_EVTID_L3           0x1
> >
> > +/* IA32_QM_CTR */
> > +#define IA32_QM_CTR_ERROR_MASK         (0x3ul << 62)
> > +
> >  struct cqm_res_struct {
> >      domid_t  domain_id;
> >      bool_t   inuse;
> >  };
> >
> > +struct qm_element {
> > +    uint64_t  qm_data;
> > +    uint32_t  cpu;
> > +    uint32_t  rmid;
> > +    uint8_t   evtid;
> > +};
> > +
> >  void init_platform_qos(void);
> >
> >  bool_t system_supports_cqm(void);
> >  int alloc_cqm_rmid(struct domain *);
> >  void free_cqm_rmid(struct domain *);
> > +unsigned int get_cqm_count(void);
> > +unsigned int get_cqm_avail(void);
> > +void get_cqm_info(uint32_t rmid, cpumask_t cpu_cqmdata_map,
> > +                  struct xen_domctl_getdomcqminfo *info);
> >
> >  #endif
> > diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> > index f5d7062..fe8b37f 100644
> > --- a/xen/include/public/domctl.h
> > +++ b/xen/include/public/domctl.h
> > @@ -883,6 +883,20 @@ struct xen_domctl_qos_resource {
> >  typedef struct xen_domctl_qos_resource xen_domctl_qos_resource_t;
> >  DEFINE_XEN_GUEST_HANDLE(xen_domctl_qos_resource_t);
> >
> > +struct xen_socket_cqmdata {
> > +    uint64_t l3c_occupancy;
> > +    uint8_t  valid;
> > +};
> > +
> > +struct xen_domctl_getdomcqminfo {
> > +    /* OUT variables. */
> > +    domid_t  domain;
> > +#define QOS_MAX_SOCKETS    128
> 
> Baking this into the ABI seems short sighted, and in this specific case
> looks to blow the 128 byte union size in a domctl structure.

Oh, thanks for catching this, it exceeds 128 bytes.

> 
> The toolstack should be able to find the number of sockets on the
> system, and provide a GUEST_HANDLE to an array of socket_cqmdata's of
> the appropriate length.

OK, will pass the structure from tool stack side.

> 
> > +    struct xen_socket_cqmdata socket_cqmdata[QOS_MAX_SOCKETS];
> > +};
> > +typedef struct xen_domctl_getdomcqminfo xen_domctl_getdomcqminfo_t;
> > +DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomcqminfo_t);
> > +
> >  struct xen_domctl {
> >      uint32_t cmd;
> >  #define XEN_DOMCTL_createdomain                   1
> > diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
> > index 8437d31..0def306 100644
> > --- a/xen/include/public/sysctl.h
> > +++ b/xen/include/public/sysctl.h
> > @@ -632,6 +632,19 @@ struct xen_sysctl_coverage_op {
> >  typedef struct xen_sysctl_coverage_op xen_sysctl_coverage_op_t;
> >  DEFINE_XEN_GUEST_HANDLE(xen_sysctl_coverage_op_t);
> >
> > +/* XEN_SYSCTL_getdomcqminfolist */
> > +struct xen_sysctl_getdomcqminfolist {
> > +    /* IN variables. */
> > +    domid_t               first_domain;
> > +    uint32_t              max_domains;
> > +    XEN_GUEST_HANDLE_64(xen_domctl_getdomcqminfo_t) buffer;
> > +    /* OUT variables. */
> > +    uint32_t              num_domains;
> 
> num_domains and max_domains can be folded together as both an in and an
> out parameter.  Also, "max_domains" is confusingly close to "max_domain"
> at a glance.

OK.

Thanks,
Dongxiao

> 
> ~Andrew
> 
> > +    uint32_t              resource_count;
> > +    uint32_t              resource_avail;
> > +};
> > +typedef struct xen_sysctl_getdomcqminfolist
> xen_sysctl_getdomcqminfolist_t;
> > +DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomcqminfolist_t);
> >
> >  struct xen_sysctl {
> >      uint32_t cmd;
> > @@ -654,6 +667,7 @@ struct xen_sysctl {
> >  #define XEN_SYSCTL_cpupool_op                    18
> >  #define XEN_SYSCTL_scheduler_op                  19
> >  #define XEN_SYSCTL_coverage_op                   20
> > +#define XEN_SYSCTL_getdomcqminfolist             21
> >      uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
> >      union {
> >          struct xen_sysctl_readconsole       readconsole;
> > @@ -675,6 +689,7 @@ struct xen_sysctl {
> >          struct xen_sysctl_cpupool_op        cpupool_op;
> >          struct xen_sysctl_scheduler_op      scheduler_op;
> >          struct xen_sysctl_coverage_op       coverage_op;
> > +        struct xen_sysctl_getdomcqminfolist getdomcqminfolist;
> >          uint8_t                             pad[128];
> >      } u;
> >  };


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.