[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v2] MCE: Fix race condition in mctelem_reserve



Sorry, I just return from long Chinese Spring Festival vacation. I will review 
it ASAP.

Thanks,
Jinsong

Jan Beulich wrote:
>>>> On 14.02.14 at 17:23, Frediano Ziglio <frediano.ziglio@xxxxxxxxxx>
>>>> wrote: Ping 
> 
> And this is clearly not the first ping. Guys - you had over 3 weeks
> time to respond to this. Please!
> 
> Jan
> 
>> On Fri, 2014-01-31 at 13:37 +0000, Frediano Ziglio wrote:
>>> Ping
>>> 
>>> On Wed, 2014-01-22 at 17:17 +0000, Frediano Ziglio wrote:
>>>> From 49b37906afef0981f318064f4cb53a3602bca50a Mon Sep 17 00:00:00
>>>> 2001 From: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx>
>>>> Date: Wed, 22 Jan 2014 10:48:50 +0000
>>>> Subject: [PATCH] MCE: Fix race condition in mctelem_reserve
>>>> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8
>>>> Content-Transfer-Encoding: 8bit
>>>> 
>>>> These lines (in mctelem_reserve)
>>>> 
>>>>         newhead = oldhead->mcte_next;
>>>>         if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
>>>> 
>>>> are racy. After you read the newhead pointer it can happen that
>>>> another flow (thread or recursive invocation) change all the list
>>>> but set head with same value. So oldhead is the same as *freelp
>>>> but you are setting 
>>>> a new head that could point to whatever element (even already
>>>> used). 
>>>> 
>>>> This patch use instead a bit array and atomic bit operations.
>>>> 
>>>> Signed-off-by: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx> ---
>>>>  xen/arch/x86/cpu/mcheck/mctelem.c |   81
>> ++++++++++++++-----------------------
>>>>  1 file changed, 30 insertions(+), 51 deletions(-)
>>>> 
>>>> Changes from v1:
>>>> - Use bitmap to allow any number of items to be used;
>>>> - Use a single bitmap to simplify reserve loop;
>>>> - Remove HOME flags as was not used anymore.
>>>> 
>>>> diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c
>> b/xen/arch/x86/cpu/mcheck/mctelem.c
>>>> index 895ce1a..ed8e8d2 100644
>>>> --- a/xen/arch/x86/cpu/mcheck/mctelem.c
>>>> +++ b/xen/arch/x86/cpu/mcheck/mctelem.c
>>>> @@ -37,24 +37,19 @@ struct mctelem_ent {
>>>>    void *mcte_data;                /* corresponding data payload */  };
>>>> 
>>>> -#define   MCTE_F_HOME_URGENT              0x0001U /* free to urgent 
>>>> freelist */
>>>> -#define   MCTE_F_HOME_NONURGENT           0x0002U /* free to nonurgent
>>>> freelist */ 
>>>> -#define   MCTE_F_CLASS_URGENT             0x0004U /* in use - urgent 
>>>> errors */
>>>> -#define   MCTE_F_CLASS_NONURGENT          0x0008U /* in use - nonurgent
>>>> errors */ +#define MCTE_F_CLASS_URGENT             0x0001U /* in use - 
>>>> urgent
>>>> errors */ +#define MCTE_F_CLASS_NONURGENT          0x0002U /* in use -
>>>>  nonurgent errors */ #define       MCTE_F_STATE_FREE               0x0010U 
>>>> /* on a
>>>>  freelist */ #define       MCTE_F_STATE_UNCOMMITTED        0x0020U /* 
>>>> reserved;
>>>>  on no list */ #define     MCTE_F_STATE_COMMITTED          0x0040U /* on a
>>>>  committed list */ #define MCTE_F_STATE_PROCESSING         0x0080U /* on
>>>> a processing list */ 
>>>> 
>>>> -#define   MCTE_F_MASK_HOME        (MCTE_F_HOME_URGENT |
>>>>  MCTE_F_HOME_NONURGENT)
>>>>  #define   MCTE_F_MASK_CLASS       (MCTE_F_CLASS_URGENT |
>>>>                            MCTE_F_CLASS_NONURGENT)
>>>>                            #define MCTE_F_MASK_STATE       
>>>> (MCTE_F_STATE_FREE | \
>>>>                            MCTE_F_STATE_UNCOMMITTED | \ 
>>>> MCTE_F_STATE_COMMITTED | \
>>>> MCTE_F_STATE_PROCESSING) 
>>>> 
>>>> -#define   MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME) -
>>>>  #define   MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
>>>>  #define   MCTE_SET_CLASS(tep, new) do { \
>>>>      (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
>>>> @@ -69,6 +64,8 @@ struct mctelem_ent {
>>>>  #define   MC_URGENT_NENT          10
>>>>  #define   MC_NONURGENT_NENT       20
>>>> 
>>>> +#define MC_NENT (MC_URGENT_NENT + MC_NONURGENT_NENT) +
>>>>  #define   MC_NCLASSES             (MC_NONURGENT + 1)
>>>> 
>>>>  #define   COOKIE2MCTE(c)          ((struct mctelem_ent *)(c))
>>>> @@ -77,11 +74,9 @@ struct mctelem_ent {
>>>>  static struct mc_telem_ctl {
>>>>    /* Linked lists that thread the array members together.          *
>>>> -   * The free lists are singly-linked via mcte_next, and we
>>>> allocate 
>>>> -   * from them by atomically unlinking an element from the head.
>>>> -   * Consumed entries are returned to the head of the free list.
>>>> -   * When an entry is reserved off the free list it is not linked
>>>> -   * on any list until it is committed or dismissed.
>>>> +   * The free lists is a bit array where bit 1 means free.
>>>> +   * This as element number is quite small and is easy to
>>>> +   * atomically allocate that way.
>>>>     *
>>>>     * The committed list grows at the head and we do not maintain a
>>>>     * tail pointer; insertions are performed atomically.  The head
>>>> @@ -101,7 +96,7 @@ static struct mc_telem_ctl {
>>>>     * we can lock it for updates.  The head of the processing list
>>>>     * always has the oldest telemetry, and we append (as above)
>>>>     * at the tail of the processing list. */
>>>> -  struct mctelem_ent *mctc_free[MC_NCLASSES];
>>>> +  DECLARE_BITMAP(mctc_free, MC_NENT);
>>>>    struct mctelem_ent *mctc_committed[MC_NCLASSES];
>>>>    struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
>>>>    struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
>>>> @@ -207,14 +202,14 @@ int mctelem_has_deferred(unsigned int cpu)  
>>>>  */ static void mctelem_free(struct mctelem_ent *tep)
>>>>  {
>>>> -  mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ?
>>>> -      MC_URGENT : MC_NONURGENT;
>>>> -
>>>>    BUG_ON(tep->mcte_refcnt != 0);
>>>>    BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
>>>> 
>>>>    tep->mcte_prev = NULL;
>>>> -  mctelem_xchg_head(&mctctl.mctc_free[target], &tep->mcte_next,
>>>> tep); +    tep->mcte_next = NULL; +
>>>> +  /* set free in array */
>>>> +  set_bit(tep - mctctl.mctc_elems, mctctl.mctc_free);  }
>>>> 
>>>>  /* Increment the reference count of an entry that is not linked
>>>> on to @@ -274,34 +269,25 @@ void mctelem_init(int reqdatasz)       }
>>>> 
>>>>    if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
>>>> -      MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL ||
>>>> -      (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT)
>>>> * 
>>>> -      datasz)) == NULL) {
>>>> +      MC_NENT)) == NULL ||
>>>> +      (datarr = xmalloc_bytes(MC_NENT * datasz)) == NULL) {               
>>>> if
>>>>                    (mctctl.mctc_elems) xfree(mctctl.mctc_elems);
>>>>            printk("Allocations for MCA telemetry failed\n");               
>>>> return;
>>>>    }
>>>> 
>>>> -  for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
>>>> -          struct mctelem_ent *tep, **tepp;
>>>> +  for (i = 0; i < MC_NENT; i++) {
>>>> +          struct mctelem_ent *tep;
>>>> 
>>>>            tep = mctctl.mctc_elems + i;
>>>>            tep->mcte_flags = MCTE_F_STATE_FREE;
>>>>            tep->mcte_refcnt = 0;
>>>>            tep->mcte_data = datarr + i * datasz;
>>>> 
>>>> -          if (i < MC_URGENT_NENT) {
>>>> -                  tepp = &mctctl.mctc_free[MC_URGENT];
>>>> -                  tep->mcte_flags |= MCTE_F_HOME_URGENT;
>>>> -          } else {
>>>> -                  tepp = &mctctl.mctc_free[MC_NONURGENT];
>>>> -                  tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
>>>> -          }
>>>> -
>>>> -          tep->mcte_next = *tepp;
>>>> +          __set_bit(i, mctctl.mctc_free);
>>>> +          tep->mcte_next = NULL;
>>>>            tep->mcte_prev = NULL;
>>>> -          *tepp = tep;
>>>>    }
>>>>  }
>>>> 
>>>> @@ -310,32 +296,25 @@ static int mctelem_drop_count;
>>>> 
>>>>  /* Reserve a telemetry entry, or return NULL if none available.
>>>>   * If we return an entry then the caller must subsequently call
>>>> exactly one of - * mctelem_unreserve or mctelem_commit for that
>>>> entry. + * mctelem_dismiss or mctelem_commit for that entry.   */
>>>>  mctelem_cookie_t mctelem_reserve(mctelem_class_t which)  {
>>>> -  struct mctelem_ent **freelp;
>>>> -  struct mctelem_ent *oldhead, *newhead;
>>>> -  mctelem_class_t target = (which == MC_URGENT) ?
>>>> -      MC_URGENT : MC_NONURGENT;
>>>> +  unsigned bit;
>>>> +  unsigned start_bit = (which == MC_URGENT) ? 0 : MC_URGENT_NENT;
>>>> 
>>>> -  freelp = &mctctl.mctc_free[target];
>>>>    for (;;) {
>>>> -          if ((oldhead = *freelp) == NULL) {
>>>> -                  if (which == MC_URGENT && target == MC_URGENT) {
>>>> -                          /* raid the non-urgent freelist */
>>>> -                          target = MC_NONURGENT;
>>>> -                          freelp = &mctctl.mctc_free[target];
>>>> -                          continue;
>>>> -                  } else {
>>>> -                          mctelem_drop_count++;
>>>> -                          return (NULL);
>>>> -                  }
>>>> +          bit = find_next_bit(mctctl.mctc_free, MC_NENT, start_bit); +
>>>> +          if (bit >= MC_NENT) {
>>>> +                  mctelem_drop_count++;
>>>> +                  return (NULL);
>>>>            }
>>>> 
>>>> -          newhead = oldhead->mcte_next;
>>>> -          if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
>>>> -                  struct mctelem_ent *tep = oldhead;
>>>> +          /* try to allocate, atomically clear free bit */
>>>> +          if (test_and_clear_bit(bit, mctctl.mctc_free)) {
>>>> +                  /* return element we got */
>>>> +                  struct mctelem_ent *tep = mctctl.mctc_elems + bit;
>>>> 
>>>>                    mctelem_hold(tep);
>>>>                    MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.