[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension


  • To: "Liu, Jinsong" <jinsong.liu@xxxxxxxxx>, Jan Beulich <JBeulich@xxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
  • From: Keir Fraser <keir@xxxxxxx>
  • Date: Sat, 14 May 2011 11:01:01 +0100
  • Cc: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>, "Li, Xin" <xin.li@xxxxxxxxx>
  • Delivery-date: Sat, 14 May 2011 03:03:25 -0700
  • Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=sender:user-agent:date:subject:from:to:cc:message-id:thread-topic :thread-index:in-reply-to:mime-version:content-type :content-transfer-encoding; b=Z26U7mbM0zWig9KwAqep9Hxap09r1HiKMKhex6XxdYSS67eKnqAwfLW3d0350jE5fD sbEhOE/2PrRwdO0TxXi2YVf62H3XKeUZAG6BSaFSK0CTucF8eMlYdcnhxCDcyJbPFIWd ycdrKetuzLBL81TJ4qtn/h//iIjEoC2rkNzLo=
  • List-id: Xen developer discussion <xen-devel.lists.xensource.com>
  • Thread-index: AcwNclEwEvLQfpHVSmSzkx8mJ5h67gECANbgAChTZIYAAIyFbw==
  • Thread-topic: [Xen-devel] [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension

Ah no, I found it! I'll take a look at it on Monday.

 -- Keir

On 14/05/2011 10:45, "Keir Fraser" <keir@xxxxxxx> wrote:

> I don't seem to have this one in my queue. You should resend.
> 
>  -- Keir
> 
> On 13/05/2011 15:34, "Liu, Jinsong" <jinsong.liu@xxxxxxxxx> wrote:
> 
>> Jan and Keir,
>> 
>> Any comments?
>> 
>> Thanks
>> Jinsong
>> 
>> 
>> Liu, Jinsong wrote:
>>> MCA: simplify mce actoin, and some update of mem err handler for
>>> future SRAR extension
>>> 
>>> This patch simplify mce_action(). Basically the 2 set of mce status
>>> flags MCA_... and MCER_... are highly functional similar. This patch
>>> remove the redundant middle-level flag MCA_..., and its related data
>>> structure and function, so that mce_action() logic is more clean and
>>> simpler.    
>>> This patch also update mem err handler. intel_memerr_dhandler() will
>>> be commonly used by SRAO and SRAR, so for the extension of future
>>> SRAR case, this patch remove the default fail flag from
>>> intel_memerr_dhandler() outside to SRAO/SRAR level, since only
>>> SRAO/SRAR handler itself has the knowledge of how to handle the
>>> failure when mem err handler fail.
>>> 
>>> Signed-off-by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>
>>> 
>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
>>> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 13:39:40 2011
>>> +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 18:15:13
>>> 2011 +0800 @@ -172,23 +172,14 @@ static unsigned int __read_mostly
>>>  mce_dh static unsigned int __read_mostly mce_dhandler_num;
>>>  static unsigned int __read_mostly mce_uhandler_num;
>>> 
>>> -enum mce_result
>>> -{
>>> -    MCER_NOERROR,
>>> -    MCER_RECOVERED,
>>> -    /* Not recoverd, but can continue */
>>> -    MCER_CONTINUE,
>>> -    MCER_RESET,
>>> -};
>>> -
>>>  /* Maybe called in MCE context, no lock, no printk */
>>>  static enum mce_result mce_action(struct cpu_user_regs *regs,
>>>                        mctelem_cookie_t mctc)
>>>  {
>>>      struct mc_info *local_mi;
>>> -    enum mce_result ret = MCER_NOERROR;
>>> +    enum mce_result bank_result = MCER_NOERROR;
>>> +    enum mce_result worst_result = MCER_NOERROR;
>>>      struct mcinfo_common *mic = NULL;
>>> -    struct mca_handle_result mca_res;
>>>      struct mca_binfo binfo;
>>>      const struct mca_error_handler *handlers = mce_dhandlers;
>>>      unsigned int i, handler_num = mce_dhandler_num;
>>> @@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
>>>      /* Processing bank information */
>>>      x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
>>> 
>>> -    for ( ; ret != MCER_RESET && mic && mic->size;
>>> +    for ( ; bank_result != MCER_RESET && mic && mic->size;
>>>            mic = x86_mcinfo_next(mic) )
>>>      {
>>>          if (mic->type != MC_TYPE_BANK) {
>>> @@ -225,34 +216,20 @@ static enum mce_result mce_action(struct
>>>          }
>>>          binfo.mib = (struct mcinfo_bank*)mic;
>>>          binfo.bank = binfo.mib->mc_bank;
>>> -        memset(&mca_res, 0x0f, sizeof(mca_res));
>>> +        bank_result = MCER_NOERROR;
>>>          for ( i = 0; i < handler_num; i++ ) {
>>>              if (handlers[i].owned_error(binfo.mib->mc_status))
>>>              {
>>> -                handlers[i].recovery_handler(&binfo, &mca_res);
>>> -
>>> -                if (mca_res.result & MCA_OWNER)
>>> -                    binfo.mib->mc_domid = mca_res.owner;
>>> -
>>> -                if (mca_res.result == MCA_NEED_RESET)
>>> -                    ret = MCER_RESET;
>>> -                else if (mca_res.result == MCA_RECOVERED)
>>> -                {
>>> -                    if (ret < MCER_RECOVERED)
>>> -                        ret = MCER_RECOVERED;
>>> -                }
>>> -                else if (mca_res.result == MCA_NO_ACTION)
>>> -                {
>>> -                    if (ret < MCER_CONTINUE)
>>> -                        ret = MCER_CONTINUE;
>>> -                }
>>> +                handlers[i].recovery_handler(&binfo, &bank_result);
>>> +                if (worst_result < bank_result)
>>> +                    worst_result = bank_result;
>>>                  break;
>>>              }
>>>          }
>>>          ASSERT(i != handler_num);
>>>      }
>>> 
>>> -    return ret;
>>> +    return worst_result;
>>>  }
>>> 
>>>  /*
>>> @@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
>>> 
>>>  static void intel_memerr_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      struct mcinfo_bank *bank = binfo->mib;
>>>      struct mcinfo_global *global = binfo->mig;
>>> @@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
>>>      uint64_t mc_status, mc_misc;
>>> 
>>>      mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
>>> -    result->result = MCA_NEED_RESET;
>>> 
>>>      mc_status = bank->mc_status;
>>>      mc_misc = bank->mc_misc;
>>> @@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
>>>          !(mc_status & MCi_STATUS_MISCV) ||
>>>          ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
>>>      {
>>> -        result->result |= MCA_NO_ACTION;
>>>          dprintk(XENLOG_WARNING,
>>>              "No physical address provided for memory error\n");
>>>          return;
>>> @@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
>>> 
>>>      /* This is free page */
>>>      if (status & PG_OFFLINE_OFFLINED)
>>> -        result->result = MCA_RECOVERED;
>>> +        *result = MCER_RECOVERED;
>>>      else if (status & PG_OFFLINE_PENDING) {
>>>          /* This page has owner */
>>>          if (status & PG_OFFLINE_OWNED) {
>>> -            result->result |= MCA_OWNER;
>>> -            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
>>> +            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
>>>              mce_printk(MCE_QUIET, "MCE: This error page is ownded"
>>> -              " by DOM %d\n", result->owner);
>>> -            /* Fill vMCE# injection and vMCE# MSR virtualization "
>>> -             * "related data */
>>> -            bank->mc_domid = result->owner;
>>> +              " by DOM %d\n", bank->mc_domid);
>>>              /* XXX: Cannot handle shared pages yet
>>>               * (this should identify all domains and gfn mapping to
>>>               *  the mfn in question) */
>>> -            BUG_ON( result->owner == DOMID_COW );
>>> -            if ( result->owner != DOMID_XEN ) {
>>> -                d = get_domain_by_id(result->owner);
>>> +            BUG_ON( bank->mc_domid == DOMID_COW );
>>> +            if ( bank->mc_domid != DOMID_XEN ) {
>>> +                d = get_domain_by_id(bank->mc_domid);
>>>                  ASSERT(d);
>>>                  gfn = get_gpfn_from_mfn((bank->mc_addr) >>
>>> PAGE_SHIFT); 
>>> 
>>> @@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
>>>                        global->mc_gstatus) == -1 )
>>>                  {
>>>                      mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d
>>> " -                      "failed\n", result->owner);
>>> +                      "failed\n", bank->mc_domid);
>>>                      goto vmce_failed;
>>>                  }
>>> 
>>> @@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
>>>                   * For xen, it has contained the error and finished
>>>                   * its own recovery job.
>>>                   */
>>> -                result->result = MCA_RECOVERED;
>>> +                *result = MCER_RECOVERED;
>>>                  put_domain(d);
>>> 
>>>                  return;
>>> @@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
>>> 
>>>  static void intel_srao_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>> 
>>>      /* For unknown srao error code, no action required */
>>> +    *result = MCER_CONTINUE;
>>> +
>>>      if ( status & MCi_STATUS_VAL )
>>>      {
>>>          switch ( status & INTEL_MCCOD_MASK )
>>> @@ -744,7 +717,7 @@ static int intel_default_check(uint64_t
>>> 
>>>  static void intel_default_mce_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>>      enum intel_mce_type type;
>>> @@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
>>>      type = intel_check_mce_type(status);
>>> 
>>>      if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
>>> -        result->result = MCA_NEED_RESET;
>>> -    else if (type == intel_mce_ucr_srao)
>>> -        result->result = MCA_NO_ACTION;
>>> +        *result = MCER_RESET;
>>> +    else
>>> +        *result = MCER_CONTINUE;
>>>  }
>>> 
>>>  static const struct mca_error_handler intel_mce_dhandlers[] = {
>>> @@ -764,7 +737,7 @@ static const struct mca_error_handler in
>>> 
>>>  static void intel_default_mce_uhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>>      enum intel_mce_type type;
>>> @@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
>>>      /* Panic if no handler for SRAR error */
>>>      case intel_mce_ucr_srar:
>>>      case intel_mce_fatal:
>>> -        result->result = MCA_NEED_RESET;
>>> +        *result = MCER_RESET;
>>>          break;
>>>      default:
>>> -        result->result = MCA_NO_ACTION;
>>> +        *result = MCER_CONTINUE;
>>>          break;
>>>      }
>>>  }
>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
>>> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 13:39:40 2011 +0800
>>> +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 18:15:13 2011 +0800
>>> @@ -124,36 +124,6 @@ void mcabanks_free(struct mca_banks *ban
>>>  void mcabanks_free(struct mca_banks *banks);
>>>  extern struct mca_banks *mca_allbanks;
>>> 
>>> -/* Below interfaces are defined for MCA internal processing:
>>> - * a. pre_handler will be called early in MCA ISR context, mainly
>>> for early 
>>> - *    need_reset detection for avoiding log missing. Also, it is
>>> used to judge 
>>> - *    impacted DOMAIN if possible.
>>> - * b. mca_error_handler is actually a (error_action_index,
>>> - *    recovery_hanlder pointer) pair. The defined recovery_handler
>>> - *    performs the actual recovery operations such as page_offline,
>>> cpu_offline 
>>> - *    in softIRQ context when the per_bank MCA error matching the
>>> corresponding 
>>> - *    mca_code index. If pre_handler can't judge the impacted domain,
>>> - *    recovery_handler must figure it out.
>>> -*/
>>> -
>>> -/* MCA error has been recovered successfully by the recovery action*/
>>> -#define MCA_RECOVERED (0x1 << 0)
>>> -/* MCA error impact the specified DOMAIN in owner field below */
>>> -#define MCA_OWNER (0x1 << 1)
>>> -/* MCA error can't be recovered and need reset */
>>> -#define MCA_NEED_RESET (0x1 << 2)
>>> -/* MCA error did not have any action yet */
>>> -#define MCA_NO_ACTION (0x1 << 3)
>>> -
>>> -struct mca_handle_result
>>> -{
>>> -    uint32_t result;
>>> -    /* Used one result & MCA_OWNER */
>>> -    domid_t owner;
>>> -    /* Used by mca_error_handler, result & MCA_RECOVRED */
>>> -    struct recovery_action *action;
>>> -};
>>> -
>>>  /*Keep bank so that we can get staus even if mib is NULL */
>>>  struct mca_binfo {
>>>      int bank;
>>> @@ -163,8 +133,14 @@ struct mca_binfo {
>>>      struct cpu_user_regs *regs;
>>>  };
>>> 
>>> -extern void (*mca_prehandler)( struct cpu_user_regs *regs,
>>> -                        struct mca_handle_result *result);
>>> +enum mce_result
>>> +{
>>> +    MCER_NOERROR,
>>> +    MCER_RECOVERED,
>>> +    /* Not recoverd, but can continue */
>>> +    MCER_CONTINUE,
>>> +    MCER_RESET,
>>> +};
>>> 
>>>  struct mca_error_handler
>>>  {
>>> @@ -175,7 +151,7 @@ struct mca_error_handler
>>>      */
>>>      int (*owned_error)(uint64_t status);
>>>      void (*recovery_handler)(struct mca_binfo *binfo,
>>> -                    struct mca_handle_result *result);
>>> +                    enum mce_result *result);
>>>  };
>>> 
>>>  /* Global variables */
>> 
> 
> 



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.