[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

RE: [Xen-devel] [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension



OK, thanks :)

Jinsong

Keir Fraser wrote:
> Ah no, I found it! I'll take a look at it on Monday.
> 
>  -- Keir
> 
> On 14/05/2011 10:45, "Keir Fraser" <keir@xxxxxxx> wrote:
> 
>> I don't seem to have this one in my queue. You should resend.
>> 
>>  -- Keir
>> 
>> On 13/05/2011 15:34, "Liu, Jinsong" <jinsong.liu@xxxxxxxxx> wrote:
>> 
>>> Jan and Keir,
>>> 
>>> Any comments?
>>> 
>>> Thanks
>>> Jinsong
>>> 
>>> 
>>> Liu, Jinsong wrote:
>>>> MCA: simplify mce actoin, and some update of mem err handler for
>>>> future SRAR extension 
>>>> 
>>>> This patch simplify mce_action(). Basically the 2 set of mce status
>>>> flags MCA_... and MCER_... are highly functional similar. This
>>>> patch remove the redundant middle-level flag MCA_..., and its
>>>> related data structure and function, so that mce_action() logic is
>>>> more clean and simpler. This patch also update mem err handler.
>>>> intel_memerr_dhandler() will be commonly used by SRAO and SRAR, so
>>>> for the extension of future SRAR case, this patch remove the
>>>> default fail flag from intel_memerr_dhandler() outside to
>>>> SRAO/SRAR level, since only SRAO/SRAR handler itself has the
>>>> knowledge of how to handle the failure when mem err handler fail.
>>>> 
>>>> Signed-off-by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>
>>>> 
>>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
>>>> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 13:39:40 2011
>>>> +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 18:15:13
>>>> 2011 +0800 @@ -172,23 +172,14 @@ static unsigned int __read_mostly
>>>>  mce_dh static unsigned int __read_mostly mce_dhandler_num;
>>>>  static unsigned int __read_mostly mce_uhandler_num;
>>>> 
>>>> -enum mce_result
>>>> -{
>>>> -    MCER_NOERROR,
>>>> -    MCER_RECOVERED,
>>>> -    /* Not recoverd, but can continue */
>>>> -    MCER_CONTINUE,
>>>> -    MCER_RESET,
>>>> -};
>>>> -
>>>>  /* Maybe called in MCE context, no lock, no printk */
>>>>  static enum mce_result mce_action(struct cpu_user_regs *regs,
>>>>                        mctelem_cookie_t mctc)
>>>>  {
>>>>      struct mc_info *local_mi;
>>>> -    enum mce_result ret = MCER_NOERROR;
>>>> +    enum mce_result bank_result = MCER_NOERROR;
>>>> +    enum mce_result worst_result = MCER_NOERROR;
>>>>      struct mcinfo_common *mic = NULL;
>>>> -    struct mca_handle_result mca_res;
>>>>      struct mca_binfo binfo;
>>>>      const struct mca_error_handler *handlers = mce_dhandlers;
>>>>      unsigned int i, handler_num = mce_dhandler_num;
>>>> @@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
>>>>      /* Processing bank information */
>>>>      x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
>>>> 
>>>> -    for ( ; ret != MCER_RESET && mic && mic->size;
>>>> +    for ( ; bank_result != MCER_RESET && mic && mic->size;
>>>>            mic = x86_mcinfo_next(mic) )
>>>>      {
>>>>          if (mic->type != MC_TYPE_BANK) {
>>>> @@ -225,34 +216,20 @@ static enum mce_result mce_action(struct    
>>>>          } binfo.mib = (struct mcinfo_bank*)mic;
>>>>          binfo.bank = binfo.mib->mc_bank;
>>>> -        memset(&mca_res, 0x0f, sizeof(mca_res));
>>>> +        bank_result = MCER_NOERROR;
>>>>          for ( i = 0; i < handler_num; i++ ) {
>>>>              if (handlers[i].owned_error(binfo.mib->mc_status))   
>>>> { 
>>>> -                handlers[i].recovery_handler(&binfo, &mca_res); -
>>>> -                if (mca_res.result & MCA_OWNER)
>>>> -                    binfo.mib->mc_domid = mca_res.owner; -
>>>> -                if (mca_res.result == MCA_NEED_RESET)
>>>> -                    ret = MCER_RESET;
>>>> -                else if (mca_res.result == MCA_RECOVERED)
>>>> -                {
>>>> -                    if (ret < MCER_RECOVERED)
>>>> -                        ret = MCER_RECOVERED;
>>>> -                }
>>>> -                else if (mca_res.result == MCA_NO_ACTION)
>>>> -                {
>>>> -                    if (ret < MCER_CONTINUE)
>>>> -                        ret = MCER_CONTINUE;
>>>> -                }
>>>> +                handlers[i].recovery_handler(&binfo,
>>>> &bank_result); +                if (worst_result < bank_result)
>>>> +                    worst_result = bank_result;
>>>>                  break;
>>>>              }
>>>>          }
>>>>          ASSERT(i != handler_num);
>>>>      }
>>>> 
>>>> -    return ret;
>>>> +    return worst_result;
>>>>  }
>>>> 
>>>>  /*
>>>> @@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
>>>> 
>>>>  static void intel_memerr_dhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      struct mcinfo_bank *bank = binfo->mib;
>>>>      struct mcinfo_global *global = binfo->mig;
>>>> @@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
>>>>      uint64_t mc_status, mc_misc;
>>>> 
>>>>      mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
>>>> -    result->result = MCA_NEED_RESET;
>>>> 
>>>>      mc_status = bank->mc_status;
>>>>      mc_misc = bank->mc_misc;
>>>> @@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
>>>>          !(mc_status & MCi_STATUS_MISCV) ||
>>>>          ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
>>>> { -        result->result |= MCA_NO_ACTION;
>>>>          dprintk(XENLOG_WARNING,
>>>>              "No physical address provided for memory error\n");  
>>>> return; @@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
>>>> 
>>>>      /* This is free page */
>>>>      if (status & PG_OFFLINE_OFFLINED)
>>>> -        result->result = MCA_RECOVERED;
>>>> +        *result = MCER_RECOVERED;
>>>>      else if (status & PG_OFFLINE_PENDING) {
>>>>          /* This page has owner */
>>>>          if (status & PG_OFFLINE_OWNED) {
>>>> -            result->result |= MCA_OWNER;
>>>> -            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
>>>> +            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
>>>>              mce_printk(MCE_QUIET, "MCE: This error page is ownded"
>>>> -              " by DOM %d\n", result->owner);
>>>> -            /* Fill vMCE# injection and vMCE# MSR virtualization "
>>>> -             * "related data */
>>>> -            bank->mc_domid = result->owner;
>>>> +              " by DOM %d\n", bank->mc_domid);
>>>>              /* XXX: Cannot handle shared pages yet
>>>>               * (this should identify all domains and gfn mapping
>>>> to 
>>>>               *  the mfn in question) */
>>>> -            BUG_ON( result->owner == DOMID_COW );
>>>> -            if ( result->owner != DOMID_XEN ) {
>>>> -                d = get_domain_by_id(result->owner);
>>>> +            BUG_ON( bank->mc_domid == DOMID_COW );
>>>> +            if ( bank->mc_domid != DOMID_XEN ) {
>>>> +                d = get_domain_by_id(bank->mc_domid);            
>>>>                  ASSERT(d); gfn =
>>>> get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT); 
>>>> 
>>>> @@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
>>>>                        global->mc_gstatus) == -1 )
>>>>                  {
>>>>                      mce_printk(MCE_QUIET, "Fill vMCE# data for
>>>> DOM%d " -                      "failed\n", result->owner);
>>>> +                      "failed\n", bank->mc_domid);
>>>>                      goto vmce_failed;
>>>>                  }
>>>> 
>>>> @@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
>>>>                   * For xen, it has contained the error and
>>>> finished 
>>>>                   * its own recovery job.
>>>>                   */
>>>> -                result->result = MCA_RECOVERED;
>>>> +                *result = MCER_RECOVERED;
>>>>                  put_domain(d);
>>>> 
>>>>                  return;
>>>> @@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
>>>> 
>>>>  static void intel_srao_dhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      uint64_t status = binfo->mib->mc_status;
>>>> 
>>>>      /* For unknown srao error code, no action required */ +   
>>>> *result = MCER_CONTINUE; +
>>>>      if ( status & MCi_STATUS_VAL )
>>>>      {
>>>>          switch ( status & INTEL_MCCOD_MASK )
>>>> @@ -744,7 +717,7 @@ static int intel_default_check(uint64_t
>>>> 
>>>>  static void intel_default_mce_dhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      uint64_t status = binfo->mib->mc_status;
>>>>      enum intel_mce_type type;
>>>> @@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
>>>>      type = intel_check_mce_type(status);
>>>> 
>>>>      if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
>>>> -        result->result = MCA_NEED_RESET;
>>>> -    else if (type == intel_mce_ucr_srao)
>>>> -        result->result = MCA_NO_ACTION;
>>>> +        *result = MCER_RESET;
>>>> +    else
>>>> +        *result = MCER_CONTINUE;
>>>>  }
>>>> 
>>>>  static const struct mca_error_handler intel_mce_dhandlers[] = {
>>>> @@ -764,7 +737,7 @@ static const struct mca_error_handler in
>>>> 
>>>>  static void intel_default_mce_uhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      uint64_t status = binfo->mib->mc_status;
>>>>      enum intel_mce_type type;
>>>> @@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
>>>>      /* Panic if no handler for SRAR error */
>>>>      case intel_mce_ucr_srar:
>>>>      case intel_mce_fatal:
>>>> -        result->result = MCA_NEED_RESET;
>>>> +        *result = MCER_RESET;
>>>>          break;
>>>>      default:
>>>> -        result->result = MCA_NO_ACTION;
>>>> +        *result = MCER_CONTINUE;
>>>>          break;
>>>>      }
>>>>  }
>>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
>>>> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 13:39:40 2011
>>>> +0800 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 18:15:13
>>>> 2011 +0800 @@ -124,36 +124,6 @@ void mcabanks_free(struct
>>>>  mca_banks *ban void mcabanks_free(struct mca_banks *banks);
>>>>  extern struct mca_banks *mca_allbanks;
>>>> 
>>>> -/* Below interfaces are defined for MCA internal processing:
>>>> - * a. pre_handler will be called early in MCA ISR context, mainly
>>>> for early 
>>>> - *    need_reset detection for avoiding log missing. Also, it is
>>>> used to judge 
>>>> - *    impacted DOMAIN if possible.
>>>> - * b. mca_error_handler is actually a (error_action_index,
>>>> - *    recovery_hanlder pointer) pair. The defined recovery_handler
>>>> - *    performs the actual recovery operations such as
>>>> page_offline, cpu_offline 
>>>> - *    in softIRQ context when the per_bank MCA error matching the
>>>> corresponding 
>>>> - *    mca_code index. If pre_handler can't judge the impacted
>>>> domain, 
>>>> - *    recovery_handler must figure it out.
>>>> -*/
>>>> -
>>>> -/* MCA error has been recovered successfully by the recovery
>>>> action*/ 
>>>> -#define MCA_RECOVERED (0x1 << 0)
>>>> -/* MCA error impact the specified DOMAIN in owner field below */
>>>> -#define MCA_OWNER (0x1 << 1)
>>>> -/* MCA error can't be recovered and need reset */
>>>> -#define MCA_NEED_RESET (0x1 << 2)
>>>> -/* MCA error did not have any action yet */
>>>> -#define MCA_NO_ACTION (0x1 << 3)
>>>> -
>>>> -struct mca_handle_result
>>>> -{
>>>> -    uint32_t result;
>>>> -    /* Used one result & MCA_OWNER */
>>>> -    domid_t owner;
>>>> -    /* Used by mca_error_handler, result & MCA_RECOVRED */
>>>> -    struct recovery_action *action;
>>>> -};
>>>> -
>>>>  /*Keep bank so that we can get staus even if mib is NULL */ 
>>>>      struct mca_binfo { int bank;
>>>> @@ -163,8 +133,14 @@ struct mca_binfo {
>>>>      struct cpu_user_regs *regs;
>>>>  };
>>>> 
>>>> -extern void (*mca_prehandler)( struct cpu_user_regs *regs,
>>>> -                        struct mca_handle_result *result); +enum
>>>> mce_result +{
>>>> +    MCER_NOERROR,
>>>> +    MCER_RECOVERED,
>>>> +    /* Not recoverd, but can continue */
>>>> +    MCER_CONTINUE,
>>>> +    MCER_RESET,
>>>> +};
>>>> 
>>>>  struct mca_error_handler
>>>>  {
>>>> @@ -175,7 +151,7 @@ struct mca_error_handler
>>>>      */
>>>>      int (*owned_error)(uint64_t status);
>>>>      void (*recovery_handler)(struct mca_binfo *binfo,
>>>> -                    struct mca_handle_result *result);
>>>> +                    enum mce_result *result);
>>>>  };
>>>> 
>>>>  /* Global variables */


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.