[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1/2] x86/microcode: Synchronize late microcode loading



From: Gao Chao <chao.gao@xxxxxxxxx>

This patch is to backport microcode improvement patches from linux
kernel. Below are the original patches description:

    commit a5321aec6412b20b5ad15db2d6b916c05349dbff
    Author: Ashok Raj <ashok.raj@xxxxxxxxx>
    Date:   Wed Feb 28 11:28:46 2018 +0100

        x86/microcode: Synchronize late microcode loading

        Original idea by Ashok, completely rewritten by Borislav.

        Before you read any further: the early loading method is still the
        preferred one and you should always do that. The following patch is
        improving the late loading mechanism for long running jobs and cloud use
        cases.

        Gather all cores and serialize the microcode update on them by doing it
        one-by-one to make the late update process as reliable as possible and
        avoid potential issues caused by the microcode update.

        [ Borislav: Rewrite completely. ]

        Co-developed-by: Borislav Petkov <bp@xxxxxxx>
        Signed-off-by: Ashok Raj <ashok.raj@xxxxxxxxx>
        Signed-off-by: Borislav Petkov <bp@xxxxxxx>
        Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
        Tested-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
        Tested-by: Ashok Raj <ashok.raj@xxxxxxxxx>
        Reviewed-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
        Cc: Arjan Van De Ven <arjan.van.de.ven@xxxxxxxxx>
        Link: https://lkml.kernel.org/r/20180228102846.13447-8-bp@xxxxxxxxx

    commit bb8c13d61a629276a162c1d2b1a20a815cbcfbb7
    Author: Borislav Petkov <bp@xxxxxxx>
    Date:   Wed Mar 14 19:36:15 2018 +0100

        x86/microcode: Fix CPU synchronization routine

        Emanuel reported an issue with a hang during microcode update because my
        dumb idea to use one atomic synchronization variable for both rendezvous
        - before and after update - was simply bollocks:

          microcode: microcode_reload_late: late_cpus: 4
          microcode: __reload_late: cpu 2 entered
          microcode: __reload_late: cpu 1 entered
          microcode: __reload_late: cpu 3 entered
          microcode: __reload_late: cpu 0 entered
          microcode: __reload_late: cpu 1 left
          microcode: Timeout while waiting for CPUs rendezvous, remaining: 1

        CPU1 above would finish, leave and the others will still spin waiting 
for
        it to join.

        So do two synchronization atomics instead, which makes the code a lot 
more
        straightforward.

        Also, since the update is serialized and it also takes quite some time 
per
        microcode engine, increase the exit timeout by the number of CPUs on the
        system.

        That's ok because the moment all CPUs are done, that timeout will be cut
        short.

        Furthermore, panic when some of the CPUs timeout when returning from a
        microcode update: we can't allow a system with not all cores updated.

        Also, as an optimization, do not do the exit sync if microcode wasn't
        updated.

        Reported-by: Emanuel Czirai <xftroxgpx@xxxxxxxxxxxxxx>
        Signed-off-by: Borislav Petkov <bp@xxxxxxx>
        Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
        Tested-by: Emanuel Czirai <xftroxgpx@xxxxxxxxxxxxxx>
        Tested-by: Ashok Raj <ashok.raj@xxxxxxxxx>
        Tested-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
        Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@xxxxxxxxx

This patch is also in accord with Andrew's suggestion,
"Rendezvous all online cpus in an IPI to apply the patch, and keep the
processors in until all have completed the patch.", in [1].

[1]:https://wiki.xenproject.org/wiki/XenParavirtOps/microcode_update#Run_time_microcode_updates

Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
Cc: Kevin Tian <kevin.tian@xxxxxxxxx>
Cc: Jun Nakajima <jun.nakajima@xxxxxxxxx>
Cc: Ashok Raj <ashok.raj@xxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
 xen/arch/x86/microcode.c | 89 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 73 insertions(+), 16 deletions(-)

diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
index 4163f50..94c1ca2 100644
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -22,6 +22,7 @@
  */
 
 #include <xen/cpu.h>
+#include <xen/cpumask.h>
 #include <xen/lib.h>
 #include <xen/kernel.h>
 #include <xen/init.h>
@@ -30,15 +31,20 @@
 #include <xen/smp.h>
 #include <xen/softirq.h>
 #include <xen/spinlock.h>
+#include <xen/stop_machine.h>
 #include <xen/tasklet.h>
 #include <xen/guest_access.h>
 #include <xen/earlycpio.h>
+#include <xen/watchdog.h>
 
+#include <asm/delay.h>
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
 #include <asm/microcode.h>
 
+#define USEC_PER_SEC 1000000UL
+
 static module_t __initdata ucode_mod;
 static signed int __initdata ucode_mod_idx;
 static bool_t __initdata ucode_mod_forced;
@@ -187,7 +193,7 @@ static DEFINE_SPINLOCK(microcode_mutex);
 DEFINE_PER_CPU(struct ucode_cpu_info, ucode_cpu_info);
 
 struct microcode_info {
-    unsigned int cpu;
+    atomic_t cpu_in, cpu_out;
     uint32_t buffer_size;
     int error;
     char buffer[1];
@@ -281,24 +287,52 @@ static int microcode_update_cpu(const void *buf, size_t 
size)
     return err;
 }
 
-static long do_microcode_update(void *_info)
+static int __wait_for_cpus(atomic_t *cnt, int timeout)
 {
-    struct microcode_info *info = _info;
-    int error;
+    int cpus = num_online_cpus();
 
-    BUG_ON(info->cpu != smp_processor_id());
+    atomic_inc(cnt);
 
-    error = microcode_update_cpu(info->buffer, info->buffer_size);
-    if ( error )
-        info->error = error;
+    while (atomic_read(cnt) != cpus)
+    {
+        if ( timeout <= 0 )
+        {
+            printk("Timeout when waiting for CPUs calling in\n");
+            return -1;
+        }
+        udelay(1);
+        timeout--;
+    }
 
-    info->cpu = cpumask_next(info->cpu, &cpu_online_map);
-    if ( info->cpu < nr_cpu_ids )
-        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+    return 0;
+}
 
-    error = info->error;
-    xfree(info);
-    return error;
+static int do_microcode_update(void *_info)
+{
+    struct microcode_info *info = _info;
+    int error, ret = 0;
+
+    error = __wait_for_cpus(&info->cpu_in, USEC_PER_SEC);
+    if ( error )
+    {
+        ret = -EBUSY;
+        return ret;
+    }
+
+    error = microcode_update_cpu(info->buffer, info->buffer_size);
+    if ( error && !ret )
+        ret = error;
+    /*
+     * Increase the wait timeout to a safe value here since we're serializing
+     * the microcode update and that could take a while on a large number of
+     * CPUs. And that is fine as the *actual* timeout will be determined by
+     * the last CPU finished updating and thus cut short
+     */
+    error = __wait_for_cpus(&info->cpu_out, USEC_PER_SEC * num_online_cpus());
+    if (error)
+        panic("Timeout when finishing updating microcode");
+    info->error = ret;
+    return ret;
 }
 
 int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
@@ -325,7 +359,6 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) 
buf, unsigned long len)
 
     info->buffer_size = len;
     info->error = 0;
-    info->cpu = cpumask_first(&cpu_online_map);
 
     if ( microcode_ops->start_update )
     {
@@ -337,7 +370,31 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) 
buf, unsigned long len)
         }
     }
 
-    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+    atomic_set(&info->cpu_in, 0);
+    atomic_set(&info->cpu_out, 0);
+
+    /*
+     * We intend to disable interrupt for long time, which may lead to
+     * watchdog timeout. Disable watchdog temporally.
+     */
+    watchdog_disable();
+    /*
+     * Late loading dance. Why the heavy-handed stop_machine effort?
+     *
+     * -HT siblings must be idle and not execute other code while the other
+     *  sibling is loading microcode in order to avoid any negative
+     *  interactions cause by the loading.
+     *
+     * -In addition, microcode update on the cores must be serialized until
+     *  this requirement can be relaxed in the feature. Right now, this is
+     *  conservative and good.
+     */
+    stop_machine_run(do_microcode_update, info, NR_CPUS);
+    watchdog_enable();
+
+    ret = info->error;
+    xfree(info);
+    return ret;
 }
 
 static int __init microcode_init(void)
-- 
1.8.3.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.