[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 4/5] x86/alternative: Implement NMI/#MC-safe patching



Patching code which is being executed is problematic, because it impossible to
arrange an atomic update of the instruction stream outside of a few corner
cases.  Furthermore, we have no feasible way to prevent execution of the NMI
and #MC exception handlers, but have patch points in them.

Use a breakpoint to capture execution which hits an in-progress patch, and
have the exception handler cope with the safety.

See the code comments for exact algorithm details.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
CC: Ross Lagerwall <ross.lagerwall@xxxxxxxxxx>
---
 xen/arch/x86/alternative.c      | 104 +++++++++++++++++++++++++++++++++++++++-
 xen/arch/x86/traps.c            |   2 +
 xen/arch/x86/x86_64/entry.S     |  34 ++++++++++++-
 xen/include/asm-x86/processor.h |   2 +
 4 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/alternative.c b/xen/arch/x86/alternative.c
index 0b309c3..40bfaad 100644
--- a/xen/arch/x86/alternative.c
+++ b/xen/arch/x86/alternative.c
@@ -131,6 +131,84 @@ void init_or_livepatch add_nops(void *insns, unsigned int 
len)
     }
 }
 
+static init_or_livepatch_data struct live_poke_info {
+    uint8_t *addr;
+    const uint8_t *opcode;
+    size_t len;
+    unsigned int cpu;
+} live_poke_info;
+
+/*
+ * text_poke_live - Update the live .text area, in an interrupt-safe way.
+ *
+ * !!! WARNING - Reentrantly called from the int3 exception handler !!!
+ *
+ * All patching data are passed via the live_poke_info structure.  @regs is
+ * non-NULL if entering from an exception handler.
+ *
+ * Returns a boolean indicating whether the transient breakpoint was hit.
+ */
+bool init_or_livepatch text_poke_live(const struct cpu_user_regs *regs)
+{
+    struct live_poke_info *i = &live_poke_info;
+
+    if ( unlikely(i->cpu != smp_processor_id()) )
+    {
+        ASSERT(regs);
+
+        /*
+         * We hit a breakpoint, on a CPU which was not performing the
+         * patching.  This is only expected to be possible for the NMI/#MC
+         * paths, and even then, only if we hit the tiny race window below
+         * while patching in the NMI/#MC handlers.
+         *
+         * We can't safely evaluate whether we hit a transient breakpoint
+         * because i->cpu has likely completed the patch and moved on to the
+         * next patch site.
+         *
+         * Go to sleep for a bit and synchronise the pipeline as we are now in
+         * a cross-modifying scenario.
+         */
+        cpu_relax();
+        cpuid_eax(0);
+
+        /*
+         * Presume that we hit the transient breakpoint, as we can't confirm
+         * whether we did or not.  We don't expect there to be any other
+         * breakpoints to hit, but if we did hit another one, then in the
+         * worse case we will only livelock until i->cpu has finished all of
+         * its patching.
+         */
+        return true;
+    }
+
+    /*
+     * We are the CPU performing the patching, and might have ended up here by
+     * hitting a breakpoint.
+     *
+     * Either way, we need to complete particular patch to make forwards
+     * progress.  This logic is safe even if executed recursively in the
+     * breakpoint handler; the worst that will happen when normal execution
+     * resumes is that we will rewrite the same bytes a second time.
+     */
+
+    /* First, insert a breakpoint to prevent execution of the patch site. */
+    i->addr[0] = 0xcc;
+    smp_wmb();
+    /* Second, copy the remaining instructions into place. */
+    memcpy(i->addr + 1, i->opcode + 1, i->len - 1);
+    smp_wmb();
+    /* Third, replace the breakpoint with the real instruction byte. */
+    i->addr[0] = i->opcode[0];
+
+    /*
+     * Inform our caller whether we hit the transient breakpoint (in which
+     * case we iret normally, having already finished the patch), or whether
+     * we need to continue further into the debug trap handler.
+     */
+    return regs && regs->rip == (unsigned long)&i->addr[1];
+}
+
 /*
  * text_poke - Update instructions on a live kernel or non-executed code.
  * @addr: address to modify
@@ -153,7 +231,31 @@ void init_or_livepatch add_nops(void *insns, unsigned int 
len)
 void init_or_livepatch noinline
 text_poke(void *addr, const void *opcode, size_t len, bool live)
 {
-    memcpy(addr, opcode, len);
+    if ( !live || len == 1 )
+    {
+        /*
+         * If we know *addr can't be executed, or we are patching a single
+         * byte, it is safe to use a straight memcpy().
+         */
+        memcpy(addr, opcode, len);
+    }
+    else
+    {
+        /*
+         * If not, arrange safe patching via the use of breakpoints.  Ordering
+         * of actions here are between this CPU, and the instruction fetch of
+         * the breakpoint exception handler on any CPU.
+         */
+        live_poke_info = (struct live_poke_info){
+            addr, opcode, len, smp_processor_id()
+        };
+        smp_wmb();
+        active_text_patching = true;
+        smp_wmb();
+        text_poke_live(NULL);
+        smp_wmb();
+        active_text_patching = false;
+    }
 }
 
 /*
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index a3e8f0c..6bea963 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -119,6 +119,8 @@ boolean_param("ler", opt_ler);
 #define stack_words_per_line 4
 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
 
+bool active_text_patching;
+
 static void show_code(const struct cpu_user_regs *regs)
 {
     unsigned char insns_before[8] = {}, insns_after[16] = {};
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index a5a6702..56f52c7 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -530,7 +530,10 @@ ENTRY(page_fault)
         movl  $TRAP_page_fault,4(%rsp)
 /* No special register assumptions. */
 GLOBAL(handle_exception)
-        SAVE_ALL CLAC
+        SAVE_ALL
+
+handle_exception_gprs_saved:
+        ASM_CLAC
 
         GET_STACK_END(14)
 
@@ -686,9 +689,36 @@ ENTRY(debug)
         jmp   handle_exception
 
 ENTRY(int3)
+        /* For patching-safety, there must not be any alternatives here. */
         pushq $0
         movl  $TRAP_int3,4(%rsp)
-        jmp   handle_exception
+
+        /* If there is no patching active, continue normally.  */
+        cmpb  $1, active_text_patching(%rip)
+        jne   handle_exception
+
+        /*
+         * We hit a debug trap, but not necessarily the one from active
+         * patching.  Let text_poke_live() work out what to do.
+         */
+        SAVE_ALL
+        mov   %rsp, %rdi
+        call  text_poke_live
+
+        /*
+         * Does text_poke_live() think we hit the transient debug trap?  If
+         * not, continue down the normal int3 handler.
+         */
+        cmp   $0, %eax
+        je    handle_exception_gprs_saved
+
+        /*
+         * We think we hit the transient debug trap.  text_poke_live() has
+         * probably completed the patching, so rewind the instruction pointer
+         * and try again.
+         */
+        subq  $1, UREGS_rip(%rsp)
+        jmp   restore_all_xen
 
 ENTRY(overflow)
         pushq $0
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index e8c2f02..5433157 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -439,6 +439,8 @@ extern idt_entry_t *idt_tables[];
 DECLARE_PER_CPU(struct tss_struct, init_tss);
 DECLARE_PER_CPU(root_pgentry_t *, root_pgt);
 
+extern bool active_text_patching;
+
 extern void init_int80_direct_trap(struct vcpu *v);
 
 extern void write_ptbase(struct vcpu *v);
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.