|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH RFC v2 1/4] x86/mm: Shadow and p2m changes for PV mem_access
>>No, at least not about unspecified hypothetical ones. But again - a
>>vague statement like you gave, without any kind of quantification of
>>the imposed overhead, isn't going to be good enough a judgment.
>>After all pausing a domain can be quite problematic for its performance
>>if that happens reasonably frequently. Otoh I admit that the user of
>>your new mechanism has a certain level of control over the impact via
>>the number of pages (s)he wants to write-protect. So yes, perhaps it
>>isn't going to be too bad as long as the hackery you need to do isn't.
>
>I just wanted to give an update as to where I stand so as to not leave this
>thread hanging. As I was working through pausing and unpausing the domain
>during Xen writes to guest memory, I found a spot where the write happens
>outside of __copy_to_user_ll() and __put_user_size(). It occurs in
>create_bounce_frame() where Xen writes to the guest stack to setup an
>exception frame. At that moment, if the guest stack is marked read-only, we
>end up in the same situation as with the copy to guest functions but with no
>obvious place to revert the page table entry back to read-only. I am at the
>moment looking for a spot where I can do this.
I have a solution for the create_bounc_frame() issue I described above. Please
find below a POC patch that includes pausing and unpausing the domain during
the Xen writes to guest memory. I have it on top of the patch that was using
CR0.WP to highlight the difference. Please take a look and let me know if this
solution is acceptable.
PS: I do realize whatever I do to create_bounce_frame() will have to be
reflected in the compat version. If this is correct approach I will do the same
there too.
Thanks,
Aravindh
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 49d8545..758f7f8 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -734,6 +734,9 @@ int arch_set_info_guest(
if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
(c(ldt_ents) > 8192) )
return -EINVAL;
+
+ v->arch.pv_vcpu.mfn_access_reset_req = 0;
+ v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
}
else if ( is_pvh_vcpu(v) )
{
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index d4473c1..6e6b7f8 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -1168,9 +1168,13 @@ int __init construct_dom0(
COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
}
- /* Pages that are part of page tables must be read only. */
if ( is_pv_domain(d) )
+ {
+ v->arch.pv_vcpu.mfn_access_reset_req = 0;
+ v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+ /* Pages that are part of page tables must be read only. */
mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages);
+ }
/* Mask all upcalls... */
for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
diff --git a/xen/arch/x86/mm/p2m-ma.c b/xen/arch/x86/mm/p2m-ma.c
index d8ad12c..ad37db0 100644
--- a/xen/arch/x86/mm/p2m-ma.c
+++ b/xen/arch/x86/mm/p2m-ma.c
@@ -27,6 +27,8 @@
#include "mm-locks.h"
/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
#undef mfn_to_page
#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
@@ -125,6 +127,34 @@ p2m_mem_access_get_entry(struct p2m_domain *p2m, unsigned
long gfn,
return mfn;
}
+void p2m_mem_access_reset_mfn_entry(void)
+{
+ mfn_t mfn = _mfn(current->arch.pv_vcpu.mfn_access_reset);
+ struct page_info *page;
+ struct domain *d = current->domain;
+
+ if ( unlikely(!mfn_valid(mfn)) )
+ return;
+
+ page = mfn_to_page(mfn);
+ if ( page_get_owner(page) != d )
+ return;
+
+ ASSERT(!paging_locked_by_me(d));
+ paging_lock(d);
+
+ shadow_set_access(page, p2m_get_hostp2m(d)->default_access);
+
+ if ( sh_remove_all_mappings(current, mfn) )
+ flush_tlb_mask(d->domain_dirty_cpumask);
+
+ current->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+ current->arch.pv_vcpu.mfn_access_reset_req = 0;
+
+ paging_unlock(d);
+ domain_unpause(d);
+}
+
/* Reset the set_entry and get_entry function pointers */
void p2m_mem_access_reset(struct p2m_domain *p2m)
{
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 9aacd8e..5f02948 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1132,8 +1132,36 @@ int shadow_write_guest_entry(struct vcpu *v, intpte_t *p,
* appropriately. Returns 0 if we page-faulted, 1 for success. */
{
int failed;
+ unsigned long mfn_access_reset_saved = INVALID_MFN;
paging_lock(v->domain);
+
+ /*
+ * If a mem_access listener is present for a PV guest and is listening for
+ * write violations, we want to allow Xen writes to guest memory to go
+ * through. To allow this we are setting PTE.RW for the MFN in question and
+ * resetting this after the write has gone through. The resetting is kicked
+ * off at the end of the guest access functions __copy_to_user_ll() and
+ * __put_user_size() if mfn_access_reset_req is set. Since these functions
+ * are also called by the shadow code when setting the PTE.RW or
+ * sh_remove_all_mappings(), we temporarily set mfn_access_reset_req to 0
+ * prevent p2m_mem_access_reset_entry() from firing.
+ */
+ if ( unlikely(mem_event_check_ring(&v->domain->mem_event->access)) &&
+ is_pv_domain(v->domain) && v->arch.pv_vcpu.mfn_access_reset_req )
+ {
+ mfn_access_reset_saved = v->arch.pv_vcpu.mfn_access_reset;
+ v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+ v->arch.pv_vcpu.mfn_access_reset_req = 0;
+ }
+
failed = __copy_to_user(p, &new, sizeof(new));
+
+ if ( unlikely(mfn_valid(_mfn(mfn_access_reset_saved))) )
+ {
+ v->arch.pv_vcpu.mfn_access_reset = mfn_access_reset_saved;
+ v->arch.pv_vcpu.mfn_access_reset_req = 1;
+ }
+
if ( failed != sizeof(new) )
sh_validate_guest_entry(v, gmfn, p, sizeof(new));
paging_unlock(v->domain);
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index db30396..bcf8fdf 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -795,7 +795,7 @@ static inline void safe_write_entry(void *dst, void *src)
static inline void
-shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+shadow_write_entries(struct vcpu *v, void *d, void *s, int entries, mfn_t mfn)
/* This function does the actual writes to shadow pages.
* It must not be called directly, since it doesn't do the bookkeeping
* that shadow_set_l*e() functions do. */
@@ -804,6 +804,26 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t
mfn)
shadow_l1e_t *src = s;
void *map = NULL;
int i;
+ unsigned long mfn_access_reset_saved = INVALID_MFN;
+
+ /*
+ * If a mem_access listener is present for a PV guest and is listening for
+ * write violations, we want to allow Xen writes to guest memory to go
+ * through. To allow this we are setting PTE.RW for the MFN in question and
+ * resetting this after the write has gone through. The resetting is kicked
+ * off at the end of the guest access functions __copy_to_user_ll() and
+ * __put_user_size() if mfn_access_reset_req is set. Since these functions
+ * are also called by the shadow code when setting the PTE.RW or
+ * sh_remove_all_mappings(), we temporarily set mfn_access_reset_req to 0
+ * prevent p2m_mem_access_reset_entry() from firing.
+ */
+ if ( unlikely(mem_event_check_ring(&v->domain->mem_event->access)) &&
+ v->arch.pv_vcpu.mfn_access_reset_req && is_pv_domain(v->domain) )
+ {
+ mfn_access_reset_saved = v->arch.pv_vcpu.mfn_access_reset;
+ v->arch.pv_vcpu.mfn_access_reset = INVALID_MFN;
+ v->arch.pv_vcpu.mfn_access_reset_req = 0;
+ }
/* Because we mirror access rights at all levels in the shadow, an
* l2 (or higher) entry with the RW bit cleared will leave us with
@@ -817,6 +837,12 @@ shadow_write_entries(void *d, void *s, int entries, mfn_t
mfn)
dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
}
+ if ( unlikely(mfn_valid(_mfn(mfn_access_reset_saved))) &&
+ is_pv_domain(v->domain) )
+ {
+ v->arch.pv_vcpu.mfn_access_reset = mfn_access_reset_saved;
+ v->arch.pv_vcpu.mfn_access_reset_req = 1;
+ }
for ( i = 0; i < entries; i++ )
safe_write_entry(dst++, src++);
@@ -924,7 +950,7 @@ static int shadow_set_l4e(struct vcpu *v,
}
/* Write the new entry */
- shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+ shadow_write_entries(v, sl4e, &new_sl4e, 1, sl4mfn);
flags |= SHADOW_SET_CHANGED;
if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
@@ -969,7 +995,7 @@ static int shadow_set_l3e(struct vcpu *v,
}
/* Write the new entry */
- shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+ shadow_write_entries(v, sl3e, &new_sl3e, 1, sl3mfn);
flags |= SHADOW_SET_CHANGED;
if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )
@@ -1051,9 +1077,9 @@ static int shadow_set_l2e(struct vcpu *v,
/* Write the new entry */
#if GUEST_PAGING_LEVELS == 2
- shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+ shadow_write_entries(v, sl2e, &pair, 2, sl2mfn);
#else /* normal case */
- shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+ shadow_write_entries(v, sl2e, &new_sl2e, 1, sl2mfn);
#endif
flags |= SHADOW_SET_CHANGED;
@@ -1218,7 +1244,7 @@ static int shadow_set_l1e(struct vcpu *v,
}
/* Write the new entry */
- shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+ shadow_write_entries(v, sl1e, &new_sl1e, 1, sl1mfn);
flags |= SHADOW_SET_CHANGED;
if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT)
@@ -3061,23 +3087,24 @@ static int sh_page_fault(struct vcpu *v,
/*
* Do not police writes to guest memory from the Xen hypervisor.
- * This keeps PV mem_access on par with HVM. Turn off CR0.WP here
to
- * allow the write to go through if the guest has marked the page
as
- * writable. Turn it back on in the guest access functions
- * __copy_to_user / __put_user_size() after the write is completed.
+ * This keeps PV mem_access on par with HVM. Pause the guest and
+ * mark the access entry as RW here to allow the write to go
through
+ * if the guest has marked the page as writable. Unpause the guest
+ * and set the access value back to the default at the end of
+ * __copy_to_user, __put_user_size() and create_bounce_frame()
after
+ * the write is completed. The guest is paused to prevent other
+ * VCPUs from writing to this page during this window.
*/
if ( violation && access_w &&
regs->eip >= XEN_VIRT_START && regs->eip <= XEN_VIRT_END )
{
- unsigned long cr0 = read_cr0();
-
violation = 0;
- if ( cr0 & X86_CR0_WP &&
- guest_l1e_get_flags(gw.l1e) & _PAGE_RW )
+ if ( guest_l1e_get_flags(gw.l1e) & _PAGE_RW )
{
- cr0 &= ~X86_CR0_WP;
- write_cr0(cr0);
- v->arch.pv_vcpu.need_cr0_wp_set = 1;
+ domain_pause_nosync(d);
+ v->arch.pv_vcpu.mfn_access_reset = mfn_x(gmfn);
+ v->arch.pv_vcpu.mfn_access_reset_req = 1;
+ shadow_set_access(mfn_to_page(gmfn), p2m_access_rw);
}
}
diff --git a/xen/arch/x86/usercopy.c b/xen/arch/x86/usercopy.c
index eecf429..a0d11a9 100644
--- a/xen/arch/x86/usercopy.c
+++ b/xen/arch/x86/usercopy.c
@@ -8,6 +8,7 @@
#include <xen/lib.h>
#include <xen/sched.h>
+#include <asm/p2m.h>
#include <asm/uaccess.h>
unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned n)
@@ -47,16 +48,12 @@ unsigned long __copy_to_user_ll(void __user *to, const void
*from, unsigned n)
/*
* A mem_access listener was present and Xen tried to write to guest
memory.
- * To allow this write to go through without an event being sent to the
- * listener or the pagetable entry being modified, we disabled CR0.WP in
the
- * shadow pagefault handler. We are enabling it back here again.
+ * To allow this write to go through we modified the PTE in the shadow page
+ * fault handler. We are resetting the access permission of the page that
+ * was written to its default value here.
*/
- if ( unlikely(current->arch.pv_vcpu.need_cr0_wp_set) )
- {
- write_cr0(read_cr0() | X86_CR0_WP);
- current->arch.pv_vcpu.need_cr0_wp_set = 0;
- }
-
+ if ( unlikely(current->arch.pv_vcpu.mfn_access_reset_req) )
+ p2m_mem_access_reset_mfn_entry();
return __n;
}
diff --git a/xen/arch/x86/x86_64/asm-offsets.c
b/xen/arch/x86/x86_64/asm-offsets.c
index 3994f4d..97ea966 100644
--- a/xen/arch/x86/x86_64/asm-offsets.c
+++ b/xen/arch/x86/x86_64/asm-offsets.c
@@ -86,6 +86,7 @@ void __dummy__(void)
OFFSET(VCPU_trap_ctxt, struct vcpu, arch.pv_vcpu.trap_ctxt);
OFFSET(VCPU_kernel_sp, struct vcpu, arch.pv_vcpu.kernel_sp);
OFFSET(VCPU_kernel_ss, struct vcpu, arch.pv_vcpu.kernel_ss);
+ OFFSET(VCPU_mfn_access_reset_req, struct vcpu,
arch.pv_vcpu.mfn_access_reset_req);
OFFSET(VCPU_guest_context_flags, struct vcpu, arch.vgc_flags);
OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index a3ed216..27fc97f 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -441,6 +441,12 @@ UNLIKELY_START(z, create_bounce_frame_bad_bounce_ip)
jmp asm_domain_crash_synchronous /* Does not return */
__UNLIKELY_END(create_bounce_frame_bad_bounce_ip)
movq %rax,UREGS_rip+8(%rsp)
+ cmpb $1, VCPU_mfn_access_reset_req(%rbx)
+ je 2f
+ ret
+2: SAVE_ALL
+ call p2m_mem_access_reset_mfn_entry
+ RESTORE_ALL
ret
_ASM_EXTABLE(.Lft2, dom_crash_sync_extable)
_ASM_EXTABLE(.Lft3, dom_crash_sync_extable)
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index cf2ae2a..fa58444 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -385,10 +385,13 @@ struct pv_vcpu
struct vcpu_time_info pending_system_time;
/*
- * Flag that tracks if CR0.WP needs to be set after a Xen write to guest
- * memory when a PV domain has a mem_access listener attached to it.
+ * Track if a mfn's access permission needs to be reset after a Xen write
to
+ * guest memory when a PV domain has a mem_access listener attached to it.
+ * The boolean is used to allow for easy checking for this condition in
+ * create_bounce_frame().
*/
- bool_t need_cr0_wp_set;
+ bool_t mfn_access_reset_req;
+ unsigned long mfn_access_reset;
};
struct arch_vcpu
diff --git a/xen/include/asm-x86/uaccess.h b/xen/include/asm-x86/uaccess.h
index 947470d..80c7a78 100644
--- a/xen/include/asm-x86/uaccess.h
+++ b/xen/include/asm-x86/uaccess.h
@@ -21,6 +21,8 @@ unsigned long __copy_from_user_ll(void *to, const void *from,
unsigned n);
extern long __get_user_bad(void);
extern void __put_user_bad(void);
+extern void p2m_mem_access_reset_mfn_entry(void);
+
/**
* get_user: - Get a simple variable from user space.
* @x: Variable to store result.
diff --git a/xen/include/asm-x86/x86_64/uaccess.h
b/xen/include/asm-x86/x86_64/uaccess.h
index 6d13ec6..111e4ae 100644
--- a/xen/include/asm-x86/x86_64/uaccess.h
+++ b/xen/include/asm-x86/x86_64/uaccess.h
@@ -67,11 +67,8 @@ do {
\
case 8: __put_user_asm(x,ptr,retval,"q","","ir",errret);break; \
default: __put_user_bad(); \
} \
- if ( unlikely(current->arch.pv_vcpu.need_cr0_wp_set) ) \
- { \
- write_cr0(read_cr0() | X86_CR0_WP); \
- current->arch.pv_vcpu.need_cr0_wp_set = 0; \
- } \
+ if ( unlikely(current->arch.pv_vcpu.mfn_access_reset_req) ) \
+ p2m_mem_access_reset_mfn_entry(); \
} while (0)
#define __get_user_size(x,ptr,size,retval,errret) \
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |