[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] PVH: libxl expertise...



On Wed, 14 Nov 2012 09:58:30 +0000
Ian Campbell <Ian.Campbell@xxxxxxxxxx> wrote:

> On Wed, 2012-11-14 at 01:31 +0000, Mukesh Rathor wrote:
> > On Tue, 13 Nov 2012 17:21:38 -0800
> > Mukesh Rathor <mukesh.rathor@xxxxxxxxxx> wrote:
> > 
> > > 
> Did you say that if you use the raw ELF file it works?

You mean vmlinux? I couldn't its 133M and my boot partition is 
only 99M.

> BTW for the purposes of parallelising things up you could post the
> hypervisor side part of the series stuff + a patch to force enable PVH
> in libxc (obviously the last not to be applied) first then we can be
> reviewing the h/v side things while simultaneously figuring out how
> the tools side fits together.

Ok. I've been procrastinating as the patch is full of debug code, 
is rather big, and would take lot of time to split up in different
patches. If you don't mind a very raw/debug version, all in one,
here it is (built on c/s: 26124 + my KDB patch):



diff -r 8b0762504037 tools/debugger/gdbsx/xg/xg_main.c
--- a/tools/debugger/gdbsx/xg/xg_main.c Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/debugger/gdbsx/xg/xg_main.c Wed Nov 14 11:03:24 2012 -0800
@@ -80,6 +80,7 @@ int xgtrc_on = 0;
 struct xen_domctl domctl;         /* just use a global domctl */
 
 static int     _hvm_guest;        /* hvm guest? 32bit HVMs have 64bit context 
*/
+static int     _pvh_guest;        /* PV guest in HVM container */
 static domid_t _dom_id;           /* guest domid */
 static int     _max_vcpu_id;      /* thus max_vcpu_id+1 VCPUs */
 static int     _dom0_fd;          /* fd of /dev/privcmd */
@@ -308,6 +309,7 @@ xg_attach(int domid, int guest_bitness)
 
     _max_vcpu_id = domctl.u.getdomaininfo.max_vcpu_id;
     _hvm_guest = (domctl.u.getdomaininfo.flags & XEN_DOMINF_hvm_guest);
+    _pvh_guest = (domctl.u.getdomaininfo.flags & XEN_DOMINF_pvh_guest);
     return _max_vcpu_id;
 }
 
@@ -368,7 +370,7 @@ _change_TF(vcpuid_t which_vcpu, int gues
     int sz = sizeof(anyc);
 
     /* first try the MTF for hvm guest. otherwise do manually */
-    if (_hvm_guest) {
+    if (_hvm_guest || _pvh_guest) {
         domctl.u.debug_op.vcpu = which_vcpu;
         domctl.u.debug_op.op = setit ? XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON :
                                        XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF;
diff -r 8b0762504037 tools/libxc/Makefile
--- a/tools/libxc/Makefile      Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxc/Makefile      Wed Nov 14 11:03:24 2012 -0800
@@ -73,7 +73,7 @@ OSDEP_SRCS-y                 += xenctrl_
 -include $(XEN_TARGET_ARCH)/Makefile
 
 CFLAGS   += -Werror -Wmissing-prototypes
-CFLAGS   += -I. $(CFLAGS_xeninclude)
+CFLAGS   += -I. $(CFLAGS_xeninclude) -g -O0
 
 # Needed for posix_fadvise64() in xc_linux.c
 CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE
diff -r 8b0762504037 tools/libxc/xc_dom_x86.c
--- a/tools/libxc/xc_dom_x86.c  Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxc/xc_dom_x86.c  Wed Nov 14 11:03:24 2012 -0800
@@ -386,7 +386,8 @@ static int setup_pgtables_x86_64(struct 
         pgpfn = (addr - dom->parms.virt_base) >> PAGE_SHIFT_X86;
         l1tab[l1off] =
             pfn_to_paddr(xc_dom_p2m_guest(dom, pgpfn)) | L1_PROT;
-        if ( (addr >= dom->pgtables_seg.vstart) && 
+        if ( !(dom->flags & SIF_IS_PVH)      &&
+             (addr >= dom->pgtables_seg.vstart) && 
              (addr < dom->pgtables_seg.vend) )
             l1tab[l1off] &= ~_PAGE_RW; /* page tables are r/o */
         if ( l1off == (L1_PAGETABLE_ENTRIES_X86_64 - 1) )
@@ -716,7 +717,7 @@ int arch_setup_meminit(struct xc_dom_ima
     rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
     if ( rc )
         return rc;
-    if ( xc_dom_feature_translated(dom) )
+    if ( xc_dom_feature_translated(dom) && !(dom->flags & SIF_IS_PVH) )
     {
         dom->shadow_enabled = 1;
         rc = x86_shadow(dom->xch, dom->guest_domid);
@@ -830,7 +831,7 @@ int arch_setup_bootlate(struct xc_dom_im
         }
 
         /* Map grant table frames into guest physmap. */
-        for ( i = 0; ; i++ )
+        for ( i = 0; !(dom->flags & SIF_IS_PVH); i++ )
         {
             rc = xc_domain_add_to_physmap(dom->xch, dom->guest_domid,
                                           XENMAPSPACE_grant_table,
diff -r 8b0762504037 tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxc/xc_domain_restore.c   Wed Nov 14 11:03:24 2012 -0800
@@ -1933,15 +1933,15 @@ int xc_domain_restore(xc_interface *xch,
             munmap(start_info, PAGE_SIZE);
         }
         /* Uncanonicalise each GDT frame number. */
-        if ( GET_FIELD(ctxt, gdt_ents) > 8192 )
+        if ( GET_FIELD(ctxt, u.pv.gdt_ents) > 8192 )
         {
             ERROR("GDT entry count out of range");
             goto out;
         }
 
-        for ( j = 0; (512*j) < GET_FIELD(ctxt, gdt_ents); j++ )
+        for ( j = 0; (512*j) < GET_FIELD(ctxt, u.pv.gdt_ents); j++ )
         {
-            pfn = GET_FIELD(ctxt, gdt_frames[j]);
+            pfn = GET_FIELD(ctxt, u.pv.gdt_frames[j]);
             if ( (pfn >= dinfo->p2m_size) ||
                  (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
             {
@@ -1949,7 +1949,7 @@ int xc_domain_restore(xc_interface *xch,
                       j, (unsigned long)pfn);
                 goto out;
             }
-            SET_FIELD(ctxt, gdt_frames[j], ctx->p2m[pfn]);
+            SET_FIELD(ctxt, u.pv.gdt_frames[j], ctx->p2m[pfn]);
         }
         /* Uncanonicalise the page table base pointer. */
         pfn = UNFOLD_CR3(GET_FIELD(ctxt, ctrlreg[3]));
diff -r 8b0762504037 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxc/xc_domain_save.c      Wed Nov 14 11:03:24 2012 -0800
@@ -1889,15 +1889,15 @@ int xc_domain_save(xc_interface *xch, in
         }
 
         /* Canonicalise each GDT frame number. */
-        for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
+        for ( j = 0; (512*j) < GET_FIELD(&ctxt, u.pv.gdt_ents); j++ )
         {
-            mfn = GET_FIELD(&ctxt, gdt_frames[j]);
+            mfn = GET_FIELD(&ctxt, u.pv.gdt_frames[j]);
             if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
             {
                 ERROR("GDT frame is not in range of pseudophys map");
                 goto out;
             }
-            SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
+            SET_FIELD(&ctxt, u.pv.gdt_frames[j], mfn_to_pfn(mfn));
         }
 
         /* Canonicalise the page table base pointer. */
diff -r 8b0762504037 tools/libxl/Makefile
--- a/tools/libxl/Makefile      Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxl/Makefile      Wed Nov 14 11:03:24 2012 -0800
@@ -13,7 +13,7 @@ XLUMINOR = 1
 
 CFLAGS += -Werror -Wno-format-zero-length -Wmissing-declarations \
        -Wno-declaration-after-statement -Wformat-nonliteral
-CFLAGS += -I. -fPIC
+CFLAGS += -I. -fPIC -O0
 
 ifeq ($(CONFIG_Linux),y)
 LIBUUID_LIBS += -luuid
diff -r 8b0762504037 tools/libxl/libxl_create.c
--- a/tools/libxl/libxl_create.c        Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxl/libxl_create.c        Wed Nov 14 11:03:24 2012 -0800
@@ -409,6 +409,8 @@ int libxl__domain_make(libxl__gc *gc, li
         flags |= XEN_DOMCTL_CDF_hvm_guest;
         flags |= libxl_defbool_val(info->hap) ? XEN_DOMCTL_CDF_hap : 0;
         flags |= libxl_defbool_val(info->oos) ? 0 : XEN_DOMCTL_CDF_oos_off;
+    } else if ( libxl_defbool_val(info->ci_pvh) ) {
+        flags |= (XEN_DOMCTL_CDF_pvh_guest | XEN_DOMCTL_CDF_hap);
     }
     *domid = -1;
 
diff -r 8b0762504037 tools/libxl/libxl_dom.c
--- a/tools/libxl/libxl_dom.c   Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxl/libxl_dom.c   Wed Nov 14 11:03:24 2012 -0800
@@ -270,7 +270,8 @@ int libxl__build_pre(libxl__gc *gc, uint
     if (rtc_timeoffset)
         xc_domain_set_time_offset(ctx->xch, domid, rtc_timeoffset);
 
-    if (info->type == LIBXL_DOMAIN_TYPE_HVM) {
+    if (info->type == LIBXL_DOMAIN_TYPE_HVM ||
+        libxl_defbool_val(info->bi_pvh) ) {
         unsigned long shadow;
         shadow = (info->shadow_memkb + 1023) / 1024;
         xc_shadow_control(ctx->xch, domid, 
XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, NULL, 0, &shadow, 0, NULL);
@@ -368,9 +369,15 @@ int libxl__build_pv(libxl__gc *gc, uint3
     struct xc_dom_image *dom;
     int ret;
     int flags = 0;
+    int is_pvh = libxl_defbool_val(info->bi_pvh);
 
     xc_dom_loginit(ctx->xch);
 
+    if (is_pvh) {
+        printf("info..features:%s\n", info->u.pv.features);
+        info->u.pv.features = strdup("auto_translated_physmap");
+    }
+
     dom = xc_dom_allocate(ctx->xch, state->pv_cmdline, info->u.pv.features);
     if (!dom) {
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "xc_dom_allocate failed");
@@ -408,6 +415,7 @@ int libxl__build_pv(libxl__gc *gc, uint3
         }
     }
 
+    flags |= is_pvh ? SIF_IS_PVH : 0;
     dom->flags = flags;
     dom->console_evtchn = state->console_port;
     dom->console_domid = state->console_domid;
@@ -438,7 +446,8 @@ int libxl__build_pv(libxl__gc *gc, uint3
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "xc_dom_boot_image failed");
         goto out;
     }
-    if ( (ret = xc_dom_gnttab_init(dom)) != 0 ) {
+    /* PVH sets up its own grant during boot via hvm mechanisms */
+    if ( !is_pvh && (ret = xc_dom_gnttab_init(dom)) != 0 ) {
         LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "xc_dom_gnttab_init failed");
         goto out;
     }
diff -r 8b0762504037 tools/libxl/libxl_types.idl
--- a/tools/libxl/libxl_types.idl       Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxl/libxl_types.idl       Wed Nov 14 11:03:24 2012 -0800
@@ -243,6 +243,7 @@ libxl_domain_create_info = Struct("domai
     ("platformdata", libxl_key_value_list),
     ("poolid",       uint32),
     ("run_hotplug_scripts",libxl_defbool),
+    ("ci_pvh",       libxl_defbool),
     ], dir=DIR_IN)
 
 MemKB = UInt(64, init_val = "LIBXL_MEMKB_DEFAULT")
@@ -339,6 +340,7 @@ libxl_domain_build_info = Struct("domain
                                       ])),
                  ("invalid", Struct(None, [])),
                  ], keyvar_init_val = "LIBXL_DOMAIN_TYPE_INVALID")),
+    ("bi_pvh",       libxl_defbool),
     ], dir=DIR_IN
 )
 
diff -r 8b0762504037 tools/libxl/xl_cmdimpl.c
--- a/tools/libxl/xl_cmdimpl.c  Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/libxl/xl_cmdimpl.c  Wed Nov 14 11:03:24 2012 -0800
@@ -615,8 +615,18 @@ static void parse_config_data(const char
         !strncmp(buf, "hvm", strlen(buf)))
         c_info->type = LIBXL_DOMAIN_TYPE_HVM;
 
+    libxl_defbool_setdefault(&c_info->ci_pvh, false);
+    libxl_defbool_setdefault(&c_info->hap, false);
+    xlu_cfg_get_defbool(config, "pvh", &c_info->ci_pvh, 0);
     xlu_cfg_get_defbool(config, "hap", &c_info->hap, 0);
 
+    if (libxl_defbool_val(c_info->ci_pvh) &&
+        !libxl_defbool_val(c_info->hap)) {
+
+        fprintf(stderr, "hap is required for PVH domain\n");
+        exit(1);
+    }
+
     if (xlu_cfg_replace_string (config, "name", &c_info->name, 0)) {
         fprintf(stderr, "Domain name must be specified.\n");
         exit(1);
@@ -916,6 +926,7 @@ static void parse_config_data(const char
 
         b_info->u.pv.cmdline = cmdline;
         xlu_cfg_replace_string (config, "ramdisk", &b_info->u.pv.ramdisk, 0);
+        libxl_defbool_set(&b_info->bi_pvh, libxl_defbool_val(c_info->ci_pvh));
         break;
     }
     default:
diff -r 8b0762504037 tools/xenstore/xenstored_domain.c
--- a/tools/xenstore/xenstored_domain.c Wed Oct 31 16:08:55 2012 -0700
+++ b/tools/xenstore/xenstored_domain.c Wed Nov 14 11:03:24 2012 -0800
@@ -578,11 +578,13 @@ static int close_xc_handle(void *_handle
        return 0;
 }
 
+#if 0
 static int close_xcg_handle(void *_handle)
 {
        xc_gnttab_close(*(xc_gnttab **)_handle);
        return 0;
 }
+#endif
 
 /* Returns the implicit path of a connection (only domains have this) */
 const char *get_implicit_path(const struct connection *conn)
@@ -639,12 +641,14 @@ void domain_init(void)
        if (!xcg_handle)
                barf_perror("Failed to allocate domain gnttab handle");
 
+*xcg_handle = NULL;
+#if 0
        *xcg_handle = xc_gnttab_open(NULL, 0);
        if (*xcg_handle < 0)
                xprintf("WARNING: Failed to open connection to gnttab\n");
        else
                talloc_set_destructor(xcg_handle, close_xcg_handle);
-
+#endif
        xce_handle = xc_evtchn_open(NULL, 0);
 
        if (xce_handle == NULL)
diff -r 8b0762504037 xen/arch/x86/debug.c
--- a/xen/arch/x86/debug.c      Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/debug.c      Wed Nov 14 11:03:24 2012 -0800
@@ -59,7 +59,7 @@ dbg_hvm_va2mfn(dbgva_t vaddr, struct dom
         return INVALID_MFN;
     }
 
-    mfn = mfn_x(get_gfn(dp, *gfn, &gfntype)); 
+    mfn = mfn_x(get_gfn_query_unlocked(dp, *gfn, &gfntype)); 
     if ( p2m_is_readonly(gfntype) && toaddr )
     {
         DBGP2("kdb:p2m_is_readonly: gfntype:%x\n", gfntype);
@@ -153,7 +153,7 @@ dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t
 
         pagecnt = min_t(long, PAGE_SIZE - (addr & ~PAGE_MASK), len);
 
-        mfn = (dp->is_hvm
+        mfn = (is_hvm_or_pvh_domain(dp)
                ? dbg_hvm_va2mfn(addr, dp, toaddr, &gfn)
                : dbg_pv_va2mfn(addr, dp, pgd3));
         if ( mfn == INVALID_MFN ) 
@@ -173,8 +173,6 @@ dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t
         }
 
         unmap_domain_page(va);
-        if ( gfn != INVALID_GFN )
-            put_gfn(dp, gfn);
 
         addr += pagecnt;
         buf += pagecnt;
diff -r 8b0762504037 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/domain.c     Wed Nov 14 11:03:24 2012 -0800
@@ -410,7 +410,7 @@ int vcpu_initialise(struct vcpu *v)
 
     vmce_init_vcpu(v);
 
-    if ( is_hvm_domain(d) )
+    if ( is_hvm_or_pvh_domain(d) )
     {
         rc = hvm_vcpu_initialise(v);
         goto done;
@@ -476,7 +476,7 @@ void vcpu_destroy(struct vcpu *v)
 
     vcpu_destroy_fpu(v);
 
-    if ( is_hvm_vcpu(v) )
+    if ( is_hvm_or_pvh_vcpu(v) )
         hvm_vcpu_destroy(v);
     else if ( standalone_trap_ctxt(v) )
         free_xenheap_page(v->arch.pv_vcpu.trap_ctxt);
@@ -489,7 +489,7 @@ int arch_domain_create(struct domain *d,
     int rc = -ENOMEM;
 
     d->arch.hvm_domain.hap_enabled =
-        is_hvm_domain(d) &&
+        is_hvm_or_pvh_domain(d) &&
         hvm_funcs.hap_supported &&
         (domcr_flags & DOMCRF_hap);
     d->arch.hvm_domain.mem_sharing_enabled = 0;
@@ -539,7 +539,7 @@ int arch_domain_create(struct domain *d,
                             __PAGE_HYPERVISOR);
 
     HYPERVISOR_COMPAT_VIRT_START(d) =
-        is_hvm_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START;
+        is_hvm_or_pvh_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START;
 
     if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
         goto fail;
@@ -581,7 +581,7 @@ int arch_domain_create(struct domain *d,
             goto fail;
     }
 
-    if ( is_hvm_domain(d) )
+    if ( is_hvm_or_pvh_domain(d) )
     {
         if ( (rc = hvm_domain_initialise(d)) != 0 )
         {
@@ -593,9 +593,9 @@ int arch_domain_create(struct domain *d,
     {
         /* 64-bit PV guest by default. */
         d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
-
+    }
+    if ( !is_hvm_domain(d) )
         spin_lock_init(&d->arch.pv_domain.e820_lock);
-    }
 
     /* initialize default tsc behavior in case tools don't */
     tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
@@ -622,9 +622,10 @@ void arch_domain_destroy(struct domain *
 {
     unsigned int i;
 
-    if ( is_hvm_domain(d) )
+    if ( is_hvm_or_pvh_domain(d) )
         hvm_domain_destroy(d);
-    else
+
+    if ( !is_hvm_domain(d) )
         xfree(d->arch.pv_domain.e820);
 
     free_domain_pirqs(d);
@@ -683,6 +684,9 @@ int arch_set_info_guest(
     unsigned int i;
     int rc = 0, compat;
 
+    if ( is_pvh_vcpu(v) && is_pvh_vcpu(current) )
+        return -EINVAL;
+
     /* The context is a compat-mode one if the target domain is compat-mode;
      * we expect the tools to DTRT even in compat-mode callers. */
     compat = is_pv_32on64_domain(d);
@@ -690,7 +694,7 @@ int arch_set_info_guest(
 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
     flags = c(flags);
 
-    if ( !is_hvm_vcpu(v) )
+    if ( !is_hvm_or_pvh_vcpu(v) )
     {
         if ( !compat )
         {
@@ -743,7 +747,7 @@ int arch_set_info_guest(
     v->fpu_initialised = !!(flags & VGCF_I387_VALID);
 
     v->arch.flags &= ~TF_kernel_mode;
-    if ( (flags & VGCF_in_kernel) || is_hvm_vcpu(v)/*???*/ )
+    if ( (flags & VGCF_in_kernel) || is_hvm_or_pvh_vcpu(v)/*???*/ )
         v->arch.flags |= TF_kernel_mode;
 
     v->arch.vgc_flags = flags;
@@ -752,7 +756,7 @@ int arch_set_info_guest(
     if ( !compat )
     {
         memcpy(&v->arch.user_regs, &c.nat->user_regs, 
sizeof(c.nat->user_regs));
-        if ( !is_hvm_vcpu(v) )
+        if ( !is_hvm_or_pvh_vcpu(v) )
             memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt,
                    sizeof(c.nat->trap_ctxt));
     }
@@ -768,10 +772,13 @@ int arch_set_info_guest(
 
     v->arch.user_regs.eflags |= 2;
 
-    if ( is_hvm_vcpu(v) )
+    if ( is_hvm_or_pvh_vcpu(v) )
     {
         hvm_set_info_guest(v);
-        goto out;
+        if ( is_hvm_vcpu(v) || v->is_initialised )
+            goto out;
+        else 
+            goto pvh_skip_pv;;
     }
 
     init_int80_direct_trap(v);
@@ -780,7 +787,8 @@ int arch_set_info_guest(
     v->arch.pv_vcpu.iopl = (v->arch.user_regs.eflags >> 12) & 3;
     v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL;
 
-    /* Ensure real hardware interrupts are enabled. */
+    /* Ensure real hardware interrupts are enabled. FYI: PVH may not have 
+     * IDT set on all vcpus so don't enable IF for it yet. */
     v->arch.user_regs.eflags |= X86_EFLAGS_IF;
 
     if ( !v->is_initialised )
@@ -811,8 +819,8 @@ int arch_set_info_guest(
         }
 
         for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i )
-            fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]);
-        fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents);
+            fail |= v->arch.pv_vcpu.gdt_frames[i] != c(u.pv.gdt_frames[i]);
+        fail |= v->arch.pv_vcpu.gdt_ents != c(u.pv.gdt_ents);
 
         fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base);
         fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents);
@@ -861,21 +869,22 @@ int arch_set_info_guest(
         d->vm_assist = c(vm_assist);
 
     if ( !compat )
-        rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
+        rc = (int)set_gdt(v, c.nat->u.pv.gdt_frames, c.nat->u.pv.gdt_ents);
     else
     {
         unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames)];
-        unsigned int n = (c.cmp->gdt_ents + 511) / 512;
+        unsigned int n = (c.cmp->u.pv.gdt_ents + 511) / 512;
 
         if ( n > ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames) )
             return -EINVAL;
         for ( i = 0; i < n; ++i )
-            gdt_frames[i] = c.cmp->gdt_frames[i];
-        rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents);
+            gdt_frames[i] = c.cmp->u.pv.gdt_frames[i];
+        rc = (int)set_gdt(v, gdt_frames, c.cmp->u.pv.gdt_ents);
     }
     if ( rc != 0 )
         return rc;
 
+pvh_skip_pv:
     if ( !compat )
     {
         cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]);
@@ -883,19 +892,26 @@ int arch_set_info_guest(
 
         if ( !cr3_page )
         {
-            destroy_gdt(v);
+            if ( !is_pvh_vcpu(v) )
+                destroy_gdt(v);
             return -EINVAL;
         }
         if ( !paging_mode_refcounts(d)
              && !get_page_type(cr3_page, PGT_base_page_table) )
         {
             put_page(cr3_page);
-            destroy_gdt(v);
+            if ( !is_pvh_vcpu(v) )
+                destroy_gdt(v);
             return -EINVAL;
         }
 
+        if ( is_pvh_vcpu(v) ) {
+            v->arch.cr3 = page_to_mfn(cr3_page);
+            v->arch.hvm_vcpu.guest_cr[3] = c.nat->ctrlreg[3];
+        }
+
         v->arch.guest_table = pagetable_from_page(cr3_page);
-        if ( c.nat->ctrlreg[1] )
+        if ( c.nat->ctrlreg[1] && !is_pvh_vcpu(v) )
         {
             cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]);
             cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
@@ -920,7 +936,8 @@ int arch_set_info_guest(
         }
         else if ( !(flags & VGCF_in_kernel) )
         {
-            destroy_gdt(v);
+            if ( !is_pvh_vcpu(v) )
+                destroy_gdt(v);
             return -EINVAL;
         }
     }
@@ -961,6 +978,13 @@ int arch_set_info_guest(
 
     update_cr3(v);
 
+    if ( is_pvh_vcpu(v) )
+    {
+        /* guest is bringing up non-boot SMP vcpu */
+        if ( (rc=hvm_pvh_set_vcpu_info(v, c.nat)) != 0 )
+            return rc;
+    }
+
  out:
     if ( flags & VGCF_online )
         clear_bit(_VPF_down, &v->pause_flags);
@@ -991,16 +1015,21 @@ void arch_vcpu_reset(struct vcpu *v)
 static void
 unmap_vcpu_info(struct vcpu *v)
 {
-    unsigned long mfn;
+    unsigned long mfn, *mfnp;
 
-    if ( v->arch.pv_vcpu.vcpu_info_mfn == INVALID_MFN )
+    if ( is_pvh_vcpu(v) )
+        mfnp = &v->arch.hvm_vcpu.hv_pvh.pvh_vcpu_info_mfn;
+    else
+        mfnp = &v->arch.pv_vcpu.vcpu_info_mfn;
+
+    mfn = *mfnp;
+    if ( mfn == INVALID_MFN )
         return;
 
-    mfn = v->arch.pv_vcpu.vcpu_info_mfn;
     unmap_domain_page_global(v->vcpu_info);
 
     v->vcpu_info = &dummy_vcpu_info;
-    v->arch.pv_vcpu.vcpu_info_mfn = INVALID_MFN;
+    *mfnp = INVALID_MFN;
 
     put_page_and_type(mfn_to_page(mfn));
 }
@@ -1011,7 +1040,7 @@ unmap_vcpu_info(struct vcpu *v)
  * of memory, and it sets a pending event to make sure that a pending
  * event doesn't get missed.
  */
-static int
+static noinline int
 map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset)
 {
     struct domain *d = v->domain;
@@ -1019,11 +1048,17 @@ map_vcpu_info(struct vcpu *v, unsigned l
     vcpu_info_t *new_info;
     struct page_info *page;
     int i;
+    unsigned long *mfnp;
+
+    if ( is_pvh_vcpu(v) )
+        mfnp = &v->arch.hvm_vcpu.hv_pvh.pvh_vcpu_info_mfn;
+    else
+        mfnp = &v->arch.pv_vcpu.vcpu_info_mfn;
 
     if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) )
         return -EINVAL;
 
-    if ( v->arch.pv_vcpu.vcpu_info_mfn != INVALID_MFN )
+    if ( *mfnp != INVALID_MFN )
         return -EINVAL;
 
     /* Run this command on yourself or on other offline VCPUS. */
@@ -1060,7 +1095,7 @@ map_vcpu_info(struct vcpu *v, unsigned l
     }
 
     v->vcpu_info = new_info;
-    v->arch.pv_vcpu.vcpu_info_mfn = page_to_mfn(page);
+    *mfnp = page_to_mfn(page);
 
     /* Set new vcpu_info pointer /before/ setting pending flags. */
     wmb();
@@ -1370,10 +1405,10 @@ static void save_segments(struct vcpu *v
     struct cpu_user_regs *regs = &v->arch.user_regs;
     unsigned int dirty_segment_mask = 0;
 
-    regs->ds = read_segment_register(ds);
-    regs->es = read_segment_register(es);
-    regs->fs = read_segment_register(fs);
-    regs->gs = read_segment_register(gs);
+    regs->ds = read_segment_register(v, regs, ds);
+    regs->es = read_segment_register(v, regs, es);
+    regs->fs = read_segment_register(v, regs, fs);
+    regs->gs = read_segment_register(v, regs, gs);
 
     if ( regs->ds )
         dirty_segment_mask |= DIRTY_DS;
@@ -1466,7 +1501,7 @@ static void update_runstate_area(struct 
 
 static inline int need_full_gdt(struct vcpu *v)
 {
-    return (!is_hvm_vcpu(v) && !is_idle_vcpu(v));
+    return (!is_hvm_or_pvh_vcpu(v) && !is_idle_vcpu(v));
 }
 
 static void __context_switch(void)
@@ -1593,7 +1628,7 @@ void context_switch(struct vcpu *prev, s
         /* Re-enable interrupts before restoring state which may fault. */
         local_irq_enable();
 
-        if ( !is_hvm_vcpu(next) )
+        if ( !is_hvm_or_pvh_vcpu(next) )
         {
             load_LDT(next);
             load_segments(next);
@@ -2034,7 +2069,7 @@ int domain_relinquish_resources(struct d
         for_each_vcpu ( d, v )
             vcpu_destroy_pagetables(v);
 
-        if ( !is_hvm_domain(d) )
+        if ( !is_hvm_or_pvh_domain(d) )
         {
             for_each_vcpu ( d, v )
             {
@@ -2045,6 +2080,7 @@ int domain_relinquish_resources(struct d
                  */
                 destroy_gdt(v);
 
+/**** PVH: unmap vcpu info here ??????????????? */
                 unmap_vcpu_info(v);
             }
 
@@ -2109,7 +2145,7 @@ int domain_relinquish_resources(struct d
         BUG();
     }
 
-    if ( is_hvm_domain(d) )
+    if ( is_hvm_or_pvh_domain(d) )
         hvm_domain_relinquish_resources(d);
 
     return 0;
@@ -2190,7 +2226,7 @@ void vcpu_mark_events_pending(struct vcp
     if ( already_pending )
         return;
 
-    if ( is_hvm_vcpu(v) )
+    if ( is_hvm_or_pvh_vcpu(v) )
         hvm_assert_evtchn_irq(v);
     else
         vcpu_kick(v);
diff -r 8b0762504037 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/domain_build.c       Wed Nov 14 11:03:24 2012 -0800
@@ -35,6 +35,8 @@
 #include <asm/setup.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
+#include <asm/hap.h>
+#include <asm/debugger.h>
 
 #include <public/version.h>
 
@@ -307,6 +309,31 @@ static void __init process_dom0_ioports_
     }
 }
 
+static noinline __init void dom0_update_physmap(struct domain *d, unsigned 
long pfn,
+                                    unsigned long mfn, unsigned long 
vphysmap_s)
+{
+    if (d->is_pvh) {
+        int rc = guest_physmap_add_page(d, pfn, mfn, 0);
+        BUG_ON(rc);             /* for now while PVH feature is experimental */
+       return;
+    }
+    if ( !is_pv_32on64_domain(d) )
+        ((unsigned long *)vphysmap_s)[pfn] = mfn;
+    else
+        ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+    set_gpfn_from_mfn(mfn, pfn);
+}
+
+static __init void noinline copy_pvh(char *dest, char *src, int bytes)
+{
+    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0);
+    if (rem) {
+        printk("Failed to copy to dom0. len:%d rem:%d\n", bytes, rem);
+        domain_crash_synchronous();
+    }
+}
+
 int __init construct_dom0(
     struct domain *d,
     const module_t *image, unsigned long image_headroom,
@@ -314,6 +341,7 @@ int __init construct_dom0(
     void *(*bootstrap_map)(const module_t *),
     char *cmdline)
 {
+    char si_buf[PAGE_SIZE], tmp_buf[PAGE_SIZE];
     int i, cpu, rc, compatible, compat32, order, machine;
     struct cpu_user_regs *regs;
     unsigned long pfn, mfn;
@@ -322,7 +350,7 @@ int __init construct_dom0(
     unsigned long alloc_spfn;
     unsigned long alloc_epfn;
     unsigned long initrd_pfn = -1, initrd_mfn = 0;
-    unsigned long count;
+    unsigned long count, shared_info_pfn_addr = 0;
     struct page_info *page = NULL;
     start_info_t *si;
     struct vcpu *v = d->vcpu[0];
@@ -480,6 +508,15 @@ int __init construct_dom0(
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
                         sizeof(struct dom0_vga_console_info));
+
+    if ( is_pvh_domain(d) ) {
+        kdbp("MUK: vstartinfo_end:%lx\n", vstartinfo_end);
+       shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start;
+       vstartinfo_end   += PAGE_SIZE;
+        kdbp("MUK: adjusted vstartinfo_end:%lx shared_info_pfn:%lx\n", 
+              vstartinfo_end, shared_info_pfn_addr);
+    }
+
     vpt_start        = round_pgup(vstartinfo_end);
     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
     {
@@ -621,20 +658,30 @@ int __init construct_dom0(
         maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
         l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
     }
-    copy_page(l4tab, idle_pg_table);
-    l4tab[0] = l4e_empty(); /* zap trampoline mapping */
-    l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
-        l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
-    l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
-        l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
-    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
-    if ( is_pv_32on64_domain(d) )
-        v->arch.guest_table_user = v->arch.guest_table;
+    if ( is_pvh_domain(d) ) 
+    {
+        v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);
+        pfn = 0;
+    } else {
+        copy_page(l4tab, idle_pg_table);
+        l4tab[0] = l4e_empty(); /* zap trampoline mapping */
+        l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
+            l4e_from_paddr(__pa(l4start), __PAGE_HYPERVISOR);
+        l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] =
+            l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
+            v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+        if ( is_pv_32on64_domain(d) )
+            v->arch.guest_table_user = v->arch.guest_table;
+        pfn = alloc_spfn;
+    }
 
     l4tab += l4_table_offset(v_start);
-    pfn = alloc_spfn;
     for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
     {
+        /* initrd chunk's mfns are separate, so we need to adjust for them */
+        signed long hyb_adj = is_pvh_domain(d) ?
+                            (-alloc_spfn + PFN_UP(initrd_len))<<PAGE_SHIFT : 0;
+
         if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
         {
             maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
@@ -661,16 +708,17 @@ int __init construct_dom0(
                     clear_page(l3tab);
                     if ( count == 0 )
                         l3tab += l3_table_offset(v_start);
-                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+                    *l4tab = l4e_from_paddr(__pa(l3start) + hyb_adj, L4_PROT);
                     l4tab++;
                 }
-                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+                *l3tab = l3e_from_paddr(__pa(l2start) + hyb_adj, L3_PROT);
                 l3tab++;
             }
-            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+            *l2tab = l2e_from_paddr(__pa(l1start) + hyb_adj, L2_PROT);
             l2tab++;
         }
-        if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
+        if ( is_pvh_domain(d) ||
+             count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
             mfn = pfn++;
         else
             mfn = initrd_mfn++;
@@ -678,6 +726,9 @@ int __init construct_dom0(
                                     L1_PROT : COMPAT_L1_PROT));
         l1tab++;
 
+        if ( is_pvh_domain(d) )
+            continue;
+
         page = mfn_to_page(mfn);
         if ( (page->u.inuse.type_info == 0) &&
              !get_page_and_type(page, d, PGT_writable_page) )
@@ -714,7 +765,7 @@ int __init construct_dom0(
     l2tab += l2_table_offset(vpt_start);
     l1start = l1tab = l2e_to_l1e(*l2tab);
     l1tab += l1_table_offset(vpt_start);
-    for ( count = 0; count < nr_pt_pages; count++ ) 
+    for ( count = 0; count < nr_pt_pages && !is_pvh_domain(d); count++ ) 
     {
         l1e_remove_flags(*l1tab, _PAGE_RW);
         page = mfn_to_page(l1e_get_pfn(*l1tab));
@@ -758,6 +809,11 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, cpu);
     }
 
+    if ( is_pvh_domain(d) )
+    {
+        v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] =
+                        (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT;
+    }
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
@@ -767,34 +823,12 @@ int __init construct_dom0(
     /* We run on dom0's page tables for the final part of the build process. */
     write_ptbase(v);
 
-    /* Copy the OS image and free temporary buffer. */
-    elf.dest = (void*)vkern_start;
-    rc = elf_load_binary(&elf);
-    if ( rc < 0 )
-    {
-        printk("Failed to load the kernel binary\n");
-        return rc;
-    }
-    bootstrap_map(NULL);
+    /* Set up start info area. */
+    if ( is_pvh_domain(d) )
+        si = (start_info_t *)si_buf;
+    else
+        si = (start_info_t *)vstartinfo_start;
 
-    if ( UNSET_ADDR != parms.virt_hypercall )
-    {
-        if ( (parms.virt_hypercall < v_start) ||
-             (parms.virt_hypercall >= v_end) )
-        {
-            write_ptbase(current);
-            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
-            return -1;
-        }
-        hypercall_page_initialise(
-            d, (void *)(unsigned long)parms.virt_hypercall);
-    }
-
-    /* Free temporary buffers. */
-    discard_initial_images();
-
-    /* Set up start info area. */
-    si = (start_info_t *)vstartinfo_start;
     clear_page(si);
     si->nr_pages = nr_pages;
 
@@ -803,6 +837,8 @@ int __init construct_dom0(
     si->flags        = SIF_PRIVILEGED | SIF_INITDOMAIN;
     if ( !vinitrd_start && initrd_len )
         si->flags   |= SIF_MOD_START_PFN;
+    if ( is_pvh_domain(d) )
+        si->flags   |= SIF_IS_PVH;
     si->flags       |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
     si->pt_base      = vpt_start;
     si->nr_pt_frames = nr_pt_pages;
@@ -812,7 +848,7 @@ int __init construct_dom0(
 
     count = d->tot_pages;
     /* Set up the phys->machine table if not part of the initial mapping. */
-    if ( parms.p2m_base != UNSET_ADDR )
+    if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) )
     {
         unsigned long va = vphysmap_start;
 
@@ -911,6 +947,11 @@ int __init construct_dom0(
             panic("Not enough RAM for DOM0 P->M table.\n");
     }
 
+    if (is_pvh_domain(d) )
+    {
+        hap_set_pvh_alloc_for_dom0(d, nr_pages);
+        kdbp("MUK: Count is:%d 0x%x\n", count, count);
+    }
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
     {
@@ -927,11 +968,8 @@ int __init construct_dom0(
         if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
             mfn = alloc_epfn - (pfn - REVERSE_START);
 #endif
-        if ( !is_pv_32on64_domain(d) )
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-        else
-            ((unsigned int *)vphysmap_start)[pfn] = mfn;
-        set_gpfn_from_mfn(mfn, pfn);
+        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
+
         if (!(pfn & 0xfffff))
             process_pending_softirqs();
     }
@@ -947,8 +985,8 @@ int __init construct_dom0(
             if ( !page->u.inuse.type_info &&
                  !get_page_and_type(page, d, PGT_writable_page) )
                 BUG();
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
             ++pfn;
             if (!(pfn & 0xfffff))
                 process_pending_softirqs();
@@ -968,11 +1006,7 @@ int __init construct_dom0(
 #ifndef NDEBUG
 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
 #endif
-            if ( !is_pv_32on64_domain(d) )
-                ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            else
-                ((unsigned int *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
 #undef pfn
             page++; pfn++;
             if (!(pfn & 0xfffff))
@@ -980,6 +1014,35 @@ int __init construct_dom0(
         }
     }
 
+    /* Copy the OS image and free temporary buffer. */
+    elf.dest = (void*)vkern_start;
+    rc = elf_load_binary(&elf, is_pvh_domain(d) );
+    if ( rc < 0 )
+    {
+        printk("Failed to load the kernel binary\n");
+        return rc;
+    }
+    bootstrap_map(NULL);
+
+    if ( UNSET_ADDR != parms.virt_hypercall )
+    {
+        void *addr = is_pvh_domain(d) ? tmp_buf : (void *)parms.virt_hypercall;
+
+        if ( (parms.virt_hypercall < v_start) ||
+             (parms.virt_hypercall >= v_end) )
+        {
+            write_ptbase(current);
+            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
+            return -1;
+        }
+        hypercall_page_initialise(d, addr);
+        if ( is_pvh_domain(d) )
+            copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE);
+    }
+
+    /* Free temporary buffers. */
+    discard_initial_images();
+
     if ( initrd_len != 0 )
     {
         si->mod_start = vinitrd_start ?: initrd_pfn;
@@ -995,6 +1058,15 @@ int __init construct_dom0(
         si->console.dom0.info_off  = sizeof(struct start_info);
         si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
     }
+    if ( is_pvh_domain(d) ) {
+        unsigned long mfn = virt_to_mfn(d->shared_info);
+        unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT;
+        si->shared_info = shared_info_pfn_addr;
+        dom0_update_physmap(d, pfn, mfn, 0);
+
+        copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE);
+    }
+    kdbp("MUK: si->shared_info:%lx\n", si->shared_info);
 
     if ( is_pv_32on64_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
@@ -1007,6 +1079,7 @@ int __init construct_dom0(
     v->is_initialised = 1;
     clear_bit(_VPF_down, &v->pause_flags);
 
+/* PVH NOTE: some of these are ignored */
     /*
      * Initial register values:
      *  DS,ES,FS,GS = FLAT_KERNEL_DS
@@ -1025,12 +1098,16 @@ int __init construct_dom0(
     regs->eip = parms.virt_entry;
     regs->esp = vstack_end;
     regs->esi = vstartinfo_start;
-    regs->eflags = X86_EFLAGS_IF;
+    regs->eflags = X86_EFLAGS_IF | 0x2;
 
-    if ( opt_dom0_shadow )
+    if ( opt_dom0_shadow ) {
+        if ( is_pvh_domain(d) ) {
+            printk("Invalid option dom0_shadow for PVH\n");
+            return -EINVAL;
+        }
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
-
+    }
     if ( supervisor_mode_kernel )
     {
         v->arch.pv_vcpu.kernel_ss &= ~3;
diff -r 8b0762504037 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/domctl.c     Wed Nov 14 11:03:24 2012 -0800
@@ -46,6 +46,77 @@ static int gdbsx_guest_mem_io(
     return (iop->remain ? -EFAULT : 0);
 }
 
+long domctl_memory_mapping(struct domain *d, unsigned long gfn,
+                           unsigned long mfn, unsigned long nr_mfns,
+                           int add_map)
+{
+    int i;
+    long ret = -EINVAL;
+
+    if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
+         ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) ||
+         (gfn + nr_mfns - 1) < gfn ) /* wrap? */
+        return ret;
+
+    ret = -EPERM;
+    if ( !IS_PRIV(current->domain) &&
+         !iomem_access_permitted(current->domain, mfn, mfn + nr_mfns - 1) )
+        return ret;
+
+/* TBD: common code with do_physdev_op() */
+
+    ret = xsm_iomem_permission(d, mfn, mfn + nr_mfns - 1, add_map);
+    if ( ret )
+        return ret;
+
+    if ( add_map )
+    {
+        if ( !is_pvh_domain(d) )   /* PVH maps lots and lots */
+            printk(XENLOG_G_INFO
+                   "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n",
+                   d->domain_id, gfn, mfn, nr_mfns);
+
+        ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
+        if ( !ret && paging_mode_translate(d) )
+        {
+            for ( i = 0; !ret && i < nr_mfns; i++ )
+                if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) )
+                    ret = -EIO;
+            if ( ret )
+            {
+                printk(XENLOG_G_WARNING
+                       "memory_map:fail: dom%d gfn=%lx mfn=%lx\n",
+                       d->domain_id, gfn + i, mfn + i);
+                while ( i-- )
+                    clear_mmio_p2m_entry(d, gfn + i);
+                if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) &&
+                     IS_PRIV(current->domain) )
+                    printk(XENLOG_ERR
+                           "memory_map: failed to deny dom%d access to 
[%lx,%lx]\n",
+                           d->domain_id, mfn, mfn + nr_mfns - 1);
+            }
+        }
+    } else {
+        if ( !is_pvh_domain(d) )   /* PVH unmaps lots and lots */
+            printk(XENLOG_G_INFO
+                   "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n",
+                   d->domain_id, gfn, mfn, nr_mfns);
+
+        if ( paging_mode_translate(d) )
+            for ( i = 0; i < nr_mfns; i++ )
+                add_map |= !clear_mmio_p2m_entry(d, gfn + i);
+        ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
+        if ( !ret && add_map )
+            ret = -EIO;
+        if ( ret && IS_PRIV(current->domain) )
+            printk(XENLOG_ERR
+                   "memory_map: error %ld %s dom%d access to [%lx,%lx]\n",
+                   ret, add_map ? "removing" : "denying", d->domain_id,
+                   mfn, mfn + nr_mfns - 1);
+    }
+    return ret;
+}
+
 long arch_do_domctl(
     struct xen_domctl *domctl,
     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
@@ -825,75 +896,13 @@ long arch_do_domctl(
         unsigned long mfn = domctl->u.memory_mapping.first_mfn;
         unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns;
         int add = domctl->u.memory_mapping.add_mapping;
-        unsigned long i;
 
-        ret = -EINVAL;
-        if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
-             ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) ||
-             (gfn + nr_mfns - 1) < gfn ) /* wrap? */
-            break;
-
-        ret = -EPERM;
-        if ( !IS_PRIV(current->domain) &&
-             !iomem_access_permitted(current->domain, mfn, mfn + nr_mfns - 1) )
-            break;
 
         ret = -ESRCH;
         if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) )
             break;
 
-        ret = xsm_iomem_permission(d, mfn, mfn + nr_mfns - 1, add);
-        if ( ret ) {
-            rcu_unlock_domain(d);
-            break;
-        }
-
-        if ( add )
-        {
-            printk(XENLOG_G_INFO
-                   "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n",
-                   d->domain_id, gfn, mfn, nr_mfns);
-
-            ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
-            if ( !ret && paging_mode_translate(d) )
-            {
-                for ( i = 0; !ret && i < nr_mfns; i++ )
-                    if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) )
-                        ret = -EIO;
-                if ( ret )
-                {
-                    printk(XENLOG_G_WARNING
-                           "memory_map:fail: dom%d gfn=%lx mfn=%lx\n",
-                           d->domain_id, gfn + i, mfn + i);
-                    while ( i-- )
-                        clear_mmio_p2m_entry(d, gfn + i);
-                    if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) &&
-                         IS_PRIV(current->domain) )
-                        printk(XENLOG_ERR
-                               "memory_map: failed to deny dom%d access to 
[%lx,%lx]\n",
-                               d->domain_id, mfn, mfn + nr_mfns - 1);
-                }
-            }
-        }
-        else
-        {
-            printk(XENLOG_G_INFO
-                   "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n",
-                   d->domain_id, gfn, mfn, nr_mfns);
-
-            if ( paging_mode_translate(d) )
-                for ( i = 0; i < nr_mfns; i++ )
-                    add |= !clear_mmio_p2m_entry(d, gfn + i);
-            ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
-            if ( !ret && add )
-                ret = -EIO;
-            if ( ret && IS_PRIV(current->domain) )
-                printk(XENLOG_ERR
-                       "memory_map: error %ld %s dom%d access to [%lx,%lx]\n",
-                       ret, add ? "removing" : "denying", d->domain_id,
-                       mfn, mfn + nr_mfns - 1);
-        }
-
+        ret = domctl_memory_mapping(d, gfn, mfn, nr_mfns, add);
         rcu_unlock_domain(d);
     }
     break;
@@ -1646,17 +1655,21 @@ void arch_get_info_guest(struct vcpu *v,
             c.nat->gs_base_kernel = hvm_get_shadow_gs_base(v);
         }
     }
+    else if ( is_pvh_vcpu(v) )
+    {
+        printk("PVH: FIXME: arch_get_info_guest()\n");
+    }
     else
     {
         c(ldt_base = v->arch.pv_vcpu.ldt_base);
         c(ldt_ents = v->arch.pv_vcpu.ldt_ents);
         for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i )
-            c(gdt_frames[i] = v->arch.pv_vcpu.gdt_frames[i]);
-        BUILD_BUG_ON(ARRAY_SIZE(c.nat->gdt_frames) !=
-                     ARRAY_SIZE(c.cmp->gdt_frames));
-        for ( ; i < ARRAY_SIZE(c.nat->gdt_frames); ++i )
-            c(gdt_frames[i] = 0);
-        c(gdt_ents = v->arch.pv_vcpu.gdt_ents);
+            c(u.pv.gdt_frames[i] = v->arch.pv_vcpu.gdt_frames[i]);
+        BUILD_BUG_ON(ARRAY_SIZE(c.nat->u.pv.gdt_frames) !=
+                     ARRAY_SIZE(c.cmp->u.pv.gdt_frames));
+        for ( ; i < ARRAY_SIZE(c.nat->u.pv.gdt_frames); ++i )
+            c(u.pv.gdt_frames[i] = 0);
+        c(u.pv.gdt_ents = v->arch.pv_vcpu.gdt_ents);
         c(kernel_ss = v->arch.pv_vcpu.kernel_ss);
         c(kernel_sp = v->arch.pv_vcpu.kernel_sp);
         for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i )
diff -r 8b0762504037 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/hvm.c    Wed Nov 14 11:03:24 2012 -0800
@@ -331,6 +331,9 @@ void hvm_do_resume(struct vcpu *v)
 {
     ioreq_t *p;
 
+    if ( is_pvh_vcpu(v) )
+        return;
+
     pt_restore_timer(v);
 
     check_wakeup_from_wait();
@@ -499,6 +502,29 @@ static int hvm_print_line(
     return X86EMUL_OKAY;
 }
 
+static noinline int hvm_pvh_dom_initialise(struct domain *d)
+{
+    int rc;
+
+    if (!d->arch.hvm_domain.hap_enabled)
+        return -EINVAL;
+
+    spin_lock_init(&d->arch.hvm_domain.irq_lock);
+    hvm_init_guest_time(d);
+
+    hvm_init_cacheattr_region_list(d);
+
+    if ( (rc=paging_enable(d, PG_refcounts|PG_translate|PG_external)) != 0 )
+        goto fail1;
+
+    if ( (rc = hvm_funcs.domain_initialise(d)) == 0 )
+        return 0;
+
+fail1:
+    hvm_destroy_cacheattr_region_list(d);
+    return rc;
+}
+
 int hvm_domain_initialise(struct domain *d)
 {
     int rc;
@@ -509,6 +535,8 @@ int hvm_domain_initialise(struct domain 
                  "on a non-VT/AMDV platform.\n");
         return -EINVAL;
     }
+    if ( is_pvh_domain(d) )
+        return hvm_pvh_dom_initialise(d);
 
     spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
     spin_lock_init(&d->arch.hvm_domain.irq_lock);
@@ -572,6 +600,11 @@ int hvm_domain_initialise(struct domain 
 
 void hvm_domain_relinquish_resources(struct domain *d)
 {
+    if ( is_pvh_domain(d) ) 
+    {
+        pit_deinit(d);
+        return;
+    }
     if ( hvm_funcs.nhvm_domain_relinquish_resources )
         hvm_funcs.nhvm_domain_relinquish_resources(d);
 
@@ -597,10 +630,16 @@ void hvm_domain_relinquish_resources(str
 void hvm_domain_destroy(struct domain *d)
 {
     hvm_funcs.domain_destroy(d);
+    hvm_destroy_cacheattr_region_list(d);
+
+    if ( is_pvh_domain(d) ) {
+        printk("PVH:skippoing stuff in hvm_domain_destroy(). reexamine me\n");
+        return;
+    }
+
     rtc_deinit(d);
     stdvga_deinit(d);
     vioapic_deinit(d);
-    hvm_destroy_cacheattr_region_list(d);
 }
 
 static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h)
@@ -1054,12 +1093,44 @@ static int __init __hvm_register_CPU_XSA
 }
 __initcall(__hvm_register_CPU_XSAVE_save_and_restore);
 
+static noinline int hvm_pvh_vcpu_initialise(struct vcpu *v)
+{
+    int rc;
+
+    if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
+        return rc;
+
+    softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
+                          (void(*)(unsigned long))hvm_assert_evtchn_irq,
+                          (unsigned long)v );
+
+    v->arch.hvm_vcpu.hv_pvh.pvh_vcpu_info_mfn = INVALID_MFN;
+    v->arch.user_regs.eflags = 2;
+    v->arch.hvm_vcpu.inject_trap.vector = -1;
+
+    if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) {
+        hvm_funcs.vcpu_destroy(v);
+        return rc;
+    }
+
+    /* during domain shutdown: pvh_vmx_vmexit_handler->emulate_privileged_op
+     * -> guest_io_read -> pv_pit_handler -> handle_speaker_io -> _spin_lock
+     *  so we call pit_init to initialize the spin lock */
+    if ( v->vcpu_id == 0 )
+        pit_init(v, cpu_khz);
+
+    return 0;
+}
+
 int hvm_vcpu_initialise(struct vcpu *v)
 {
     int rc;
 
     hvm_asid_flush_vcpu(v);
 
+    if ( is_pvh_vcpu(v) )
+        return hvm_pvh_vcpu_initialise(v);
+
     if ( (rc = vlapic_init(v)) != 0 )
         goto fail1;
 
@@ -1147,11 +1218,14 @@ void hvm_vcpu_destroy(struct vcpu *v)
 
     free_compat_arg_xlat(v);
 
+if ( is_pvh_vcpu(v) )
+    printk("PVH: fixme: ignoring tasklet kill in hvm_vcpu_destroy(). \n");
+else {
     tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
     hvm_vcpu_cacheattr_destroy(v);
     vlapic_destroy(v);
+}
     hvm_funcs.vcpu_destroy(v);
-
     /* Event channel is already freed by evtchn_destroy(). */
     /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/
 }
diff -r 8b0762504037 xen/arch/x86/hvm/irq.c
--- a/xen/arch/x86/hvm/irq.c    Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/irq.c    Wed Nov 14 11:03:24 2012 -0800
@@ -405,6 +405,9 @@ struct hvm_intack hvm_vcpu_has_pending_i
          && vcpu_info(v, evtchn_upcall_pending) )
         return hvm_intack_vector(plat->irq.callback_via.vector);
 
+    if ( is_pvh_vcpu(v) )
+        return hvm_intack_none;
+
     if ( vlapic_accept_pic_intr(v) && plat->vpic[0].int_output )
         return hvm_intack_pic(0);
 
diff -r 8b0762504037 xen/arch/x86/hvm/mtrr.c
--- a/xen/arch/x86/hvm/mtrr.c   Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/mtrr.c   Wed Nov 14 11:03:24 2012 -0800
@@ -553,6 +553,10 @@ int32_t hvm_get_mem_pinned_cacheattr(
 
     *type = 0;
 
+    if ( is_pvh_domain(d) ) {
+        printk("PVH: fixme: hvm_get_mem_pinned_cacheattr(). \n");
+        return 0;
+    }
     if ( !is_hvm_domain(d) )
         return 0;
 
@@ -578,6 +582,11 @@ int32_t hvm_set_mem_pinned_cacheattr(
 {
     struct hvm_mem_pinned_cacheattr_range *range;
 
+    if ( is_pvh_domain(d) ) {
+        printk("PVH: fixme: hvm_set_mem_pinned_cacheattr()\n");
+        return 0;
+    }
+
     if ( !((type == PAT_TYPE_UNCACHABLE) ||
            (type == PAT_TYPE_WRCOMB) ||
            (type == PAT_TYPE_WRTHROUGH) ||
@@ -606,6 +615,12 @@ static int hvm_save_mtrr_msr(struct doma
     struct vcpu *v;
     struct hvm_hw_mtrr hw_mtrr;
     struct mtrr_state *mtrr_state;
+
+    if ( is_pvh_domain(d) ) {
+        printk("PVH: fixme: hvm_save_mtrr_msr()\n");
+        return 0;
+    }
+
     /* save mtrr&pat */
     for_each_vcpu(d, v)
     {
@@ -644,6 +659,10 @@ static int hvm_load_mtrr_msr(struct doma
     struct mtrr_state *mtrr_state;
     struct hvm_hw_mtrr hw_mtrr;
 
+    if ( is_pvh_domain(d) ) {
+        printk("PVH: fixme: hvm_load_mtrr_msr()\n");
+        return 0;
+    }
     vcpuid = hvm_load_instance(h);
     if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
     {
@@ -693,6 +712,14 @@ uint8_t epte_get_entry_emt(struct domain
          ((d->vcpu == NULL) || ((v = d->vcpu[0]) == NULL)) )
         return MTRR_TYPE_WRBACK;
 
+    /* PVH TBD: FIXME: this needs to be studied, figure what need to be done
+     * for PVH */
+    if ( is_pvh_domain(d) ) {
+        if (direct_mmio)
+            return MTRR_TYPE_UNCACHABLE;
+        return MTRR_TYPE_WRBACK;
+    }
+
     if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] )
         return MTRR_TYPE_WRBACK;
 
diff -r 8b0762504037 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/svm/vmcb.c       Wed Nov 14 11:03:24 2012 -0800
@@ -326,7 +326,7 @@ void kdb_dump_vmcb(domid_t did, int vid)
 
     rcu_read_lock(&domlist_read_lock);
     for_each_domain (dp) {
-        if (!is_hvm_or_hyb_domain(dp) || dp->is_dying)
+        if (!is_hvm_or_pvh_domain(dp) || dp->is_dying)
             continue;
         if (did != 0 && did != dp->domain_id)
             continue;
diff -r 8b0762504037 xen/arch/x86/hvm/vmx/Makefile
--- a/xen/arch/x86/hvm/vmx/Makefile     Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/vmx/Makefile     Wed Nov 14 11:03:24 2012 -0800
@@ -5,3 +5,4 @@ obj-y += vmcs.o
 obj-y += vmx.o
 obj-y += vpmu_core2.o
 obj-y += vvmx.o
+obj-y += vmx_pvh.o
diff -r 8b0762504037 xen/arch/x86/hvm/vmx/intr.c
--- a/xen/arch/x86/hvm/vmx/intr.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/vmx/intr.c       Wed Nov 14 11:03:24 2012 -0800
@@ -216,15 +216,16 @@ void vmx_intr_assist(void)
         return;
     }
 
-    /* Crank the handle on interrupt state. */
-    pt_vector = pt_update_irq(v);
+    if ( !is_pvh_vcpu(v) )
+        /* Crank the handle on interrupt state. */
+        pt_vector = pt_update_irq(v);
 
     do {
         intack = hvm_vcpu_has_pending_irq(v);
         if ( likely(intack.source == hvm_intsrc_none) )
             goto out;
 
-        if ( unlikely(nvmx_intr_intercept(v, intack)) )
+        if ( !is_pvh_vcpu(v) && unlikely(nvmx_intr_intercept(v, intack)) )
             goto out;
 
         intblk = hvm_interrupt_blocked(v, intack);
diff -r 8b0762504037 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Nov 14 11:03:24 2012 -0800
@@ -606,7 +606,7 @@ void vmx_vmcs_exit(struct vcpu *v)
     {
         /* Don't confuse vmx_do_resume (for @v or @current!) */
         vmx_clear_vmcs(v);
-        if ( is_hvm_vcpu(current) )
+        if ( is_hvm_or_pvh_vcpu(current) )
             vmx_load_vmcs(current);
 
         spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
@@ -697,6 +697,306 @@ void vmx_vmcs_switch(struct vmcs_struct 
     spin_unlock(&vmx->vmcs_lock);
 }
 
+static noinline int pvh_construct_vmcs(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    uint16_t sysenter_cs;
+    unsigned long sysenter_eip;
+    u32 vmexit_ctl = vmx_vmexit_control;
+    u32 vmentry_ctl = vmx_vmentry_control;
+    u64 u64val = -1;
+
+    vmx_vmcs_enter(v);
+
+    /* VMCS controls. */
+    vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS;
+    __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
+
+    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
+
+    /* if ( v->domain->arch.vtsc ) */
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
+v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
+
+    if ( !paging_mode_hap(d) )
+    {
+        printk("ERROR: HAP is required to run PV in HVM container\n");
+        goto out;
+    }
+    v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+                                      CPU_BASED_CR3_LOAD_EXITING |
+                                      CPU_BASED_CR3_STORE_EXITING);
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+#if 0
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_ACTIVATE_IO_BITMAP; /* ??? */
+#endif
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
+    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+
+    kdbp("MUK: writing proc based exec controls:%x\n", 
+                 v->arch.hvm_vmx.exec_control);
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+
+    /* I/O access bitmap. */
+    __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
+    __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
+
+    /* MSR access bitmap. */
+    if ( cpu_has_vmx_msr_bitmap )
+    {
+        unsigned long *msr_bitmap = alloc_xenheap_page();
+        int msr_type = MSR_TYPE_R | MSR_TYPE_W;
+
+        if ( msr_bitmap == NULL )
+            goto out;
+
+        memset(msr_bitmap, ~0, PAGE_SIZE);
+        v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
+        __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
+
+        vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type);
+        vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type);
+
+        /* pure hvm doesn't do this. safe? see: long_mode_do_msr_write() */
+#if 0
+        vmx_disable_intercept_for_msr(v, MSR_STAR);
+        vmx_disable_intercept_for_msr(v, MSR_LSTAR);
+        vmx_disable_intercept_for_msr(v, MSR_CSTAR);
+        vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK);
+#endif
+        kdbp("MUK: disabled intercepts for few msrs\n");
+
+    } else {
+        kdbp("MUK: CPU does NOT have msr bitmap\n");
+        for (;;) cpu_relax();
+    }
+
+    if ( !cpu_has_vmx_vpid ) {
+        printk("ERROR: VPID support is required to run PV in HVM container\n");
+        goto out;
+    }
+
+    v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
+
+    if ( cpu_has_vmx_secondary_exec_control ) {
+        v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn off all */
+#if 0
+        v->arch.hvm_vmx.secondary_exec_control &= 
+                                       
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+        v->arch.hvm_vmx.secondary_exec_control &= 
~SECONDARY_EXEC_ENABLE_RDTSCP;
+
+        v->arch.hvm_vmx.secondary_exec_control &= 
+                                             
~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+#endif
+        v->arch.hvm_vmx.secondary_exec_control |= 
+                                              
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+        v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID;
+
+        if ( paging_mode_hap(d) )
+            v->arch.hvm_vmx.secondary_exec_control |= 
SECONDARY_EXEC_ENABLE_EPT;
+
+        kdbp("MUK: muk_construct_vmcs: sec exec:0x%x\n",
+                                        
v->arch.hvm_vmx.secondary_exec_control);
+        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+                  v->arch.hvm_vmx.secondary_exec_control);
+    } else {
+        printk("ERROR: NO Secondary Exec control\n");
+        goto out;
+    }
+
+    /* __vmwrite(VIRTUAL_PROCESSOR_ID, v->arch.hvm_vcpu.asid); */
+
+    if ( !paging_mode_hap(d) )
+        vmexit_ctl &= ~(VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT);
+    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
+
+    #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4
+    #define VM_ENTRY_LOAD_EFER 0x8000
+    #define GUEST_EFER 0x2806        /* see page 23-20 */
+    #define GUEST_EFER_HIGH 0x2807   /* see page 23-20 */
+    vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS;
+    vmentry_ctl &= ~VM_ENTRY_LOAD_EFER;
+    vmentry_ctl &= ~VM_ENTRY_SMM;
+    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
+    vmentry_ctl |= VM_ENTRY_IA32E_MODE;
+    if ( !paging_mode_hap(d) )
+        vmentry_ctl &= ~VM_ENTRY_LOAD_GUEST_PAT;
+    kdbp("MUK:muk_construct_vmcs(). vmentry_ctl:0x%x\n", vmentry_ctl);
+    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
+
+    /* MSR intercepts. */
+    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+
+    /* Host data selectors. */
+    __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
+    __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
+    __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
+    __vmwrite(HOST_FS_SELECTOR, 0);
+    __vmwrite(HOST_GS_SELECTOR, 0);
+    __vmwrite(HOST_FS_BASE, 0);
+    __vmwrite(HOST_GS_BASE, 0);
+
+    vmx_set_host_env(v);
+
+    /* Host control registers. */
+    v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
+    __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
+    __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? X86_CR4_OSXSAVE : 
0));
+
+    /* Host CS:RIP. */
+    __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
+    __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
+
+    /* Host SYSENTER CS:RIP. */
+    rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
+    __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
+    rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
+    __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
+
+    __vmwrite(VM_ENTRY_INTR_INFO, 0);
+
+    __vmwrite(CR3_TARGET_COUNT, 0);
+
+    __vmwrite(GUEST_ACTIVITY_STATE, 0);
+
+    /* Set default guest context values here. Some of these are then 
overwritten
+     * in vmx_pvh_set_vcpu_info() by guest itself during vcpu bringup */
+    __vmwrite(GUEST_CS_BASE, 0);
+    __vmwrite(GUEST_CS_LIMIT, ~0u);
+    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */
+    __vmwrite(GUEST_CS_SELECTOR, 0x10);
+
+    __vmwrite(GUEST_DS_BASE, 0);
+    __vmwrite(GUEST_DS_LIMIT, ~0u);
+    __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
+    __vmwrite(GUEST_DS_SELECTOR, 0x18);
+
+    __vmwrite(GUEST_SS_BASE, 0);         /* use same seg as DS */
+    __vmwrite(GUEST_SS_LIMIT, ~0u);
+    __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
+    __vmwrite(GUEST_SS_SELECTOR, 0x18);
+
+    __vmwrite(GUEST_ES_SELECTOR, 0);
+    __vmwrite(GUEST_FS_SELECTOR, 0);
+    __vmwrite(GUEST_GS_SELECTOR, 0);
+
+    /* Guest segment bases. */
+    __vmwrite(GUEST_ES_BASE, 0);
+    __vmwrite(GUEST_FS_BASE, 0);
+    __vmwrite(GUEST_GS_BASE, 0);
+
+    /* Guest segment limits. */
+    __vmwrite(GUEST_ES_LIMIT, ~0u);
+    __vmwrite(GUEST_FS_LIMIT, ~0u);
+    __vmwrite(GUEST_GS_LIMIT, ~0u);
+
+    /* Guest segment AR bytes. */
+    __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
+    __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
+    __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
+
+    /* Guest IDT. */
+    __vmwrite(GUEST_GDTR_BASE, 0);
+    __vmwrite(GUEST_GDTR_LIMIT, 0);
+
+    /* Guest LDT. */
+    __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */
+    __vmwrite(GUEST_LDTR_SELECTOR, 0);
+    __vmwrite(GUEST_LDTR_BASE, 0);
+    __vmwrite(GUEST_LDTR_LIMIT, 0);
+
+    /* Guest TSS. */
+    __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */
+    __vmwrite(GUEST_TR_BASE, 0);
+    __vmwrite(GUEST_TR_LIMIT, 0xff);
+
+    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+    __vmwrite(GUEST_DR7, 0);
+    __vmwrite(VMCS_LINK_POINTER, ~0UL);
+
+    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+
+    v->arch.hvm_vmx.exception_bitmap = 
+                                   HVM_TRAP_MASK     | (1 << TRAP_debug) | 
+                                   (1U << TRAP_int3) | (1U << TRAP_no_device);
+    __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap);
+
+#if 0
+    __vmwrite(EXCEPTION_BITMAP,
+              HVM_TRAP_MASK | (1<<TRAP_debug) | (1<<TRAP_gp_fault) |
+              (1U<<TRAP_int3) | (1U << TRAP_page_fault)|(1U << 
TRAP_no_device));
+#endif
+    __vmwrite(TSC_OFFSET, 0);
+
+#if 0
+    v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PG | X86_CR0_PE | X86_CR0_ET;
+    hvm_update_guest_cr(v, 0);
+
+    v->arch.hvm_vcpu.guest_cr[4] = 0;
+    hvm_update_guest_cr(v, 4);
+#endif
+
+#if 0
+    u64val = X86_CR0_PG | X86_CR0_PE | X86_CR0_ET | X86_CR0_TS |
+             X86_CR0_NE | X86_CR0_WP;
+#endif
+    /* make sure to set WP bit so rdonly pages are not written from CPL 0 */
+    u64val = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
+    __vmwrite(GUEST_CR0, u64val);
+    __vmwrite(CR0_READ_SHADOW, u64val);
+    v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = u64val;
+
+    u64val = X86_CR4_PAE | X86_CR4_VMXE;
+    __vmwrite(GUEST_CR4, u64val);
+    __vmwrite(CR4_READ_SHADOW, u64val);
+    v->arch.hvm_vcpu.guest_cr[4] = u64val;
+
+    __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
+    __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
+
+     v->arch.hvm_vmx.vmx_realmode = 0;
+
+    if ( paging_mode_hap(d) )
+    {
+        __vmwrite(EPT_POINTER, d->arch.hvm_domain.vmx.ept_control.eptp);
+#ifdef __i386__
+        __vmwrite(EPT_POINTER_HIGH,
+                  d->arch.hvm_domain.vmx.ept_control.eptp >> 32);
+#endif
+    }
+
+    if ( cpu_has_vmx_pat && paging_mode_hap(d) )
+    {
+        u64 host_pat, guest_pat;
+
+        rdmsrl(MSR_IA32_CR_PAT, host_pat);
+        guest_pat = MSR_IA32_CR_PAT_RESET;
+
+        __vmwrite(HOST_PAT, host_pat);
+        __vmwrite(GUEST_PAT, guest_pat);
+#ifdef __i386__
+JUNK
+        __vmwrite(HOST_PAT_HIGH, host_pat >> 32);
+        __vmwrite(GUEST_PAT_HIGH, guest_pat >> 32);
+#endif
+    }
+out:
+    vmx_vmcs_exit(v);
+#if 0
+    paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
+#endif
+    return 0;
+}
+
 static int construct_vmcs(struct vcpu *v)
 {
     struct domain *d = v->domain;
@@ -705,6 +1005,9 @@ static int construct_vmcs(struct vcpu *v
     u32 vmexit_ctl = vmx_vmexit_control;
     u32 vmentry_ctl = vmx_vmentry_control;
 
+    if ( is_pvh_vcpu(v) )
+        return pvh_construct_vmcs(v);
+
     vmx_vmcs_enter(v);
 
     /* VMCS controls. */
@@ -1143,9 +1446,11 @@ void vmx_do_resume(struct vcpu *v)
 
         vmx_clear_vmcs(v);
         vmx_load_vmcs(v);
-        hvm_migrate_timers(v);
-        hvm_migrate_pirqs(v);
-        vmx_set_host_env(v);
+        if ( !is_pvh_vcpu(v) ) {
+            hvm_migrate_timers(v);
+            hvm_migrate_pirqs(v);
+            vmx_set_host_env(v);
+        }
         /*
          * Both n1 VMCS and n2 VMCS need to update the host environment after 
          * VCPU migration. The environment of current VMCS is updated in place,
@@ -1163,6 +1468,9 @@ void vmx_do_resume(struct vcpu *v)
         __vmwrite(GUEST_DR7, 0);
 #endif
 
+    if ( is_pvh_vcpu(v) )
+        goto skip_inteLdbgr;   /* PVH supports gdbsx and gdb inside PVH */
+
     debug_state = v->domain->debugger_attached
                   || 
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3]
                   || 
v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP];
@@ -1173,18 +1481,11 @@ void vmx_do_resume(struct vcpu *v)
         vmx_update_debug_state(v);
     }
 
+skip_inteLdbgr:
     hvm_do_resume(v);
     reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
-static unsigned long vmr(unsigned long field)
-{
-    int rc;
-    unsigned long val;
-    val = __vmread_safe(field, &rc);
-    return rc ? 0 : val;
-}
-
 static void vmx_dump_sel(char *name, uint32_t selector)
 {
     uint32_t sel, attr, limit;
@@ -1346,7 +1647,7 @@ static void vmcs_dump(unsigned char ch)
 
     for_each_domain ( d )
     {
-        if ( !is_hvm_domain(d) )
+        if ( !is_hvm_or_pvh_domain(d) )
             continue;
         printk("\n>>> Domain %d <<<\n", d->domain_id);
         for_each_vcpu ( d, v )
@@ -1531,7 +1832,7 @@ void kdb_curr_cpu_flush_vmcs(void)
     /* looks like we got one. unfortunately, current_vmcs points to vmcs 
      * and not VCPU, so we gotta search the entire list... */
     for_each_domain (dp) {
-        if ( !(is_hvm_or_hyb_domain(dp)) || dp->is_dying)
+        if ( !(is_hvm_or_pvh_domain(dp)) || dp->is_dying)
             continue;
         for_each_vcpu (dp, vp) {
             if ( vp->arch.hvm_vmx.vmcs == cvp ) {
@@ -1562,7 +1863,7 @@ void kdb_dump_vmcs(domid_t did, int vid)
     __vmptrst(&addr);
 
     for_each_domain (dp) {
-        if ( !(is_hvm_or_hyb_domain(dp)) || dp->is_dying)
+        if ( !(is_hvm_or_pvh_domain(dp)) || dp->is_dying)
             continue;
         if (did != 0 && did != dp->domain_id)
             continue;
diff -r 8b0762504037 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Nov 14 11:03:24 2012 -0800
@@ -55,6 +55,7 @@
 #include <asm/debugger.h>
 #include <asm/apic.h>
 #include <asm/hvm/nestedhvm.h>
+#include <asm/hvm/vmx/pvh.h>
 
 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
 
@@ -92,6 +93,9 @@ static int vmx_domain_initialise(struct 
     if ( !zalloc_cpumask_var(&d->arch.hvm_domain.vmx.ept_synced) )
         return -ENOMEM;
 
+    if ( is_pvh_domain(d) )
+        return 0;
+
     if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
     {
         free_cpumask_var(d->arch.hvm_domain.vmx.ept_synced);
@@ -129,6 +133,12 @@ static int vmx_vcpu_initialise(struct vc
 
     vpmu_initialise(v);
 
+    if (is_pvh_vcpu(v) ) 
+    {
+        /* this for hvm_long_mode_enabled(v) */
+        v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME;
+        return 0;
+    }
     vmx_install_vlapic_mapping(v);
 
     /* %eax == 1 signals full real-mode support to the guest loader. */
@@ -366,6 +376,10 @@ static int vmx_guest_x86_mode(struct vcp
 {
     unsigned int cs_ar_bytes;
 
+/* PVH: fixme : look at vmcs.cs?? or just assume 8 ?? */
+if ( is_pvh_vcpu(v) )
+    return 8;
+
     if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
         return 0;
     if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
@@ -593,7 +607,7 @@ static int vmx_load_vmcs_ctxt(struct vcp
     return 0;
 }
 
-static void vmx_fpu_enter(struct vcpu *v)
+void vmx_fpu_enter(struct vcpu *v)
 {
     vcpu_restore_fpu_lazy(v);
     v->arch.hvm_vmx.exception_bitmap &= ~(1u << TRAP_no_device);
@@ -1051,6 +1065,25 @@ static void vmx_update_host_cr3(struct v
     vmx_vmcs_exit(v);
 }
 
+static noinline void vmx_update_pvh_cr(struct vcpu *v, unsigned int cr)
+{
+    vmx_vmcs_enter(v);
+    switch ( cr )
+    {
+        case 3:
+            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.guest_cr[3]);
+kdbp("pvh: d:%d update cr3:%lx\n", v->domain->domain_id, 
v->arch.hvm_vcpu.guest_cr[3]);
+            hvm_asid_flush_vcpu(v);
+            break;
+
+        default:
+            printk("pvh: d%d v%d unexpected cr%d rip:%lx update\n", 
+                   v->domain->domain_id, v->vcpu_id, cr, __vmread(GUEST_RIP));
+            kdb_trap_immed(KDB_TRAP_NONFATAL);
+    }
+    vmx_vmcs_exit(v);
+}
+
 void vmx_update_debug_state(struct vcpu *v)
 {
     unsigned long mask;
@@ -1070,6 +1103,11 @@ void vmx_update_debug_state(struct vcpu 
 
 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
 {
+    if ( is_pvh_vcpu(v) ) {
+        vmx_update_pvh_cr(v, cr);
+        return;
+    }
+
     vmx_vmcs_enter(v);
 
     switch ( cr )
@@ -1267,7 +1305,7 @@ static int nvmx_vmexit_trap(struct vcpu 
     return NESTEDHVM_VMEXIT_DONE;
 }
 
-static void __vmx_inject_exception(int trap, int type, int error_code)
+static noinline void __vmx_inject_exception(int trap, int type, int error_code)
 {
     unsigned long intr_fields;
     struct vcpu *curr = current;
@@ -1511,7 +1549,10 @@ static struct hvm_function_table __read_
     .nhvm_intr_blocked    = nvmx_intr_blocked,
     .nhvm_domain_relinquish_resources = nvmx_domain_relinquish_resources,
     .update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap,
-    .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled
+    .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled,
+    .pvh_update_cr3       = vmx_pvh_update_cr3,
+    .pvh_set_vcpu_info    = vmx_pvh_set_vcpu_info,
+    .pvh_read_descriptor  = vmx_pvh_read_descriptor
 };
 
 struct hvm_function_table * __init start_vmx(void)
@@ -1543,19 +1584,7 @@ struct hvm_function_table * __init start
     return &vmx_function_table;
 }
 
-/*
- * Not all cases receive valid value in the VM-exit instruction length field.
- * Callers must know what they're doing!
- */
-static int get_instruction_length(void)
-{
-    int len;
-    len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
-    BUG_ON((len < 1) || (len > 15));
-    return len;
-}
-
-static void update_guest_eip(void)
+void update_guest_eip(void)
 {
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     unsigned long x;
@@ -1633,8 +1662,8 @@ static void vmx_do_cpuid(struct cpu_user
     regs->edx = edx;
 }
 
-static void vmx_dr_access(unsigned long exit_qualification,
-                          struct cpu_user_regs *regs)
+void vmx_dr_access(unsigned long exit_qualification,
+                   struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
 
@@ -2009,7 +2038,7 @@ gp_fault:
     return X86EMUL_EXCEPTION;
 }
 
-static void vmx_do_extint(struct cpu_user_regs *regs)
+void vmx_do_extint(struct cpu_user_regs *regs)
 {
     unsigned int vector;
 
@@ -2260,6 +2289,14 @@ void vmx_vmexit_handler(struct cpu_user_
 
     perfc_incra(vmexits, exit_reason);
 
+    if ( is_pvh_vcpu(v) ) {
+        if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
+            return vmx_failed_vmentry(exit_reason, regs);
+
+        pvh_vmx_vmexit_handler(regs);
+        return;
+    }
+
     /* Handle the interrupt we missed before allowing any more in. */
     switch ( (uint16_t)exit_reason )
     {
diff -r 8b0762504037 xen/arch/x86/hvm/vmx/vmx_pvh.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/vmx/vmx_pvh.c    Wed Nov 14 11:03:24 2012 -0800
@@ -0,0 +1,1109 @@
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/irq.h>
+#include <xen/softirq.h>
+#include <xen/domain_page.h>
+#include <xen/hypercall.h>
+#include <xen/guest_access.h>
+#include <xen/perfc.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/regs.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/types.h>
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/spinlock.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/traps.h>
+#include <asm/mem_sharing.h>
+#include <asm/hvm/emulate.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/support.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <public/sched.h>
+#include <public/hvm/ioreq.h>
+#include <asm/hvm/vpic.h>
+#include <asm/hvm/vlapic.h>
+#include <asm/x86_emulate.h>
+#include <asm/hvm/vpt.h>
+#include <public/hvm/save.h>
+#include <asm/hvm/trace.h>
+#include <asm/xenoprof.h>
+#include <asm/debugger.h>
+
+#if 0
+enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
+extern enum handler_return long_mode_do_msr_read(struct cpu_user_regs *);
+extern enum handler_return long_mode_do_msr_write(struct cpu_user_regs *);
+#endif
+
+volatile int mukprint=0, mukspin=1;
+#define dbgp0(...) dprintk(XENLOG_ERR, __VA_ARGS__);
+#define dbgp1(...) {(mukprint==1) ? kdbp(__VA_ARGS__):0;}
+#define dbgp2(...) {(mukprint==2) ? kdbp(__VA_ARGS__):0;}
+
+
+static void read_vmcs_selectors(struct cpu_user_regs *regs)
+{
+    regs->cs = vmr(GUEST_CS_SELECTOR);
+    regs->ss = vmr(GUEST_SS_SELECTOR);
+    regs->ds = vmr(GUEST_DS_SELECTOR);
+    regs->es = vmr(GUEST_ES_SELECTOR);
+    regs->gs = vmr(GUEST_GS_SELECTOR);
+    regs->fs = vmr(GUEST_FS_SELECTOR);
+}
+
+/* returns : 0 success */
+static noinline int vmxit_msr_read(struct cpu_user_regs *regs)
+{
+    int rc=1;
+
+    u64 msr_content = 0;
+    switch (regs->ecx)
+    {
+        case MSR_IA32_MISC_ENABLE:
+        {
+            rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
+            msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+                           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
+            break;
+        }
+        default:
+        {
+            rdmsrl(regs->ecx, msr_content);
+            break;
+        }
+    }
+    regs->eax = (uint32_t)msr_content;
+    regs->edx = (uint32_t)(msr_content >> 32);
+    update_guest_eip();
+    rc = 0;
+
+#if 0
+    rc = (long_mode_do_msr_read(regs) == HNDL_done) ? 0 : 1;
+
+    if ( hvm_msr_read_intercept(regs) == X86EMUL_OKAY ) {
+        update_guest_eip();
+        rc = 0;
+    }
+#endif
+
+    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, 
regs->eax, 
+          regs->edx, vmr(GUEST_RIP), vmr(GUEST_RSP));
+    return rc;
+}
+
+/* for now just scratch the cpu since nothing else will run on it. eventually
+ * we need to save and restore these MSRs 
+ * returns : 0 success */
+static noinline int vmxit_msr_write(struct cpu_user_regs *regs)
+{
+    uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
+    int rc=1;
+#if 0
+    wrmsr(regs->ecx, regs->eax, regs->edx);
+
+    rc = (long_mode_do_msr_write(regs) == HNDL_done) ? 0 : 1;
+    return rc;
+#endif
+
+    dbgp1("MUK: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx, 
+          regs->eax,regs->edx);
+    if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) {
+        update_guest_eip();
+        rc = 0;
+    }
+    return rc;
+}
+
+/* rc == 0: handled the MTF vmexit */
+static noinline int vmxit_mtf(struct cpu_user_regs *regs)
+{
+    struct vcpu *vp = current;
+    int rc=1, ss=vp->arch.hvm_vcpu.single_step; 
+
+    dbgp2("\n");
+    vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control);
+    vp->arch.hvm_vcpu.single_step = 0;
+
+    /* kdb will set hvm_vcpu.single_step again if ss command */
+    if (kdb_handle_trap_entry(TRAP_debug, regs)) { /* TBD: ifdef KDB */
+        rc = 0;
+    } else if ( vp->domain->debugger_attached && ss ) {
+        domain_pause_for_debugger();
+        rc = 0;
+    }
+    return rc;
+}
+
+static noinline int vmxit_int3(struct cpu_user_regs *regs)
+{
+    struct vcpu *vp = current;
+
+    if ( vp->domain->debugger_attached ) {
+        update_guest_eip();
+
+        if (kdb_handle_trap_entry(TRAP_int3, regs))
+            return 0;
+        else 
+           /* gdbsx or another debugger. Never pause dom0 */
+            if ( vp->domain->domain_id != 0 && guest_kernel_mode(vp, regs) )
+            {
+                kdbp("[%d]MUK: domain pause for 
debugger\n",smp_processor_id());
+                current->arch.gdbsx_vcpu_event = TRAP_int3;
+                domain_pause_for_debugger();
+            }
+
+    } else {  /* User mode exception. Inject into the guest */
+
+        /* int ilen = get_instruction_length(); */
+        struct hvm_trap trap_info = { 
+                        .vector = TRAP_int3, 
+                        .type = X86_EVENTTYPE_SW_EXCEPTION,
+                        .error_code = HVM_DELIVER_NO_ERROR_CODE, 
+                        .insn_len = get_instruction_length() 
+        };
+        hvm_inject_trap(&trap_info);
+    }
+    return 0;
+}
+
+volatile int mukprintpf;
+/* rc == 0: handled the exception or NMI */
+static noinline int vmxit_exception(struct cpu_user_regs *regs)
+{
+    unsigned int vector = (__vmread(VM_EXIT_INTR_INFO)) & 
INTR_INFO_VECTOR_MASK;
+    int rc=1; 
+    struct vcpu *vp = current;
+
+    dbgp2(" EXCPT: vec:%d cs:%x r.IP:%lx\n", vector, vmr(GUEST_CS_SELECTOR), 
+          regs->eip);
+
+    if (vector == TRAP_debug) {
+        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+        write_debugreg(6, exit_qualification | 0xffff0ff0);
+        regs->rip = vmr(GUEST_RIP); 
+        regs->rsp = vmr(GUEST_RSP);
+        rc = 0;
+        if (kdb_handle_trap_entry(vector, regs)) /* TBD: ifdef KDB */
+            return 0;
+        else {
+           /* gdbsx or another debugger */
+            if ( vp->domain->domain_id != 0 &&    /* never pause dom0 */
+                 guest_kernel_mode(vp, regs) &&  vp->domain->debugger_attached 
)
+            {
+                domain_pause_for_debugger();
+            } else {
+                hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);
+            }
+        }
+    } 
+    if (vector == TRAP_int3) {
+        rc = vmxit_int3(regs);
+    } 
+
+    if (vector == TRAP_invalid_op) {
+#if 0
+kdbp("MUK: invalid op. injection exception. rip:%lx\n", (ulong)vmr(GUEST_RIP));
+kdb_trap_immed(KDB_TRAP_NONFATAL);
+        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
+        rc = 0;
+#endif
+        if ( guest_kernel_mode(vp, regs) || emulate_forced_invalid_op(regs)==0 
)
+        {
+            hvm_inject_hw_exception(TRAP_invalid_op, 
HVM_DELIVER_NO_ERROR_CODE);
+            rc = 0;
+        }
+
+    }
+    if (vector == TRAP_no_device) {
+        hvm_funcs.fpu_dirty_intercept();  /* calls vmx_fpu_dirty_intercept */
+        rc = 0;
+    }
+
+    if (vector == TRAP_gp_fault) {
+        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
+        kdbp("MUK: inject GP: errcode:0x%04x RIP:%016lx RSP:%016lx\n", 
+             regs->error_code, (ulong)vmr(GUEST_RIP), 
+             (ulong)vmr(GUEST_RSP));
+
+        kdb_trap_immed(KDB_TRAP_NONFATAL);
+        /* hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code); */
+        rc = 1;
+    }
+
+    if (vector == TRAP_page_fault) {
+        kdbp("Unexpected vector page_fault\n");
+        kdb_trap_immed(KDB_TRAP_NONFATAL);
+#if 0
+        extern int fixup_page_fault(unsigned long , struct cpu_user_regs *);
+        ulong eflags_sav = regs->eflags;
+        unsigned long va = __vmread(EXIT_QUALIFICATION);
+
+        regs->error_code = __vmread(VM_EXIT_INTR_ERROR_CODE);
+
+        if (mukprintpf)
+            kdbp("MUK:PF va:%016lx errcode:0x%04x RIP:%016lx RSP:%016lx", 
+                 va, regs->error_code, (ulong)vmr(GUEST_RIP), 
+                 (ulong)vmr(GUEST_RSP));
+
+        regs->eflags |= X86_EFLAGS_IF;
+        if (fixup_page_fault(va, regs) == 0) {
+            if (mukprintpf)
+                kdbp(" NOT ");
+            current->arch.hvm_vcpu.guest_cr[2] = va;
+            hvm_inject_hw_exception(TRAP_page_fault, regs->error_code);
+        }
+        regs->eflags = eflags_sav;
+        if (mukprintpf)
+            kdbp(" fixedup\n");
+        rc = 0;
+#endif
+    }
+
+    /* TBD: call do_guest_trap() here */
+    if (rc)
+        kdbp("MUK: Unhandled trap vector:%d\n", vector);
+    return rc;
+}
+
+static noinline int vmxit_invlpg(void)
+{
+    ulong vaddr = __vmread(EXIT_QUALIFICATION);
+
+    update_guest_eip();
+    vpid_sync_vcpu_gva(current, vaddr);
+    return 0;
+}
+
+#if 0
+static noinline void native_cpuid(struct cpu_user_regs *regs);
+{
+    uint32_t a, b, c, d;
+
+    a = regs->eax;
+    b = regs->ebx;
+    c = regs->ecx;
+    d = regs->edx;
+
+    asm ( "cpuid"
+          : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
+          : "0" (a), "1" (b), "2" (c), "3" (d) );
+
+    regs->eax = a;
+    regs->ebx = b;
+    regs->ecx = c;
+    regs->edx = d;
+}
+#endif
+
+static noinline int pvh_grant_table_op(
+              unsigned int cmd, XEN_GUEST_HANDLE(void) uop, unsigned int count)
+{
+    switch (cmd)
+    {
+        case GNTTABOP_map_grant_ref:
+        case GNTTABOP_unmap_grant_ref:
+        case GNTTABOP_setup_table:
+        case GNTTABOP_copy:
+        case GNTTABOP_query_size:
+        case GNTTABOP_set_version:
+            return do_grant_table_op(cmd, uop, count);
+    }
+kdbp("MUK: hcall: pvh_grant_table_op: unhandled grant op:%d\n", cmd);
+kdb_trap_immed(KDB_TRAP_NONFATAL);
+    return -ENOSYS;
+}
+
+static long pvh_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE(void) arg)
+{
+    long rc = -ENOSYS;
+
+    switch ( cmd )
+    {
+        case VCPUOP_register_runstate_memory_area:
+        case VCPUOP_get_runstate_info:
+        case VCPUOP_set_periodic_timer:
+        case VCPUOP_stop_periodic_timer:
+        case VCPUOP_set_singleshot_timer:
+        case VCPUOP_stop_singleshot_timer:
+        case VCPUOP_is_up:
+        case VCPUOP_up:
+        case VCPUOP_initialise:
+            rc = do_vcpu_op(cmd, vcpuid, arg);
+
+            /* pvh boot vcpu setting context for bringing up smp vcpu */
+            if (cmd == VCPUOP_initialise)
+                vmx_vmcs_enter(current);
+    }
+    return rc;
+}
+
+static long pvh_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
+{
+    switch ( cmd )
+    {
+        case PHYSDEVOP_map_pirq:
+        case PHYSDEVOP_unmap_pirq:
+        case PHYSDEVOP_eoi:
+        case PHYSDEVOP_irq_status_query:
+        case PHYSDEVOP_get_free_pirq:
+#if 0
+        case PHYSDEVOP_set_iopl:
+#endif
+            return do_physdev_op(cmd, arg);
+
+        default:
+            if ( IS_PRIV(current->domain) )
+                return do_physdev_op(cmd, arg);
+    }
+    return -ENOSYS;
+}
+
+static noinline long do_pvh_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) 
arg)
+{
+    long rc = -EINVAL;
+    struct xen_hvm_param harg;
+    struct domain *d;
+
+    if ( copy_from_guest(&harg, arg, 1) )
+        return -EFAULT;
+
+    rc = rcu_lock_target_domain_by_id(harg.domid, &d);
+    if ( rc != 0 )
+        return rc;
+
+    if (is_hvm_domain(d)) {
+        /* pvh dom0 is building an hvm guest */
+        rcu_unlock_domain(d);
+       return do_hvm_op(op, arg);  
+    }
+
+    rc = -ENOSYS;
+    if (op == HVMOP_set_param) {
+        if (harg.index == HVM_PARAM_CALLBACK_IRQ) {
+            struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
+            uint64_t via = harg.value;
+            uint8_t via_type = (uint8_t)(via >> 56) + 1;
+
+            if (via_type == HVMIRQ_callback_vector) {
+                hvm_irq->callback_via_type = HVMIRQ_callback_vector;
+                hvm_irq->callback_via.vector = (uint8_t)via;
+                rc = 0;
+            }
+        }
+    }
+    return rc;
+}
+
+typedef unsigned long pvh_hypercall_t(
+    unsigned long, unsigned long, unsigned long, unsigned long, unsigned long,
+    unsigned long);
+
+int hcall_a[NR_hypercalls];
+
+static pvh_hypercall_t *pvh_hypercall64_table[NR_hypercalls] = {
+    [__HYPERVISOR_platform_op]     = (pvh_hypercall_t *)do_platform_op,
+    [__HYPERVISOR_memory_op]       = (pvh_hypercall_t *)do_memory_op,
+    /* [__HYPERVISOR_set_timer_op]     = (pvh_hypercall_t *)do_set_timer_op, */
+    [__HYPERVISOR_xen_version]     = (pvh_hypercall_t *)do_xen_version,
+    [__HYPERVISOR_console_io]      = (pvh_hypercall_t *)do_console_io,
+    [__HYPERVISOR_grant_table_op]  = (pvh_hypercall_t *)pvh_grant_table_op,
+    [__HYPERVISOR_vcpu_op]         = (pvh_hypercall_t *)pvh_vcpu_op,
+    [__HYPERVISOR_mmuext_op]       = (pvh_hypercall_t *)do_mmuext_op,
+    [__HYPERVISOR_xsm_op]          = (pvh_hypercall_t *)do_xsm_op,
+    [__HYPERVISOR_sched_op]        = (pvh_hypercall_t *)do_sched_op,
+    [__HYPERVISOR_event_channel_op]= (pvh_hypercall_t *)do_event_channel_op,
+    [__HYPERVISOR_physdev_op]      = (pvh_hypercall_t *)pvh_physdev_op,
+    [__HYPERVISOR_hvm_op]          = (pvh_hypercall_t *)do_pvh_hvm_op,
+    [__HYPERVISOR_sysctl]          = (pvh_hypercall_t *)do_sysctl,
+    [__HYPERVISOR_domctl]          = (pvh_hypercall_t *)do_domctl
+};
+
+/* TBD: Do we need to worry about this and slow things down in this path? */
+static int pvh_long_mode_enabled(void)
+{
+    return 1;
+}
+
+/* Check if hypercall is valid 
+ * Returns: 0 if hcall is not valid with eax set to the errno to ret to guest
+ */
+static noinline int hcall_valid(struct cpu_user_regs *regs)
+{
+    struct segment_register sreg;
+
+    if (!pvh_long_mode_enabled()) {
+        printk("PVH Error: Expected long mode set\n");
+        return 1;
+    }
+    hvm_get_segment_register(current, x86_seg_ss, &sreg);
+    if ( unlikely(sreg.attr.fields.dpl == 3) ) {
+        regs->eax = -EPERM;
+        return 0;
+    }
+
+    /* domU's are not allowed following hcalls */
+    if ( !IS_PRIV(current->domain) &&
+         (regs->eax == __HYPERVISOR_xsm_op ||
+          regs->eax == __HYPERVISOR_platform_op ||
+#if 0
+          regs->eax == __HYPERVISOR_mmuext_op ||    /* for privcmd mmap */
+#endif
+          regs->eax == __HYPERVISOR_domctl) ) {     /* for privcmd mmap */
+
+        regs->eax = -EPERM;
+        return 0;
+    }
+    return 1;
+}
+volatile int mukdbg178=1;
+static noinline int vmxit_vmcall(struct cpu_user_regs *regs)
+{
+    uint32_t hnum = regs->eax;
+
+    if (hnum >= NR_hypercalls || pvh_hypercall64_table[hnum] ==NULL) 
+    {
+        kdbp("MUK: UnImplemented HCALL:%d. ret -ENOSYS to guest\n", regs->eax);
+if (mukdbg178)
+    kdb_trap_immed(KDB_TRAP_NONFATAL);
+        regs->eax = -ENOSYS;
+        update_guest_eip();
+        return HVM_HCALL_completed;
+    }
+
+    dbgp2("vmxit_vmcall: hcall eax:$%ld\n", regs->eax);
+    if (regs->eax == __HYPERVISOR_sched_op && regs->rdi == SCHEDOP_shutdown) {
+        kdbp("MUK: FIXME: SCHEDOP_shutdown hcall\n");
+        regs->eax = -ENOSYS;
+        update_guest_eip();
+        domain_crash_synchronous();
+        return HVM_HCALL_completed;
+    }
+
+    if ( !hcall_valid(regs) ) {
+kdbp("!hcall_valid(). hcall:%d\n", hnum);
+if (mukdbg178)
+    kdb_trap_immed(KDB_TRAP_NONFATAL);
+        return HVM_HCALL_completed;
+    }
+
+
+hcall_a[hnum]++;
+current->arch.hvm_vcpu.hcall_preempted = 0; /* TBD */
+/* search for this and do it. PV method will not work */
+
+    regs->rax = pvh_hypercall64_table[hnum](regs->rdi, regs->rsi, regs->rdx,
+                                            regs->r10, regs->r8, regs->r9);
+
+    if ( !current->arch.hvm_vcpu.hcall_preempted )
+        update_guest_eip();
+    else
+        kdbp("Hcall :%d preempted\n", hnum);
+         
+
+    return HVM_HCALL_completed;
+}
+
+static noinline uint64_t *get_gpr_ptr(struct cpu_user_regs *regs, uint gpr)
+{
+    switch (gpr)
+    {
+        case VMX_CONTROL_REG_ACCESS_GPR_EAX:
+            return &regs->eax;
+        case VMX_CONTROL_REG_ACCESS_GPR_ECX:
+            return &regs->ecx;
+        case VMX_CONTROL_REG_ACCESS_GPR_EDX:
+            return &regs->edx;
+        case VMX_CONTROL_REG_ACCESS_GPR_EBX:
+            return &regs->ebx;
+        case VMX_CONTROL_REG_ACCESS_GPR_ESP:
+            return &regs->esp;
+        case VMX_CONTROL_REG_ACCESS_GPR_EBP:
+            return &regs->ebp;
+        case VMX_CONTROL_REG_ACCESS_GPR_ESI:
+            return &regs->esi;
+        case VMX_CONTROL_REG_ACCESS_GPR_EDI:
+            return &regs->edi;
+        case VMX_CONTROL_REG_ACCESS_GPR_R8:
+            return &regs->r8;
+        case VMX_CONTROL_REG_ACCESS_GPR_R9:
+            return &regs->r9;
+        case VMX_CONTROL_REG_ACCESS_GPR_R10:
+            return &regs->r10;
+        case VMX_CONTROL_REG_ACCESS_GPR_R11:
+            return &regs->r11;
+        case VMX_CONTROL_REG_ACCESS_GPR_R12:
+            return &regs->r12;
+        case VMX_CONTROL_REG_ACCESS_GPR_R13:
+            return &regs->r13;
+        case VMX_CONTROL_REG_ACCESS_GPR_R14:
+            return &regs->r14;
+        case VMX_CONTROL_REG_ACCESS_GPR_R15:
+            return &regs->r15;
+        default:
+            return NULL;
+    }
+}
+/* rc == 0: success */
+static noinline int access_cr0(struct cpu_user_regs *regs, uint acc_typ, 
+                               uint64_t *regp)
+{
+    struct vcpu *vp = current;
+
+    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+    {
+        unsigned long new_cr0 = *regp;
+        unsigned long old_cr0 = __vmread(GUEST_CR0);
+
+        dbgp2("MUK:writing to CR0. RIP:%lx val:0x%lx\n", vmr(GUEST_RIP),*regp);
+        if ( (u32)new_cr0 != new_cr0 )
+        {
+            HVM_DBG_LOG(DBG_LEVEL_1, 
+                        "Guest setting upper 32 bits in CR0: %lx", new_cr0);
+            return 1;
+        }
+
+        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
+        /* ET is reserved and should be always be 1. */
+        new_cr0 |= X86_CR0_ET;
+
+        /* pvh cannot change to real mode */
+        if ( (new_cr0 & (X86_CR0_PE|X86_CR0_PG)) != (X86_CR0_PG|X86_CR0_PE) ) {
+            kdbp("Guest attempting to turn off PE/PG. CR0:%lx\n", new_cr0);
+            return 1;
+        }
+        /* TS going from 1 to 0 */
+        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS)==0) )
+            vmx_fpu_enter(vp);
+
+        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0;
+        __vmwrite(GUEST_CR0, new_cr0);
+        __vmwrite(CR0_READ_SHADOW, new_cr0);
+    } else {
+        *regp = __vmread(GUEST_CR0);
+    } 
+    return 0;
+}
+
+#if 0
+volatile int mukdbgcr3;
+/* rc == 0: success */
+static noinline int access_cr3(struct cpu_user_regs *regs, uint acc_typ, 
+                               uint64_t *regp)
+{
+    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR ) {
+        unsigned long new_cr3 = *regp;
+
+        __vmwrite(GUEST_CR3, new_cr3);
+        vpid_sync_all();
+        if (mukdbgcr3)
+            kdbp("MUK: mov to cr3:%016lx cs:%lx RIP:%016lx\n", 
new_cr3,regs->cs,
+                 vmr(GUEST_RIP));
+    } else {
+        *regp = __vmread(GUEST_CR3);
+    }
+
+    return 0;
+}
+#endif
+
+/* rc == 0: success */
+static noinline int access_cr4(struct cpu_user_regs *regs, uint acc_typ, 
+                               uint64_t *regp)
+{
+    if (acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
+    {
+        u64 old_cr4 = __vmread(GUEST_CR4);
+        /* kdbp("MUK:writing to CR4. val:0x%lx\n", *regp); */
+
+        if ( (old_cr4 ^ (*regp)) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
+            vpid_sync_all();
+
+        /* pvh_verify_cr4_wr(*regp)); */
+        __vmwrite(GUEST_CR4, *regp);
+    } else {
+        *regp = __vmread(GUEST_CR4);
+        kdbp("MUK: read cr4. val:0x%lx\n", *regp);
+    } 
+    return 0;
+}
+
+/* rc == 0: success */
+static noinline int vmxit_cr_access(struct cpu_user_regs *regs)
+{
+    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
+    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
+    int cr, rc = 1;
+
+    switch ( acc_typ )
+    {
+        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
+        case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
+        {
+            uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
+            uint64_t *regp = get_gpr_ptr(regs, gpr);
+            cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
+
+            if (regp == NULL)
+                break;
+
+            /* pl don't embed switch statements */
+            if (cr == 0)
+                rc = access_cr0(regs, acc_typ, regp);
+            else if (cr == 3) {
+                printk("PVH: d%d: unexpected cr3 access vmexit. rip:%lx\n", 
+                      current->domain->domain_id, vmr(GUEST_RIP));
+                domain_crash_synchronous();
+            } else if (cr == 4) 
+                rc = access_cr4(regs, acc_typ, regp);
+
+            if (rc == 0)
+                update_guest_eip();
+            break;
+        }
+        case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
+        {
+#if 0
+            unsigned long cr0 = __vmread(GUEST_CR0);
+            cr0 &= ~X86_CR0_TS;
+#endif
+            struct vcpu *vp = current;
+            unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS;
+            vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0;
+            vmx_fpu_enter(vp);
+            __vmwrite(GUEST_CR0, cr0);
+            __vmwrite(CR0_READ_SHADOW, cr0);
+            update_guest_eip();
+            rc = 0;
+        }
+    }
+    return rc;
+}
+
+static int noinline vmxit_io_instr(struct cpu_user_regs *regs)
+{
+    int curr_lvl;
+    int requested = (regs->rflags >> 12) & 3;
+
+    read_vmcs_selectors(regs);
+    curr_lvl = regs->cs & 3;
+
+    if (requested >= curr_lvl && emulate_privileged_op(regs)) 
+        return 0;
+
+    kdbp("MUK: io instr about to inject gpf. req:%x curr:%x\n", requested, 
+         curr_lvl);
+    kdb_trap_immed(KDB_TRAP_NONFATAL);
+    hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
+
+    return 0;
+}
+
+static noinline int pvh_ept_handle_violation(unsigned long qualification, 
paddr_t gpa)
+{
+    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
+    p2m_type_t p2mt;
+    mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt);
+
+    gdprintk(XENLOG_ERR, "Dom:%d EPT violation %#lx (%c%c%c/%c%c%c), "
+             "gpa %#"PRIpaddr", mfn %#lx, type %i.\n",
+             current->domain->domain_id, qualification, 
+             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
+             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
+             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
+             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
+             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
+             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
+             gpa, mfn_x(mfn), p2mt);
+             
+    ept_walk_table(current->domain, gfn);
+
+    if ( qualification & EPT_GLA_VALID )
+    {
+        gla = __vmread(GUEST_LINEAR_ADDRESS);
+        gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla);
+    }
+#if 0
+    if (vmr(GUEST_CS_SELECTOR) & 3 == 3) {
+       kdbp("EPT Violation in user mode. inject GP\n");
+        hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
+    } else
+#endif
+        hvm_inject_hw_exception(TRAP_gp_fault, 0);
+        kdb_trap_immed(KDB_TRAP_NONFATAL);
+
+    return 0;
+}
+
+#if 0
+/* emulate write_cr3(read_cr3()) in guest. */
+static noinline int vmxit_invvpid(void)
+{
+    hvm_asid_flush_vcpu(current);
+    return 0;
+}
+#endif
+volatile int mukcpuiddbg=0;
+static noinline void pvh_cpuid(struct cpu_user_regs *regs)
+{
+    unsigned int eax, ebx, ecx, edx;
+
+    if (mukcpuiddbg)
+        kdbp("ucpuid: rax:%lx bx:%lx cx:%lx dx:%lx\n", regs->rax, regs->rbx,
+             regs->rcx, regs->rdx);
+
+    asm volatile ( "cpuid"
+              : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+              : "0" (regs->eax), "2" (regs->rcx) );
+
+    regs->rax = eax; regs->rbx = ebx; regs->rcx = ecx; regs->rdx = edx;
+    if (mukcpuiddbg)
+        kdbp("done: rax:%lx bx:%lx cx:%lx dx:%lx\n", regs->rax, regs->rbx,
+             regs->rcx, regs->rdx);
+}
+
+volatile int mukprdr7, mukprtsc=1, mukcrashit;
+void pvh_vmx_vmexit_handler(struct cpu_user_regs *regs)
+{
+    unsigned long exit_qualification;
+    unsigned int vector, exit_reason = __vmread(VM_EXIT_REASON);
+    int rc=0, ccpu = smp_processor_id();
+    struct vcpu *vp = current;
+
+#if defined(XEN_KDB_CONFIG)
+{
+    if (kdb_dr7)
+        write_debugreg(7, kdb_dr7);
+}
+#endif
+    if (mukprdr7) 
+        kdbp("MUK: vmexit dr0:%lx 7:%lx vmcs.7:%lx\n", read_debugreg(0),
+              read_debugreg(7), __vmread(GUEST_DR7));
+
+    dbgp1("MUK:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx 
CR0:%lx\n", 
+          ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP), regs->rflags, 
+          vmr(GUEST_CR0));
+
+    /* for guest_kernel_mode in fixup_page_fault() */
+    regs->cs = vmr(GUEST_CS_SELECTOR); 
+
+    switch ( (uint16_t)exit_reason )
+    {
+        case EXIT_REASON_EXCEPTION_NMI:
+        case EXIT_REASON_EXTERNAL_INTERRUPT:
+        case EXIT_REASON_MCE_DURING_VMENTRY:
+            break;
+        default:
+            local_irq_enable();
+    }
+
+    switch ( (uint16_t)exit_reason )
+    {
+        case EXIT_REASON_EXCEPTION_NMI:      /* 0 */
+            rc = vmxit_exception(regs);
+            break;
+            
+        case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
+        {
+            vector = __vmread(VM_EXIT_INTR_INFO);
+            vector &= INTR_INFO_VECTOR_MASK;
+#if 0
+            dbgp2("MUK: [%d] exit vmcs reas:%d vec:%d cr0:0x%016lx\n", ccpu, 
+                 exit_reason, vector, vmr(GUEST_CR0));
+#endif
+            vmx_do_extint(regs);
+            break;
+        }
+
+        case EXIT_REASON_TRIPLE_FAULT:  /* 2 */
+        {
+#if 0
+            static int once;
+     if (!once)
+     kdbp("MUK:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx CR0:%lx\n",
+          ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP), regs->rflags, 
+          vmr(GUEST_CR0));
+            once = 1;
+            hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
+            rc = 0;
+#endif
+     kdbp("MUK:Triple Flt:[%d]exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx 
CR3:%lx\n",
+          ccpu, exit_reason, vmr(GUEST_RIP), vmr(GUEST_RSP), regs->rflags, 
+          vmr(GUEST_CR3));
+
+    if ( paging_mode_hap(vp->domain) && hvm_paging_enabled(vp) )
+        vp->arch.hvm_vcpu.guest_cr[3] = vp->arch.hvm_vcpu.hw_cr[3] =
+                                                           __vmread(GUEST_CR3);
+            kdb_trap_immed(KDB_TRAP_NONFATAL);
+            rc = 1;
+            break;
+        }
+        case EXIT_REASON_PENDING_VIRT_INTR:  /* 7 */
+        {
+            struct vcpu *v = current;
+            /* Disable the interrupt window. */
+            v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+            __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+            break;
+        }
+
+        case EXIT_REASON_CPUID:              /* 10 */
+        {
+            if ( guest_kernel_mode(vp, regs) ) {
+                pv_cpuid(regs);
+
+                /* Because we are setting CR4.OSFXSR to 0, we need to disable
+                 * this because, during boot, user process "init" (which 
doesn't
+                 * do cpuid), will do 'pxor xmm0,xmm0' and cause #UD. For now 
+                 * disable this. HVM doesn't allow setting of CR4.OSFXSR.
+                 * TBD/FIXME: this and also look at CR4.OSXSAVE */
+
+                __clear_bit(X86_FEATURE_FXSR, &regs->edx);
+            } else
+                pvh_cpuid(regs);
+
+             /* TBD/FIXME: investigate and fix the XSAVE/MMX/FPU stuff */
+
+            update_guest_eip();
+            dbgp2("cpuid:%d RIP:%lx\n", regs->eax, vmr(GUEST_RIP));
+            break;
+        }
+
+        case EXIT_REASON_HLT:             /* 12 */
+       {
+            update_guest_eip();
+            hvm_hlt(regs->eflags);
+           break;
+#if 0
+            XEN_GUEST_HANDLE(void) p = {NULL};
+           /* hvm_event_pending ?? see hvm_hlt() */
+           do_sched_op(SCHEDOP_block, p);
+#endif
+       }
+
+        case EXIT_REASON_INVLPG:             /* 14 */
+            rc = vmxit_invlpg();
+            break;
+
+        case EXIT_REASON_RDTSC:              /* 16 */
+        {
+#if 0
+            uint64_t tsc;
+            int ilen=get_instruction_length();
+            rdtscll(tsc);
+            regs->eax = (uint32_t)tsc;
+            regs->edx = (uint32_t)(tsc >> 32);
+            rdtsc(regs->eax, regs->edx);
+if (mukprtsc)
+    kdbp(" RDTSC: eax:%lx edx:%lx\n", regs->eax, regs->edx);
+            update_guest_eip();
+#endif
+            rc = 1;
+            break;
+        }
+
+        case EXIT_REASON_VMCALL:             /* 18 */
+            rc = vmxit_vmcall(regs);
+            break;
+
+        case EXIT_REASON_CR_ACCESS:          /* 28 */
+            rc = vmxit_cr_access(regs);
+            break;
+
+        case EXIT_REASON_DR_ACCESS:          /* 29 */
+        {
+            exit_qualification = __vmread(EXIT_QUALIFICATION);
+#if defined(XEN_KDB_CONFIG)
+            update_guest_eip();
+            break;
+#endif
+            vmx_dr_access(exit_qualification, regs);
+            break;
+        }
+
+        case EXIT_REASON_IO_INSTRUCTION:
+            vmxit_io_instr(regs);
+            break;
+
+        case EXIT_REASON_MSR_READ:           /* 31 */
+            rc = vmxit_msr_read(regs);
+            break;
+
+        case EXIT_REASON_MSR_WRITE:          /* 32 */
+            rc = vmxit_msr_write(regs);
+            break;
+
+        case EXIT_REASON_MONITOR_TRAP_FLAG:  /* 37 */
+            rc = vmxit_mtf(regs);
+            break;
+
+        case EXIT_REASON_EPT_VIOLATION:
+        {
+            paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+            exit_qualification = __vmread(EXIT_QUALIFICATION);
+            rc = pvh_ept_handle_violation(exit_qualification, gpa);
+            break;
+        }
+#if 0
+        case EXIT_REASON_INVVPID:            /* 53 */
+            rc = vmxit_invvpid();
+            break;
+#endif
+        default: 
+            rc = 1;
+            kdbp("Unexpected exit reason:%d 0x%x\n", exit_reason, exit_reason);
+    }
+    if (rc) {
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        kdbp("MUK: [%d] exit_reas:%d 0x%lx qual:%ld 0x%lx cr0:0x%016lx\n", 
+             ccpu, exit_reason, exit_reason, exit_qualification,
+             exit_qualification, vmr(GUEST_CR0));
+        kdbp("MUK: [%d] RIP:%lx RSP:%lx\n", ccpu, 
+             vmr(GUEST_RIP), vmr(GUEST_RSP));
+        domain_crash_synchronous();
+    }
+
+if (mukcrashit)
+    domain_crash_synchronous();
+
+    /*dbgp("MUK: will enter vmcs: cs:%x ss:%x\n", vmr(GUEST_CS_SELECTOR),
+         vmr(GUEST_SS_SELECTOR)); */
+
+    dbgp1("MUK: will enter vmcs:RIP:%lx RSP:%lx cr0:%lx eflags:%lx\n", 
+          vmr(GUEST_RIP), vmr(GUEST_RSP), vmr(GUEST_CR0), regs->rflags);
+
+    if (mukprdr7) 
+        kdbp("MUK: vmexit dr0:%lx 7:%lx vmcs.7:%lx\n", read_debugreg(0),
+              read_debugreg(7), __vmread(GUEST_DR7));
+
+}
+
+void pvh_flush_tlb(void)
+{
+    vpid_sync_all();
+}
+
+void pvh_do_invlpg(ulong addr)
+{
+    /* vpid_sync_all(); */
+    vpid_sync_vcpu_gva(current, addr);
+}
+
+/* 
+ * Sets info for non boot vcpu. VCPU 0 context is set by library which needs 
+ * to be modified to send
+ * correct selectors and gs_base. For now, we use this for nonboot vcpu 
+ * in which case the call somes from the kernel cpu_initialize_context().
+ */
+int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp)
+{
+kdbp("MUK: bringup pvh vcpu: cs:%x ds:%x es:%x ss:%x gsbase:%lx\n", 
+     ctxtp->user_regs.cs, ctxtp->user_regs.ds, ctxtp->user_regs.es,
+     ctxtp->user_regs.ss, ctxtp->gs_base_kernel);
+kdbp("MUK: bringup pvh vcpu: gdtr base:%lx sz:%lx gsusr:%lx\n", 
+ctxtp->u.pvh.gdtaddr, ctxtp->u.pvh.gdtsz, ctxtp->gs_base_user);
+
+    if (v->vcpu_id == 0)
+        return 0;
+
+    vmx_vmcs_enter(v);
+    __vmwrite(GUEST_GDTR_BASE, ctxtp->u.pvh.gdtaddr);
+    __vmwrite(GUEST_GDTR_LIMIT, ctxtp->u.pvh.gdtsz);
+    __vmwrite(GUEST_GS_BASE, ctxtp->gs_base_user);
+
+    __vmwrite(GUEST_CS_SELECTOR, ctxtp->user_regs.cs);
+    __vmwrite(GUEST_DS_SELECTOR, ctxtp->user_regs.ds);
+    __vmwrite(GUEST_ES_SELECTOR, ctxtp->user_regs.es);
+    __vmwrite(GUEST_SS_SELECTOR, ctxtp->user_regs.ss);
+    __vmwrite(GUEST_GS_SELECTOR, ctxtp->user_regs.gs);
+
+    if ( vmx_add_guest_msr(MSR_SHADOW_GS_BASE) )
+        return -EINVAL;
+
+    vmx_write_guest_msr(MSR_SHADOW_GS_BASE, ctxtp->gs_base_kernel);
+
+    vmx_vmcs_exit(v);
+    return 0;
+}
+
+void vmx_pvh_update_cr3(struct vcpu *v)
+{
+kdbp("MUK: in vmx_pvh_update_cr3\n");
+kdb_trap_immed(KDB_TRAP_NONFATAL);
+    vmx_vmcs_enter(v);
+    __vmwrite(GUEST_CR3, v->arch.cr3);
+    __vmwrite(HOST_CR3, v->arch.cr3);
+
+    vpid_sync_all();
+    /* hvm_asid_flush_vcpu(v); ????????????? */
+    vmx_vmcs_exit(v);
+}
+
+
+int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+                            const struct cpu_user_regs *regs,
+                            unsigned long *base, unsigned long *limit,
+                            unsigned int *ar)
+{
+    unsigned int tmp_ar = 0;
+    BUG_ON(v!=current);
+    BUG_ON(!is_pvh_vcpu(v));
+
+    if (sel == (unsigned int)regs->cs) {
+        *base = vmr(GUEST_CS_BASE);
+        *limit = vmr(GUEST_CS_LIMIT);
+        tmp_ar = vmr(GUEST_CS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->ds) {
+        *base = vmr(GUEST_DS_BASE);
+        *limit = vmr(GUEST_DS_LIMIT);
+        tmp_ar = vmr(GUEST_DS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->ss) {
+        *base = vmr(GUEST_SS_BASE);
+        *limit = vmr(GUEST_SS_LIMIT);
+        tmp_ar = vmr(GUEST_SS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->gs) {
+        *base = vmr(GUEST_GS_BASE);
+        *limit = vmr(GUEST_GS_LIMIT);
+        tmp_ar = vmr(GUEST_GS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->fs) {
+        *base = vmr(GUEST_FS_BASE);
+        *limit = vmr(GUEST_FS_LIMIT);
+        tmp_ar = vmr(GUEST_FS_AR_BYTES); 
+    } else if (sel == (unsigned int)regs->es) {
+        *base = vmr(GUEST_ES_BASE);
+        *limit = vmr(GUEST_ES_LIMIT);
+        tmp_ar = vmr(GUEST_ES_AR_BYTES); 
+    } else {
+        kdbp("Unmatche segment selector:%d\n", sel);
+        kdb_trap_immed(KDB_TRAP_NONFATAL);
+    }
+
+    if (tmp_ar & X86_SEG_AR_CS_LM_ACTIVE) {           /* x86 mess!! */
+        *base = 0UL;
+        *limit = ~0UL;
+    }
+    /* Fixup ar so that it looks the same as in native mode */
+    *ar = (tmp_ar << 8);
+    return 1;
+}
+
+int pvh_segment_bad(unsigned int ar, struct vcpu *v)
+{
+    int rc;
+
+    BUG_ON(v!=current);
+    BUG_ON(!is_pvh_vcpu(v));
+    rc = !(ar & X86_SEG_AR_DESC_TYPE) || !(ar & X86_SEG_AR_SEG_PRESENT) ||
+         !(ar & X86_SEG_AR_SEG_TYPE_CODE);
+    return rc;
+}
+
diff -r 8b0762504037 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/mm.c Wed Nov 14 11:03:24 2012 -0800
@@ -2608,7 +2608,7 @@ static struct domain *get_pg_owner(domid
         goto out;
     }
 
-    if ( unlikely(paging_mode_translate(curr)) )
+    if ( !is_pvh_domain(curr) && unlikely(paging_mode_translate(curr)) )
     {
         MEM_LOG("Cannot mix foreign mappings with translated domains");
         goto out;
@@ -3667,7 +3667,7 @@ static int destroy_grant_va_mapping(
     return replace_grant_va_mapping(addr, frame, l1e_empty(), v);
 }
 
-static int create_grant_p2m_mapping(uint64_t addr, unsigned long frame,
+static noinline int create_grant_p2m_mapping(uint64_t addr, unsigned long 
frame,
                                     unsigned int flags,
                                     unsigned int cache_flags)
 {
@@ -3742,6 +3742,22 @@ static int replace_grant_p2m_mapping(
     }
     guest_physmap_remove_page(d, gfn, frame, PAGE_ORDER_4K);
 
+    if (is_pvh_domain(d)) {
+        struct page_info *page = alloc_domheap_page(d, 0);
+
+       if (page == NULL) {
+           gdprintk(XENLOG_ERR, "Unable to alloc domheap page\n");
+#if 0
+            /* the guest will crash anyways with ept fault when gfn accessed */
+           domain_crash_synchronous();
+#endif
+       }
+       if (guest_physmap_add_page(d, gfn, page_to_mfn(page), 0) != 0) {
+           gdprintk(XENLOG_ERR, "Unable to add mfn to replace grant\n");
+           /* domain_crash_synchronous(); */
+            return GNTST_general_error;
+        }
+    }
     put_gfn(d, gfn);
     return GNTST_okay;
 }
@@ -4143,7 +4159,7 @@ long do_update_descriptor(u64 pa, u64 de
     page = get_page_from_gfn(dom, gmfn, NULL, P2M_ALLOC);
     if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
          !page ||
-         !check_descriptor(dom, &d) )
+         (!is_pvh_domain(dom) && !check_descriptor(dom, &d)) )
     {
         if ( page )
             put_page(page);
@@ -4217,9 +4233,106 @@ static int handle_iomem_range(unsigned l
     return 0;
 }
 
-static int xenmem_add_to_physmap_once(
-    struct domain *d,
-    const struct xen_add_to_physmap *xatp)
+/* add frames from foreign domain to current domain physmap. Similar to 
+ * XENMAPSPACE_gmfn but the frame is foreign being mapped into current,
+ * and is not removed from foreign domain. 
+ * Usage: libxl on pvh dom0 creating guest and doing privcmd_ioctl_mmap
+ * Returns: 0 ==> success
+ */
+static long noinline xenmem_add_foreign_to_pmap( domid_t foreign_domid, 
+                                        unsigned long fgmfn, unsigned long 
gpfn)
+{
+    unsigned long rc=0, failed=0, prev_mfn, mfn = 0;
+    struct domain *fdom, *currd = current->domain;
+    p2m_type_t p2mt, p2mt_prev;
+
+    if ( (fdom = get_pg_owner(foreign_domid)) == NULL ) {
+        kdb_trap_immed(KDB_TRAP_NONFATAL);
+        return -EPERM;
+    }
+failed = 0;
+
+    mfn = mfn_x(get_gfn_query_unlocked(fdom, fgmfn, &p2mt));
+
+    /* qemu, running on PVH dom0, mapping hvm domain's pages during domain 
+     * creation, doesn't have mfns in the HAP table */
+    if ( !mfn_valid(mfn) && p2m_is_mmio(p2mt) ) {
+
+        if (!is_hvm_domain(fdom)) {
+            printk("mmio type for non-hvm domain. fd:%d fgmfn:%lx gpfn:%lx\n",
+                   foreign_domid, fgmfn, gpfn);
+            kdb_trap_immed(KDB_TRAP_NONFATAL);
+        }
+        mfn = fgmfn;
+    }
+
+    if ( !p2m_is_valid(p2mt) ) {
+        put_pg_owner(fdom);
+        return -EINVAL;
+    }
+#if 0
+    if (paging_mode_external(fdom)) {
+        /* Foreign mappings into guests in shadow external mode don't       
+         * contribute to writeable mapping refcounts.  (This allows the
+         * qemu-dm helper process in dom0 to map the domain's memory without
+         * messing up the count of "real" writable mappings.) */
+        if (get_page_from_pagenr(mfn, fdom) == 0)
+            failed = 1;
+    } else {
+        if (get_page_and_type_from_pagenr(mfn, PGT_writable_page, fdom, 0, 0))
+            failed = 1;
+    }
+    if (failed) {
+        put_pg_owner(fdom);
+        return -EINVAL;
+    }
+#endif
+    /* Remove previously mapped page if it was present. */
+    prev_mfn = mfn_x(get_gfn_query_unlocked(currd, gpfn, &p2mt_prev));
+    if ( mfn_valid(prev_mfn) )
+    {
+        if ( is_xen_heap_mfn(prev_mfn) )
+            /* Xen heap frames are simply unhooked from this phys slot */
+            guest_physmap_remove_page(currd, gpfn, prev_mfn, 0);
+        else
+            /* Normal domain memory is freed, to avoid leaking memory. */
+            guest_remove_page(currd, gpfn);
+    }
+/* I CAN PROB remove the mmio because all io space is mapped upfront via
+ * domctl_memory_mapping */
+    /* Map at new location. */
+    /* Can't use guest_physmap_add_page() because it will update the m2p
+     * table so mfn ---> gpfn in dom0 and not gpfn of domU.
+     */
+    if ( p2m_is_mmio(p2mt) ) {
+        if (set_mmio_p2m_entry(currd, gpfn, mfn) == 0) {
+            kdbp("guest_physmap_add_page failed. gpfn:%lx mfn:%lx 
fgmfn:%lx\n", 
+                 gpfn, mfn, fgmfn);
+            kdb_trap_immed(KDB_TRAP_NONFATAL);
+            rc = -EINVAL;
+        }
+        goto out;
+    }
+    if (set_foreign_p2m_entry(currd, gpfn, mfn) == 0) {
+#if 0
+        if (paging_mode_external(fdom))
+            put_page(mfn_to_page(mfn));
+        else
+            put_page_and_type(mfn_to_page(mfn));
+#endif
+        kdbp("guest_physmap_add_page failed1. gpfn:%lx mfn:%lx fgmfn:%lx\n", 
+             gpfn, mfn, fgmfn);
+        kdb_trap_immed(KDB_TRAP_NONFATAL);
+        rc = -EINVAL;
+    }
+out:
+    put_pg_owner(fdom);
+    return rc;
+}
+
+static noinline int xenmem_add_to_physmap_once(
+    struct domain *d, uint16_t xatp_space, domid_t foreign_domid,
+    unsigned long xatp_idx, unsigned long xatp_gpfn)
 {
     struct page_info *page = NULL;
     unsigned long gfn = 0; /* gcc ... */
@@ -4227,10 +4340,10 @@ static int xenmem_add_to_physmap_once(
     int rc;
     p2m_type_t p2mt;
 
-    switch ( xatp->space )
+    switch ( xatp_space )
     {
         case XENMAPSPACE_shared_info:
-            if ( xatp->idx == 0 )
+            if ( xatp_idx == 0 )
                 mfn = virt_to_mfn(d->shared_info);
             break;
         case XENMAPSPACE_grant_table:
@@ -4239,9 +4352,9 @@ static int xenmem_add_to_physmap_once(
             if ( d->grant_table->gt_version == 0 )
                 d->grant_table->gt_version = 1;
 
-            idx = xatp->idx;
+            idx = xatp_idx;
             if ( d->grant_table->gt_version == 2 &&
-                 (xatp->idx & XENMAPIDX_grant_table_status) )
+                 (xatp_idx & XENMAPIDX_grant_table_status) )
             {
                 idx &= ~XENMAPIDX_grant_table_status;
                 if ( idx < nr_status_frames(d->grant_table) )
@@ -4263,9 +4376,9 @@ static int xenmem_add_to_physmap_once(
         case XENMAPSPACE_gmfn:
         {
             p2m_type_t p2mt;
-            gfn = xatp->idx;
-
-            idx = mfn_x(get_gfn_unshare(d, xatp->idx, &p2mt));
+            gfn = xatp_idx;
+
+            idx = mfn_x(get_gfn_unshare(d, xatp_idx, &p2mt));
             /* If the page is still shared, exit early */
             if ( p2m_is_shared(p2mt) )
             {
@@ -4278,6 +4391,13 @@ static int xenmem_add_to_physmap_once(
             page = mfn_to_page(mfn);
             break;
         }
+
+        case XENMAPSPACE_gmfn_foreign:
+        {
+            rc = xenmem_add_foreign_to_pmap(foreign_domid, xatp_idx, 
xatp_gpfn);
+            return rc;
+        }
+
         default:
             break;
     }
@@ -4286,8 +4406,8 @@ static int xenmem_add_to_physmap_once(
     {
         if ( page )
             put_page(page);
-        if ( xatp->space == XENMAPSPACE_gmfn ||
-             xatp->space == XENMAPSPACE_gmfn_range )
+        if ( xatp_space == XENMAPSPACE_gmfn ||
+             xatp_space == XENMAPSPACE_gmfn_range )
             put_gfn(d, gfn);
         return -EINVAL;
     }
@@ -4298,41 +4418,41 @@ static int xenmem_add_to_physmap_once(
         put_page(page);
 
     /* Remove previously mapped page if it was present. */
-    prev_mfn = mfn_x(get_gfn(d, xatp->gpfn, &p2mt));
+    prev_mfn = mfn_x(get_gfn(d, xatp_gpfn, &p2mt));
     if ( mfn_valid(prev_mfn) )
     {
         if ( is_xen_heap_mfn(prev_mfn) )
             /* Xen heap frames are simply unhooked from this phys slot. */
-            guest_physmap_remove_page(d, xatp->gpfn, prev_mfn, PAGE_ORDER_4K);
+            guest_physmap_remove_page(d, xatp_gpfn, prev_mfn, PAGE_ORDER_4K);
         else
             /* Normal domain memory is freed, to avoid leaking memory. */
-            guest_remove_page(d, xatp->gpfn);
+            guest_remove_page(d, xatp_gpfn);
     }
     /* In the XENMAPSPACE_gmfn case we still hold a ref on the old page. */
-    put_gfn(d, xatp->gpfn);
+    put_gfn(d, xatp_gpfn);
 
     /* Unmap from old location, if any. */
     gpfn = get_gpfn_from_mfn(mfn);
     ASSERT( gpfn != SHARED_M2P_ENTRY );
-    if ( xatp->space == XENMAPSPACE_gmfn ||
-         xatp->space == XENMAPSPACE_gmfn_range )
+    if ( xatp_space == XENMAPSPACE_gmfn ||
+         xatp_space == XENMAPSPACE_gmfn_range )
         ASSERT( gpfn == gfn );
     if ( gpfn != INVALID_M2P_ENTRY )
         guest_physmap_remove_page(d, gpfn, mfn, PAGE_ORDER_4K);
 
     /* Map at new location. */
-    rc = guest_physmap_add_page(d, xatp->gpfn, mfn, PAGE_ORDER_4K);
+    rc = guest_physmap_add_page(d, xatp_gpfn, mfn, PAGE_ORDER_4K);
 
     /* In the XENMAPSPACE_gmfn, we took a ref of the gfn at the top */
-    if ( xatp->space == XENMAPSPACE_gmfn ||
-         xatp->space == XENMAPSPACE_gmfn_range )
+    if ( xatp_space == XENMAPSPACE_gmfn ||
+         xatp_space == XENMAPSPACE_gmfn_range )
         put_gfn(d, gfn);
     domain_unlock(d);
 
     return rc;
 }
 
-static int xenmem_add_to_physmap(struct domain *d,
+static noinline int xenmem_add_to_physmap(struct domain *d,
                                  struct xen_add_to_physmap *xatp)
 {
     struct xen_add_to_physmap start_xatp;
@@ -4346,7 +4466,8 @@ static int xenmem_add_to_physmap(struct 
         start_xatp = *xatp;
         while ( xatp->size > 0 )
         {
-            rc = xenmem_add_to_physmap_once(d, xatp);
+            rc = xenmem_add_to_physmap_once(d, xatp->space, -1,
+                                            xatp->idx, xatp->gpfn);
             if ( rc < 0 )
                 return rc;
 
@@ -4372,7 +4493,50 @@ static int xenmem_add_to_physmap(struct 
         return rc;
     }
 
-    return xenmem_add_to_physmap_once(d, xatp);
+    return xenmem_add_to_physmap_once(d, xatp->space, -1,
+                                      xatp->idx, xatp->gpfn);
+}
+
+static noinline int xenmem_add_to_physmap_range(struct domain *d,
+                                       struct xen_add_to_physmap_range *xatpr)
+{
+    int rc;
+
+    /* Process entries in reverse order to allow continuations */
+    while ( xatpr->size > 0 )
+    {
+        xen_ulong_t idx;
+        xen_pfn_t gpfn;
+
+        rc = copy_from_guest_offset(&idx, xatpr->idxs, xatpr->size-1, 1);
+        if ( rc < 0 )
+            goto out;
+
+        rc = copy_from_guest_offset(&gpfn, xatpr->gpfns, xatpr->size-1, 1);
+        if ( rc < 0 )
+            goto out;
+
+        rc = xenmem_add_to_physmap_once(d, xatpr->space, xatpr->foreign_domid,
+                                        idx, gpfn);
+
+       if (rc)
+            goto out;
+
+        xatpr->size--;
+
+        /* Check for continuation if it's not the last interation */
+        if ( xatpr->size > 0 && hypercall_preempt_check() )
+        {
+            rc = -EAGAIN;
+            goto out;
+        }
+    }
+
+    rc = 0;
+
+out:
+    return rc;
+
 }
 
 long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -4389,6 +4553,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         if ( copy_from_guest(&xatp, arg, 1) )
             return -EFAULT;
 
+        /* This one is only supported for add_to_physmap_range */
+        if ( xatp.space == XENMAPSPACE_gmfn_foreign )
+            return -EINVAL;
+
         rc = rcu_lock_target_domain_by_id(xatp.domid, &d);
         if ( rc != 0 )
             return rc;
@@ -4416,6 +4584,32 @@ long arch_memory_op(int op, XEN_GUEST_HA
         return rc;
     }
 
+    case XENMEM_add_to_physmap_range:
+    {
+        struct xen_add_to_physmap_range xatpr;
+        struct domain *d;
+
+        if ( copy_from_guest(&xatpr, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(xatpr.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        rc = xenmem_add_to_physmap_range(d, &xatpr);
+
+        rcu_unlock_domain(d);
+
+        if ( rc && copy_to_guest(arg, &xatpr, 1) )
+            rc = -EFAULT;
+
+        if ( rc == -EAGAIN )
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_memory_op, "ih", op, arg);
+
+        return rc;
+    }
+
     case XENMEM_set_memory_map:
     {
         struct xen_foreign_memory_map fmap;
diff -r 8b0762504037 xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/mm/hap/hap.c Wed Nov 14 11:03:24 2012 -0800
@@ -600,6 +600,20 @@ int hap_domctl(struct domain *d, xen_dom
     }
 }
 
+/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
+{
+    int rc;
+    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
+
+    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+    num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);
+    paging_lock(d);
+    rc = hap_set_allocation(d, num_pages, NULL);
+    paging_unlock(d);
+    BUG_ON(rc);
+}
+
 static const struct paging_mode hap_paging_real_mode;
 static const struct paging_mode hap_paging_protected_mode;
 static const struct paging_mode hap_paging_pae_mode;
@@ -659,7 +673,8 @@ static void hap_update_cr3(struct vcpu *
 const struct paging_mode *
 hap_paging_get_mode(struct vcpu *v)
 {
-    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
+    return is_pvh_vcpu(v) ? &hap_paging_long_mode :
+        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
         hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
         hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
                                    &hap_paging_protected_mode;
diff -r 8b0762504037 xen/arch/x86/mm/p2m-ept.c
--- a/xen/arch/x86/mm/p2m-ept.c Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/mm/p2m-ept.c Wed Nov 14 11:03:24 2012 -0800
@@ -75,6 +75,7 @@ static void ept_p2m_type_to_flags(ept_en
             entry->w = 0;
             break;
         case p2m_grant_map_rw:
+        case p2m_map_foreign:
             entry->r = entry->w = 1;
             entry->x = 0;
             break;
@@ -428,7 +429,7 @@ ept_set_entry(struct p2m_domain *p2m, un
     }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( p2mt != p2m_invalid &&
+    if ( p2mt != p2m_invalid && p2mt != p2m_mmio_dm &&
          (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
         p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
 
@@ -462,6 +463,9 @@ out:
             }
             else
             {
+
+if (p2m->domain->domain_id == 0)   /******* PVH : FIXME */
+    goto skip;
                 if ( order > 0 )
                 {
                     for ( i = 0; i < (1 << order); i++ )
@@ -472,7 +476,7 @@ out:
             }
         }
     }
-
+skip:
     /* Release the old intermediate tables, if any.  This has to be the
        last thing we do, after the ept_sync_domain() and removal
        from the iommu tables, so as to avoid a potential
diff -r 8b0762504037 xen/arch/x86/mm/p2m-pt.c
--- a/xen/arch/x86/mm/p2m-pt.c  Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/mm/p2m-pt.c  Wed Nov 14 11:03:24 2012 -0800
@@ -89,6 +89,7 @@ static unsigned long p2m_type_to_flags(p
     case p2m_ram_rw:
         return flags | P2M_BASE_FLAGS | _PAGE_RW;
     case p2m_grant_map_rw:
+    case p2m_map_foreign:
         return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_NX_BIT;
     case p2m_mmio_direct:
         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) )
@@ -429,7 +430,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
     }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( p2mt != p2m_invalid
+    if ( p2mt != p2m_invalid && p2mt != p2m_mmio_dm
          && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
         p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
 
diff -r 8b0762504037 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/mm/p2m.c     Wed Nov 14 11:03:24 2012 -0800
@@ -488,7 +488,7 @@ p2m_remove_page(struct p2m_domain *p2m, 
         for ( i = 0; i < (1UL << page_order); i++ )
         {
             mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, 0, NULL);
-            if ( !p2m_is_grant(t) && !p2m_is_shared(t) )
+            if ( !p2m_is_grant(t) && !p2m_is_shared(t) && !p2m_is_foreign(t) )
                 set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
             ASSERT( !p2m_is_valid(t) || mfn + i == mfn_x(mfn_return) );
         }
@@ -584,6 +584,11 @@ guest_physmap_add_entry(struct domain *d
         {
             ASSERT(mfn_valid(omfn));
             set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+
+            /* Because PVH domU uses kmalloc for grant pfn, we need to save
+             * and restore the old mfn */
+             if (is_pvh_domain(d) && p2m_is_grant(t))
+                 free_domheap_page(mfn_to_page(omfn));
         }
         else if ( ot == p2m_populate_on_demand )
         {
@@ -715,7 +720,33 @@ void p2m_change_type_range(struct domain
     p2m_unlock(p2m);
 }
 
+/* Returns: True for success. 0 for failure */
+int set_foreign_p2m_entry(struct domain *domp, unsigned long gfn, mfn_t mfn)
+{
+    int rc = 0;
+    p2m_type_t ot;
+    mfn_t omfn;
+    struct p2m_domain *p2m = p2m_get_hostp2m(domp);
 
+    if ( !paging_mode_translate(domp) )
+        return 0;
+
+    omfn = get_gfn_query_unlocked(domp, gfn, &ot);
+    if (mfn_valid(omfn)) {
+        gdprintk(XENLOG_ERR, "Already mapped mfn %lx at gfn:%lx\n", omfn, gfn);
+        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+    }
+
+    P2M_DEBUG("set foreign %lx %lx\n", gfn, mfn_x(mfn));
+    p2m_lock(p2m);
+    rc = set_p2m_entry(p2m, gfn, mfn, 0, p2m_map_foreign, p2m->default_access);
+    p2m_unlock(p2m);
+    if ( rc == 0 )
+        gdprintk(XENLOG_ERR,
+            "set_foreign_p2m_entry: set_p2m_entry failed! gfn:%lx mfn=%08lx\n",
+            gfn, mfn_x(get_gfn_query(domp, gfn, &ot)));
+    return rc;
+}
 
 int
 set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
diff -r 8b0762504037 xen/arch/x86/msi.c
--- a/xen/arch/x86/msi.c        Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/msi.c        Wed Nov 14 11:03:24 2012 -0800
@@ -766,6 +766,9 @@ static int msix_capability_init(struct p
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first,
                                         dev->msix_pba.last));
 
+/* PVH: fixme: not a clue what to do here :) */
+if (is_pvh_domain(dev->domain) && dev->domain->domain_id != 0)
+{
         if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
                                 dev->msix_table.last) )
             WARN();
@@ -793,6 +796,7 @@ static int msix_capability_init(struct p
                 /* XXX How to deal with existing mappings? */
             }
         }
+}
     }
     WARN_ON(dev->msix_nr_entries != nr_entries);
     WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
diff -r 8b0762504037 xen/arch/x86/physdev.c
--- a/xen/arch/x86/physdev.c    Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/physdev.c    Wed Nov 14 11:03:24 2012 -0800
@@ -732,6 +732,25 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         break;
     }
 
+    case PHYSDEVOP_pvh_map_iomem : {
+
+       struct physdev_map_iomem iomem;
+        struct domain *d = current->domain;
+
+        ret = -EPERM;
+        if ( !IS_PRIV(d) || !is_pvh_domain(d))
+            break;
+        d = rcu_lock_current_domain();
+        
+        ret = -EFAULT;
+        if ( copy_from_guest(&iomem, arg, 1) != 0 )
+            break;
+
+       ret = domctl_memory_mapping(d, iomem.first_gfn, iomem.first_mfn, 
+                                   iomem.nr_mfns, iomem.add_mapping);
+        break;
+    }
+
     default:
         ret = -ENOSYS;
         break;
diff -r 8b0762504037 xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/setup.c      Wed Nov 14 11:03:24 2012 -0800
@@ -54,6 +54,9 @@ int opt_earlykdb=0;
 boolean_param("earlykdb", opt_earlykdb);
 #endif
 
+int opt_dom0hyb=0;
+boolean_param("dom0hyb", opt_dom0hyb);
+
 /* opt_nosmp: If true, secondary processors are ignored. */
 static bool_t __initdata opt_nosmp;
 boolean_param("nosmp", opt_nosmp);
@@ -542,7 +545,7 @@ void __init __start_xen(unsigned long mb
 {
     char *memmap_type = NULL;
     char *cmdline, *kextra, *loader;
-    unsigned int initrdidx;
+    unsigned int initrdidx, domcr_flags = 0;
     multiboot_info_t *mbi = __va(mbi_p);
     module_t *mod = (module_t *)__va(mbi->mods_addr);
     unsigned long nr_pages, modules_headroom, *module_map;
@@ -1260,7 +1263,9 @@ void __init __start_xen(unsigned long mb
         panic("Could not protect TXT memory regions\n");
 
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, 0);
+    domcr_flags = (opt_dom0hyb ? DOMCRF_pvh | DOMCRF_hap : 0);
+    domcr_flags |= DOMCRF_s3_integrity;
+    dom0 = domain_create(0, domcr_flags, 0);
     if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) )
         panic("Error creating domain 0\n");
 
diff -r 8b0762504037 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/time.c       Wed Nov 14 11:03:24 2012 -0800
@@ -1923,7 +1923,7 @@ void tsc_set_info(struct domain *d,
         break;
     }
     d->arch.incarnation = incarnation + 1;
-    if ( is_hvm_domain(d) )
+    if ( is_hvm_or_pvh_domain(d) )
         hvm_set_rdtsc_exiting(d, d->arch.vtsc);
 }
 
diff -r 8b0762504037 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/traps.c      Wed Nov 14 11:03:24 2012 -0800
@@ -723,7 +723,7 @@ int cpuid_hypervisor_leaves( uint32_t id
     return 1;
 }
 
-static void pv_cpuid(struct cpu_user_regs *regs)
+void pv_cpuid(struct cpu_user_regs *regs)
 {
     uint32_t a, b, c, d;
 
@@ -900,7 +900,7 @@ static int emulate_invalid_rdtscp(struct
     return EXCRET_fault_fixed;
 }
 
-static int emulate_forced_invalid_op(struct cpu_user_regs *regs)
+int emulate_forced_invalid_op(struct cpu_user_regs *regs)
 {
     char sig[5], instr[2];
     unsigned long eip, rc;
@@ -910,6 +910,10 @@ static int emulate_forced_invalid_op(str
     /* Check for forced emulation signature: ud2 ; .ascii "xen". */
     if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 )
     {
+        /* PVH: fixme: hmm... what do we do for PVH? */
+        if ( is_pvh_vcpu(current) )
+            return 0;
+
         propagate_page_fault(eip + sizeof(sig) - rc, 0);
         return EXCRET_fault_fixed;
     }
@@ -920,6 +924,10 @@ static int emulate_forced_invalid_op(str
     /* We only emulate CPUID. */
     if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 )
     {
+        /* PVH: fixme: hmm... what do we do for PVH? */
+        if ( is_pvh_vcpu(current) )
+            return 0;
+
         propagate_page_fault(eip + sizeof(instr) - rc, 0);
         return EXCRET_fault_fixed;
     }
@@ -1448,6 +1456,9 @@ static int read_descriptor(unsigned int 
 {
     struct desc_struct desc;
 
+    if ( is_pvh_vcpu(v) )
+        return hvm_pvh_read_descriptor(sel, v, regs, base, limit, ar);
+
     if ( !vm86_mode(regs) )
     {
         if ( sel < 4)
@@ -1566,6 +1577,10 @@ static int guest_io_okay(
     int user_mode = !(v->arch.flags & TF_kernel_mode);
 #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
 
+    /* for PVH we check this in vmexit for EXIT_REASON_IO_INSTRUCTION */
+    if (is_pvh_vcpu(v))
+        return 1;
+
     if ( !vm86_mode(regs) &&
          (v->arch.pv_vcpu.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) )
         return 1;
@@ -1811,14 +1826,14 @@ static inline uint64_t guest_misc_enable
         _ptr = (unsigned int)_ptr;                                          \
     if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) )   \
         goto fail;                                                          \
-    if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 )       \
+    if ( (_rc = raw_copy_from_guest(&_x, (type *)_ptr, sizeof(_x))) != 0 )  \
     {                                                                       \
         propagate_page_fault(_ptr + sizeof(_x) - _rc, 0);                   \
         goto skip;                                                          \
     }                                                                       \
     (eip) += sizeof(_x); _x; })
 
-#define read_sreg(regs, sr) read_segment_register(sr)
+#define read_sreg(vcpu, regs, sr) read_segment_register(vcpu, regs, sr)
 
 static int is_cpufreq_controller(struct domain *d)
 {
@@ -1828,7 +1843,7 @@ static int is_cpufreq_controller(struct 
 
 #include "x86_64/mmconfig.h"
 
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+int emulate_privileged_op(struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     unsigned long *reg, eip = regs->eip;
@@ -1864,7 +1879,7 @@ static int emulate_privileged_op(struct 
         goto fail;
 
     /* emulating only opcodes not allowing SS to be default */
-    data_sel = read_sreg(regs, ds);
+    data_sel = read_sreg(v, regs, ds);
 
     /* Legacy prefixes. */
     for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
@@ -1882,17 +1897,17 @@ static int emulate_privileged_op(struct 
             data_sel = regs->cs;
             continue;
         case 0x3e: /* DS override */
-            data_sel = read_sreg(regs, ds);
+            data_sel = read_sreg(v, regs, ds);
             continue;
         case 0x26: /* ES override */
-            data_sel = read_sreg(regs, es);
+            data_sel = read_sreg(v, regs, es);
             continue;
         case 0x64: /* FS override */
-            data_sel = read_sreg(regs, fs);
+            data_sel = read_sreg(v, regs, fs);
             lm_ovr = lm_seg_fs;
             continue;
         case 0x65: /* GS override */
-            data_sel = read_sreg(regs, gs);
+            data_sel = read_sreg(v, regs, gs);
             lm_ovr = lm_seg_gs;
             continue;
         case 0x36: /* SS override */
@@ -1939,7 +1954,7 @@ static int emulate_privileged_op(struct 
 
         if ( !(opcode & 2) )
         {
-            data_sel = read_sreg(regs, es);
+            data_sel = read_sreg(v, regs, es);
             lm_ovr = lm_seg_none;
         }
 
@@ -2668,22 +2683,22 @@ static void emulate_gate_op(struct cpu_u
             ASSERT(opnd_sel);
             continue;
         case 0x3e: /* DS override */
-            opnd_sel = read_sreg(regs, ds);
+            opnd_sel = read_sreg(v, regs, ds);
             if ( !opnd_sel )
                 opnd_sel = dpl;
             continue;
         case 0x26: /* ES override */
-            opnd_sel = read_sreg(regs, es);
+            opnd_sel = read_sreg(v, regs, es);
             if ( !opnd_sel )
                 opnd_sel = dpl;
             continue;
         case 0x64: /* FS override */
-            opnd_sel = read_sreg(regs, fs);
+            opnd_sel = read_sreg(v, regs, fs);
             if ( !opnd_sel )
                 opnd_sel = dpl;
             continue;
         case 0x65: /* GS override */
-            opnd_sel = read_sreg(regs, gs);
+            opnd_sel = read_sreg(v, regs, gs);
             if ( !opnd_sel )
                 opnd_sel = dpl;
             continue;
@@ -2736,7 +2751,7 @@ static void emulate_gate_op(struct cpu_u
                             switch ( modrm & 7 )
                             {
                             default:
-                                opnd_sel = read_sreg(regs, ds);
+                                opnd_sel = read_sreg(v, regs, ds);
                                 break;
                             case 4: case 5:
                                 opnd_sel = regs->ss;
@@ -2764,7 +2779,7 @@ static void emulate_gate_op(struct cpu_u
                             break;
                         }
                         if ( !opnd_sel )
-                            opnd_sel = read_sreg(regs, ds);
+                            opnd_sel = read_sreg(v, regs, ds);
                         switch ( modrm & 7 )
                         {
                         case 0: case 2: case 4:
diff -r 8b0762504037 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/arch/x86/x86_64/traps.c       Wed Nov 14 11:03:24 2012 -0800
@@ -127,10 +127,10 @@ void show_registers(struct cpu_user_regs
         fault_crs[0] = read_cr0();
         fault_crs[3] = read_cr3();
         fault_crs[4] = read_cr4();
-        fault_regs.ds = read_segment_register(ds);
-        fault_regs.es = read_segment_register(es);
-        fault_regs.fs = read_segment_register(fs);
-        fault_regs.gs = read_segment_register(gs);
+        fault_regs.ds = read_segment_register(v, regs, ds);
+        fault_regs.es = read_segment_register(v, regs, es);
+        fault_regs.fs = read_segment_register(v, regs, fs);
+        fault_regs.gs = read_segment_register(v, regs, gs);
     }
 
     print_xen_info();
@@ -624,7 +624,7 @@ static void hypercall_page_initialise_ri
 void hypercall_page_initialise(struct domain *d, void *hypercall_page)
 {
     memset(hypercall_page, 0xCC, PAGE_SIZE);
-    if ( is_hvm_domain(d) )
+    if ( is_hvm_or_pvh_domain(d) )
         hvm_hypercall_page_initialise(d, hypercall_page);
     else if ( !is_pv_32bit_domain(d) )
         hypercall_page_initialise_ring3_kernel(hypercall_page);
diff -r 8b0762504037 xen/common/domain.c
--- a/xen/common/domain.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/common/domain.c       Wed Nov 14 11:03:24 2012 -0800
@@ -232,6 +232,14 @@ struct domain *domain_create(
 
     if ( domcr_flags & DOMCRF_hvm )
         d->is_hvm = 1;
+    else if ( domcr_flags & DOMCRF_pvh ) {
+        d->is_pvh = 1;
+        if ( !(domcr_flags & DOMCRF_hap) ) {
+            printk("PVH guest must have HAP on\n");
+            goto fail;
+        } else
+            printk("Yeay... PVH guest. domid:%d\n", domid);
+    }
 
     if ( domid == 0 )
     {
diff -r 8b0762504037 xen/common/domctl.c
--- a/xen/common/domctl.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/common/domctl.c       Wed Nov 14 11:03:24 2012 -0800
@@ -149,6 +149,8 @@ void getdomaininfo(struct domain *d, str
 
     if ( is_hvm_domain(d) )
         info->flags |= XEN_DOMINF_hvm_guest;
+    else if ( is_pvh_domain(d) )
+        info->flags |= XEN_DOMINF_pvh_guest;
 
     xsm_security_domaininfo(d, info);
 
@@ -419,6 +421,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
         if ( supervisor_mode_kernel ||
              (op->u.createdomain.flags &
              ~(XEN_DOMCTL_CDF_hvm_guest | XEN_DOMCTL_CDF_hap |
+               XEN_DOMCTL_CDF_pvh_guest |
                XEN_DOMCTL_CDF_s3_integrity | XEN_DOMCTL_CDF_oos_off)) )
             break;
 
@@ -449,6 +452,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
         domcr_flags = 0;
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hvm_guest )
             domcr_flags |= DOMCRF_hvm;
+        if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_pvh_guest )
+            domcr_flags |= DOMCRF_pvh;
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hap )
             domcr_flags |= DOMCRF_hap;
         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_s3_integrity )
diff -r 8b0762504037 xen/common/grant_table.c
--- a/xen/common/grant_table.c  Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/common/grant_table.c  Wed Nov 14 11:03:24 2012 -0800
@@ -529,7 +529,7 @@ static void mapcount(
  * addr is _either_ a host virtual address, or the address of the pte to
  * update, as indicated by the GNTMAP_contains_pte flag.
  */
-static void
+static noinline void
 __gnttab_map_grant_ref(
     struct gnttab_map_grant_ref *op)
 {
@@ -745,7 +745,7 @@ __gnttab_map_grant_ref(
 
     double_gt_lock(lgt, rgt);
 
-    if ( !is_hvm_domain(ld) && need_iommu(ld) )
+    if ( !is_hvm_or_pvh_domain(ld) && need_iommu(ld) )
     {
         unsigned int wrc, rdc;
         int err = 0;
@@ -956,7 +956,7 @@ __gnttab_unmap_common(
             act->pin -= GNTPIN_hstw_inc;
     }
 
-    if ( !is_hvm_domain(ld) && need_iommu(ld) )
+    if ( !is_hvm_or_pvh_domain(ld) && need_iommu(ld) )
     {
         unsigned int wrc, rdc;
         int err = 0;
diff -r 8b0762504037 xen/common/kernel.c
--- a/xen/common/kernel.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/common/kernel.c       Wed Nov 14 11:03:24 2012 -0800
@@ -289,7 +289,11 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL
             if ( current->domain == dom0 )
                 fi.submap |= 1U << XENFEAT_dom0;
 #ifdef CONFIG_X86
-            if ( !is_hvm_vcpu(current) )
+            if ( is_pvh_vcpu(current) )
+                fi.submap |= (1U << XENFEAT_hvm_safe_pvclock) |
+                             (1U << XENFEAT_supervisor_mode_kernel) |
+                             (1U << XENFEAT_hvm_callback_vector);
+            else if ( !is_hvm_vcpu(current) )
                 fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
                              (1U << XENFEAT_highmem_assist) |
                              (1U << XENFEAT_gnttab_map_avail_bits);
diff -r 8b0762504037 xen/common/libelf/libelf-loader.c
--- a/xen/common/libelf/libelf-loader.c Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/common/libelf/libelf-loader.c Wed Nov 14 11:03:24 2012 -0800
@@ -17,6 +17,10 @@
  */
 
 #include "libelf-private.h"
+#ifdef __XEN__
+#include <public/xen.h>
+#include <asm/debugger.h>
+#endif
 
 /* ------------------------------------------------------------------------ */
 
@@ -108,7 +112,8 @@ void elf_set_log(struct elf_binary *elf,
     elf->verbose = verbose;
 }
 
-static int elf_load_image(void *dst, const void *src, uint64_t filesz, 
uint64_t memsz)
+static int elf_load_image(void *dst, const void *src, uint64_t filesz, 
+                          uint64_t memsz, int not_used)
 {
     memcpy(dst, src, filesz);
     memset(dst + filesz, 0, memsz - filesz);
@@ -122,11 +127,35 @@ void elf_set_verbose(struct elf_binary *
     elf->verbose = 1;
 }
 
-static int elf_load_image(void *dst, const void *src, uint64_t filesz, 
uint64_t memsz)
+static int elf_load_image(void *dst, const void *src, uint64_t filesz, 
+                          uint64_t memsz, int is_pvh_dom0)
 {
     int rc;
     if ( filesz > ULONG_MAX || memsz > ULONG_MAX )
         return -1;
+
+    /* raw_copy_to_guest -> copy_to_user_hvm -> __hvm_copy needs curr to
+     * point to the hvm/pvh vcpu. Hence for PVH dom0 we can't use that. For now
+     * just use dbg_rw_mem(). FIXME: this is kinda slow, enhance to copy
+     * more than one byte at a time. */
+    if ( is_pvh_dom0 ) 
+    {
+        int j, rem; 
+        rem = dbg_rw_mem((dbgva_t)dst, (dbgbyte_t *)src, (int)filesz, 0, 1, 0);
+        if ( rem ) {
+            printk("Failed to copy elf binary. len:%ld rem:%d\n", filesz, rem);
+            return -1;
+        }
+        for (j=0; j < memsz - filesz; j++) {
+            unsigned char zero=0;
+            rem = dbg_rw_mem((dbgva_t)(dst+filesz+j), &zero, 1, 0, 1, 0);
+            if (rem) {
+                kdbp("Failed to copy to: %lx rem:%d\n", dst+filesz+j, rem);
+                return -1;
+            }
+        }
+        return 0;
+    }
     rc = raw_copy_to_guest(dst, src, filesz);
     if ( rc != 0 )
         return -1;
@@ -260,7 +289,9 @@ void elf_parse_binary(struct elf_binary 
             __FUNCTION__, elf->pstart, elf->pend);
 }
 
-int elf_load_binary(struct elf_binary *elf)
+/* This function called for dom0 and also from the libraries when building 
+ * guests */
+static int _elf_load_binary(struct elf_binary *elf, int is_pvh_dom0)
 {
     const elf_phdr *phdr;
     uint64_t i, count, paddr, offset, filesz, memsz;
@@ -279,7 +310,8 @@ int elf_load_binary(struct elf_binary *e
         dest = elf_get_ptr(elf, paddr);
         elf_msg(elf, "%s: phdr %" PRIu64 " at 0x%p -> 0x%p\n",
                 __func__, i, dest, dest + filesz);
-        if ( elf_load_image(dest, elf->image + offset, filesz, memsz) != 0 )
+        if ( elf_load_image(dest, elf->image + offset, filesz, memsz, 
+                            is_pvh_dom0) != 0 )
             return -1;
     }
 
@@ -287,6 +319,18 @@ int elf_load_binary(struct elf_binary *e
     return 0;
 }
 
+#ifdef __XEN__
+int elf_load_binary(struct elf_binary *elf, int is_pvh_dom0)
+{
+    return _elf_load_binary(elf, is_pvh_dom0);
+}
+#else
+int elf_load_binary(struct elf_binary *elf)
+{
+    return _elf_load_binary(elf, 0);
+}
+#endif
+
 void *elf_get_ptr(struct elf_binary *elf, unsigned long addr)
 {
     return elf->dest + addr - elf->pstart;
diff -r 8b0762504037 xen/common/memory.c
--- a/xen/common/memory.c       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/common/memory.c       Wed Nov 14 11:03:24 2012 -0800
@@ -653,6 +653,8 @@ long do_memory_op(unsigned long cmd, XEN
         struct xen_remove_from_physmap xrfp;
         struct page_info *page;
         struct domain *d;
+        p2m_type_t p2mt;
+        int is_curr_pvh = is_pvh_vcpu(current);
 
         if ( copy_from_guest(&xrfp, arg, 1) )
             return -EFAULT;
@@ -669,14 +671,22 @@ long do_memory_op(unsigned long cmd, XEN
 
         domain_lock(d);
 
-        page = get_page_from_gfn(d, xrfp.gpfn, NULL, P2M_ALLOC);
-        if ( page )
+        page = get_page_from_gfn(d, xrfp.gpfn, &p2mt, P2M_ALLOC);
+        if ( page || 
+             (is_curr_pvh && (p2m_is_mmio(p2mt) || p2m_is_foreign(p2mt))) )
         {
-            guest_physmap_remove_page(d, xrfp.gpfn, page_to_mfn(page), 0);
-            put_page(page);
+            unsigned long argmfn = page ? page_to_mfn(page) : INVALID_MFN;
+            guest_physmap_remove_page(d, xrfp.gpfn, argmfn, 0);
+            if (page)
+                put_page(page);
         }
-        else
+        else 
+        {
+            if ( is_curr_pvh )
+                gdprintk(XENLOG_WARNING, "%s: Domain:%u gmfn:%lx invalid\n",
+                         __FUNCTION__, current->domain->domain_id, xrfp.gpfn);
             rc = -ENOENT;
+        }
 
         domain_unlock(d);
 
diff -r 8b0762504037 xen/drivers/passthrough/iommu.c
--- a/xen/drivers/passthrough/iommu.c   Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/drivers/passthrough/iommu.c   Wed Nov 14 11:03:24 2012 -0800
@@ -120,15 +120,25 @@ int iommu_domain_init(struct domain *d)
     return hd->platform_ops->init(d);
 }
 
+static inline void check_dom0_pvh_reqs(struct domain *d)
+{
+    if (!iommu_enabled || iommu_passthrough)
+        panic("For pvh dom0, iommu must be enabled, dom0-passthrough must "
+              "not be enabled \n");
+}
+
 void __init iommu_dom0_init(struct domain *d)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
 
+    if ( is_pvh_domain(d) )
+        check_dom0_pvh_reqs(d);
+
     if ( !iommu_enabled )
         return;
 
     register_keyhandler('o', &iommu_p2m_table);
-    d->need_iommu = !!iommu_dom0_strict;
+    d->need_iommu = is_pvh_domain(d) || !!iommu_dom0_strict;
     if ( need_iommu(d) )
     {
         struct page_info *page;
@@ -141,7 +151,10 @@ void __init iommu_dom0_init(struct domai
                  ((page->u.inuse.type_info & PGT_type_mask)
                   == PGT_writable_page) )
                 mapping |= IOMMUF_writable;
-            hd->platform_ops->map_page(d, mfn, mfn, mapping);
+            if ( is_pvh_domain(d) )
+                hd->platform_ops->map_page(d, mfn_to_gfn(d, mfn), mfn, 
mapping);
+            else
+                hd->platform_ops->map_page(d, mfn, mfn, mapping);
             if ( !(i++ & 0xfffff) )
                 process_pending_softirqs();
         }
diff -r 8b0762504037 xen/include/asm-x86/desc.h
--- a/xen/include/asm-x86/desc.h        Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/desc.h        Wed Nov 14 11:03:24 2012 -0800
@@ -38,7 +38,8 @@
 
 #ifndef __ASSEMBLY__
 
-#define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
+#define GUEST_KERNEL_RPL(d) (is_pvh_domain(d) ? 0 : \
+                                                is_pv_32bit_domain(d) ? 1 : 3)
 
 /* Fix up the RPL of a guest segment selector. */
 #define __fixup_guest_selector(d, sel)                             \
diff -r 8b0762504037 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/domain.h      Wed Nov 14 11:03:24 2012 -0800
@@ -16,7 +16,7 @@
 #define is_pv_32on64_domain(d) (is_pv_32bit_domain(d))
 #define is_pv_32on64_vcpu(v)   (is_pv_32on64_domain((v)->domain))
 
-#define is_hvm_pv_evtchn_domain(d) (is_hvm_domain(d) && \
+#define is_hvm_pv_evtchn_domain(d) (is_hvm_or_pvh_domain(d) && \
         d->arch.hvm_domain.irq.callback_via_type == HVMIRQ_callback_vector)
 #define is_hvm_pv_evtchn_vcpu(v) (is_hvm_pv_evtchn_domain(v->domain))
 
@@ -252,10 +252,10 @@ struct arch_domain
 
     struct list_head pdev_list;
 
-    union {
-        struct pv_domain pv_domain;
-        struct hvm_domain hvm_domain;
-    };
+/* PVH fixme: pvh uses fields from both pv and hvm, so separate the union
+ * for now. make a separate pvh struct and put in HVM */
+    struct pv_domain pv_domain;
+    struct hvm_domain hvm_domain;
 
     struct paging_domain paging;
     struct p2m_domain *p2m;
@@ -396,10 +396,14 @@ struct arch_vcpu
     void (*ctxt_switch_to) (struct vcpu *);
 
     /* Virtual Machine Extensions */
+#if 0
     union {
+#endif
         struct pv_vcpu pv_vcpu;
         struct hvm_vcpu hvm_vcpu;
+#if 0
     };
+#endif
 
     /*
      * Every domain has a L1 pagetable of its own. Per-domain mappings
diff -r 8b0762504037 xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/event.h       Wed Nov 14 11:03:24 2012 -0800
@@ -18,7 +18,7 @@ int hvm_local_events_need_delivery(struc
 static inline int local_events_need_delivery(void)
 {
     struct vcpu *v = current;
-    return (is_hvm_vcpu(v) ? hvm_local_events_need_delivery(v) :
+    return (is_hvm_or_pvh_vcpu(v) ? hvm_local_events_need_delivery(v) :
             (vcpu_info(v, evtchn_upcall_pending) &&
              !vcpu_info(v, evtchn_upcall_mask)));
 }
diff -r 8b0762504037 xen/include/asm-x86/guest_access.h
--- a/xen/include/asm-x86/guest_access.h        Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/guest_access.h        Wed Nov 14 11:03:24 2012 -0800
@@ -14,27 +14,27 @@
 
 /* Raw access functions: no type checking. */
 #define raw_copy_to_guest(dst, src, len)        \
-    (is_hvm_vcpu(current) ?                     \
+    (is_hvm_or_pvh_vcpu(current) ?                     \
      copy_to_user_hvm((dst), (src), (len)) :    \
      copy_to_user((dst), (src), (len)))
 #define raw_copy_from_guest(dst, src, len)      \
-    (is_hvm_vcpu(current) ?                     \
+    (is_hvm_or_pvh_vcpu(current) ?                     \
      copy_from_user_hvm((dst), (src), (len)) :  \
      copy_from_user((dst), (src), (len)))
 #define raw_clear_guest(dst,  len)              \
-    (is_hvm_vcpu(current) ?                     \
+    (is_hvm_or_pvh_vcpu(current) ?                     \
      clear_user_hvm((dst), (len)) :             \
      clear_user((dst), (len)))
 #define __raw_copy_to_guest(dst, src, len)      \
-    (is_hvm_vcpu(current) ?                     \
+    (is_hvm_or_pvh_vcpu(current) ?                     \
      copy_to_user_hvm((dst), (src), (len)) :    \
      __copy_to_user((dst), (src), (len)))
 #define __raw_copy_from_guest(dst, src, len)    \
-    (is_hvm_vcpu(current) ?                     \
+    (is_hvm_or_pvh_vcpu(current) ?                     \
      copy_from_user_hvm((dst), (src), (len)) :  \
      __copy_from_user((dst), (src), (len)))
 #define __raw_clear_guest(dst,  len)            \
-    (is_hvm_vcpu(current) ?                     \
+    (is_hvm_or_pvh_vcpu(current) ?                     \
      clear_user_hvm((dst), (len)) :             \
      clear_user((dst), (len)))
 
diff -r 8b0762504037 xen/include/asm-x86/hap.h
--- a/xen/include/asm-x86/hap.h Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/hap.h Wed Nov 14 11:03:24 2012 -0800
@@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain
                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
 
 #endif /* XEN_HAP_H */
 
diff -r 8b0762504037 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Nov 14 11:03:24 2012 -0800
@@ -183,6 +183,13 @@ struct hvm_function_table {
     /* Virtual interrupt delivery */
     void (*update_eoi_exit_bitmap)(struct vcpu *v, u8 vector, u8 trig);
     int (*virtual_intr_delivery_enabled)(void);
+
+    /* PVH functions */
+    void (*pvh_update_cr3)(struct vcpu *v);
+    int (*pvh_set_vcpu_info)(struct vcpu *v, struct vcpu_guest_context *ctxtp);
+    int (*pvh_read_descriptor)(unsigned int sel, const struct vcpu *v,
+                         const struct cpu_user_regs *regs, unsigned long *base,
+                         unsigned long *limit, unsigned int *ar);
 };
 
 extern struct hvm_function_table hvm_funcs;
@@ -316,6 +323,24 @@ static inline unsigned long hvm_get_shad
     return hvm_funcs.get_shadow_gs_base(v);
 }
 
+static inline void hvm_pvh_update_cr3(struct vcpu *v)
+{
+    hvm_funcs.pvh_update_cr3(v);
+}
+
+static inline int hvm_pvh_set_vcpu_info(struct vcpu *v, 
+                                        struct vcpu_guest_context *ctxtp)
+{
+    return hvm_funcs.pvh_set_vcpu_info(v, ctxtp);
+}
+
+static inline int hvm_pvh_read_descriptor(unsigned int sel, 
+               const struct vcpu *v, const struct cpu_user_regs *regs, 
+               unsigned long *base, unsigned long *limit, unsigned int *ar)
+{
+    return hvm_funcs.pvh_read_descriptor(sel, v, regs, base, limit, ar);
+}
+
 #define is_viridian_domain(_d)                                             \
  (is_hvm_domain(_d) && ((_d)->arch.hvm_domain.params[HVM_PARAM_VIRIDIAN]))
 
diff -r 8b0762504037 xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/hvm/vcpu.h    Wed Nov 14 11:03:24 2012 -0800
@@ -104,6 +104,17 @@ struct nestedvcpu {
 
 #define vcpu_nestedhvm(v) ((v)->arch.hvm_vcpu.nvcpu)
 
+struct pvh_hvm_vcpu_ext
+{
+    /* I/O-port access bitmap. */
+    XEN_GUEST_HANDLE(uint8) iobmp;  /* Guest kernel vaddr of the bitmap. */
+    unsigned int pvh_iobmp_limit;   /* Number of ports in the bitmap. */
+    unsigned int pvh_iopl;          /* Current IOPL for this VCPU. */
+
+    /* Guest-specified relocation of vcpu_info. */
+    unsigned long pvh_vcpu_info_mfn;
+};
+
 struct hvm_vcpu {
     /* Guest control-register and EFER values, just as the guest sees them. */
     unsigned long       guest_cr[5];
@@ -170,6 +181,8 @@ struct hvm_vcpu {
     struct hvm_trap     inject_trap;
 
     struct viridian_vcpu viridian;
+
+    struct pvh_hvm_vcpu_ext hv_pvh;
 };
 
 #endif /* __ASM_X86_HVM_VCPU_H__ */
diff -r 8b0762504037 xen/include/asm-x86/hvm/vmx/pvh.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-x86/hvm/vmx/pvh.h Wed Nov 14 11:03:24 2012 -0800
@@ -0,0 +1,11 @@
+#ifndef __ASM_X86_HVM_VMX_PVH_H__
+#define __ASM_X86_HVM_VMX_PVH_H__
+    
+void vmx_pvh_update_cr3(struct vcpu *v);
+int vmx_pvh_set_vcpu_info(struct vcpu *v, struct vcpu_guest_context *ctxtp);
+int vmx_pvh_read_descriptor(unsigned int sel, const struct vcpu *v,
+                         const struct cpu_user_regs *regs, unsigned long *base,
+                         unsigned long *limit, unsigned int *ar);
+#endif /* __ASM_X86_HVM_VMX_PVH_H__ */
+
+
diff -r 8b0762504037 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Nov 14 11:03:24 2012 -0800
@@ -413,6 +413,7 @@ int vmx_write_guest_msr(u32 msr, u64 val
 int vmx_add_guest_msr(u32 msr);
 int vmx_add_host_load_msr(u32 msr);
 void vmx_vmcs_switch(struct vmcs_struct *from, struct vmcs_struct *to);
+void vmx_fpu_enter(struct vcpu *v);
 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector);
 void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector);
 
diff -r 8b0762504037 xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Wed Nov 14 11:03:24 2012 -0800
@@ -156,11 +156,28 @@ void vmx_update_cpu_exec_control(struct 
 # define VMX_CONTROL_REG_ACCESS_TYPE_LMSW        3
  /* 10:8 - general purpose register operand */
 #define VMX_CONTROL_REG_ACCESS_GPR(eq)  (((eq) >> 8) & 0xf)
+#define VMX_CONTROL_REG_ACCESS_GPR_EAX  0
+#define VMX_CONTROL_REG_ACCESS_GPR_ECX  1
+#define VMX_CONTROL_REG_ACCESS_GPR_EDX  2
+#define VMX_CONTROL_REG_ACCESS_GPR_EBX  3
+#define VMX_CONTROL_REG_ACCESS_GPR_ESP  4
+#define VMX_CONTROL_REG_ACCESS_GPR_EBP  5
+#define VMX_CONTROL_REG_ACCESS_GPR_ESI  6
+#define VMX_CONTROL_REG_ACCESS_GPR_EDI  7
+#define VMX_CONTROL_REG_ACCESS_GPR_R8   8
+#define VMX_CONTROL_REG_ACCESS_GPR_R9   9
+#define VMX_CONTROL_REG_ACCESS_GPR_R10  10
+#define VMX_CONTROL_REG_ACCESS_GPR_R11  11
+#define VMX_CONTROL_REG_ACCESS_GPR_R12  12
+#define VMX_CONTROL_REG_ACCESS_GPR_R13  13
+#define VMX_CONTROL_REG_ACCESS_GPR_R14  14
+#define VMX_CONTROL_REG_ACCESS_GPR_R15  15
 
 /*
  * Access Rights
  */
 #define X86_SEG_AR_SEG_TYPE     0xf        /* 3:0, segment type */
+#define X86_SEG_AR_SEG_TYPE_CODE (1u << 3) /* code (vs data) segment */
 #define X86_SEG_AR_DESC_TYPE    (1u << 4)  /* 4, descriptor type */
 #define X86_SEG_AR_DPL          0x60       /* 6:5, descriptor privilege level 
*/
 #define X86_SEG_AR_SEG_PRESENT  (1u << 7)  /* 7, segment present */
@@ -389,6 +406,26 @@ static inline int __vmxon(u64 addr)
     return rc;
 }
 
+static inline unsigned long vmr(unsigned long field)
+{
+    int rc;
+    unsigned long val;
+    val = __vmread_safe(field, &rc);
+    return rc ? 0 : val;
+}
+
+/*
+ * Not all cases receive valid value in the VM-exit instruction length field.
+ * Callers must know what they're doing!
+ */
+static inline int get_instruction_length(void)
+{
+    int len;
+    len = __vmread(VM_EXIT_INSTRUCTION_LEN); /* Safe: callers audited */
+    BUG_ON((len < 1) || (len > 15));
+    return len;
+}
+
 void vmx_get_segment_register(struct vcpu *, enum x86_segment,
                               struct segment_register *);
 void vmx_inject_extint(int trap);
@@ -398,6 +435,11 @@ void ept_p2m_init(struct p2m_domain *p2m
 void ept_walk_table(struct domain *d, unsigned long gfn);
 void setup_ept_dump(void);
 
+void update_guest_eip(void);
+void pvh_vmx_vmexit_handler(struct cpu_user_regs *regs);
+void vmx_dr_access(unsigned long exit_qualification,struct cpu_user_regs 
*regs);
+void vmx_do_extint(struct cpu_user_regs *regs);
+
 /* EPT violation qualifications definitions */
 #define _EPT_READ_VIOLATION         0
 #define EPT_READ_VIOLATION          (1UL<<_EPT_READ_VIOLATION)
diff -r 8b0762504037 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/p2m.h Wed Nov 14 11:03:24 2012 -0800
@@ -70,6 +70,7 @@ typedef enum {
     p2m_ram_paging_in = 11,       /* Memory that is being paged in */
     p2m_ram_shared = 12,          /* Shared or sharable memory */
     p2m_ram_broken = 13,          /* Broken page, access cause domain crash */
+    p2m_map_foreign  = 14,        /* ram pages from foreign domain */
 } p2m_type_t;
 
 /*
@@ -180,6 +181,7 @@ typedef unsigned int p2m_query_t;
 #define p2m_is_sharable(_t) (p2m_to_mask(_t) & P2M_SHARABLE_TYPES)
 #define p2m_is_shared(_t)   (p2m_to_mask(_t) & P2M_SHARED_TYPES)
 #define p2m_is_broken(_t)   (p2m_to_mask(_t) & P2M_BROKEN_TYPES)
+#define p2m_is_foreign(_t)  (p2m_to_mask(_t) & p2m_to_mask(p2m_map_foreign))
 
 /* Per-p2m-table state */
 struct p2m_domain {
@@ -506,6 +508,8 @@ p2m_type_t p2m_change_type(struct domain
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);
 
+/* Set foreign mfn in the current guest's p2m table (for pvh dom0) */
+int set_foreign_p2m_entry(struct domain *domp, unsigned long gfn, mfn_t mfn);
 
 /* 
  * Populate-on-demand
diff -r 8b0762504037 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h   Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/processor.h   Wed Nov 14 11:03:24 2012 -0800
@@ -545,12 +545,14 @@ extern int hypercall(void);
 
 int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
           uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+void pv_cpuid(struct cpu_user_regs *regs);
 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val);
 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val);
 
 void microcode_set_module(unsigned int);
 int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void), unsigned long len);
 int microcode_resume_cpu(int cpu);
+int emulate_forced_invalid_op(struct cpu_user_regs *regs);
 
 #endif /* !__ASSEMBLY__ */
 
diff -r 8b0762504037 xen/include/asm-x86/system.h
--- a/xen/include/asm-x86/system.h      Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/system.h      Wed Nov 14 11:03:24 2012 -0800
@@ -4,9 +4,15 @@
 #include <xen/lib.h>
 #include <asm/bitops.h>
 
-#define read_segment_register(name)                             \
+/* We need vcpu because during context switch, going from pure pv to pvh,
+ * in save_segments(), current has been updated to next, and no longer pointing
+ * to the pure pv. Btw, for pvh, we update regs->selectors on each vmexit */
+#define read_segment_register(vcpu, regs, name)                 \
 ({  u16 __sel;                                                  \
-    asm volatile ( "movw %%" STR(name) ",%0" : "=r" (__sel) );  \
+    if (is_pvh_vcpu(v))                                         \
+        __sel = regs->name;                                     \
+    else                                                         \
+        asm volatile ( "movw %%" STR(name) ",%0" : "=r" (__sel) );  \
     __sel;                                                      \
 })
 
diff -r 8b0762504037 xen/include/asm-x86/traps.h
--- a/xen/include/asm-x86/traps.h       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/traps.h       Wed Nov 14 11:03:24 2012 -0800
@@ -48,5 +48,6 @@ extern int guest_has_trap_callback(struc
  */
 extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
                                unsigned int trap_nr);
+int emulate_privileged_op(struct cpu_user_regs *regs);
 
 #endif /* ASM_TRAP_H */
diff -r 8b0762504037 xen/include/asm-x86/x86_64/regs.h
--- a/xen/include/asm-x86/x86_64/regs.h Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/asm-x86/x86_64/regs.h Wed Nov 14 11:03:24 2012 -0800
@@ -11,9 +11,10 @@
 #define ring_3(r)    (((r)->cs & 3) == 3)
 
 #define guest_kernel_mode(v, r)                                 \
+   ( is_pvh_vcpu(v) ? ({BUG_ON(v!=current); ring_0(r);}) :  \
     (!is_pv_32bit_vcpu(v) ?                                     \
      (ring_3(r) && ((v)->arch.flags & TF_kernel_mode)) :        \
-     (ring_1(r)))
+     (ring_1(r))) )
 
 #define permit_softint(dpl, v, r) \
     ((dpl) >= (guest_kernel_mode(v, r) ? 1 : 3))
diff -r 8b0762504037 xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/public/arch-x86/xen.h Wed Nov 14 11:03:24 2012 -0800
@@ -157,7 +157,16 @@ struct vcpu_guest_context {
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
     unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
-    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+    union {
+        struct {
+            /* GDT (machine frames, # ents) */
+            unsigned long gdt_frames[16], gdt_ents;
+        } pv;
+        struct {
+            /* PVH: GDTR addr and size */   
+            unsigned long gdtaddr, gdtsz;
+        } pvh;
+    } u;
     unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
     /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
     unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
diff -r 8b0762504037 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h       Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/public/domctl.h       Wed Nov 14 11:03:24 2012 -0800
@@ -59,6 +59,10 @@ struct xen_domctl_createdomain {
  /* Disable out-of-sync shadow page tables? */
 #define _XEN_DOMCTL_CDF_oos_off       3
 #define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
+ /* Is this a PV guest in HVM container, aka, a pvh guest? */
+ #define _XEN_DOMCTL_CDF_pvh_guest    4
+ #define XEN_DOMCTL_CDF_pvh_guest     (1U<<_XEN_DOMCTL_CDF_pvh_guest)
+
     uint32_t flags;
 };
 typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
@@ -89,6 +93,10 @@ struct xen_domctl_getdomaininfo {
  /* Being debugged.  */
 #define _XEN_DOMINF_debugged  6
 #define XEN_DOMINF_debugged   (1U<<_XEN_DOMINF_debugged)
+ /* domain is PVH */
+#define _XEN_DOMINF_pvh_guest 7
+#define XEN_DOMINF_pvh_guest   (1U<<_XEN_DOMINF_pvh_guest)
+
  /* XEN_DOMINF_shutdown guest-supplied code.  */
 #define XEN_DOMINF_shutdownmask 255
 #define XEN_DOMINF_shutdownshift 16
diff -r 8b0762504037 xen/include/public/physdev.h
--- a/xen/include/public/physdev.h      Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/public/physdev.h      Wed Nov 14 11:03:24 2012 -0800
@@ -330,6 +330,19 @@ struct physdev_dbgp_op {
 typedef struct physdev_dbgp_op physdev_dbgp_op_t;
 DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t);
 
+ 
+#define PHYSDEVOP_pvh_map_iomem        30
+struct physdev_map_iomem {
+    /* IN */
+    unsigned long first_gfn;
+    unsigned long first_mfn;
+    unsigned int nr_mfns;
+    unsigned int add_mapping;        /* 1 == add mapping;  0 == unmap */
+
+};
+typedef struct physdev_map_iomem physdev_map_iomem_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_map_iomem_t);
+
 /*
  * Notify that some PIRQ-bound event channels have been unmasked.
  * ** This command is obsolete since interface version 0x00030202 and is **
diff -r 8b0762504037 xen/include/public/xen.h
--- a/xen/include/public/xen.h  Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/public/xen.h  Wed Nov 14 11:03:24 2012 -0800
@@ -723,6 +723,7 @@ typedef struct start_info start_info_t;
 #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
 #define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
 #define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
+#define SIF_IS_PVH        (1<<4)  /* Is it a PVH guest? */
 #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
 
 /*
diff -r 8b0762504037 xen/include/xen/domain.h
--- a/xen/include/xen/domain.h  Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/xen/domain.h  Wed Nov 14 11:03:24 2012 -0800
@@ -86,4 +86,7 @@ extern unsigned int xen_processor_pmbits
 
 extern bool_t opt_dom0_vcpus_pin;
 
+extern long domctl_memory_mapping(struct domain *d, unsigned long gfn,
+                    unsigned long mfn, unsigned long nr_mfns, int add_map);
+
 #endif /* __XEN_DOMAIN_H__ */
diff -r 8b0762504037 xen/include/xen/lib.h
--- a/xen/include/xen/lib.h     Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/xen/lib.h     Wed Nov 14 11:03:24 2012 -0800
@@ -45,6 +45,10 @@ do {                                    
 #define ASSERT(p) do { if ( 0 && (p) ); } while (0)
 #endif
 
+/* While PVH feature is experimental, make it an unconditional assert */
+#define PVH_ASSERT(p) \
+    do { if ( unlikely(!(p)) ) assert_failed(#p); } while (0)
+
 #define ABS(_x) ({                              \
     typeof(_x) __x = (_x);                      \
     (__x < 0) ? -__x : __x;                     \
diff -r 8b0762504037 xen/include/xen/libelf.h
--- a/xen/include/xen/libelf.h  Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/xen/libelf.h  Wed Nov 14 11:03:24 2012 -0800
@@ -192,13 +192,14 @@ int elf_phdr_is_loadable(struct elf_bina
 int elf_init(struct elf_binary *elf, const char *image, size_t size);
 #ifdef __XEN__
 void elf_set_verbose(struct elf_binary *elf);
+int elf_load_binary(struct elf_binary *elf, int is_pvh_dom0);
 #else
 void elf_set_log(struct elf_binary *elf, elf_log_callback*,
                  void *log_caller_pointer, int verbose);
+int elf_load_binary(struct elf_binary *elf);
 #endif
 
 void elf_parse_binary(struct elf_binary *elf);
-int elf_load_binary(struct elf_binary *elf);
 
 void *elf_get_ptr(struct elf_binary *elf, unsigned long addr);
 uint64_t elf_lookup_addr(struct elf_binary *elf, const char *symbol);
diff -r 8b0762504037 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/include/xen/sched.h   Wed Nov 14 11:03:24 2012 -0800
@@ -230,6 +230,9 @@ struct mem_event_per_domain
     struct mem_event_domain access;
 };
 
+/* PVH: is a PV guest running in HVM container. is_hvm is false for it. 
+ *      But it uses few of the HVM data structs.
+ */
 struct domain
 {
     domid_t          domain_id;
@@ -278,6 +281,7 @@ struct domain
 
     /* Is this an HVM guest? */
     bool_t           is_hvm;
+    bool_t           is_pvh;      /* see above for description */
 #ifdef HAS_PASSTHROUGH
     /* Does this guest need iommu mappings? */
     bool_t           need_iommu;
@@ -449,6 +453,10 @@ struct domain *domain_create(
  /* DOMCRF_oos_off: dont use out-of-sync optimization for shadow page tables */
 #define _DOMCRF_oos_off         4
 #define DOMCRF_oos_off          (1U<<_DOMCRF_oos_off)
+ /* DOMCRF_pvh: Create PV domain in HVM container */
+#define _DOMCRF_pvh            5
+#define DOMCRF_pvh             (1U<<_DOMCRF_pvh)
+
 
 /*
  * rcu_lock_domain_by_id() is more efficient than get_domain_by_id().
@@ -716,8 +724,12 @@ void watchdog_domain_destroy(struct doma
 
 #define is_hvm_domain(d) ((d)->is_hvm)
 #define is_hvm_vcpu(v)   (is_hvm_domain(v->domain))
+#define is_pvh_domain(d) ((d)->is_pvh)
+#define is_pvh_vcpu(v)   (is_pvh_domain(v->domain))
 #define is_pinned_vcpu(v) ((v)->domain->is_pinned || \
                            cpumask_weight((v)->cpu_affinity) == 1)
+#define is_hvm_or_pvh_domain(d) (is_hvm_domain(d) || is_pvh_domain(d))
+#define is_hvm_or_pvh_vcpu(v) (is_hvm_or_pvh_domain(v->domain))
 #ifdef HAS_PASSTHROUGH
 #define need_iommu(d)    ((d)->need_iommu)
 #else
diff -r 8b0762504037 xen/kdb/include/kdb_extern.h
--- a/xen/kdb/include/kdb_extern.h      Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/kdb/include/kdb_extern.h      Wed Nov 14 11:03:24 2012 -0800
@@ -59,8 +59,5 @@ static inline void __vmptrst(u64 *addr)
 }
 
 extern void mukchk(unsigned long);
-#define is_hvm_or_hyb_domain is_hvm_domain
-#define is_hvm_or_hyb_vcpu is_hvm_vcpu
-
 
 #endif  /* _KDB_EXTERN_H */
diff -r 8b0762504037 xen/kdb/kdb_cmds.c
--- a/xen/kdb/kdb_cmds.c        Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/kdb/kdb_cmds.c        Wed Nov 14 11:03:24 2012 -0800
@@ -317,7 +317,7 @@ kdb_guest_bitness(domid_t domid)
 
     if (is_idle_domain(dp))
         retval = HYPSZ;
-    else if (is_hvm_or_hyb_domain(dp))
+    else if (is_hvm_or_pvh_domain(dp))
         retval = (hvm_long_mode_enabled(dp->vcpu[0])) ? HYPSZ : 32;
     else 
         retval = is_pv_32bit_domain(dp) ? 32 : HYPSZ;
@@ -977,7 +977,7 @@ kdb_cmdf_ss(int argc, const char **argv,
         kdbp("kdb: Failed to read byte at: %lx\n", regs->KDBIP);
         return KDB_CPU_MAIN_KDB;
     }
-    if (guest_mode(regs) && is_hvm_or_hyb_vcpu(current)) {
+    if (guest_mode(regs) && is_hvm_or_pvh_vcpu(current)) {
         dp->debugger_attached = 1;  /* see svm_do_resume/vmx_do_ */
         current->arch.hvm_vcpu.single_step = 1;
     } else
@@ -1016,7 +1016,7 @@ kdb_cmdf_ni(int argc, const char **argv,
         return KDB_CPU_MAIN_KDB;
 
     kdb_sbpa[i].bp_ni = 1;
-    if (guest_mode(regs) && is_hvm_or_hyb_vcpu(current))
+    if (guest_mode(regs) && is_hvm_or_pvh_vcpu(current))
         current->arch.hvm_vcpu.single_step = 0;
     else
         regs->eflags &= ~X86_EFLAGS_TF;
@@ -1052,7 +1052,7 @@ kdb_cmdf_ssb(int argc, const char **argv
         kdbp("%s: regs not available\n", __FUNCTION__);
         return KDB_CPU_MAIN_KDB;
     }
-    if (is_hvm_or_hyb_vcpu(current)) 
+    if (is_hvm_or_pvh_vcpu(current)) 
         current->domain->debugger_attached = 1;        /* vmx/svm_do_resume()*/
 
     regs->eflags |= X86_EFLAGS_TF;
@@ -1655,7 +1655,7 @@ kdb_set_bp(domid_t domid, kdbva_t addr, 
         return KDBMAXSBP;
     }
     /* make sure swbp reporting is enabled in the vmcb/vmcs */
-    if (is_hvm_or_hyb_domain(kdb_domid2ptr(domid))) {
+    if (is_hvm_or_pvh_domain(kdb_domid2ptr(domid))) {
         struct domain *dp = kdb_domid2ptr(domid);
         dp->debugger_attached = 1;              /* see svm_do_resume/vmx_do_ */
         KDBGP("debugger_attached set. domid:%d\n", domid);
@@ -1708,7 +1708,7 @@ kdb_cmdf_bp(int argc, const char **argv,
     if (domidstrp && !kdb_str2domid(domidstrp, &domid, 1)) {
         return kdb_usgf_bp();
     }
-    if (argc > 3 && is_hvm_or_hyb_domain(kdb_domid2ptr(domid))) {
+    if (argc > 3 && is_hvm_or_pvh_domain(kdb_domid2ptr(domid))) {
         kdbp("HVM domain not supported yet for conditional bp\n");
         return KDB_CPU_MAIN_KDB;
     }
@@ -1756,7 +1756,7 @@ kdb_cmdf_btp(int argc, const char **argv
     argsidx = 2;                   /* assume 3rd arg is not domid */
     if (argc > 3 && kdb_str2domid(argv[2], &domid, 0)) {
 
-        if (is_hvm_or_hyb_domain(kdb_domid2ptr(domid))) {
+        if (is_hvm_or_pvh_domain(kdb_domid2ptr(domid))) {
             kdbp("HVM domains are not currently supprted\n");
             return KDB_CPU_MAIN_KDB;
         } else
@@ -1889,6 +1889,73 @@ kdb_cmdf_wc(int argc, const char **argv,
     return KDB_CPU_MAIN_KDB;
 }
 
+static void
+kdb_display_hvm_vcpu(struct vcpu *vp)
+{
+    struct hvm_vcpu *hvp;
+    struct vlapic *vlp;
+    struct hvm_io_op *ioop;
+
+    hvp = &vp->arch.hvm_vcpu;
+    vlp = &hvp->vlapic;
+    kdbp("vcpu:%lx id:%d domid:%d\n", vp, vp->vcpu_id, vp->domain->domain_id);
+
+    if (is_pvh_vcpu(vp)) {
+        struct pvh_hvm_vcpu_ext *hp = &hvp->hv_pvh;
+        kdbp("    &pvh_ext:%p limit:%x iopl:%x vcpu_info_mfn:%lx\n",
+             hp, hp->pvh_iobmp_limit, hp->pvh_iopl, hp->pvh_vcpu_info_mfn);
+    }
+
+    ioop = NULL;   /* compiler warning */
+    kdbp("    &hvm_vcpu:%lx  guest_efer:"KDBFL"\n", hvp, hvp->guest_efer);
+    kdbp("      guest_cr: [0]:"KDBFL" [1]:"KDBFL" [2]:"KDBFL"\n", 
+         hvp->guest_cr[0], hvp->guest_cr[1],hvp->guest_cr[2]);
+    kdbp("                [3]:"KDBFL" [4]:"KDBFL"\n", hvp->guest_cr[3],
+         hvp->guest_cr[4]);
+    kdbp("      hw_cr: [0]:"KDBFL" [1]:"KDBFL" [2]:"KDBFL"\n", hvp->hw_cr[0],
+         hvp->hw_cr[1], hvp->hw_cr[2]);
+    kdbp("              [3]:"KDBFL" [4]:"KDBFL"\n", hvp->hw_cr[3], 
+         hvp->hw_cr[4]);
+
+    kdbp("      VLAPIC: base msr:"KDBF64" dis:%x tmrdiv:%x\n", 
+         vlp->hw.apic_base_msr, vlp->hw.disabled, vlp->hw.timer_divisor);
+    kdbp("          regs:%p regs_page:%p\n", vlp->regs, vlp->regs_page);
+    kdbp("          periodic time:\n"); 
+    kdb_prnt_periodic_time(&vlp->pt);
+
+    kdbp("      xen_port:%x flag_dr_dirty:%x dbg_st_latch:%x\n", hvp->xen_port,
+         hvp->flag_dr_dirty, hvp->debug_state_latch);
+
+    if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+
+        struct arch_vmx_struct *vxp = &hvp->u.vmx;
+        kdbp("      &vmx: %p vmcs:%lx active_cpu:%x launched:%x\n", vxp, 
+             vxp->vmcs, vxp->active_cpu, vxp->launched);
+#if XEN_VERSION != 4               /* xen 3.x.x */
+        kdbp("        exec_ctrl:%x vpid:$%d\n", vxp->exec_control, vxp->vpid);
+#endif
+        kdbp("        host_cr0: "KDBFL" vmx: {realm:%x emulate:%x}\n",
+             vxp->host_cr0, vxp->vmx_realmode, vxp->vmx_emulate);
+
+#ifdef __x86_64__
+        kdbp("        &msr_state:%p exception_bitmap:%lx\n", &vxp->msr_state,
+             vxp->exception_bitmap);
+#endif
+    } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+        struct arch_svm_struct *svp = &hvp->u.svm;
+#if XEN_VERSION != 4               /* xen 3.x.x */
+        kdbp("  &svm: vmcb:%lx pa:"KDBF64" asid:"KDBF64"\n", svp, svp->vmcb,
+             svp->vmcb_pa, svp->asid_generation);
+#endif
+        kdbp("    msrpm:%p lnch_core:%x vmcb_sync:%x\n", svp->msrpm, 
+             svp->launch_core, svp->vmcb_in_sync);
+    }
+    kdbp("      cachemode:%x io: {state: %x data: "KDBFL"}\n", hvp->cache_mode,
+         hvp->hvm_io.io_state, hvp->hvm_io.io_data);
+    kdbp("      mmio: {gva: "KDBFL" gpfn: "KDBFL"}\n", hvp->hvm_io.mmio_gva,
+         hvp->hvm_io.mmio_gpfn);
+}
+
 /* display struct hvm_vcpu{} in struct vcpu.arch{} */
 static kdb_cpu_cmd_t
 kdb_usgf_vcpuh(void)
@@ -1900,71 +1967,17 @@ static kdb_cpu_cmd_t
 kdb_cmdf_vcpuh(int argc, const char **argv, struct cpu_user_regs *regs)
 {
     struct vcpu *vp;
-    struct hvm_vcpu *hvp;
-    struct hvm_io_op *ioop;
-    struct vlapic *vlp;
 
     if (argc < 2 || *argv[1] == '?') 
         return kdb_usgf_vcpuh();
 
     if (!kdb_str2ulong(argv[1], (ulong *)&vp) || !kdb_vcpu_valid(vp) ||
-        !is_hvm_or_hyb_vcpu(vp)) {
+        !is_hvm_or_pvh_vcpu(vp)) {
 
         kdbp("kdb: Bad VCPU: %s\n", argv[1]);
         return KDB_CPU_MAIN_KDB;
     }
-
-    hvp = &vp->arch.hvm_vcpu;
-    vlp = &hvp->vlapic;
-    kdbp("vcpu:%lx id:%d domid:%d\n", vp, vp->vcpu_id, vp->domain->domain_id);
-
-    ioop = NULL;   /* compiler warning */
-    kdbp("&hvm_vcpu:%lx  guest_efer:"KDBFL"\n", hvp, hvp->guest_efer);
-    kdbp("  guest_cr: [0]:"KDBFL" [1]:"KDBFL" [2]:"KDBFL"\n", hvp->guest_cr[0],
-         hvp->guest_cr[1],hvp->guest_cr[2]);
-    kdbp("            [3]:"KDBFL" [4]:"KDBFL"\n", hvp->guest_cr[3],
-         hvp->guest_cr[4]);
-    kdbp("  hw_cr: [0]:"KDBFL" [1]:"KDBFL" [2]:"KDBFL"\n", hvp->hw_cr[0],
-         hvp->hw_cr[1], hvp->hw_cr[2]);
-    kdbp("          [3]:"KDBFL" [4]:"KDBFL"\n", hvp->hw_cr[3], hvp->hw_cr[4]);
-
-    kdbp("  VLAPIC: base msr:"KDBF64" dis:%x tmrdiv:%x\n", 
-         vlp->hw.apic_base_msr, vlp->hw.disabled, vlp->hw.timer_divisor);
-    kdbp("          regs:%p regs_page:%p\n", vlp->regs, vlp->regs_page);
-    kdbp("          periodic time:\n"); 
-    kdb_prnt_periodic_time(&vlp->pt);
-
-    kdbp("  xen_port:%x flag_dr_dirty:%x dbg_st_latch:%x\n", hvp->xen_port,
-         hvp->flag_dr_dirty, hvp->debug_state_latch);
-
-    if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-
-        struct arch_vmx_struct *vxp = &hvp->u.vmx;
-        kdbp("  &vmx: %p vmcs:%lx active_cpu:%x launched:%x\n", vxp, 
vxp->vmcs, 
-             vxp->active_cpu, vxp->launched);
-#if XEN_VERSION != 4               /* xen 3.x.x */
-        kdbp("    exec_ctrl:%x vpid:$%d\n", vxp->exec_control, vxp->vpid);
-#endif
-        kdbp("    host_cr0: "KDBFL" vmx: {realm:%x emulate:%x}\n",
-             vxp->host_cr0, vxp->vmx_realmode, vxp->vmx_emulate);
-
-#ifdef __x86_64__
-        kdbp("    &msr_state:%p\n", &vxp->msr_state);
-#endif
-    } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-        struct arch_svm_struct *svp = &hvp->u.svm;
-#if XEN_VERSION != 4               /* xen 3.x.x */
-        kdbp("  &svm: vmcb:%lx pa:"KDBF64" asid:"KDBF64"\n", svp, svp->vmcb,
-             svp->vmcb_pa, svp->asid_generation);
-#endif
-        kdbp("    msrpm:%p lnch_core:%x vmcb_sync:%x\n", svp->msrpm, 
-             svp->launch_core, svp->vmcb_in_sync);
-    }
-    kdbp("  cachemode:%x io: {state: %x data: "KDBFL"}\n", hvp->cache_mode,
-         hvp->hvm_io.io_state, hvp->hvm_io.io_data);
-    kdbp("  mmio: {gva: "KDBFL" gpfn: "KDBFL"}\n", hvp->hvm_io.mmio_gva,
-         hvp->hvm_io.mmio_gpfn);
-
+    kdb_display_hvm_vcpu(vp);
     return KDB_CPU_MAIN_KDB;
 }
 
@@ -2094,7 +2107,7 @@ kdb_display_vcpu(struct vcpu *vp)
     kdbp("  arch info: (%p)\n", &vp->arch);
     kdbp("    guest_context: VGCF_ flags:%lx", 
          vp->arch.vgc_flags); /* VGCF_in_kernel */
-    if (is_hvm_or_hyb_vcpu(vp))
+    if (is_hvm_or_pvh_vcpu(vp))
         kdbp("    (HVM guest: IP, SP, EFLAGS may be stale)");
     kdbp("\n");
     kdb_print_uregs(&vp->arch.user_regs);
@@ -2102,7 +2115,11 @@ kdb_display_vcpu(struct vcpu *vp)
     for (i=0; i < 8; i=i+4)
         kdbp("          %016lx %016lx %016lx %016lx\n", avp->debugreg[i], 
              avp->debugreg[i+1], avp->debugreg[i+2], avp->debugreg[i+3]);
-    kdb_display_pv_vcpu(vp);
+
+    if (is_hvm_or_pvh_vcpu(vp))
+        kdb_display_hvm_vcpu(vp);
+    else
+        kdb_display_pv_vcpu(vp);
 
     kdbp("    TF_flags: %016lx  guest_table: %016lx cr3:%016lx\n", 
          vp->arch.flags, vp->arch.guest_table.pfn, avp->cr3); 
@@ -2291,6 +2308,21 @@ static void kdb_pr_vtsc_info(struct arch
          ap->vtsc_kerncount, ap->vtsc_usercount);
 }
 
+static void kdb_print_p2mlock(struct domain *dp)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(dp);
+    mm_rwlock_t *lp = p2m ? &p2m->lock : NULL;
+
+    if (lp == NULL) {
+        kdbp("    p2m lock ptr is null\n");
+        return;
+    }
+    kdbp("    p2m lockval: %x unlock_level:%x recurse_count:%x locker 
cpu:%x\n",
+         lp->lock.raw.lock, lp->unlock_level, lp->recurse_count,
+         lp->locker);
+    kdbp("    p2m locker_function:%s\n", lp->locker_function);
+}
+
 /* display one domain info */
 static void
 kdb_display_dom(struct domain *dp)
@@ -2332,8 +2364,8 @@ kdb_display_dom(struct domain *dp)
         kdbp("    mapcnt:");
         kdb_print_spin_lock("mapcnt: lk:", &gp->lock, "\n");
     }
-    kdbp("  hvm:%d priv:%d need_iommu:%d dbg:%d dying:%d paused:%d\n",
-         dp->is_hvm, dp->is_privileged, dp->need_iommu,
+    kdbp("  hvm:%d pvh:%d priv:%d need_iommu:%d dbg:%d dying:%d paused:%d\n",
+         dp->is_hvm, dp->is_pvh, dp->is_privileged, dp->need_iommu,
          dp->debugger_attached, dp->is_dying, dp->is_paused_by_controller);
     kdb_print_spin_lock("  shutdown: lk:", &dp->shutdown_lock, "\n");
     kdbp("  shutn:%d shut:%d code:%d \n", dp->is_shutting_down,
@@ -2351,13 +2383,14 @@ kdb_display_dom(struct domain *dp)
     kdbp("    pt_pages:0x%p ", ap->mm_perdomain_pt_pages);
     kdbp("    l2:0x%p l3:0x%p\n", ap->mm_perdomain_l2, ap->mm_perdomain_l3);
     kdbp("    ioport:0x%p &hvm_dom:0x%p\n", ap->ioport_caps, &ap->hvm_domain);
-    if (is_hvm_or_hyb_domain(dp))
+    if (is_hvm_or_pvh_domain(dp))
         kdb_prnt_hvm_dom_info(dp);
 
     kdbp("    &pging_dom:%p mode: %lx", &ap->paging, ap->paging.mode); 
     kdb_pr_dom_pg_modes(dp);
     kdbp("    p2m ptr:%p  pages:{%p, %p}\n", ap->p2m, ap->p2m->pages.next,
          KDB_PGLLE(ap->p2m->pages));
+    kdb_print_p2mlock(dp);
     kdbp("       max_mapped_pfn:"KDBFL, ap->p2m->max_mapped_pfn);
 #if XEN_SUBVERSION > 0 && XEN_VERSION == 4              /* xen 4.1 and above */
     kdbp("  phys_table:%p\n", ap->p2m->phys_table.pfn);
@@ -2932,7 +2965,7 @@ kdb_cmdf_mmu(int argc, const char **argv
     return KDB_CPU_MAIN_KDB;
 }
 
-/* for HVM/HYB guests, go thru EPT. For PV guest we need to go to the btree. 
+/* for HVM/PVH guests, go thru EPT. For PV guest we need to go to the btree. 
  * btree: pfn_to_mfn_frame_list_list is root that points (has mfns of) upto 16
  * pages (call 'em l2 nodes) that contain mfns of guest p2m table pages 
  * NOTE: num of entries in a p2m page is same as num of entries in l2 node */
@@ -2985,7 +3018,7 @@ kdb_gpfn2mfn(struct domain *dp, ulong gp
        *typep = -1;
         return mfn;
     } else
-        return get_gfn_query(dp, gpfn, typep);
+        return mfn_x(get_gfn_query_unlocked(dp, gpfn, typep));
 
     return INVALID_MFN;
 }
@@ -3001,8 +3034,8 @@ static kdb_cpu_cmd_t
 kdb_cmdf_p2m(int argc, const char **argv, struct cpu_user_regs *regs)
 {
     struct domain *dp;
-    ulong gpfn, mfn;
-    p2m_type_t p2mtype;
+    ulong gpfn, mfn=0xdeadbeef;
+    p2m_type_t p2mtype = -1;
 
     if (argc < 3                                   ||
         (dp=kdb_strdomid2ptr(argv[1], 1)) == NULL  ||
@@ -3015,6 +3048,7 @@ kdb_cmdf_p2m(int argc, const char **argv
         kdbp("p2m[%lx] == %lx type:%d/0x%x\n", gpfn, mfn, p2mtype, p2mtype);
     else 
         kdbp("p2m[%lx] == %lx type:N/A(PV)\n", gpfn, mfn);
+
     return KDB_CPU_MAIN_KDB;
 }
 
@@ -3556,6 +3590,9 @@ kdb_cmdf_info(int argc, const char **arg
             kdbp("|_x2apic");
     kdbp("\n\n");
     kdbp("CC:");
+#if defined(CONFIG_X86_64)
+        kdbp(" CONFIG_X86_64");
+#endif
 #if defined(CONFIG_COMPAT)
         kdbp(" CONFIG_COMPAT");
 #endif
diff -r 8b0762504037 xen/kdb/kdbmain.c
--- a/xen/kdb/kdbmain.c Wed Oct 31 16:08:55 2012 -0700
+++ b/xen/kdb/kdbmain.c Wed Nov 14 11:03:24 2012 -0800
@@ -53,10 +53,6 @@ static inline unsigned int kdb_firstbit(
     return (unsigned int)val;
 }
 
-void noinline mukchk(unsigned long ul)
-{
-}
-
 static void 
 kdb_dbg_prnt_ctrps(char *label, int ccpu)
 {
@@ -179,11 +175,6 @@ kdb_begin_session(void)
     }
 }
 
-int noinline mukid(void)
-{
-    return smp_processor_id();
-}
-
 static void
 kdb_smp_unpause_cpus(int ccpu)
 {
@@ -238,8 +229,7 @@ kdb_end_session(int ccpu, struct cpu_use
     watchdog_enable();
     KDBGP("end_session:ccpu:%d\n", ccpu);
 }
-volatile int mukwpprnt;
-unsigned long mukaddr1 = 0xffffffff81243ae7, mukaddr2 = 0xffffffff8100986e;
+
 /* 
  * check if we entered kdb because of DB trap. If yes, then check if
  * we caused it or someone else.
@@ -276,14 +266,6 @@ kdb_check_dbtrap(kdb_reason_t *reasp, in
             }
         } else if (! kdb_check_watchpoints(regs)) {
             rc = 0;                        /* hyp must handle it */
-        } else {
-            if (regs->rip==mukaddr1 || regs->rip==mukaddr2)
-            {
-                if (mukwpprnt)
-                    kdbp("MUK: ignoring wp:%lx\n", regs->rip);
-                kdb_end_session(ccpu, regs);
-                rc = 1;
-            } 
         }
     }
     return rc;
@@ -382,7 +364,7 @@ kdbmain(kdb_reason_t reason, struct cpu_
             }
         } else if (rc == 2) {        /* one of ours but condition not met */
                 kdb_begin_session();
-                if (guest_mode(regs) && is_hvm_or_hyb_vcpu(current))
+                if (guest_mode(regs) && is_hvm_or_pvh_vcpu(current))
                     current->arch.hvm_vcpu.single_step = 1;
                 else
                     regs->eflags |= X86_EFLAGS_TF;  
@@ -422,7 +404,7 @@ kdbmain(kdb_reason_t reason, struct cpu_
             if (!cpumask_empty(&kdb_cpu_traps)) {
                 /* execute current instruction without 0xcc */
                 kdb_dbg_prnt_ctrps("nempty:", ccpu);
-                if (guest_mode(regs) && is_hvm_or_hyb_vcpu(current))
+                if (guest_mode(regs) && is_hvm_or_pvh_vcpu(current))
                     current->arch.hvm_vcpu.single_step = 1;
                 else
                     regs->eflags |= X86_EFLAGS_TF;  
@@ -439,7 +421,7 @@ kdbmain(kdb_reason_t reason, struct cpu_
         if (kdb_swbp_exists()) {
             if (reason == KDB_REASON_BPEXCP) {
                 /* do delayed install */
-                if (guest_mode(regs) && is_hvm_or_hyb_vcpu(current))
+                if (guest_mode(regs) && is_hvm_or_pvh_vcpu(current))
                     current->arch.hvm_vcpu.single_step = 1;
                 else
                     regs->eflags |= X86_EFLAGS_TF;  
@@ -538,7 +520,7 @@ kdb_handle_trap_entry(int vector, struct
             kdb_trap_immed_reason = 0;
             rc = kdb_keyboard(regs);
         } else {                         /* ss/ni/delayed install... */
-            if (guest_mode(regs) && is_hvm_or_hyb_vcpu(current))
+            if (guest_mode(regs) && is_hvm_or_pvh_vcpu(current))
                 current->arch.hvm_vcpu.single_step = 0;
             rc = kdbmain(KDB_REASON_DBEXCP, regs); 
         }
@@ -755,3 +737,8 @@ kdb_trcp(void)
     kdbp(" [most recent]: %016lx   trcidx: 0x%x\n", &trca[i], trcidx);
 }
 
+volatile int muklkdbg;
+void noinline mukchk(unsigned long ul)
+{
+}
+

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.