[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 4/7] x86/vmx: add do_vmtrace_op



On Fri, Jun 19, 2020 at 01:41:03AM +0200, Michał Leszczyński wrote:
> Provide an interface for privileged domains to manage
> external IPT monitoring. Guest IPT state will be preserved
> across vmentry/vmexit using ipt_state structure.

Thanks! I have some comments below, some of them are cosmetic coding
style issues. Sorry the Xen coding style is quite different from other
projects, and it takes a bit to get used to.

> Signed-off-by: Michal Leszczynski <michal.leszczynski@xxxxxxx>
> ---
>  xen/arch/x86/hvm/hvm.c             | 167 +++++++++++++++++++++++++++++
>  xen/arch/x86/hvm/vmx/vmx.c         |  24 +++++
>  xen/arch/x86/mm.c                  |  37 +++++++
>  xen/common/domain.c                |   1 +
>  xen/include/asm-x86/hvm/vmx/vmcs.h |  16 +++
>  xen/include/public/hvm/hvm_op.h    |  23 ++++
>  xen/include/public/hvm/params.h    |   5 +-
>  xen/include/public/memory.h        |   1 +
>  xen/include/xen/sched.h            |   3 +
>  9 files changed, 276 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 5bb47583b3..145ad053d2 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1612,6 +1612,24 @@ int hvm_vcpu_initialise(struct vcpu *v)
>      return rc;
>  }
>  
> +void hvm_vmtrace_destroy(struct vcpu *v)
> +{
> +    unsigned int i;
> +    struct page_info *pg;
> +    struct ipt_state *ipt = v->arch.hvm.vmx.ipt_state;
> +    mfn_t buf_mfn = ipt->output_base >> PAGE_SHIFT;

Does this build? I think you are missing a _mfn(...) here?

> +    size_t buf_size = ipt->output_mask.size + 1;
> +
> +    xfree(ipt);
> +    v->arch.hvm.vmx.ipt_state = NULL;
> +
> +    for ( i = 0; i < (buf_size >> PAGE_SHIFT); i++ )
> +    {
> +        pg = mfn_to_page(_mfn(mfn_add(buf_mfn, i)));
> +        free_domheap_page(pg);

There's no need for the loop, just use free_domheap_pages and free the
whole allocated area at once. Also you can use maddr_to_page to get
the page from output_base.

> +    }
> +}
> +
>  void hvm_vcpu_destroy(struct vcpu *v)
>  {
>      viridian_vcpu_deinit(v);
> @@ -1631,6 +1649,8 @@ void hvm_vcpu_destroy(struct vcpu *v)
>      vlapic_destroy(v);
>  
>      hvm_vcpu_cacheattr_destroy(v);
> +
> +    hvm_vmtrace_destroy(v);
>  }
>  
>  void hvm_vcpu_down(struct vcpu *v)
> @@ -4066,6 +4086,51 @@ static int hvmop_set_evtchn_upcall_vector(
>      return 0;
>  }
>  
> +static int hvm_set_vmtrace_pt_size(struct domain *d, uint64_t value)

No need for the hvm_ prefix since it's a local function, and then I
would name it:

vmtrace_alloc_buffers(struct domain *d, uint64_t size)

I would name the variable size, so it's purpose it's clearer.

> +{
> +    void *buf;
> +    unsigned int buf_order;
> +    struct page_info *pg;
> +    struct ipt_state *ipt;
> +    struct vcpu *v;
> +
> +    if ( value < PAGE_SIZE ||
> +         value > GB(4) ||
> +         ( value & (value - 1) ) ) {

Extra spaces, and no need to place each condition on a new line as
long as you don't exceed the 80 character limit. Also thee opening
brace should be on a newline:

    if ( value < PAGE_SIZE || value > GB(4) || (value & (value - 1)) )
    {


> +        /* we don't accept trace buffer size smaller than single page
> +         * and the upper bound is defined as 4GB in the specification */

Comment style is wrong, according to CODING_STYLE this should be:

        /*
         * We don't accept trace buffer size smaller than single page
         * and the upper bound is defined as 4GB in the specification.
         */

Since you mention the limitations of the buffer size, I would also
mention that size must be a power of 2.

> +        return -EINVAL;
> +    }
> +
> +    for_each_vcpu ( d, v )
> +    {
> +        buf_order = get_order_from_bytes(value);

There's no need for the buf_order variable, just place the call inside
alloc_domheap_pages.

> +        pg = alloc_domheap_pages(d, buf_order, MEMF_no_refcount);

You can define the variable here:

struct page_info *pg = alloc_domheap_pages(d, get_order_from_bytes(value),
                                           MEMF_no_refcount);

ipt variable can also be defined here to reduce it's scope.

> +
> +        if ( !pg )
> +            return -EFAULT;

-ENOMEM

> +
> +        buf = page_to_virt(pg);

Why do you need buf?

You seem to get the virtual address of the page(s) just to use it in
virt_to_mfn, but you can directly use page_to_maddr and remove the buf
variable (and the shift done below).

> +
> +        if ( vmx_add_host_load_msr(v, MSR_RTIT_CTL, 0) )
> +            return -EFAULT;

You leak the pg allocation here...

> +
> +        ipt = xmalloc(struct ipt_state);

If you use xzalloc you can avoid setting the fields to 0 below.

> +
> +        if ( !ipt )
> +            return -EFAULT;

... and here in the failure paths. This should be -ENOMEM also.

> +
> +        ipt->output_base = virt_to_mfn(buf) << PAGE_SHIFT;
> +        ipt->output_mask.raw = value - 1;
> +        ipt->status = 0;
> +        ipt->active = 0;
> +
> +        v->arch.hvm.vmx.ipt_state = ipt;
> +    }
> +
> +    return 0;
> +}
> +
>  static int hvm_allow_set_param(struct domain *d,
>                                 uint32_t index,
>                                 uint64_t new_value)
> @@ -4127,6 +4192,7 @@ static int hvm_allow_set_param(struct domain *d,
>      case HVM_PARAM_NR_IOREQ_SERVER_PAGES:
>      case HVM_PARAM_ALTP2M:
>      case HVM_PARAM_MCA_CAP:
> +    case HVM_PARAM_VMTRACE_PT_SIZE:
>          if ( value != 0 && new_value != value )
>              rc = -EEXIST;
>          break;
> @@ -4328,6 +4394,9 @@ static int hvm_set_param(struct domain *d, uint32_t 
> index, uint64_t value)
>      case HVM_PARAM_MCA_CAP:
>          rc = vmce_enable_mca_cap(d, value);
>          break;
> +    case HVM_PARAM_VMTRACE_PT_SIZE:
> +        rc = hvm_set_vmtrace_pt_size(d, value);
> +        break;
>      }
>  
>      if ( !rc )
> @@ -4949,6 +5018,100 @@ static int compat_altp2m_op(
>      return rc;
>  }
>  
> +static int do_vmtrace_op(XEN_GUEST_HANDLE_PARAM(void) arg)
> +{
> +    struct xen_hvm_vmtrace_op a;
> +    struct domain *d;
> +    int rc;
> +    struct vcpu *v;
> +    struct ipt_state *ipt;
> +
> +    if ( !hvm_pt_supported() )
> +        return -EOPNOTSUPP;
> +
> +    if ( copy_from_guest(&a, arg, 1) )
> +        return -EFAULT;
> +
> +    if ( a.version != HVMOP_VMTRACE_INTERFACE_VERSION )
> +        return -EINVAL;
> +
> +    d = rcu_lock_domain_by_any_id(a.domain);
> +    spin_lock(&d->vmtrace_lock);

This must be moved after you have checked d is not NULL.

> +
> +    if ( d == NULL )
> +        return -ESRCH;
> +
> +    if ( !is_hvm_domain(d) )

You don't need to hold vmtrace_lock to check this...

> +    {
> +        rc = -EOPNOTSUPP;
> +        goto out;
> +    }
> +
> +    domain_pause(d);

Do you really need to pause the whole domain, or just the vcpu you are
avoid to modify the parameters?

Also for HVMOP_vmtrace_ipt_get_offset there's no need to pause
anything?

> +
> +    if ( a.vcpu >= d->max_vcpus )

... neither this. I think you can reduce the locked section a bit.

> +    {
> +        rc = -EINVAL;
> +        goto out;
> +    }
> +
> +    v = d->vcpu[a.vcpu];
> +    ipt = v->arch.hvm.vmx.ipt_state;
> +
> +    if ( !ipt )
> +    {
> +        /*
> +      * PT must be first initialized upon domain creation.
> +      */

You have a couple of hard tab in the lines above. Also single line
comments are /* ... */.

> +        rc = -EINVAL;
> +        goto out;
> +    }
> +
> +    switch ( a.cmd )
> +    {
> +    case HVMOP_vmtrace_ipt_enable:
> +        if ( vmx_add_guest_msr(v, MSR_RTIT_CTL,
> +                               RTIT_CTL_TRACEEN | RTIT_CTL_OS |
> +                               RTIT_CTL_USR | RTIT_CTL_BRANCH_EN) )
> +        {
> +            rc = -EFAULT;

vmx_add_guest_msr already returns a valid error code, please don't
replace it with -EFAULT unconditionally (here and below).

> +            goto out;
> +        }
> +
> +        ipt->active = 1;
> +        break;

Please add an empty newline after break;.

> +    case HVMOP_vmtrace_ipt_disable:
> +        if ( vmx_add_guest_msr(v, MSR_RTIT_CTL, 0) )
> +        {
> +            rc = -EFAULT;
> +            goto out;
> +        }
> +
> +        ipt->active = 0;
> +        break;
> +    case HVMOP_vmtrace_ipt_get_offset:
> +        a.offset = ipt->output_mask.offset;
> +        break;
> +    default:
> +        rc = -EOPNOTSUPP;
> +        goto out;
> +    }
> +
> +    rc = -EFAULT;
> +    if ( __copy_to_guest(arg, &a, 1) )
> +      goto out;
> +    rc = 0;
> +
> + out:
> +    domain_unpause(d);
> +    spin_unlock(&d->vmtrace_lock);
> +    rcu_unlock_domain(d);
> +
> +    return rc;
> +}
> +
> +DEFINE_XEN_GUEST_HANDLE(compat_hvm_vmtrace_op_t);
> +
>  static int hvmop_get_mem_type(
>      XEN_GUEST_HANDLE_PARAM(xen_hvm_get_mem_type_t) arg)
>  {
> @@ -5101,6 +5264,10 @@ long do_hvm_op(unsigned long op, 
> XEN_GUEST_HANDLE_PARAM(void) arg)
>          rc = current->hcall_compat ? compat_altp2m_op(arg) : 
> do_altp2m_op(arg);
>          break;
>  
> +    case HVMOP_vmtrace:
> +        rc = do_vmtrace_op(arg);
> +        break;
> +
>      default:
>      {
>          gdprintk(XENLOG_DEBUG, "Bad HVM op %ld.\n", op);
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index ab19d9424e..51f0046483 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -508,11 +508,25 @@ static void vmx_restore_host_msrs(void)
>  
>  static void vmx_save_guest_msrs(struct vcpu *v)
>  {
> +    uint64_t rtit_ctl;
> +
>      /*
>       * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can
>       * be updated at any time via SWAPGS, which we cannot trap.
>       */
>      v->arch.hvm.vmx.shadow_gs = rdgsshadow();
> +
> +    if ( unlikely(v->arch.hvm.vmx.ipt_state && 
> v->arch.hvm.vmx.ipt_state->active) )
> +    {
> +        smp_rmb();

If this barrier is required I think it warrants a comment about why
it's needed.

> +        rdmsrl(MSR_RTIT_CTL, rtit_ctl);
> +
> +        if ( rtit_ctl & RTIT_CTL_TRACEEN )
> +            BUG();

BUG_ON(rtit_ctl & RTIT_CTL_TRACEEN);

> +
> +        rdmsrl(MSR_RTIT_STATUS, v->arch.hvm.vmx.ipt_state->status);
> +        rdmsrl(MSR_RTIT_OUTPUT_MASK, 
> v->arch.hvm.vmx.ipt_state->output_mask.raw);
> +    }
>  }
>  
>  static void vmx_restore_guest_msrs(struct vcpu *v)
> @@ -524,6 +538,16 @@ static void vmx_restore_guest_msrs(struct vcpu *v)
>  
>      if ( cpu_has_msr_tsc_aux )
>          wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
> +
> +    if ( unlikely(v->arch.hvm.vmx.ipt_state && 
> v->arch.hvm.vmx.ipt_state->active) )

Line is too long.

> +    {
> +        wrmsrl(MSR_RTIT_OUTPUT_BASE,
> +            v->arch.hvm.vmx.ipt_state->output_base);
> +        wrmsrl(MSR_RTIT_OUTPUT_MASK,
> +            v->arch.hvm.vmx.ipt_state->output_mask.raw);
> +        wrmsrl(MSR_RTIT_STATUS,
> +            v->arch.hvm.vmx.ipt_state->status);

Can you please align to the start of the parentheses:

        wrmsrl(MSR_RTIT_STATUS,
               v->arch.hvm.vmx.ipt_state->status);

> +    }
>  }
>  
>  void vmx_update_cpu_exec_control(struct vcpu *v)
> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
> index e376fc7e8f..e4658dc27f 100644
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -4624,6 +4624,43 @@ int arch_acquire_resource(struct domain *d, unsigned 
> int type,
>          }
>          break;
>      }
> +
> +    case XENMEM_resource_vmtrace_buf:
> +    {
> +        mfn_t mfn;
> +        unsigned int i;
> +        struct ipt_state *ipt;
> +
> +        if ( id >= d->max_vcpus )
> +        {
> +            rc = -EINVAL;

You can just set rc = -EINVAL at the top and then avoid setting it on
every error case.

> +            break;
> +        }
> +
> +        ipt = d->vcpu[id]->arch.hvm.vmx.ipt_state;
> +
> +        if ( !ipt )
> +        {
> +            rc = -EINVAL;
> +            break;
> +        }
> +
> +        mfn = mfn_x(ipt->output_base >> PAGE_SHIFT);
> +
> +        if (nr_frames > ( ( ipt->output_mask.size + 1 ) >> PAGE_SHIFT ))

Spaces are wrongly positioned. ie:

if ( frame + nr_frames > ((ipt->output_mask.size + 1) >> PAGE_SHIFT) )

> +        {
> +            rc = -EINVAL;
> +            break;
> +        }
> +
> +        rc = 0;
> +        for ( i = 0; i < nr_frames; i++ )

I think you should init i to the the 'frame' field from
xen_mem_acquire_resource, since it's possible to select the offset you
want to map from a resource. You will also need to take it into
account, because IMO that's where I would store the last processed
frame when dealing with continuations.

> +        {
> +            mfn_list[i] = mfn_add(mfn, i);
> +        }
> +
> +        break;
> +    }
>  #endif
>  
>      default:
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 7cc9526139..6f6458cd5b 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -414,6 +414,7 @@ struct domain *domain_create(domid_t domid,
>      d->shutdown_code = SHUTDOWN_CODE_INVALID;
>  
>      spin_lock_init(&d->pbuf_lock);
> +    spin_lock_init(&d->vmtrace_lock);
>  
>      rwlock_init(&d->vnuma_rwlock);
>  
> diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
> b/xen/include/asm-x86/hvm/vmx/vmcs.h
> index 4c81093aba..9fc4653777 100644
> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -104,6 +104,19 @@ struct pi_blocking_vcpu {
>      spinlock_t           *lock;
>  };
>  
> +struct ipt_state {
> +    uint64_t active;

Could this be a boolean?

> +    uint64_t status;
> +    uint64_t output_base;
> +    union {
> +        uint64_t raw;
> +        struct {
> +            uint32_t size;
> +            uint32_t offset;
> +        };
> +    } output_mask;
> +};
> +
>  struct vmx_vcpu {
>      /* Physical address of VMCS. */
>      paddr_t              vmcs_pa;
> @@ -186,6 +199,9 @@ struct vmx_vcpu {
>       * pCPU and wakeup the related vCPU.
>       */
>      struct pi_blocking_vcpu pi_blocking;
> +
> +    /* State of Intel Processor Trace feature */
> +    struct ipt_state     *ipt_state;
>  };
>  
>  int vmx_create_vmcs(struct vcpu *v);
> diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
> index 870ec52060..8cd0b6ea49 100644
> --- a/xen/include/public/hvm/hvm_op.h
> +++ b/xen/include/public/hvm/hvm_op.h
> @@ -382,6 +382,29 @@ struct xen_hvm_altp2m_op {
>  typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
>  
> +/* HVMOP_vmtrace: Perform VM tracing related operation */
> +#define HVMOP_vmtrace 26
> +
> +#define HVMOP_VMTRACE_INTERFACE_VERSION 0x00000001
> +
> +struct xen_hvm_vmtrace_op {
> +    /* IN variable */
> +    uint32_t version;   /* HVMOP_VMTRACE_INTERFACE_VERSION */
> +    uint32_t cmd;
> +/* Enable/disable external vmtrace for given domain */
> +#define HVMOP_vmtrace_ipt_enable      1
> +#define HVMOP_vmtrace_ipt_disable     2
> +#define HVMOP_vmtrace_ipt_get_offset  3
> +    domid_t domain;
> +    uint32_t vcpu;
> +    uint64_t size;
> +
> +    /* OUT variable */
> +    uint64_t offset;
> +};
> +typedef struct xen_hvm_vmtrace_op xen_hvm_vmtrace_op_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_hvm_vmtrace_op_t);
> +
>  #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
>  
>  /*
> diff --git a/xen/include/public/hvm/params.h b/xen/include/public/hvm/params.h
> index 0a91bfa749..adbc7e5d08 100644
> --- a/xen/include/public/hvm/params.h
> +++ b/xen/include/public/hvm/params.h
> @@ -300,6 +300,9 @@
>  #define XEN_HVM_MCA_CAP_LMCE   (xen_mk_ullong(1) << 0)
>  #define XEN_HVM_MCA_CAP_MASK   XEN_HVM_MCA_CAP_LMCE
>  
> -#define HVM_NR_PARAMS 39
> +/* VM trace capabilities */
> +#define HVM_PARAM_VMTRACE_PT_SIZE 39

I think it was recommended that IPT should be set at domain creation,
but using a HVM param still allows you to init this after the domain
has been created. I think it might be best to just add a new field to
xen_domctl_createdomain, as it seems like the interface could also be
helpful for other arches?

Thanks, Roger.



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.