[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 6/9] tools/libxc: x86 pv save implementation



On Wed, 2014-04-30 at 19:36 +0100, Andrew Cooper wrote:

The previous "common" patch seemed to have a fair bit of x86 pv save
code in it, and there seems to be a fair bit of common code here. I
suppose the split was pretty artificial to make the patches manageable?

> @@ -121,6 +123,9 @@ struct context
>      };
>  };
>  
> +/* Saves an x86 PV domain. */

I should hope so!

> +int save_x86_pv(struct context *ctx);
> +
>  /*
>   * Write the image and domain headers to the stream.
>   * (to eventually make static in save.c)
> @@ -137,6 +142,21 @@ struct record
>      void *data;
>  };
>  
> +/* Gets a field from an *_any union */

Unless the #undefs which were above before I trimmed them relate to
patch #2/9 they should go right before the redefine IMHO.

> +#define GET_FIELD(_c, _p, _f)                   \
> +    ({ (_c)->x86_pv.width == 8 ?                \
> +            (_p)->x64._f:                       \
> +            (_p)->x32._f;                       \
> +    })                                          \
> +
> +/* Gets a field from an *_any union */
> +#define SET_FIELD(_c, _p, _f, _v)               \
> +    ({ if ( (_c)->x86_pv.width == 8 )           \
> +            (_p)->x64._f = (_v);                \
> +        else                                    \
> +            (_p)->x32._f = (_v);                \
> +    })
> +
>  /*
>   * Writes a split record to the stream, applying correct padding where
>   * appropriate.  It is common when sending records containing blobs from Xen

> +    /* MFNs of the batch pfns */
> +    mfns = malloc(nr_pfns * sizeof *mfns);
> +    /* Types of the batch pfns */
> +    types = malloc(nr_pfns * sizeof *types);
> +    /* Errors from attempting to map the mfns */
> +    errors = malloc(nr_pfns * sizeof *errors);
> +    /* Pointers to page data to send.  Might be from mapped mfns or local 
> allocations */
> +    guest_data = calloc(nr_pfns, sizeof *guest_data);
> +    /* Pointers to locally allocated pages.  Probably not all used, but need 
> freeing */
> +    local_pages = calloc(nr_pfns, sizeof *local_pages);

Wouldn't it be better to preallocate (most of) these for the entire
save/restore than to do it for each batch?

> +    for ( i = 0; i < nr_pfns; ++i )
> +    {
> +        types[i] = mfns[i] = ctx->ops.pfn_to_gfn(ctx, ctx->batch_pfns[i]);
> +
> +        /* Likely a ballooned page */
> +        if ( mfns[i] == INVALID_MFN )
> +            set_bit(ctx->batch_pfns[i], ctx->deferred_pages);

deferred_pages doesn't need to grow dynamically like that bit map in a
previous patch, does it?

> +    hdr.count = nr_pfns;
> +    s = nr_pfns * sizeof *rec_pfns;

Brackets around sizeof would really help here.

> +
> +
> +    rec_pfns = malloc(s);

Given the calculation of s is this a candidate for calloc?

> +    if ( !rec_pfns )
> +    {
> +        ERROR("Unable to allocate memory for page data pfn list");
> +        goto err;
> +    }
> +
> +    for ( i = 0; i < nr_pfns; ++i )
> +        rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->batch_pfns[i];

Could this not have a more structured type? (I noticed this in the
decode bit of the previous patch too).

> +
> +    /*        header +          pfns data           +        page data */
> +    rec_size = 4 + 4 + (s) + (nr_pages * PAGE_SIZE);

Can these 4's not be sizeof something? Also the comment looks like it
wants to align with something, but is failing.

> +    rec_count = nr_pfns;
> +
> +    if ( write_exact(ctx->fd, &rec_type, sizeof(uint32_t)) ||
> +         write_exact(ctx->fd, &rec_size, sizeof(uint32_t)) ||

Are these not a struct rhdr?

> +         write_exact(ctx->fd, &rec_count, sizeof(uint32_t)) ||
> +         write_exact(ctx->fd, &rec_res, sizeof(uint32_t)) )
> +    {
> +        PERROR("Failed to write page_type header to stream");
> +        goto err;
> +    }
> +
> +    if ( write_exact(ctx->fd, rec_pfns, s) )
> +    {
> +        PERROR("Failed to write page_type header to stream");
> +        goto err;
> +    }
> +
> +
> +    for ( i = 0; i < nr_pfns; ++i )

Extra braces are useful for clarity in these siturations.


> +static int flush_batch(struct context *ctx)
> +{
> +    int rc = 0;
> +
> +    if ( ctx->nr_batch_pfns == 0 )
> +        return rc;
> +
> +    rc = write_batch(ctx);
> +
> +    if ( !rc )
> +    {
> +        /* Valgrind sanity check */

What now?

> +        free(ctx->batch_pfns);
> +        ctx->batch_pfns = malloc(MAX_BATCH_SIZE * sizeof *ctx->batch_pfns);
> +        rc = !ctx->batch_pfns;
> +    }
> +
> +
> +    for ( x = 0, pages_written = 0; x < max_iter ; ++x )
> +    {
> +        DPRINTF("Iteration %u", x);

Something in this series should be using xc_report_progress_* (I'm not
sure that place is here, but I didn't see it elsewhere yet).

>  int xc_domain_save2(xc_interface *xch, int io_fd, uint32_t dom, uint32_t 
> max_iters,
>                      uint32_t max_factor, uint32_t flags,
>                      struct save_callbacks* callbacks, int hvm,
>                      unsigned long vm_generationid_addr)
>  {
> +    struct context ctx =
> +        {

Coding style.

> +            .xch = xch,
> +            .fd = io_fd,
> +        };
> +
> +    /* Older GCC cant initialise anonymous unions */

How old?


> +static int map_p2m(struct context *ctx)
> +{
> [...]

> +    fll_mfn = GET_FIELD(ctx, ctx->x86_pv.shinfo, 
> arch.pfn_to_mfn_frame_list_list);
> +    if ( !fll_mfn )
> +        IPRINTF("Waiting for domain to set up its p2m frame list list");
> +
> +    while ( tries-- && !fll_mfn )
> +    {
> +        usleep(10000);
> +        fll_mfn = GET_FIELD(ctx, ctx->x86_pv.shinfo,
> +                            arch.pfn_to_mfn_frame_list_list);
> +    }
> +
> +    if ( !fll_mfn )
> +    {
> +        ERROR("Timed out waiting for p2m frame list list to be updated");
> +        goto err;
> +    }

This (preexisting) ugliness is there to "support" running migrate
immediately after starting a guest, when it hasn't got its act together
yet?

I wonder if we can get away with refusing to migrate under those
circumstances.

> +static int write_one_vcpu_basic(struct context *ctx, uint32_t id)
> +{
> +    xc_interface *xch = ctx->xch;
> +    xen_pfn_t mfn, pfn;
> +    unsigned i;
> +    int rc = -1;
> +    vcpu_guest_context_any_t vcpu;
> +    struct rec_x86_pv_vcpu vhdr = { .vcpu_id = id };
> +    struct record rec =
> +    {
> +        .type = REC_TYPE_x86_pv_vcpu_basic,
> +        .length = sizeof vhdr,
> +        .data = &vhdr,
> +    };
> +
> +    if ( xc_vcpu_getcontext(xch, ctx->domid, id, &vcpu) )
> +    {
> +        PERROR("Failed to get vcpu%u context", id);
> +        goto err;
> +    }
> +
> +    /* Vcpu 0 is special: Convert the suspend record to a PFN */

The "suspend record" is the guest's start_info_t I think? Would be
clearer to say that I think (and perhaps allude to it being the magic
3rd parameter to the hypercall.)

> +    /* 64bit guests: Convert CR1 (guest pagetables) to PFN */
> +    if ( ctx->x86_pv.levels == 4 && vcpu.x64.ctrlreg[1] )
> +    {
> +        mfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
> +        if ( !mfn_in_pseudophysmap(ctx, mfn) )
> +        {
> +            ERROR("Bad MFN for vcpu%u's cr1", id);
> +            pseudophysmap_walk(ctx, mfn);
> +            errno = ERANGE;
> +            goto err;
> +        }
> +
> +        pfn = mfn_to_pfn(ctx, mfn);
> +        vcpu.x64.ctrlreg[1] = 1 | ((uint64_t)pfn << PAGE_SHIFT);
> +    }
> +
> +    if ( ctx->x86_pv.width == 8 )

The use of x86_pv.levels vs x86_pv.width to determine the type of guest
is rather inconsistent. .width would seem to be the right one to use
almost all the time I think.

> +        rc = write_split_record(ctx, &rec, &vcpu, sizeof vcpu.x64);
> +    else
> +        rc = write_split_record(ctx, &rec, &vcpu, sizeof vcpu.x32);
> +
> +    if ( rc )
> +        goto err;
> +
> +    DPRINTF("Writing vcpu%u basic context", id);
> +    rc = 0;
> + err:
> +
> +    return rc;
> +}
> +
> +static int write_one_vcpu_extended(struct context *ctx, uint32_t id)
> +{
> +    xc_interface *xch = ctx->xch;
> +    int rc;
> +    struct rec_x86_pv_vcpu vhdr = { .vcpu_id = id };
> +    struct record rec =
> +    {
> +        .type = REC_TYPE_x86_pv_vcpu_extended,
> +        .length = sizeof vhdr,
> +        .data = &vhdr,
> +    };

Does this pattern repeat a lot? DECLARE_REC(x86_pv_vcpu_extended, vhdr)
perhaps?

> +    struct xen_domctl domctl =
> +    {
> +        .cmd = XEN_DOMCTL_get_ext_vcpucontext,
> +        .domain = ctx->domid,
> +        .u.ext_vcpucontext.vcpu = id,
> +    };

Wants to use DECLARE_DOMCTL I think (at least everywhere else in libxc
seesms too).

> +static int write_all_vcpu_information(struct context *ctx)
> +{
> +    xc_interface *xch = ctx->xch;
> +    xc_vcpuinfo_t vinfo;
> +    unsigned int i;
> +    int rc;
> +
> +    for ( i = 0; i <= ctx->dominfo.max_vcpu_id; ++i )
> +    {
> +        rc = xc_vcpu_getinfo(xch, ctx->domid, i, &vinfo);
> +        if ( rc )
> +        {
> +            PERROR("Failed to get vcpu%u information", i);
> +            return rc;
> +        }
> +
> +        if ( !vinfo.online )
> +        {
> +            DPRINTF("vcpu%u offline - skipping", i);
> +            continue;
> +        }
> +
> +        rc = write_one_vcpu_basic(ctx, i) ?:
> +            write_one_vcpu_extended(ctx, i) ?:
> +            write_one_vcpu_xsave(ctx, i);

I'd prefer the more verbose one at a time version.

> +    /* Map various structures */
> +    rc = x86_pv_map_m2p(ctx) ?: map_shinfo(ctx) ?: map_p2m(ctx);

One at a time please.

> +    if ( rc )
> +        goto err;
> +

Ian.


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.