[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] merge with xen-unstable.hg
# HG changeset patch # User awilliam@xxxxxxxxxxx # Node ID fbc0e953732ef78292d9e87ff6dd7f3432ddd014 # Parent 7f67c15e2c917dc52a3f8acc0fdb79a63b894b15 # Parent 73c73fb8875c331b8c0e6ed0317c8d71b83cdda2 merge with xen-unstable.hg --- tools/security/python/xensec_tools/acm_getdecision | 55 extras/mini-os/events.c | 12 extras/mini-os/include/xenbus.h | 28 extras/mini-os/kernel.c | 23 extras/mini-os/xenbus/xenbus.c | 202 + linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c | 5 linux-2.6-xen-sparse/arch/i386/mm/init-xen.c | 2 linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c | 142 linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c | 6 linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c | 27 linux-2.6-xen-sparse/drivers/xen/core/gnttab.c | 15 linux-2.6-xen-sparse/drivers/xen/core/skbuff.c | 11 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c | 68 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu.h | 4 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h | 11 linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h | 4 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu.h | 4 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h | 12 linux-2.6-xen-sparse/include/xen/gnttab.h | 1 linux-2.6-xen-sparse/kernel/fork.c | 1619 ++++++++++ tools/console/daemon/io.c | 66 tools/console/daemon/utils.c | 26 tools/console/daemon/utils.h | 3 tools/ioemu/sdl.c | 9 tools/ioemu/target-i386-dm/helper2.c | 32 tools/libxc/xc_elf.h | 3 tools/libxc/xc_linux.c | 118 tools/libxc/xc_linux_restore.c | 22 tools/libxc/xc_load_elf.c | 32 tools/libxc/xenctrl.h | 54 tools/python/xen/util/security.py | 41 tools/python/xen/xm/addlabel.py | 2 tools/python/xen/xm/create.py | 2 tools/python/xen/xm/main.py | 3 tools/security/Makefile | 2 tools/security/python/xensec_gen/cgi-bin/policy.cgi | 2 tools/security/secpol_xml2bin.c | 6 tools/xenmon/xenbaked.c | 55 tools/xenstat/libxenstat/src/xenstat.c | 23 tools/xenstore/fake_libxc.c | 4 tools/xenstore/xenstored_core.c | 13 tools/xenstore/xenstored_domain.c | 79 tools/xm-test/tests/block-integrity/01_block_device_read_verify.py | 4 tools/xm-test/tests/block-integrity/02_block_device_write_verify.py | 4 xen/arch/x86/traps.c | 2 xen/common/event_channel.c | 14 xen/include/asm-ia64/event.h | 8 xen/include/asm-x86/event.h | 8 xen/include/xen/elf.h | 2 49 files changed, 2417 insertions(+), 473 deletions(-) diff -r 7f67c15e2c91 -r fbc0e953732e extras/mini-os/events.c --- a/extras/mini-os/events.c Thu Jun 15 10:02:53 2006 -0600 +++ b/extras/mini-os/events.c Thu Jun 15 10:23:57 2006 -0600 @@ -35,24 +35,29 @@ int do_event(u32 port, struct pt_regs *r ev_action_t *action; if (port >= NR_EVS) { printk("Port number too large: %d\n", port); - return 0; + goto out; } action = &ev_actions[port]; action->count++; if (!action->handler) + { + printk("Spurious event on port %d\n", port); goto out; + } if (action->status & EVS_DISABLED) + { + printk("Event on port %d disabled\n", port); goto out; + } /* call the handler */ action->handler(port, regs); - - clear_evtchn(port); out: + clear_evtchn(port); return 1; } @@ -135,6 +140,7 @@ void init_events(void) { ev_actions[i].status = EVS_DISABLED; ev_actions[i].handler = default_handler; + mask_evtchn(i); } } diff -r 7f67c15e2c91 -r fbc0e953732e extras/mini-os/include/xenbus.h --- a/extras/mini-os/include/xenbus.h Thu Jun 15 10:02:53 2006 -0600 +++ b/extras/mini-os/include/xenbus.h Thu Jun 15 10:23:57 2006 -0600 @@ -1,6 +1,34 @@ #ifndef XENBUS_H__ #define XENBUS_H__ +/* Initialize the XenBus system. */ void init_xenbus(void); +/* Read the value associated with a path. Returns a malloc'd error + string on failure and sets *value to NULL. On success, *value is + set to a malloc'd copy of the value. */ +char *xenbus_read(const char *path, char **value); + +/* Associates a value with a path. Returns a malloc'd error string on + failure. */ +char *xenbus_write(const char *path, const char *value); + +/* Removes the value associated with a path. Returns a malloc'd error + string on failure. */ +char *xenbus_rm(const char *path); + +/* List the contents of a directory. Returns a malloc'd error string + on failure and sets *contents to NULL. On success, *contents is + set to a malloc'd array of pointers to malloc'd strings. The array + is NULL terminated. May block. */ +char *xenbus_ls(const char *prefix, char ***contents); + +/* Reads permissions associated with a path. Returns a malloc'd error + string on failure and sets *value to NULL. On success, *value is + set to a malloc'd copy of the value. */ +char *xenbus_get_perms(const char *path, char **value); + +/* Sets the permissions associated with a path. Returns a malloc'd + error string on failure. */ +char *xenbus_set_perms(const char *path, domid_t dom, char perm); #endif /* XENBUS_H__ */ diff -r 7f67c15e2c91 -r fbc0e953732e extras/mini-os/kernel.c --- a/extras/mini-os/kernel.c Thu Jun 15 10:02:53 2006 -0600 +++ b/extras/mini-os/kernel.c Thu Jun 15 10:23:57 2006 -0600 @@ -82,17 +82,6 @@ static shared_info_t *map_shared_info(un } -void test_xenbus(void); - -/* Do initialisation from a thread once the scheduler's available */ -static void init_xs(void *ign) -{ - init_xenbus(); - - test_xenbus(); -} - - u8 xen_features[XENFEAT_NR_SUBMAPS * 32]; void setup_xen_features(void) @@ -111,10 +100,18 @@ void setup_xen_features(void) } } +void test_xenbus(void); + +void xenbus_tester(void *p) +{ + test_xenbus(); +} + /* This should be overridden by the application we are linked against. */ __attribute__((weak)) int app_main(start_info_t *si) { printk("Dummy main: start_info=%p\n", si); + create_thread("xenbus_tester", xenbus_tester, si); return 0; } @@ -183,8 +180,8 @@ void start_kernel(start_info_t *si) /* Init scheduler. */ init_sched(); - /* Init XenBus from a separate thread */ - create_thread("init_xs", init_xs, NULL); + /* Init XenBus */ + init_xenbus(); /* Call (possibly overridden) app_main() */ app_main(&start_info); diff -r 7f67c15e2c91 -r fbc0e953732e extras/mini-os/xenbus/xenbus.c --- a/extras/mini-os/xenbus/xenbus.c Thu Jun 15 10:02:53 2006 -0600 +++ b/extras/mini-os/xenbus/xenbus.c Thu Jun 15 10:23:57 2006 -0600 @@ -3,11 +3,12 @@ * (C) 2006 - Cambridge University **************************************************************************** * - * File: mm.c + * File: xenbus.c * Author: Steven Smith (sos22@xxxxxxxxx) * Changes: Grzegorz Milos (gm281@xxxxxxxxx) + * Changes: John D. Ramsdell * - * Date: Mar 2006, chages Aug 2005 + * Date: Jun 2006, chages Aug 2005 * * Environment: Xen Minimal OS * Description: Minimal implementation of xenbus @@ -167,6 +168,7 @@ void init_xenbus(void) void init_xenbus(void) { int err; + printk("Initialising xenbus\n"); DEBUG("init_xenbus called.\n"); xenstore_buf = mfn_to_virt(start_info.store_mfn); create_thread("xenstore", xenbus_thread_func, NULL); @@ -262,15 +264,15 @@ static void xb_write(int type, int req_i /* Send a mesasge to xenbus, in the same fashion as xb_write, and block waiting for a reply. The reply is malloced and should be freed by the caller. */ -static void *xenbus_msg_reply(int type, +static struct xsd_sockmsg * +xenbus_msg_reply(int type, int trans, struct write_req *io, int nr_reqs) { int id; DEFINE_WAIT(w); - void *rep; - struct xsd_sockmsg *repmsg; + struct xsd_sockmsg *rep; id = allocate_xenbus_id(); add_waiter(w, req_info[id].waitq); @@ -281,12 +283,26 @@ static void *xenbus_msg_reply(int type, wake(current); rep = req_info[id].reply; - repmsg = rep; - BUG_ON(repmsg->req_id != id); + BUG_ON(rep->req_id != id); release_xenbus_id(id); - return rep; } + +static char *errmsg(struct xsd_sockmsg *rep) +{ + if (!rep) { + char msg[] = "No reply"; + size_t len = strlen(msg) + 1; + return memcpy(malloc(len), msg, len); + } + if (rep->type != XS_ERROR) + return NULL; + char *res = malloc(rep->len + 1); + memcpy(res, rep + 1, rep->len); + res[rep->len] = 0; + free(rep); + return res; +} /* Send a debug message to xenbus. Can block. */ static void xenbus_debug_msg(const char *msg) @@ -296,27 +312,29 @@ static void xenbus_debug_msg(const char { "print", sizeof("print") }, { msg, len }, { "", 1 }}; - void *reply; - struct xsd_sockmsg *repmsg; - - reply = xenbus_msg_reply(XS_DEBUG, 0, req, 3); - repmsg = reply; + struct xsd_sockmsg *reply; + + reply = xenbus_msg_reply(XS_DEBUG, 0, req, ARRAY_SIZE(req)); DEBUG("Got a reply, type %d, id %d, len %d.\n", - repmsg->type, repmsg->req_id, repmsg->len); + reply->type, reply->req_id, reply->len); } /* List the contents of a directory. Returns a malloc()ed array of pointers to malloc()ed strings. The array is NULL terminated. May block. */ -static char **xenbus_ls(const char *pre) -{ - void *reply; - struct xsd_sockmsg *repmsg; +char *xenbus_ls(const char *pre, char ***contents) +{ + struct xsd_sockmsg *reply, *repmsg; struct write_req req[] = { { pre, strlen(pre)+1 } }; int nr_elems, x, i; char **res; - repmsg = xenbus_msg_reply(XS_DIRECTORY, 0, req, 1); + repmsg = xenbus_msg_reply(XS_DIRECTORY, 0, req, ARRAY_SIZE(req)); + char *msg = errmsg(repmsg); + if (msg) { + *contents = NULL; + return msg; + } reply = repmsg + 1; for (x = nr_elems = 0; x < repmsg->len; x++) nr_elems += (((char *)reply)[x] == 0); @@ -329,20 +347,91 @@ static char **xenbus_ls(const char *pre) } res[i] = NULL; free(repmsg); - return res; -} - -static char *xenbus_read(const char *path) -{ - struct write_req req[] = { {path, strlen(path) + 1}}; + *contents = res; + return NULL; +} + +char *xenbus_read(const char *path, char **value) +{ + struct write_req req[] = { {path, strlen(path) + 1} }; struct xsd_sockmsg *rep; char *res; - rep = xenbus_msg_reply(XS_READ, 0, req, 1); + rep = xenbus_msg_reply(XS_READ, 0, req, ARRAY_SIZE(req)); + char *msg = errmsg(rep); + if (msg) { + *value = NULL; + return msg; + } res = malloc(rep->len + 1); memcpy(res, rep + 1, rep->len); res[rep->len] = 0; free(rep); - return res; + *value = res; + return NULL; +} + +char *xenbus_write(const char *path, const char *value) +{ + struct write_req req[] = { + {path, strlen(path) + 1}, + {value, strlen(value) + 1}, + }; + struct xsd_sockmsg *rep; + rep = xenbus_msg_reply(XS_WRITE, 0, req, ARRAY_SIZE(req)); + char *msg = errmsg(rep); + if (msg) + return msg; + free(rep); + return NULL; +} + +char *xenbus_rm(const char *path) +{ + struct write_req req[] = { {path, strlen(path) + 1} }; + struct xsd_sockmsg *rep; + rep = xenbus_msg_reply(XS_RM, 0, req, ARRAY_SIZE(req)); + char *msg = errmsg(rep); + if (msg) + return msg; + free(rep); + return NULL; +} + +char *xenbus_get_perms(const char *path, char **value) +{ + struct write_req req[] = { {path, strlen(path) + 1} }; + struct xsd_sockmsg *rep; + char *res; + rep = xenbus_msg_reply(XS_GET_PERMS, 0, req, ARRAY_SIZE(req)); + char *msg = errmsg(rep); + if (msg) { + *value = NULL; + return msg; + } + res = malloc(rep->len + 1); + memcpy(res, rep + 1, rep->len); + res[rep->len] = 0; + free(rep); + *value = res; + return NULL; +} + +#define PERM_MAX_SIZE 32 +char *xenbus_set_perms(const char *path, domid_t dom, char perm) +{ + char value[PERM_MAX_SIZE]; + snprintf(value, PERM_MAX_SIZE, "%c%hu", perm, dom); + struct write_req req[] = { + {path, strlen(path) + 1}, + {value, strlen(value) + 1}, + }; + struct xsd_sockmsg *rep; + rep = xenbus_msg_reply(XS_SET_PERMS, 0, req, ARRAY_SIZE(req)); + char *msg = errmsg(rep); + if (msg) + return msg; + free(rep); + return NULL; } static void do_ls_test(const char *pre) @@ -351,7 +440,12 @@ static void do_ls_test(const char *pre) int x; DEBUG("ls %s...\n", pre); - dirs = xenbus_ls(pre); + char *msg = xenbus_ls(pre, &dirs); + if (msg) { + DEBUG("Error in xenbus ls: %s\n", msg); + free(msg); + return; + } for (x = 0; dirs[x]; x++) { DEBUG("ls %s[%d] -> %s\n", pre, x, dirs[x]); @@ -364,9 +458,38 @@ static void do_read_test(const char *pat { char *res; DEBUG("Read %s...\n", path); - res = xenbus_read(path); + char *msg = xenbus_read(path, &res); + if (msg) { + DEBUG("Error in xenbus read: %s\n", msg); + free(msg); + return; + } DEBUG("Read %s -> %s.\n", path, res); free(res); +} + +static void do_write_test(const char *path, const char *val) +{ + DEBUG("Write %s to %s...\n", val, path); + char *msg = xenbus_write(path, val); + if (msg) { + DEBUG("Result %s\n", msg); + free(msg); + } else { + DEBUG("Success.\n"); + } +} + +static void do_rm_test(const char *path) +{ + DEBUG("rm %s...\n", path); + char *msg = xenbus_rm(path); + if (msg) { + DEBUG("Result %s\n", msg); + free(msg); + } else { + DEBUG("Success.\n"); + } } /* Simple testing thing */ @@ -383,5 +506,22 @@ void test_xenbus(void) DEBUG("Doing read test.\n"); do_read_test("device/vif/0/mac"); do_read_test("device/vif/0/backend"); - printk("Xenbus initialised.\n"); -} + + DEBUG("Doing write test.\n"); + do_write_test("device/vif/0/flibble", "flobble"); + do_read_test("device/vif/0/flibble"); + do_write_test("device/vif/0/flibble", "widget"); + do_read_test("device/vif/0/flibble"); + + DEBUG("Doing rm test.\n"); + do_rm_test("device/vif/0/flibble"); + do_read_test("device/vif/0/flibble"); + DEBUG("(Should have said ENOENT)\n"); +} + +/* + * Local variables: + * mode: C + * c-basic-offset: 4 + * End: + */ diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c --- a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c Thu Jun 15 10:23:57 2006 -0600 @@ -133,6 +133,7 @@ void xen_tlb_flush(void) op.cmd = MMUEXT_TLB_FLUSH_LOCAL; BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } +EXPORT_SYMBOL(xen_tlb_flush); void xen_invlpg(unsigned long ptr) { @@ -141,6 +142,7 @@ void xen_invlpg(unsigned long ptr) op.arg1.linear_addr = ptr & PAGE_MASK; BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } +EXPORT_SYMBOL(xen_invlpg); #ifdef CONFIG_SMP @@ -363,7 +365,8 @@ void xen_destroy_contiguous_region(unsig }; set_xen_guest_handle(reservation.extent_start, &frame); - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap) || + !test_bit(__pa(vstart) >> PAGE_SHIFT, contiguous_bitmap)) return; scrub_pages(vstart, 1 << order); diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/arch/i386/mm/init-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Thu Jun 15 10:23:57 2006 -0600 @@ -763,7 +763,7 @@ void __init pgtable_cache_init(void) #endif 0, pgd_ctor, - pgd_dtor); + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c Thu Jun 15 10:23:57 2006 -0600 @@ -300,11 +300,6 @@ void pgd_ctor(void *pgd, kmem_cache_t *c unsigned long flags; if (PTRS_PER_PMD > 1) { - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) { - int rc = xen_create_contiguous_region( - (unsigned long)pgd, 0, 32); - BUG_ON(rc); - } if (HAVE_SHARED_KERNEL_PMD) clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, @@ -320,69 +315,105 @@ void pgd_ctor(void *pgd, kmem_cache_t *c } } +/* never called when PTRS_PER_PMD > 1 */ void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ - if (PTRS_PER_PMD > 1) { - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) - xen_destroy_contiguous_region((unsigned long)pgd, 0); - } else { - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - - pgd_test_and_unpin(pgd); - } + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + + pgd_test_and_unpin(pgd); } pgd_t *pgd_alloc(struct mm_struct *mm) { int i; pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + pmd_t **pmd; + unsigned long flags; pgd_test_and_unpin(pgd); if (PTRS_PER_PMD == 1 || !pgd) return pgd; - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); - } - - if (!HAVE_SHARED_KERNEL_PMD) { - unsigned long flags; - - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { + if (HAVE_SHARED_KERNEL_PMD) { + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); if (!pmd) goto out_oom; set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } - - spin_lock_irqsave(&pgd_lock, flags); - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { - unsigned long v = (unsigned long)i << PGDIR_SHIFT; - pgd_t *kpgd = pgd_offset_k(v); - pud_t *kpud = pud_offset(kpgd, v); - pmd_t *kpmd = pmd_offset(kpud, v); - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); - memcpy(pmd, kpmd, PAGE_SIZE); - make_lowmem_page_readonly( - pmd, XENFEAT_writable_page_tables); + return pgd; + } + + /* + * We can race save/restore (if we sleep during a GFP_KERNEL memory + * allocation). We therefore store virtual addresses of pmds as they + * do not change across save/restore, and poke the machine addresses + * into the pgdir under the pgd_lock. + */ + pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); + if (!pmd) { + kmem_cache_free(pgd_cache, pgd); + return NULL; + } + + /* Allocate pmds, remember virtual addresses. */ + for (i = 0; i < PTRS_PER_PGD; ++i) { + pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd[i]) + goto out_oom; + } + + spin_lock_irqsave(&pgd_lock, flags); + + /* Protect against save/restore: move below 4GB under pgd_lock. */ + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) { + int rc = xen_create_contiguous_region( + (unsigned long)pgd, 0, 32); + if (rc) { + spin_unlock_irqrestore(&pgd_lock, flags); + goto out_oom; } - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - } + } + + /* Copy kernel pmd contents and write-protect the new pmds. */ + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { + unsigned long v = (unsigned long)i << PGDIR_SHIFT; + pgd_t *kpgd = pgd_offset_k(v); + pud_t *kpud = pud_offset(kpgd, v); + pmd_t *kpmd = pmd_offset(kpud, v); + memcpy(pmd[i], kpmd, PAGE_SIZE); + make_lowmem_page_readonly( + pmd[i], XENFEAT_writable_page_tables); + } + + /* It is safe to poke machine addresses of pmds under the pmd_lock. */ + for (i = 0; i < PTRS_PER_PGD; i++) + set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i]))); + + /* Ensure this pgd gets picked up and pinned on save/restore. */ + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); + + kfree(pmd); return pgd; out_oom: - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + if (HAVE_SHARED_KERNEL_PMD) { + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, + (void *)__va(pgd_val(pgd[i])-1)); + } else { + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, pmd[i]); + kfree(pmd); + } kmem_cache_free(pgd_cache, pgd); return NULL; } @@ -391,6 +422,14 @@ void pgd_free(pgd_t *pgd) { int i; + /* + * After this the pgd should not be pinned for the duration of this + * function's execution. We should never sleep and thus never race: + * 1. User pmds will not become write-protected under our feet due + * to a concurrent mm_pin_all(). + * 2. The machine addresses in PGD entries will not become invalid + * due to a concurrent save/restore. + */ pgd_test_and_unpin(pgd); /* in the PAE case user pgd entries are overwritten before usage */ @@ -399,11 +438,13 @@ void pgd_free(pgd_t *pgd) pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); kmem_cache_free(pmd_cache, pmd); } + if (!HAVE_SHARED_KERNEL_PMD) { unsigned long flags; spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); make_lowmem_page_writable( @@ -411,8 +452,13 @@ void pgd_free(pgd_t *pgd) memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); kmem_cache_free(pmd_cache, pmd); } + + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) + xen_destroy_contiguous_region( + (unsigned long)pgd, 0); } } + /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } @@ -588,7 +634,7 @@ void mm_pin(struct mm_struct *mm) void mm_pin(struct mm_struct *mm) { if (xen_feature(XENFEAT_writable_page_tables)) - return; + return; spin_lock(&mm->page_table_lock); __pgd_pin(mm->pgd); spin_unlock(&mm->page_table_lock); @@ -597,7 +643,7 @@ void mm_unpin(struct mm_struct *mm) void mm_unpin(struct mm_struct *mm) { if (xen_feature(XENFEAT_writable_page_tables)) - return; + return; spin_lock(&mm->page_table_lock); __pgd_unpin(mm->pgd); spin_unlock(&mm->page_table_lock); @@ -607,11 +653,17 @@ void mm_pin_all(void) { struct page *page; if (xen_feature(XENFEAT_writable_page_tables)) - return; + return; for (page = pgd_list; page; page = (struct page *)page->index) { if (!test_bit(PG_pinned, &page->flags)) __pgd_pin((pgd_t *)page_address(page)); } +} + +void _arch_dup_mmap(struct mm_struct *mm) +{ + if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags)) + mm_pin(mm); } void _arch_exit_mmap(struct mm_struct *mm) diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/arch/x86_64/mm/pageattr-xen.c Thu Jun 15 10:23:57 2006 -0600 @@ -130,6 +130,12 @@ void mm_pin_all(void) context.unpinned)); } +void _arch_dup_mmap(struct mm_struct *mm) +{ + if (!mm->context.pinned) + mm_pin(mm); +} + void _arch_exit_mmap(struct mm_struct *mm) { struct task_struct *tsk = current; diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Thu Jun 15 10:23:57 2006 -0600 @@ -342,8 +342,20 @@ static void blkfront_closing(struct xenb static void blkfront_closing(struct xenbus_device *dev) { struct blkfront_info *info = dev->dev.driver_data; + unsigned long flags; DPRINTK("blkfront_closing: %s removed\n", dev->nodename); + + if (info->rq == NULL) + return; + + spin_lock_irqsave(&blkif_io_lock, flags); + /* No more blkif_request(). */ + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + flush_scheduled_work(); + spin_unlock_irqrestore(&blkif_io_lock, flags); xlvbd_del(info); @@ -407,7 +419,8 @@ static void blkif_restart_queue(void *ar { struct blkfront_info *info = (struct blkfront_info *)arg; spin_lock_irq(&blkif_io_lock); - kick_pending_request_queues(info); + if (info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(info); spin_unlock_irq(&blkif_io_lock); } @@ -695,6 +708,12 @@ static void blkif_free(struct blkfront_i spin_lock_irq(&blkif_io_lock); info->connected = suspend ? BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_stop_queue(info->rq); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + flush_scheduled_work(); spin_unlock_irq(&blkif_io_lock); /* Free resources associated with old device channel. */ @@ -768,17 +787,17 @@ static void blkif_recover(struct blkfron (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); + spin_lock_irq(&blkif_io_lock); + /* Now safe for us to use the shared ring */ - spin_lock_irq(&blkif_io_lock); info->connected = BLKIF_STATE_CONNECTED; - spin_unlock_irq(&blkif_io_lock); /* Send off requeued requests */ flush_requests(info); /* Kick any other new requests queued since we resumed */ - spin_lock_irq(&blkif_io_lock); kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); } diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/drivers/xen/core/gnttab.c --- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Thu Jun 15 10:23:57 2006 -0600 @@ -334,6 +334,21 @@ out: } EXPORT_SYMBOL_GPL(gnttab_request_free_callback); +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) +{ + struct gnttab_free_callback **pcb; + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) { + if (*pcb == callback) { + *pcb = callback->next; + break; + } + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + #ifndef __ia64__ static int map_pte_fn(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/drivers/xen/core/skbuff.c --- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c Thu Jun 15 10:23:57 2006 -0600 @@ -121,8 +121,15 @@ static int __init skbuff_init(void) for (order = 0; order <= MAX_SKBUFF_ORDER; order++) { size = PAGE_SIZE << order; sprintf(name[order], "xen-skb-%lu", size); - skbuff_order_cachep[order] = kmem_cache_create( - name[order], size, size, 0, skbuff_ctor, skbuff_dtor); + if (is_running_on_xen() && + (xen_start_info->flags & SIF_PRIVILEGED)) + skbuff_order_cachep[order] = kmem_cache_create( + name[order], size, size, 0, + skbuff_ctor, skbuff_dtor); + else + skbuff_order_cachep[order] = kmem_cache_create( + name[order], size, size, 0, NULL, NULL); + } skbuff_cachep = skbuff_order_cachep[0]; diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Thu Jun 15 10:23:57 2006 -0600 @@ -1072,68 +1072,39 @@ static void xennet_set_features(struct n static void network_connect(struct net_device *dev) { - struct netfront_info *np; + struct netfront_info *np = netdev_priv(dev); int i, requeue_idx; - struct netif_tx_request *tx; struct sk_buff *skb; xennet_set_features(dev); - np = netdev_priv(dev); spin_lock_irq(&np->tx_lock); spin_lock(&np->rx_lock); - /* Recovery procedure: */ - /* - * Step 1: Rebuild the RX and TX ring contents. - * NB. We could just free the queued TX packets now but we hope - * that sending them out might do some good. We have to rebuild - * the RX ring because some of our pages are currently flipped out - * so we can't just free the RX skbs. - * NB2. Freelist index entries are always going to be less than + * Recovery procedure: + * NB. Freelist index entries are always going to be less than * PAGE_OFFSET, whereas pointers to skbs will always be equal or - * greater than PAGE_OFFSET: we use this property to distinguish - * them. - */ - - /* - * Rebuild the TX buffer freelist and the TX ring itself. - * NB. This reorders packets. We could keep more private state - * to avoid this but maybe it doesn't matter so much given the - * interface has been down. - */ + * greater than PAGE_OFFSET: we use this property to distinguish + * them. + */ + + /* Step 1: Discard all pending TX packet fragments. */ for (requeue_idx = 0, i = 1; i <= NET_TX_RING_SIZE; i++) { if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET) continue; skb = np->tx_skbs[i]; - - tx = RING_GET_REQUEST(&np->tx, requeue_idx); - requeue_idx++; - - tx->id = i; - gnttab_grant_foreign_access_ref( - np->grant_tx_ref[i], np->xbdev->otherend_id, - virt_to_mfn(np->tx_skbs[i]->data), - GNTMAP_readonly); - tx->gref = np->grant_tx_ref[i]; - tx->offset = (unsigned long)skb->data & ~PAGE_MASK; - tx->size = skb->len; - tx->flags = 0; - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ - tx->flags |= NETTXF_csum_blank | NETTXF_data_validated; - if (skb->proto_data_valid) /* remote but checksummed? */ - tx->flags |= NETTXF_data_validated; - - np->stats.tx_bytes += skb->len; - np->stats.tx_packets++; - } - - np->tx.req_prod_pvt = requeue_idx; - RING_PUSH_REQUESTS(&np->tx); - - /* Rebuild the RX buffer freelist and the RX ring itself. */ + gnttab_end_foreign_access_ref( + np->grant_tx_ref[i], GNTMAP_readonly); + gnttab_release_grant_reference( + &np->gref_tx_head, np->grant_tx_ref[i]); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + add_id_to_freelist(np->tx_skbs, i); + dev_kfree_skb_irq(skb); + } + + /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */ for (requeue_idx = 0, i = 1; i <= NET_RX_RING_SIZE; i++) { if ((unsigned long)np->rx_skbs[i] < PAGE_OFFSET) continue; @@ -1150,7 +1121,7 @@ static void network_connect(struct net_d RING_PUSH_REQUESTS(&np->rx); /* - * Step 2: All public and private state should now be sane. Get + * Step 3: All public and private state should now be sane. Get * ready to start sending and receiving packets and give the driver * domain a kick because we've probably just requeued some * packets. @@ -1158,6 +1129,7 @@ static void network_connect(struct net_d netif_carrier_on(dev); notify_remote_via_irq(np->irq); network_tx_buf_gc(dev); + network_alloc_rx_buffers(dev); spin_unlock(&np->rx_lock); spin_unlock_irq(&np->tx_lock); diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu.h Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu.h Thu Jun 15 10:23:57 2006 -0600 @@ -18,4 +18,8 @@ extern void _arch_exit_mmap(struct mm_st extern void _arch_exit_mmap(struct mm_struct *mm); #define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) +/* kernel/fork.c:dup_mmap hook */ +extern void _arch_dup_mmap(struct mm_struct *mm); +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm)) + #endif diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h Thu Jun 15 10:23:57 2006 -0600 @@ -51,8 +51,7 @@ static inline void switch_mm(struct mm_s struct mmuext_op _op[2], *op = _op; if (likely(prev != next)) { - if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) - mm_pin(next); + BUG_ON(!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)); /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); @@ -99,7 +98,11 @@ static inline void switch_mm(struct mm_s #define deactivate_mm(tsk, mm) \ asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) -#define activate_mm(prev, next) \ - switch_mm((prev),(next),NULL) +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) + mm_pin(next); + switch_mm(prev, next, NULL); +} #endif diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h Thu Jun 15 10:23:57 2006 -0600 @@ -25,9 +25,9 @@ static char * __init machine_specific_me if ( rc == -ENOSYS ) { memmap.nr_entries = 1; map[0].addr = 0ULL; - map[0].size = xen_start_info->nr_pages << PAGE_SHIFT; + map[0].size = PFN_PHYS(xen_start_info->nr_pages); /* 8MB slack (to balance backend allocations). */ - map[0].size += 8 << 20; + map[0].size += 8ULL << 20; map[0].type = E820_RAM; rc = 0; } diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu.h Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu.h Thu Jun 15 10:23:57 2006 -0600 @@ -28,6 +28,10 @@ extern spinlock_t mm_unpinned_lock; /* mm/memory.c:exit_mmap hook */ extern void _arch_exit_mmap(struct mm_struct *mm); #define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) + +/* kernel/fork.c:dup_mmap hook */ +extern void _arch_dup_mmap(struct mm_struct *mm); +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm)) #endif #endif diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h Thu Jun 15 10:23:57 2006 -0600 @@ -73,8 +73,7 @@ static inline void switch_mm(struct mm_s struct mmuext_op _op[3], *op = _op; if (likely(prev != next)) { - if (!next->context.pinned) - mm_pin(next); + BUG_ON(!next->context.pinned); /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); @@ -127,8 +126,11 @@ static inline void switch_mm(struct mm_s asm volatile("movl %0,%%fs"::"r"(0)); \ } while(0) -#define activate_mm(prev, next) do { \ - switch_mm((prev),(next),NULL); \ -} while (0) +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + if (!next->context.pinned) + mm_pin(next); + switch_mm(prev, next, NULL); +} #endif diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/include/xen/gnttab.h --- a/linux-2.6-xen-sparse/include/xen/gnttab.h Thu Jun 15 10:02:53 2006 -0600 +++ b/linux-2.6-xen-sparse/include/xen/gnttab.h Thu Jun 15 10:23:57 2006 -0600 @@ -100,6 +100,7 @@ void gnttab_release_grant_reference(gran void gnttab_request_free_callback(struct gnttab_free_callback *callback, void (*fn)(void *), void *arg, u16 count); +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); diff -r 7f67c15e2c91 -r fbc0e953732e tools/console/daemon/io.c --- a/tools/console/daemon/io.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/console/daemon/io.c Thu Jun 15 10:23:57 2006 -0600 @@ -24,8 +24,8 @@ #include "io.h" #include <xenctrl.h> #include <xs.h> -#include <xen/linux/evtchn.h> #include <xen/io/console.h> +#include <xenctrl.h> #include <malloc.h> #include <stdlib.h> @@ -36,7 +36,6 @@ #include <unistd.h> #include <termios.h> #include <stdarg.h> -#include <sys/ioctl.h> #include <sys/mman.h> #define MAX(a, b) (((a) > (b)) ? (a) : (b)) @@ -64,18 +63,11 @@ struct domain char *conspath; int ring_ref; evtchn_port_t local_port; - int evtchn_fd; + int xce_handle; struct xencons_interface *interface; }; static struct domain *dom_head; - -static void evtchn_notify(struct domain *dom) -{ - struct ioctl_evtchn_notify notify; - notify.port = dom->local_port; - (void)ioctl(dom->evtchn_fd, IOCTL_EVTCHN_NOTIFY, ¬ify); -} static void buffer_append(struct domain *dom) { @@ -106,7 +98,7 @@ static void buffer_append(struct domain mb(); intf->out_cons = cons; - evtchn_notify(dom); + xc_evtchn_notify(dom->xce_handle, dom->local_port); if (buffer->max_capacity && buffer->size > buffer->max_capacity) { @@ -234,7 +226,6 @@ static int domain_create_ring(struct dom static int domain_create_ring(struct domain *dom) { int err, remote_port, ring_ref, rc; - struct ioctl_evtchn_bind_interdomain bind; err = xs_gather(xs, dom->conspath, "ring-ref", "%u", &ring_ref, @@ -258,24 +249,24 @@ static int domain_create_ring(struct dom } dom->local_port = -1; - if (dom->evtchn_fd != -1) - close(dom->evtchn_fd); + if (dom->xce_handle != -1) + xc_evtchn_close(dom->xce_handle); /* Opening evtchn independently for each console is a bit * wasteful, but that's how the code is structured... */ - dom->evtchn_fd = open("/dev/xen/evtchn", O_RDWR); - if (dom->evtchn_fd == -1) { + dom->xce_handle = xc_evtchn_open(); + if (dom->xce_handle == -1) { err = errno; goto out; } - bind.remote_domain = dom->domid; - bind.remote_port = remote_port; - rc = ioctl(dom->evtchn_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); + rc = xc_evtchn_bind_interdomain(dom->xce_handle, + dom->domid, remote_port); + if (rc == -1) { err = errno; - close(dom->evtchn_fd); - dom->evtchn_fd = -1; + xc_evtchn_close(dom->xce_handle); + dom->xce_handle = -1; goto out; } dom->local_port = rc; @@ -285,8 +276,8 @@ static int domain_create_ring(struct dom if (dom->tty_fd == -1) { err = errno; - close(dom->evtchn_fd); - dom->evtchn_fd = -1; + xc_evtchn_close(dom->xce_handle); + dom->xce_handle = -1; dom->local_port = -1; goto out; } @@ -344,7 +335,7 @@ static struct domain *create_domain(int dom->ring_ref = -1; dom->local_port = -1; dom->interface = NULL; - dom->evtchn_fd = -1; + dom->xce_handle = -1; if (!watch_domain(dom, true)) goto out; @@ -409,9 +400,9 @@ static void shutdown_domain(struct domai if (d->interface != NULL) munmap(d->interface, getpagesize()); d->interface = NULL; - if (d->evtchn_fd != -1) - close(d->evtchn_fd); - d->evtchn_fd = -1; + if (d->xce_handle != -1) + xc_evtchn_close(d->xce_handle); + d->xce_handle = -1; cleanup_domain(d); } @@ -483,7 +474,7 @@ static void handle_tty_read(struct domai } wmb(); intf->in_prod = prod; - evtchn_notify(dom); + xc_evtchn_notify(dom->xce_handle, dom->local_port); } else { close(dom->tty_fd); dom->tty_fd = -1; @@ -516,14 +507,14 @@ static void handle_tty_write(struct doma static void handle_ring_read(struct domain *dom) { - evtchn_port_t v; - - if (!read_sync(dom->evtchn_fd, &v, sizeof(v))) + evtchn_port_t port; + + if ((port = xc_evtchn_pending(dom->xce_handle)) == -1) return; buffer_append(dom); - (void)write_sync(dom->evtchn_fd, &v, sizeof(v)); + (void)xc_evtchn_unmask(dom->xce_handle, port); } static void handle_xs(void) @@ -566,9 +557,10 @@ void handle_io(void) max_fd = MAX(xs_fileno(xs), max_fd); for (d = dom_head; d; d = d->next) { - if (d->evtchn_fd != -1) { - FD_SET(d->evtchn_fd, &readfds); - max_fd = MAX(d->evtchn_fd, max_fd); + if (d->xce_handle != -1) { + int evtchn_fd = xc_evtchn_fd(d->xce_handle); + FD_SET(evtchn_fd, &readfds); + max_fd = MAX(evtchn_fd, max_fd); } if (d->tty_fd != -1) { @@ -588,8 +580,8 @@ void handle_io(void) for (d = dom_head; d; d = n) { n = d->next; - if (d->evtchn_fd != -1 && - FD_ISSET(d->evtchn_fd, &readfds)) + if (d->xce_handle != -1 && + FD_ISSET(xc_evtchn_fd(d->xce_handle), &readfds)) handle_ring_read(d); if (d->tty_fd != -1) { diff -r 7f67c15e2c91 -r fbc0e953732e tools/console/daemon/utils.c --- a/tools/console/daemon/utils.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/console/daemon/utils.c Thu Jun 15 10:23:57 2006 -0600 @@ -38,32 +38,6 @@ struct xs_handle *xs; int xc; - -bool _read_write_sync(int fd, void *data, size_t size, bool do_read) -{ - size_t offset = 0; - ssize_t len; - - while (offset < size) { - if (do_read) { - len = read(fd, data + offset, size - offset); - } else { - len = write(fd, data + offset, size - offset); - } - - if (len < 1) { - if (len == -1 && (errno == EAGAIN || errno == EINTR)) { - continue; - } else { - return false; - } - } else { - offset += len; - } - } - - return true; -} static void child_exit(int sig) { diff -r 7f67c15e2c91 -r fbc0e953732e tools/console/daemon/utils.h --- a/tools/console/daemon/utils.h Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/console/daemon/utils.h Thu Jun 15 10:23:57 2006 -0600 @@ -29,9 +29,6 @@ void daemonize(const char *pidfile); bool xen_setup(void); -#define read_sync(fd, buffer, size) _read_write_sync(fd, buffer, size, true) -#define write_sync(fd, buffer, size) _read_write_sync(fd, buffer, size, false) -bool _read_write_sync(int fd, void *data, size_t size, bool do_read); extern struct xs_handle *xs; extern int xc; diff -r 7f67c15e2c91 -r fbc0e953732e tools/ioemu/sdl.c --- a/tools/ioemu/sdl.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/ioemu/sdl.c Thu Jun 15 10:23:57 2006 -0600 @@ -376,13 +376,18 @@ static void sdl_update_caption(void) static void sdl_hide_cursor(void) { - SDL_SetCursor(sdl_cursor_hidden); + if (kbd_mouse_is_absolute()) { + SDL_ShowCursor(1); + SDL_SetCursor(sdl_cursor_hidden); + } else { + SDL_ShowCursor(0); + } } static void sdl_show_cursor(void) { if (!kbd_mouse_is_absolute()) { - SDL_SetCursor(sdl_cursor_normal); + SDL_ShowCursor(1); } } diff -r 7f67c15e2c91 -r fbc0e953732e tools/ioemu/target-i386-dm/helper2.c --- a/tools/ioemu/target-i386-dm/helper2.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/ioemu/target-i386-dm/helper2.c Thu Jun 15 10:23:57 2006 -0600 @@ -47,11 +47,9 @@ #include <limits.h> #include <fcntl.h> -#include <sys/ioctl.h> #include <xenctrl.h> #include <xen/hvm/ioreq.h> -#include <xen/linux/evtchn.h> #include "cpu.h" #include "exec-all.h" @@ -123,7 +121,7 @@ target_ulong cpu_get_phys_page_debug(CPU } //the evtchn fd for polling -int evtchn_fd = -1; +int xce_handle = -1; //which vcpu we are serving int send_vcpu = 0; @@ -170,11 +168,10 @@ static ioreq_t* __cpu_get_ioreq(int vcpu //retval--the number of ioreq packet static ioreq_t* cpu_get_ioreq(void) { - int i, rc; + int i; evtchn_port_t port; - rc = read(evtchn_fd, &port, sizeof(port)); - if ( rc == sizeof(port) ) { + if ( (port = xc_evtchn_pending(xce_handle)) != -1 ) { for ( i = 0; i < vcpus; i++ ) if ( shared_page->vcpu_iodata[i].dm_eport == port ) break; @@ -184,8 +181,7 @@ static ioreq_t* cpu_get_ioreq(void) exit(1); } - // unmask the wanted port again - write(evtchn_fd, &port, sizeof(port)); + xc_evtchn_unmask(xce_handle, port); //get the io packet from shared memory send_vcpu = i; @@ -436,6 +432,7 @@ int main_loop(void) extern int shutdown_requested; CPUState *env = global_env; int retval; + int evtchn_fd = xc_evtchn_fd(xce_handle); extern void main_loop_wait(int); /* Watch stdin (fd 0) to see when it has input. */ @@ -475,11 +472,9 @@ int main_loop(void) main_loop_wait(0); if (env->send_event) { - struct ioctl_evtchn_notify notify; - env->send_event = 0; - notify.port = shared_page->vcpu_iodata[send_vcpu].dm_eport; - (void)ioctl(evtchn_fd, IOCTL_EVTCHN_NOTIFY, ¬ify); + (void)xc_evtchn_notify(xce_handle, + shared_page->vcpu_iodata[send_vcpu].dm_eport); } } destroy_hvm_domain(); @@ -511,7 +506,6 @@ CPUState * cpu_init() CPUState * cpu_init() { CPUX86State *env; - struct ioctl_evtchn_bind_interdomain bind; int i, rc; cpu_exec_init(); @@ -523,21 +517,19 @@ CPUState * cpu_init() cpu_single_env = env; - if (evtchn_fd != -1)//the evtchn has been opened by another cpu object + if (xce_handle != -1)//the evtchn has been opened by another cpu object return NULL; - //use nonblock reading not polling, may change in future. - evtchn_fd = open("/dev/xen/evtchn", O_RDWR|O_NONBLOCK); - if (evtchn_fd == -1) { + xce_handle = xc_evtchn_open(); + if (xce_handle == -1) { fprintf(logfile, "open evtchn device error %d\n", errno); return NULL; } /* FIXME: how about if we overflow the page here? */ - bind.remote_domain = domid; for ( i = 0; i < vcpus; i++ ) { - bind.remote_port = shared_page->vcpu_iodata[i].vp_eport; - rc = ioctl(evtchn_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); + rc = xc_evtchn_bind_interdomain(xce_handle, domid, + shared_page->vcpu_iodata[i].vp_eport); if ( rc == -1 ) { fprintf(logfile, "bind interdomain ioctl error %d\n", errno); return NULL; diff -r 7f67c15e2c91 -r fbc0e953732e tools/libxc/xc_elf.h --- a/tools/libxc/xc_elf.h Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/libxc/xc_elf.h Thu Jun 15 10:23:57 2006 -0600 @@ -170,13 +170,14 @@ typedef struct { #define EM_PARISC 15 /* HPPA */ #define EM_SPARC32PLUS 18 /* Enhanced instruction set SPARC */ #define EM_PPC 20 /* PowerPC */ +#define EM_PPC64 21 /* PowerPC 64-bit */ #define EM_ARM 40 /* Advanced RISC Machines ARM */ #define EM_ALPHA 41 /* DEC ALPHA */ #define EM_SPARCV9 43 /* SPARC version 9 */ #define EM_ALPHA_EXP 0x9026 /* DEC ALPHA */ +#define EM_IA_64 50 /* Intel Merced */ #define EM_X86_64 62 /* AMD x86-64 architecture */ #define EM_VAX 75 /* DEC VAX */ -#define EM_NUM 15 /* number of machine types */ /* Version */ #define EV_NONE 0 /* Invalid */ diff -r 7f67c15e2c91 -r fbc0e953732e tools/libxc/xc_linux.c --- a/tools/libxc/xc_linux.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/libxc/xc_linux.c Thu Jun 15 10:23:57 2006 -0600 @@ -103,6 +103,124 @@ int do_xen_hypercall(int xc_handle, priv (unsigned long)hypercall); } +#define EVTCHN_DEV_NAME "/dev/xen/evtchn" +#define EVTCHN_DEV_MAJOR 10 +#define EVTCHN_DEV_MINOR 201 + +int xc_evtchn_open(void) +{ + struct stat st; + int fd; + + /* Make sure any existing device file links to correct device. */ + if ((lstat(EVTCHN_DEV_NAME, &st) != 0) || !S_ISCHR(st.st_mode) || + (st.st_rdev != makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR))) + (void)unlink(EVTCHN_DEV_NAME); + +reopen: + if ( (fd = open(EVTCHN_DEV_NAME, O_RDWR)) == -1 ) + { + if ( (errno == ENOENT) && + ((mkdir("/dev/xen", 0755) == 0) || (errno == EEXIST)) && + (mknod(EVTCHN_DEV_NAME, S_IFCHR|0600, + makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR)) == 0) ) + goto reopen; + + PERROR("Could not open event channel interface"); + return -1; + } + + return fd; +} + +int xc_evtchn_close(int xce_handle) +{ + return close(xce_handle); +} + +int xc_evtchn_fd(int xce_handle) +{ + return xce_handle; +} + +int xc_evtchn_notify(int xce_handle, evtchn_port_t port) +{ + struct ioctl_evtchn_notify notify; + + notify.port = port; + + return ioctl(xce_handle, IOCTL_EVTCHN_NOTIFY, ¬ify); +} + +evtchn_port_t xc_evtchn_bind_interdomain(int xce_handle, int domid, + evtchn_port_t remote_port) +{ + struct ioctl_evtchn_bind_interdomain bind; + + bind.remote_domain = domid; + bind.remote_port = remote_port; + + return ioctl(xce_handle, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); +} + +int xc_evtchn_unbind(int xce_handle, evtchn_port_t port) +{ + struct ioctl_evtchn_unbind unbind; + + unbind.port = port; + + return ioctl(xce_handle, IOCTL_EVTCHN_UNBIND, &unbind); +} + +evtchn_port_t xc_evtchn_bind_virq(int xce_handle, unsigned int virq) +{ + struct ioctl_evtchn_bind_virq bind; + + bind.virq = virq; + + return ioctl(xce_handle, IOCTL_EVTCHN_BIND_VIRQ, &bind); +} + +static int dorw(int fd, char *data, size_t size, int do_write) +{ + size_t offset = 0; + ssize_t len; + + while ( offset < size ) + { + if (do_write) + len = write(fd, data + offset, size - offset); + else + len = read(fd, data + offset, size - offset); + + if ( len == -1 ) + { + if ( errno == EINTR ) + continue; + return -1; + } + + offset += len; + } + + return 0; +} + +evtchn_port_t xc_evtchn_pending(int xce_handle) +{ + evtchn_port_t port; + + if ( dorw(xce_handle, (char *)&port, sizeof(port), 0) == -1 ) + return -1; + + return port; +} + +int xc_evtchn_unmask(int xce_handle, evtchn_port_t port) +{ + return dorw(xce_handle, (char *)&port, sizeof(port), 1); +} + /* * Local variables: * mode: C diff -r 7f67c15e2c91 -r fbc0e953732e tools/libxc/xc_linux_restore.c --- a/tools/libxc/xc_linux_restore.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/libxc/xc_linux_restore.c Thu Jun 15 10:23:57 2006 -0600 @@ -456,6 +456,15 @@ int xc_linux_restore(int xc_handle, int n+= j; /* crude stats */ } + /* + * Ensure we flush all machphys updates before potential PAE-specific + * reallocations below. + */ + if (xc_finish_mmu_updates(xc_handle, mmu)) { + ERR("Error doing finish_mmu_updates()"); + goto out; + } + DPRINTF("Received all pages (%d races)\n", nraces); if ((pt_levels == 3) && !pae_extended_cr3) { @@ -550,14 +559,11 @@ int xc_linux_restore(int xc_handle, int } } - } - - - if (xc_finish_mmu_updates(xc_handle, mmu)) { - ERR("Error doing finish_mmu_updates()"); - goto out; - } - + if (xc_finish_mmu_updates(xc_handle, mmu)) { + ERR("Error doing finish_mmu_updates()"); + goto out; + } + } /* * Pin page tables. Do this after writing to them as otherwise Xen diff -r 7f67c15e2c91 -r fbc0e953732e tools/libxc/xc_load_elf.c --- a/tools/libxc/xc_load_elf.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/libxc/xc_load_elf.c Thu Jun 15 10:23:57 2006 -0600 @@ -21,6 +21,24 @@ loadelfsymtab( loadelfsymtab( const char *image, int xch, uint32_t dom, xen_pfn_t *parray, struct domain_setup_info *dsi); + +/* + * Elf header attributes we require for each supported host platform. + * These are checked in parseelfimage(). + */ +#if defined(__ia64__) +#define ELFCLASS ELFCLASS64 +#define ELFDATA ELFDATA2LSB +#define ELFMACHINE EM_IA_64 +#elif defined(__i386__) +#define ELFCLASS ELFCLASS32 +#define ELFDATA ELFDATA2LSB +#define ELFMACHINE EM_386 +#elif defined(__x86_64__) +#define ELFCLASS ELFCLASS64 +#define ELFDATA ELFDATA2LSB +#define ELFMACHINE EM_X86_64 +#endif int probe_elf(const char *image, unsigned long image_size, @@ -61,16 +79,10 @@ static int parseelfimage(const char *ima return -EINVAL; } - if ( -#if defined(__i386__) - (ehdr->e_ident[EI_CLASS] != ELFCLASS32) || - (ehdr->e_machine != EM_386) || -#elif defined(__x86_64__) - (ehdr->e_ident[EI_CLASS] != ELFCLASS64) || - (ehdr->e_machine != EM_X86_64) || -#endif - (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) || - (ehdr->e_type != ET_EXEC) ) + if ( (ehdr->e_ident[EI_CLASS] != ELFCLASS) || + (ehdr->e_machine != ELFMACHINE) || + (ehdr->e_ident[EI_DATA] != ELFDATA) || + (ehdr->e_type != ET_EXEC) ) { ERROR("Kernel not a Xen-compatible Elf image."); return -EINVAL; diff -r 7f67c15e2c91 -r fbc0e953732e tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/libxc/xenctrl.h Thu Jun 15 10:23:57 2006 -0600 @@ -604,4 +604,58 @@ int xc_finish_mmu_updates(int xc_handle, int xc_acm_op(int xc_handle, int cmd, void *arg, size_t arg_size); +/* + * Return a handle to the event channel driver, or -1 on failure, in which case + * errno will be set appropriately. + */ +int xc_evtchn_open(void); + +/* + * Close a handle previously allocated with xc_evtchn_open(). + */ +int xc_evtchn_close(int xce_handle); + +/* + * Return an fd that can be select()ed on for further calls to + * xc_evtchn_pending(). + */ +int xc_evtchn_fd(int xce_handle); + +/* + * Notify the given event channel. Returns -1 on failure, in which case + * errno will be set appropriately. + */ +int xc_evtchn_notify(int xce_handle, evtchn_port_t port); + +/* + * Returns a new event port bound to the remote port for the given domain ID, + * or -1 on failure, in which case errno will be set appropriately. + */ +evtchn_port_t xc_evtchn_bind_interdomain(int xce_handle, int domid, + evtchn_port_t remote_port); + +/* + * Unbind the given event channel. Returns -1 on failure, in which case errno + * will be set appropriately. + */ +int xc_evtchn_unbind(int xce_handle, evtchn_port_t port); + +/* + * Bind an event channel to the given VIRQ. Returns the event channel bound to + * the VIRQ, or -1 on failure, in which case errno will be set appropriately. + */ +evtchn_port_t xc_evtchn_bind_virq(int xce_handle, unsigned int virq); + +/* + * Return the next event channel to become pending, or -1 on failure, in which + * case errno will be set appropriately. + */ +evtchn_port_t xc_evtchn_pending(int xce_handle); + +/* + * Unmask the given event channel. Returns -1 on failure, in which case errno + * will be set appropriately. + */ +int xc_evtchn_unmask(int xce_handle, evtchn_port_t port); + #endif diff -r 7f67c15e2c91 -r fbc0e953732e tools/python/xen/util/security.py --- a/tools/python/xen/util/security.py Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/python/xen/util/security.py Thu Jun 15 10:23:57 2006 -0600 @@ -52,7 +52,8 @@ binary_name_re = re.compile(".*[chwall|s binary_name_re = re.compile(".*[chwall|ste|chwall_ste].*\.bin", re.IGNORECASE) policy_name_re = re.compile(".*[chwall|ste|chwall_ste].*", re.IGNORECASE) - +#other global variables +NULL_SSIDREF = 0 log = logging.getLogger("xend.util.security") @@ -255,6 +256,8 @@ def ssidref2label(ssidref_var): #2. get labelnames for both ssidref parts pri_ssid = ssidref & 0xffff sec_ssid = ssidref >> 16 + pri_null_ssid = NULL_SSIDREF & 0xffff + sec_null_ssid = NULL_SSIDREF >> 16 pri_labels = [] sec_labels = [] labels = [] @@ -270,7 +273,11 @@ def ssidref2label(ssidref_var): f.close() #3. get the label that is in both lists (combination must be a single label) - if secondary == "NULL": + if (primary == "CHWALL") and (pri_ssid == pri_null_ssid) and (sec_ssid != sec_null_ssid): + labels = sec_labels + elif (secondary == "CHWALL") and (pri_ssid != pri_null_ssid) and (sec_ssid == sec_null_ssid): + labels = pri_labels + elif secondary == "NULL": labels = pri_labels else: for i in pri_labels: @@ -285,7 +292,7 @@ def ssidref2label(ssidref_var): -def label2ssidref(labelname, policyname): +def label2ssidref(labelname, policyname, type): """ returns ssidref corresponding to labelname; maps current policy to default directory @@ -293,6 +300,14 @@ def label2ssidref(labelname, policyname) if policyname in ['NULL', 'INACTIVE', 'DEFAULT']: err("Cannot translate labels for \'" + policyname + "\' policy.") + + allowed_types = ['ANY'] + if type == 'dom': + allowed_types.append('VM') + elif type == 'res': + allowed_types.append('RES') + else: + err("Invalid type. Must specify 'dom' or 'res'.") (primary, secondary, f, pol_exists) = getmapfile(policyname) @@ -303,11 +318,15 @@ def label2ssidref(labelname, policyname) l = line.split() if (len(l) < 5) or (l[0] != "LABEL->SSID"): continue - if primary and (l[2] == primary) and (l[3] == labelname): + if primary and (l[1] in allowed_types) and (l[2] == primary) and (l[3] == labelname): pri_ssid.append(int(l[4], 16)) - if secondary and (l[2] == secondary) and (l[3] == labelname): + if secondary and (l[1] in allowed_types) and (l[2] == secondary) and (l[3] == labelname): sec_ssid.append(int(l[4], 16)) f.close() + if (type == 'res') and (primary == "CHWALL") and (len(pri_ssid) == 0): + pri_ssid.append(NULL_SSIDREF) + elif (type == 'res') and (secondary == "CHWALL") and (len(sec_ssid) == 0): + sec_ssid.append(NULL_SSIDREF) #3. sanity check and composition of ssidref if (len(pri_ssid) == 0) or ((len(sec_ssid) == 0) and (secondary != "NULL")): @@ -360,7 +379,7 @@ def refresh_ssidref(config): err("Policy \'" + policyname + "\' in label does not match active policy \'" + active_policy +"\'!") - new_ssidref = label2ssidref(labelname, policyname) + new_ssidref = label2ssidref(labelname, policyname, 'dom') if not new_ssidref: err("SSIDREF refresh failed!") @@ -409,7 +428,7 @@ def get_decision(arg1, arg2): enables domains to retrieve access control decisions from the hypervisor Access Control Module. IN: args format = ['domid', id] or ['ssidref', ssidref] - or ['access_control', ['policy', policy], ['label', label]] + or ['access_control', ['policy', policy], ['label', label], ['type', type]] """ if not on(): @@ -417,14 +436,14 @@ def get_decision(arg1, arg2): #translate labels before calling low-level function if arg1[0] == 'access_control': - if (arg1[1][0] != 'policy') or (arg1[2][0] != 'label') : + if (arg1[1][0] != 'policy') or (arg1[2][0] != 'label') or (arg1[3][0] != 'type'): err("Argument type not supported.") - ssidref = label2ssidref(arg1[2][1], arg1[1][1]) + ssidref = label2ssidref(arg1[2][1], arg1[1][1], arg1[3][1]) arg1 = ['ssidref', str(ssidref)] if arg2[0] == 'access_control': - if (arg2[1][0] != 'policy') or (arg2[2][0] != 'label') : + if (arg2[1][0] != 'policy') or (arg2[2][0] != 'label') or (arg2[3][0] != 'type'): err("Argument type not supported.") - ssidref = label2ssidref(arg2[2][1], arg2[1][1]) + ssidref = label2ssidref(arg2[2][1], arg2[1][1], arg2[3][1]) arg2 = ['ssidref', str(ssidref)] # accept only int or string types for domid and ssidref diff -r 7f67c15e2c91 -r fbc0e953732e tools/python/xen/xm/addlabel.py --- a/tools/python/xen/xm/addlabel.py Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/python/xen/xm/addlabel.py Thu Jun 15 10:23:57 2006 -0600 @@ -50,7 +50,7 @@ def main(argv): err("No active policy. Policy must be specified in command line.") #sanity checks: make sure this label can be instantiated later on - ssidref = label2ssidref(label, policyref) + ssidref = label2ssidref(label, policyref, 'dom') new_label = "access_control = ['policy=%s,label=%s']\n" % (policyref, label) if not os.path.isfile(configfile): diff -r 7f67c15e2c91 -r fbc0e953732e tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/python/xen/xm/create.py Thu Jun 15 10:23:57 2006 -0600 @@ -541,7 +541,7 @@ def configure_security(config, vals): if sxp.child_value(config, 'ssidref'): err("ERROR: SSIDREF and access_control are mutually exclusive but both specified!") #else calculate ssidre from label - ssidref = security.label2ssidref(label, policy) + ssidref = security.label2ssidref(label, policy, 'dom') if not ssidref : err("ERROR calculating ssidref from access_control.") security_label = ['security', [ config_access_control, ['ssidref' , ssidref ] ] ] diff -r 7f67c15e2c91 -r fbc0e953732e tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/python/xen/xm/main.py Thu Jun 15 10:23:57 2006 -0600 @@ -1193,6 +1193,9 @@ def main(argv=sys.argv): else: print >>sys.stderr, "Error: %s" % ex.faultString sys.exit(1) + except (ValueError, OverflowError): + err("Invalid argument.") + usage(argv[1]) except: print "Unexpected error:", sys.exc_info()[0] print diff -r 7f67c15e2c91 -r fbc0e953732e tools/security/Makefile --- a/tools/security/Makefile Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/security/Makefile Thu Jun 15 10:23:57 2006 -0600 @@ -33,7 +33,7 @@ OBJS_XML2BIN := $(patsubst %.c,%.o,$(fil ACM_INST_TOOLS = xensec_tool xensec_xml2bin xensec_gen ACM_OBJS = $(OBJS_TOOL) $(OBJS_XML2BIN) $(OBJS_GETD) -ACM_SCRIPTS = python/xensec_tools/acm_getlabel python/xensec_tools/acm_getdecision +ACM_SCRIPTS = python/xensec_tools/acm_getlabel ACM_CONFIG_DIR = /etc/xen/acm-security ACM_POLICY_DIR = $(ACM_CONFIG_DIR)/policies diff -r 7f67c15e2c91 -r fbc0e953732e tools/security/python/xensec_gen/cgi-bin/policy.cgi --- a/tools/security/python/xensec_gen/cgi-bin/policy.cgi Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/security/python/xensec_gen/cgi-bin/policy.cgi Thu Jun 15 10:23:57 2006 -0600 @@ -406,7 +406,7 @@ def parsePolicyXml( ): msg = msg + 'Please validate the Policy file used.' formatXmlError( msg ) - allCSMTypes[csName][1] = csMemberList + allCSMTypes[csName][1] = csMemberList if pOrder != '': formPolicyOrder[1] = pOrder diff -r 7f67c15e2c91 -r fbc0e953732e tools/security/secpol_xml2bin.c --- a/tools/security/secpol_xml2bin.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/security/secpol_xml2bin.c Thu Jun 15 10:23:57 2006 -0600 @@ -44,6 +44,8 @@ #define DEBUG 0 +#define NULL_LABEL_NAME "__NULL_LABEL__" + /* primary / secondary policy component setting */ enum policycomponent { CHWALL, STE, NULLPOLICY } primary = NULLPOLICY, secondary = NULLPOLICY; @@ -467,7 +469,7 @@ int init_ssid_queues(void) return -ENOMEM; /* default chwall ssid */ - default_ssid_chwall->name = "DEFAULT"; + default_ssid_chwall->name = NULL_LABEL_NAME; default_ssid_chwall->num = max_chwall_ssids++; default_ssid_chwall->is_ref = 0; default_ssid_chwall->type = ANY; @@ -484,7 +486,7 @@ int init_ssid_queues(void) max_chwall_labels++; /* default ste ssid */ - default_ssid_ste->name = "DEFAULT"; + default_ssid_ste->name = NULL_LABEL_NAME; default_ssid_ste->num = max_ste_ssids++; default_ssid_ste->is_ref = 0; default_ssid_ste->type = ANY; diff -r 7f67c15e2c91 -r fbc0e953732e tools/xenmon/xenbaked.c --- a/tools/xenmon/xenbaked.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/xenmon/xenbaked.c Thu Jun 15 10:23:57 2006 -0600 @@ -33,9 +33,6 @@ #include <stdlib.h> #include <stdio.h> #include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sys/ioctl.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> @@ -45,7 +42,6 @@ #include <xen/xen.h> #include <string.h> #include <sys/select.h> -#include <xen/linux/evtchn.h> #define PERROR(_m, _a...) \ do { \ @@ -256,51 +252,29 @@ void log_event(int event_id) stat_map[0].event_count++; // other } -#define EVTCHN_DEV_NAME "/dev/xen/evtchn" -#define EVTCHN_DEV_MAJOR 10 -#define EVTCHN_DEV_MINOR 201 - int virq_port; -int eventchn_fd = -1; +int xce_handle = -1; /* Returns the event channel handle. */ /* Stolen from xenstore code */ int eventchn_init(void) { - struct stat st; - struct ioctl_evtchn_bind_virq bind; int rc; // to revert to old way: if (0) return -1; - /* Make sure any existing device file links to correct device. */ - if ((lstat(EVTCHN_DEV_NAME, &st) != 0) || !S_ISCHR(st.st_mode) || - (st.st_rdev != makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR))) - (void)unlink(EVTCHN_DEV_NAME); - - reopen: - eventchn_fd = open(EVTCHN_DEV_NAME, O_NONBLOCK|O_RDWR); - if (eventchn_fd == -1) { - if ((errno == ENOENT) && - ((mkdir("/dev/xen", 0755) == 0) || (errno == EEXIST)) && - (mknod(EVTCHN_DEV_NAME, S_IFCHR|0600, - makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR)) == 0)) - goto reopen; - return -errno; - } - - if (eventchn_fd < 0) + xce_handle = xc_evtchn_open(); + + if (xce_handle < 0) perror("Failed to open evtchn device"); - bind.virq = VIRQ_TBUF; - rc = ioctl(eventchn_fd, IOCTL_EVTCHN_BIND_VIRQ, &bind); - if (rc == -1) + if ((rc = xc_evtchn_bind_virq(xce_handle, VIRQ_TBUF)) == -1) perror("Failed to bind to domain exception virq port"); virq_port = rc; - return eventchn_fd; + return xce_handle; } void wait_for_event(void) @@ -309,27 +283,30 @@ void wait_for_event(void) fd_set inset; evtchn_port_t port; struct timeval tv; + int evtchn_fd; - if (eventchn_fd < 0) { + if (xce_handle < 0) { nanosleep(&opts.poll_sleep, NULL); return; } + evtchn_fd = xc_evtchn_fd(xce_handle); + FD_ZERO(&inset); - FD_SET(eventchn_fd, &inset); + FD_SET(evtchn_fd, &inset); tv.tv_sec = 1; tv.tv_usec = 0; // tv = millis_to_timespec(&opts.poll_sleep); - ret = select(eventchn_fd+1, &inset, NULL, NULL, &tv); + ret = select(evtchn_fd+1, &inset, NULL, NULL, &tv); - if ( (ret == 1) && FD_ISSET(eventchn_fd, &inset)) { - if (read(eventchn_fd, &port, sizeof(port)) != sizeof(port)) + if ( (ret == 1) && FD_ISSET(evtchn_fd, &inset)) { + if ((port = xc_evtchn_pending(xce_handle)) == -1) perror("Failed to read from event fd"); // if (port == virq_port) // printf("got the event I was looking for\r\n"); - - if (write(eventchn_fd, &port, sizeof(port)) != sizeof(port)) + + if (xc_evtchn_unmask(xce_handle, port) == -1) perror("Failed to write to event fd"); } } diff -r 7f67c15e2c91 -r fbc0e953732e tools/xenstat/libxenstat/src/xenstat.c --- a/tools/xenstat/libxenstat/src/xenstat.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/xenstat/libxenstat/src/xenstat.c Thu Jun 15 10:23:57 2006 -0600 @@ -223,18 +223,20 @@ xenstat_node *xenstat_get_node(xenstat_h num_domains = 0; do { - xenstat_domain *domain; + xenstat_domain *domain, *tmp; new_domains = xc_domain_getinfolist(handle->xc_handle, num_domains, DOMAIN_CHUNK_SIZE, domaininfo); - node->domains = realloc(node->domains, - (num_domains + new_domains) - * sizeof(xenstat_domain)); - if (node->domains == NULL) { + tmp = realloc(node->domains, + (num_domains + new_domains) + * sizeof(xenstat_domain)); + if (tmp == NULL) { + free(node->domains); free(node); return NULL; } + node->domains = tmp; domain = node->domains + num_domains; @@ -582,11 +584,14 @@ static int xenstat_collect_networks(xens domain->num_networks = 1; domain->networks = malloc(sizeof(xenstat_network)); } else { + struct xenstat_network *tmp; domain->num_networks++; - domain->networks = - realloc(domain->networks, - domain->num_networks * - sizeof(xenstat_network)); + tmp = realloc(domain->networks, + domain->num_networks * + sizeof(xenstat_network)); + if (tmp == NULL) + free(domain->networks); + domain->networks = tmp; } if (domain->networks == NULL) return 0; diff -r 7f67c15e2c91 -r fbc0e953732e tools/xenstore/fake_libxc.c --- a/tools/xenstore/fake_libxc.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/xenstore/fake_libxc.c Thu Jun 15 10:23:57 2006 -0600 @@ -37,7 +37,7 @@ static evtchn_port_t port; static evtchn_port_t port; /* The event channel maps to a signal, shared page to an mmapped file. */ -void evtchn_notify(int local_port) +void xc_evtchn_notify(int xce_handle, int local_port) { assert(local_port == port); if (kill(xs_test_pid, SIGUSR2) != 0) @@ -124,7 +124,7 @@ void fake_ack_event(void) signal(SIGUSR2, send_to_fd); } -int fake_open_eventchn(void) +int xc_evtchn_open(void) { int fds[2]; diff -r 7f67c15e2c91 -r fbc0e953732e tools/xenstore/xenstored_core.c --- a/tools/xenstore/xenstored_core.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/xenstore/xenstored_core.c Thu Jun 15 10:23:57 2006 -0600 @@ -54,7 +54,7 @@ #include "hashtable.h" -extern int eventchn_fd; /* in xenstored_domain.c */ +extern int xce_handle; /* in xenstored_domain.c */ static bool verbose = false; LIST_HEAD(connections); @@ -353,8 +353,11 @@ static int initialize_set(fd_set *inset, set_fd(sock, inset, &max); set_fd(ro_sock, inset, &max); - set_fd(eventchn_fd, inset, &max); set_fd(reopen_log_pipe[0], inset, &max); + + if (xce_handle != -1) + set_fd(xc_evtchn_fd(xce_handle), inset, &max); + list_for_each_entry(i, &connections, list) { if (i->domain) continue; @@ -1769,6 +1772,7 @@ int main(int argc, char *argv[]) bool outputpid = false; bool no_domain_init = false; const char *pidfile = NULL; + int evtchn_fd = -1; while ((opt = getopt_long(argc, argv, "DE:F:HNPS:T:RLVW:", options, NULL)) != -1) { @@ -1907,6 +1911,9 @@ int main(int argc, char *argv[]) signal(SIGUSR1, stop_failtest); #endif + if (xce_handle != -1) + evtchn_fd = xc_evtchn_fd(xce_handle); + /* Get ready to listen to the tools. */ max = initialize_set(&inset, &outset, *sock, *ro_sock); @@ -1934,7 +1941,7 @@ int main(int argc, char *argv[]) if (FD_ISSET(*ro_sock, &inset)) accept_connection(*ro_sock, false); - if (eventchn_fd > 0 && FD_ISSET(eventchn_fd, &inset)) + if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) handle_event(); list_for_each_entry(i, &connections, list) { diff -r 7f67c15e2c91 -r fbc0e953732e tools/xenstore/xenstored_domain.c --- a/tools/xenstore/xenstored_domain.c Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/xenstore/xenstored_domain.c Thu Jun 15 10:23:57 2006 -0600 @@ -18,15 +18,10 @@ */ #include <stdio.h> -#include <linux/ioctl.h> -#include <sys/ioctl.h> #include <sys/mman.h> #include <unistd.h> #include <stdlib.h> #include <stdarg.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> //#define DEBUG #include "utils.h" @@ -37,12 +32,11 @@ #include "xenstored_test.h" #include <xenctrl.h> -#include <xen/sys/evtchn.h> static int *xc_handle; static evtchn_port_t virq_port; -int eventchn_fd = -1; +int xce_handle = -1; struct domain { @@ -82,19 +76,6 @@ struct domain }; static LIST_HEAD(domains); - -#ifndef TESTING -static void evtchn_notify(int port) -{ - int rc; - - struct ioctl_evtchn_notify notify; - notify.port = port; - rc = ioctl(eventchn_fd, IOCTL_EVTCHN_NOTIFY, ¬ify); -} -#else -extern void evtchn_notify(int port); -#endif /* FIXME: Mark connection as broken (close it?) when this happens. */ static bool check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) @@ -146,7 +127,7 @@ static int writechn(struct connection *c mb(); intf->rsp_prod += len; - evtchn_notify(conn->domain->port); + xc_evtchn_notify(xce_handle, conn->domain->port); return len; } @@ -176,7 +157,7 @@ static int readchn(struct connection *co mb(); intf->req_cons += len; - evtchn_notify(conn->domain->port); + xc_evtchn_notify(xce_handle, conn->domain->port); return len; } @@ -184,13 +165,11 @@ static int destroy_domain(void *_domain) static int destroy_domain(void *_domain) { struct domain *domain = _domain; - struct ioctl_evtchn_unbind unbind; list_del(&domain->list); if (domain->port) { - unbind.port = domain->port; - if (ioctl(eventchn_fd, IOCTL_EVTCHN_UNBIND, &unbind) == -1) + if (xc_evtchn_unbind(xce_handle, domain->port) == -1) eprintf("> Unbinding port %i failed!\n", domain->port); } @@ -231,14 +210,14 @@ void handle_event(void) { evtchn_port_t port; - if (read(eventchn_fd, &port, sizeof(port)) != sizeof(port)) + if ((port = xc_evtchn_pending(xce_handle)) == -1) barf_perror("Failed to read from event fd"); if (port == virq_port) domain_cleanup(); #ifndef TESTING - if (write(eventchn_fd, &port, sizeof(port)) != sizeof(port)) + if (xc_evtchn_unmask(xce_handle, port) == -1) barf_perror("Failed to write to event fd"); #endif } @@ -269,7 +248,6 @@ static struct domain *new_domain(void *c int port) { struct domain *domain; - struct ioctl_evtchn_bind_interdomain bind; int rc; @@ -283,9 +261,7 @@ static struct domain *new_domain(void *c talloc_set_destructor(domain, destroy_domain); /* Tell kernel we're interested in this event. */ - bind.remote_domain = domid; - bind.remote_port = port; - rc = ioctl(eventchn_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); + rc = xc_evtchn_bind_interdomain(xce_handle, domid, port); if (rc == -1) return NULL; domain->port = rc; @@ -490,23 +466,14 @@ static int dom0_init(void) talloc_steal(dom0->conn, dom0); - evtchn_notify(dom0->port); + xc_evtchn_notify(xce_handle, dom0->port); return 0; } - - - -#define EVTCHN_DEV_NAME "/dev/xen/evtchn" -#define EVTCHN_DEV_MAJOR 10 -#define EVTCHN_DEV_MINOR 201 - /* Returns the event channel handle. */ int domain_init(void) { - struct stat st; - struct ioctl_evtchn_bind_virq bind; int rc; xc_handle = talloc(talloc_autofree_context(), int); @@ -519,39 +486,19 @@ int domain_init(void) talloc_set_destructor(xc_handle, close_xc_handle); -#ifdef TESTING - eventchn_fd = fake_open_eventchn(); - (void)&st; -#else - /* Make sure any existing device file links to correct device. */ - if ((lstat(EVTCHN_DEV_NAME, &st) != 0) || !S_ISCHR(st.st_mode) || - (st.st_rdev != makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR))) - (void)unlink(EVTCHN_DEV_NAME); - - reopen: - eventchn_fd = open(EVTCHN_DEV_NAME, O_NONBLOCK|O_RDWR); - if (eventchn_fd == -1) { - if ((errno == ENOENT) && - ((mkdir("/dev/xen", 0755) == 0) || (errno == EEXIST)) && - (mknod(EVTCHN_DEV_NAME, S_IFCHR|0600, - makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR)) == 0)) - goto reopen; - return -errno; - } -#endif - if (eventchn_fd < 0) + xce_handle = xc_evtchn_open(); + + if (xce_handle < 0) barf_perror("Failed to open evtchn device"); if (dom0_init() != 0) barf_perror("Failed to initialize dom0 state"); - bind.virq = VIRQ_DOM_EXC; - rc = ioctl(eventchn_fd, IOCTL_EVTCHN_BIND_VIRQ, &bind); - if (rc == -1) + if ((rc = xc_evtchn_bind_virq(xce_handle, VIRQ_DOM_EXC)) == -1) barf_perror("Failed to bind to domain exception virq port"); virq_port = rc; - return eventchn_fd; + return xce_handle; } void domain_entry_inc(struct connection *conn) diff -r 7f67c15e2c91 -r fbc0e953732e tools/xm-test/tests/block-integrity/01_block_device_read_verify.py --- a/tools/xm-test/tests/block-integrity/01_block_device_read_verify.py Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/xm-test/tests/block-integrity/01_block_device_read_verify.py Thu Jun 15 10:23:57 2006 -0600 @@ -31,7 +31,7 @@ traceCommand("cat /dev/urandom > /dev/ra s, o = traceCommand("md5sum /dev/ram1") -dom0_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", o) +dom0_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", o, re.M) block_attach(domain, "phy:ram1", "hda1") @@ -40,7 +40,7 @@ except ConsoleError, e: except ConsoleError, e: FAIL(str(e)) -domU_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", run["output"]) +domU_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", run["output"], re.M) domain.closeConsole() diff -r 7f67c15e2c91 -r fbc0e953732e tools/xm-test/tests/block-integrity/02_block_device_write_verify.py --- a/tools/xm-test/tests/block-integrity/02_block_device_write_verify.py Thu Jun 15 10:02:53 2006 -0600 +++ b/tools/xm-test/tests/block-integrity/02_block_device_write_verify.py Thu Jun 15 10:23:57 2006 -0600 @@ -37,7 +37,7 @@ except ConsoleError, e: except ConsoleError, e: FAIL(str(e)) -domU_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", run["output"]) +domU_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", run["output"], re.M) domain.closeConsole() @@ -45,7 +45,7 @@ domain.stop() s, o = traceCommand("md5sum /dev/ram1") -dom0_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", o) +dom0_md5sum_match = re.search(r"^[\dA-Fa-f]{32}", o, re.M) if domU_md5sum_match == None: FAIL("Failed to get md5sum of data written in domU.") diff -r 7f67c15e2c91 -r fbc0e953732e xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Thu Jun 15 10:02:53 2006 -0600 +++ b/xen/arch/x86/traps.c Thu Jun 15 10:23:57 2006 -0600 @@ -1279,7 +1279,7 @@ static void nmi_softirq(void) static void nmi_softirq(void) { /* Only used to defer wakeup of dom0,vcpu0 to a safe (non-NMI) context. */ - evtchn_notify(dom0->vcpu[0]); + vcpu_kick(dom0->vcpu[0]); } static void nmi_dom0_report(unsigned int reason_idx) diff -r 7f67c15e2c91 -r fbc0e953732e xen/common/event_channel.c --- a/xen/common/event_channel.c Thu Jun 15 10:02:53 2006 -0600 +++ b/xen/common/event_channel.c Thu Jun 15 10:23:57 2006 -0600 @@ -493,10 +493,9 @@ void evtchn_set_pending(struct vcpu *v, if ( !test_bit (port, s->evtchn_mask) && !test_and_set_bit(port / BITS_PER_LONG, - &v->vcpu_info->evtchn_pending_sel) && - !test_and_set_bit(0, &v->vcpu_info->evtchn_upcall_pending) ) - { - evtchn_notify(v); + &v->vcpu_info->evtchn_pending_sel) ) + { + vcpu_mark_events_pending(v); } /* Check if some VCPU might be polling for this event. */ @@ -682,10 +681,9 @@ static long evtchn_unmask(evtchn_unmask_ if ( test_and_clear_bit(port, s->evtchn_mask) && test_bit (port, s->evtchn_pending) && !test_and_set_bit (port / BITS_PER_LONG, - &v->vcpu_info->evtchn_pending_sel) && - !test_and_set_bit (0, &v->vcpu_info->evtchn_upcall_pending) ) - { - evtchn_notify(v); + &v->vcpu_info->evtchn_pending_sel) ) + { + vcpu_mark_events_pending(v); } spin_unlock(&d->evtchn_lock); diff -r 7f67c15e2c91 -r fbc0e953732e xen/include/asm-ia64/event.h --- a/xen/include/asm-ia64/event.h Thu Jun 15 10:02:53 2006 -0600 +++ b/xen/include/asm-ia64/event.h Thu Jun 15 10:23:57 2006 -0600 @@ -12,7 +12,7 @@ #include <public/arch-ia64.h> #include <asm/vcpu.h> -static inline void evtchn_notify(struct vcpu *v) +static inline void vcpu_kick(struct vcpu *v) { /* * NB1. 'vcpu_flags' and 'processor' must be checked /after/ update of @@ -30,6 +30,12 @@ static inline void evtchn_notify(struct if(!VMX_DOMAIN(v) && !v->arch.event_callback_ip) vcpu_pend_interrupt(v, v->domain->shared_info->arch.evtchn_vector); +} + +static inline void vcpu_mark_events_pending(struct vcpu *v) +{ + if ( !test_and_set_bit(0, &v->vcpu_info->evtchn_upcall_pending) ) + vcpu_kick(v); } /* Note: Bitwise operations result in fast code with no branches. */ diff -r 7f67c15e2c91 -r fbc0e953732e xen/include/asm-x86/event.h --- a/xen/include/asm-x86/event.h Thu Jun 15 10:02:53 2006 -0600 +++ b/xen/include/asm-x86/event.h Thu Jun 15 10:23:57 2006 -0600 @@ -9,7 +9,7 @@ #ifndef __ASM_EVENT_H__ #define __ASM_EVENT_H__ -static inline void evtchn_notify(struct vcpu *v) +static inline void vcpu_kick(struct vcpu *v) { /* * NB1. 'vcpu_flags' and 'processor' must be checked /after/ update of @@ -24,6 +24,12 @@ static inline void evtchn_notify(struct vcpu_unblock(v); if ( running ) smp_send_event_check_cpu(v->processor); +} + +static inline void vcpu_mark_events_pending(struct vcpu *v) +{ + if ( !test_and_set_bit(0, &v->vcpu_info->evtchn_upcall_pending) ) + vcpu_kick(v); } static inline int local_events_need_delivery(void) diff -r 7f67c15e2c91 -r fbc0e953732e xen/include/xen/elf.h --- a/xen/include/xen/elf.h Thu Jun 15 10:02:53 2006 -0600 +++ b/xen/include/xen/elf.h Thu Jun 15 10:23:57 2006 -0600 @@ -178,9 +178,9 @@ typedef struct { #define EM_ALPHA 41 /* DEC ALPHA */ #define EM_SPARCV9 43 /* SPARC version 9 */ #define EM_ALPHA_EXP 0x9026 /* DEC ALPHA */ +#define EM_IA_64 50 /* Intel Merced */ #define EM_X86_64 62 /* AMD x86-64 architecture */ #define EM_VAX 75 /* DEC VAX */ -#define EM_NUM 15 /* number of machine types */ /* Version */ #define EV_NONE 0 /* Invalid */ diff -r 7f67c15e2c91 -r fbc0e953732e linux-2.6-xen-sparse/kernel/fork.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/kernel/fork.c Thu Jun 15 10:23:57 2006 -0600 @@ -0,0 +1,1619 @@ +/* + * linux/kernel/fork.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'fork.c' contains the help-routines for the 'fork' system call + * (see also entry.S and others). + * Fork is rather simple, once you get the hang of it, but the memory + * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' + */ + +#include <linux/config.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/unistd.h> +#include <linux/smp_lock.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/completion.h> +#include <linux/namespace.h> +#include <linux/personality.h> +#include <linux/mempolicy.h> +#include <linux/sem.h> +#include <linux/file.h> +#include <linux/key.h> +#include <linux/binfmts.h> +#include <linux/mman.h> +#include <linux/fs.h> +#include <linux/capability.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/security.h> +#include <linux/swap.h> +#include <linux/syscalls.h> +#include <linux/jiffies.h> +#include <linux/futex.h> +#include <linux/rcupdate.h> +#include <linux/ptrace.h> +#include <linux/mount.h> +#include <linux/audit.h> +#include <linux/profile.h> +#include <linux/rmap.h> +#include <linux/acct.h> +#include <linux/cn_proc.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <asm/mmu_context.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +/* + * Protected counters by write_lock_irq(&tasklist_lock) + */ +unsigned long total_forks; /* Handle normal Linux uptimes. */ +int nr_threads; /* The idle threads do not count.. */ + +int max_threads; /* tunable limit on nr_threads */ + +DEFINE_PER_CPU(unsigned long, process_counts) = 0; + + __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ + +EXPORT_SYMBOL(tasklist_lock); + +int nr_processes(void) +{ + int cpu; + int total = 0; + + for_each_online_cpu(cpu) + total += per_cpu(process_counts, cpu); + + return total; +} + +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) +# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +static kmem_cache_t *task_struct_cachep; +#endif + +/* SLAB cache for signal_struct structures (tsk->signal) */ +kmem_cache_t *signal_cachep; + +/* SLAB cache for sighand_struct structures (tsk->sighand) */ +kmem_cache_t *sighand_cachep; + +/* SLAB cache for files_struct structures (tsk->files) */ +kmem_cache_t *files_cachep; + +/* SLAB cache for fs_struct structures (tsk->fs) */ +kmem_cache_t *fs_cachep; + +/* SLAB cache for vm_area_struct structures */ +kmem_cache_t *vm_area_cachep; + +/* SLAB cache for mm_struct structures (tsk->mm) */ +static kmem_cache_t *mm_cachep; + +void free_task(struct task_struct *tsk) +{ + free_thread_info(tsk->thread_info); + free_task_struct(tsk); +} +EXPORT_SYMBOL(free_task); + +void __put_task_struct_cb(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); + WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + if (unlikely(tsk->audit_context)) + audit_free(tsk); + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + + if (!profile_handoff_task(tsk)) + free_task(tsk); +} + +void __init fork_init(unsigned long mempages) +{ +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef ARCH_MIN_TASKALIGN +#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +#endif + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", sizeof(struct task_struct), + ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); +#endif + + /* + * The default maximum number of threads is set to a safe + * value: the thread structures can take up at most half + * of memory. + */ + max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); + + /* + * we need to allow at least 20 threads to boot a system + */ + if(max_threads < 20) + max_threads = 20; + + init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + init_task.signal->rlim[RLIMIT_SIGPENDING] = + init_task.signal->rlim[RLIMIT_NPROC]; +} + +static struct task_struct *dup_task_struct(struct task_struct *orig) +{ + struct task_struct *tsk; + struct thread_info *ti; + + prepare_to_copy(orig); + + tsk = alloc_task_struct(); + if (!tsk) + return NULL; + + ti = alloc_thread_info(tsk); + if (!ti) { + free_task_struct(tsk); + return NULL; + } + + *tsk = *orig; + tsk->thread_info = ti; + setup_thread_stack(tsk, orig); + + /* One for us, one for whoever does the "release_task()" (usually parent) */ + atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); + return tsk; +} + +#ifdef CONFIG_MMU +static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + struct mempolicy *pol; + + down_write(&oldmm->mmap_sem); + flush_cache_mm(oldmm); + down_write(&mm->mmap_sem); + + mm->locked_vm = 0; + mm->mmap = NULL; + mm->mmap_cache = NULL; + mm->free_area_cache = oldmm->mmap_base; + mm->cached_hole_size = ~0UL; + mm->map_count = 0; + cpus_clear(mm->cpu_vm_mask); + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + long pages = vma_pages(mpnt); + mm->total_vm -= pages; + vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + -pages); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if (security_vm_enough_memory(len)) + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + pol = mpol_copy(vma_policy(mpnt)); + retval = PTR_ERR(pol); + if (IS_ERR(pol)) + goto fail_nomem_policy; + vma_set_policy(tmp, pol); + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_mm = mm; + tmp->vm_next = NULL; + anon_vma_link(tmp); + file = tmp->vm_file; + if (file) { + struct inode *inode = file->f_dentry->d_inode; + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + + /* insert tmp into the share list, just after mpnt */ + spin_lock(&file->f_mapping->i_mmap_lock); + tmp->vm_truncate_count = mpnt->vm_truncate_count; + flush_dcache_mmap_lock(file->f_mapping); + vma_prio_tree_add(tmp, mpnt); + flush_dcache_mmap_unlock(file->f_mapping); + spin_unlock(&file->f_mapping->i_mmap_lock); + } + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } +#ifdef arch_dup_mmap + arch_dup_mmap(mm, oldmm); +#endif + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + return retval; +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct * mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct * mm) +{ + pgd_free(mm->pgd); +} +#else +#define dup_mmap(mm, oldmm) (0) +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + + __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +#include <linux/init_task.h> + +static struct mm_struct * mm_init(struct mm_struct * mm) +{ + atomic_set(&mm->mm_users, 1); + atomic_set(&mm->mm_count, 1); + init_rwsem(&mm->mmap_sem); + INIT_LIST_HEAD(&mm->mmlist); + mm->core_waiters = 0; + mm->nr_ptes = 0; + set_mm_counter(mm, file_rss, 0); + set_mm_counter(mm, anon_rss, 0); + spin_lock_init(&mm->page_table_lock); + rwlock_init(&mm->ioctx_list_lock); + mm->ioctx_list = NULL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; + + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + return mm; + } + free_mm(mm); + return NULL; +} + +/* + * Allocate and initialize an mm_struct. + */ +struct mm_struct * mm_alloc(void) +{ + struct mm_struct * mm; + + mm = allocate_mm(); + if (mm) { + memset(mm, 0, sizeof(*mm)); + mm = mm_init(mm); + } + return mm; +} + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +void fastcall __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + free_mm(mm); +} + +/* + * Decrement the use count and release all resources for an mm. + */ +void mmput(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + exit_aio(mm); + exit_mmap(mm); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + put_swap_token(mm); + mmdrop(mm); + } +} +EXPORT_SYMBOL_GPL(mmput); + +/** + * get_task_mm - acquire a reference to the task's mm + * + * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * this kernel workthread has transiently adopted a user mm with use_mm, + * to do its AIO) is not set and if so returns a reference to it, after + * bumping up the use count. User must release the mm via mmput() + * after use. Typically used by /proc and ptrace. + */ +struct mm_struct *get_task_mm(struct task_struct *task) +{ + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (task->flags & PF_BORROWED_MM) + mm = NULL; + else + atomic_inc(&mm->mm_users); + } + task_unlock(task); + return mm; +} +EXPORT_SYMBOL_GPL(get_task_mm); + +/* Please note the differences between mmput and mm_release. + * mmput is called whenever we stop holding onto a mm_struct, + * error success whatever. + * + * mm_release is called after a mm_struct has been removed + * from the current process. + * + * This difference is important for error handling, when we + * only half set up a mm_struct for a new process and need to restore + * the old one. Because we mmput the new mm_struct before + * restoring the old one. . . + * Eric Biederman 10 January 1998 + */ +void mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + struct completion *vfork_done = tsk->vfork_done; + + /* Get rid of any cached register state */ + deactivate_mm(tsk, mm); + + /* notify parent sleeping on vfork() */ + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { + u32 __user * tidptr = tsk->clear_child_tid; + tsk->clear_child_tid = NULL; + + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tidptr); + sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); + } +} + +/* + * Allocate a new mm structure and copy contents from the + * mm structure of the passed in task structure. + */ +static struct mm_struct *dup_mm(struct task_struct *tsk) +{ + struct mm_struct *mm, *oldmm = current->mm; + int err; + + if (!oldmm) + return NULL; + + mm = allocate_mm(); + if (!mm) + goto fail_nomem; + + memcpy(mm, oldmm, sizeof(*mm)); + + if (!mm_init(mm)) + goto fail_nomem; + + if (init_new_context(tsk, mm)) + goto fail_nocontext; + + err = dup_mmap(mm, oldmm); + if (err) + goto free_pt; + + mm->hiwater_rss = get_mm_rss(mm); + mm->hiwater_vm = mm->total_vm; + + return mm; + +free_pt: + mmput(mm); + +fail_nomem: + return NULL; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return NULL; +} + +static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +{ + struct mm_struct * mm, *oldmm; + int retval; + + tsk->min_flt = tsk->maj_flt = 0; + tsk->nvcsw = tsk->nivcsw = 0; + + tsk->mm = NULL; + tsk->active_mm = NULL; + + /* + * Are we cloning a kernel thread? + * + * We need to steal a active VM for that.. + */ + oldmm = current->mm; + if (!oldmm) + return 0; + + if (clone_flags & CLONE_VM) { + atomic_inc(&oldmm->mm_users); + mm = oldmm; + goto good_mm; + } + + retval = -ENOMEM; + mm = dup_mm(tsk); + if (!mm) + goto fail_nomem; + +good_mm: + tsk->mm = mm; + tsk->active_mm = mm; + return 0; + +fail_nomem: + return retval; +} + +static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + atomic_set(&fs->count, 1); + rwlock_init(&fs->lock); + fs->umask = old->umask; + read_lock(&old->lock); + fs->rootmnt = mntget(old->rootmnt); + fs->root = dget(old->root); + fs->pwdmnt = mntget(old->pwdmnt); + fs->pwd = dget(old->pwd); + if (old->altroot) { + fs->altrootmnt = mntget(old->altrootmnt); + fs->altroot = dget(old->altroot); + } else { + fs->altrootmnt = NULL; + fs->altroot = NULL; + } + read_unlock(&old->lock); + } + return fs; +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + return __copy_fs_struct(old); +} + +EXPORT_SYMBOL_GPL(copy_fs_struct); + +static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) +{ + if (clone_flags & CLONE_FS) { + atomic_inc(¤t->fs->count); + return 0; + } + tsk->fs = __copy_fs_struct(current->fs); + if (!tsk->fs) + return -ENOMEM; + return 0; +} + +static int count_open_files(struct fdtable *fdt) +{ + int size = fdt->max_fdset; + int i; + + /* Find the last open fd */ + for (i = size/(8*sizeof(long)); i > 0; ) { + if (fdt->open_fds->fds_bits[--i]) + break; + } + i = (i+1) * 8 * sizeof(long); + return i; +} + +static struct files_struct *alloc_files(void) +{ + struct files_struct *newf; + struct fdtable *fdt; + + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + fdt = &newf->fdtab; + fdt->next_fd = 0; + fdt->max_fds = NR_OPEN_DEFAULT; + fdt->max_fdset = __FD_SETSIZE; + fdt->close_on_exec = &newf->close_on_exec_init; + fdt->open_fds = &newf->open_fds_init; + fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&fdt->rcu); + fdt->free_files = NULL; + fdt->next = NULL; + rcu_assign_pointer(newf->fdt, fdt); +out: + return newf; +} + +/* + * Allocate a new files structure and copy contents from the + * passed in files structure. + */ +static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) +{ + struct files_struct *newf; + struct file **old_fds, **new_fds; + int open_files, size, i, expand; + struct fdtable *old_fdt, *new_fdt; + + newf = alloc_files(); + if (!newf) + goto out; + + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + new_fdt = files_fdtable(newf); + size = old_fdt->max_fdset; + open_files = count_open_files(old_fdt); + expand = 0; + + /* + * Check whether we need to allocate a larger fd array or fd set. + * Note: we're not a clone task, so the open count won't change. + */ + if (open_files > new_fdt->max_fdset) { + new_fdt->max_fdset = 0; + expand = 1; + } + if (open_files > new_fdt->max_fds) { + new_fdt->max_fds = 0; + expand = 1; + } + + /* if the old fdset gets grown now, we'll only copy up to "size" fds */ + if (expand) { + spin_unlock(&oldf->file_lock); + spin_lock(&newf->file_lock); + *errorp = expand_files(newf, open_files-1); + spin_unlock(&newf->file_lock); + if (*errorp < 0) + goto out_release; + new_fdt = files_fdtable(newf); + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + } + + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; + + memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); + + for (i = open_files; i != 0; i--) { + struct file *f = *old_fds++; + if (f) { + get_file(f); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). So make sure that this + * fd is available to the new process. + */ + FD_CLR(open_files - i, new_fdt->open_fds); + } + rcu_assign_pointer(*new_fds++, f); + } + spin_unlock(&oldf->file_lock); + + /* compute the remainder to be cleared */ + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); + + /* This is long word aligned thus could use a optimized version */ + memset(new_fds, 0, size); + + if (new_fdt->max_fdset > open_files) { + int left = (new_fdt->max_fdset-open_files)/8; + int start = open_files / (8 * sizeof(unsigned long)); + + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); + } + +out: + return newf; + +out_release: + free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); + free_fdset (new_fdt->open_fds, new_fdt->max_fdset); + free_fd_array(new_fdt->fd, new_fdt->max_fds); + kmem_cache_free(files_cachep, newf); + return NULL; +} + +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + struct files_struct *oldf, *newf; + int error = 0; + + /* + * A background process may not have any files ... + */ + oldf = current->files; + if (!oldf) + goto out; + + if (clone_flags & CLONE_FILES) { + atomic_inc(&oldf->count); + goto out; + } + + /* + * Note: we may be using current for both targets (See exec.c) + * This works because we cache current->files (old) as oldf. Don't + * break this. + */ + tsk->files = NULL; + error = -ENOMEM; + newf = dup_fd(oldf, &error); + if (!newf) + goto out; + + tsk->files = newf; + error = 0; +out: + return error; +} + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(void) +{ + struct files_struct *files = current->files; + int rc; + + if(!files) + BUG(); + + /* This can race but the race causes us to copy when we don't + need to and drop the copy */ + if(atomic_read(&files->count) == 1) + { + atomic_inc(&files->count); + return 0; + } + rc = copy_files(0, current); + if(rc) + current->files = files; + return rc; +} + +EXPORT_SYMBOL(unshare_files); + +void sighand_free_cb(struct rcu_head *rhp) +{ + struct sighand_struct *sp; + + sp = container_of(rhp, struct sighand_struct, rcu); + kmem_cache_free(sighand_cachep, sp); +} + +static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) +{ + struct sighand_struct *sig; + + if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + atomic_inc(¤t->sighand->count); + return 0; + } + sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + rcu_assign_pointer(tsk->sighand, sig); + if (!sig) + return -ENOMEM; + spin_lock_init(&sig->siglock); + atomic_set(&sig->count, 1); + memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + return 0; +} + +static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) +{ + struct signal_struct *sig; + int ret; + + if (clone_flags & CLONE_THREAD) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + return 0; + } + sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + tsk->signal = sig; + if (!sig) + return -ENOMEM; + + ret = copy_thread_group_keys(tsk); + if (ret < 0) { + kmem_cache_free(signal_cachep, sig); + return ret; + } + + atomic_set(&sig->count, 1); + atomic_set(&sig->live, 1); + init_waitqueue_head(&sig->wait_chldexit); + sig->flags = 0; + sig->group_exit_code = 0; + sig->group_exit_task = NULL; + sig->group_stop_count = 0; + sig->curr_target = NULL; + init_sigpending(&sig->shared_pending); + INIT_LIST_HEAD(&sig->posix_timers); + + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); + sig->it_real_incr.tv64 = 0; + sig->real_timer.function = it_real_fn; + sig->real_timer.data = tsk; + + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + sig->leader = 0; /* session leadership doesn't inherit */ + sig->tty_old_pgrp = 0; + + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; + sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + sig->sched_time = 0; + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); + + task_lock(current->group_leader); + memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); + task_unlock(current->group_leader); + + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { + /* + * New sole thread in the process gets an expiry time + * of the whole CPU time limit. + */ + tsk->it_prof_expires = + secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + } + + return 0; +} + +static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) +{ + unsigned long new_flags = p->flags; + + new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); + new_flags |= PF_FORKNOEXEC; + if (!(clone_flags & CLONE_PTRACE)) + p->ptrace = 0; + p->flags = new_flags; +} + +asmlinkage long sys_set_tid_address(int __user *tidptr) +{ + current->clear_child_tid = tidptr; + + return current->pid; +} + +/* + * This creates a new process as a copy of the old one, + * but does not actually start it yet. + * + * It copies the registers, and all the appropriate + * parts of the process environment (as per the clone + * flags). The actual kick-off is left to the caller. + */ +static task_t *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, + int pid) +{ + int retval; + struct task_struct *p = NULL; + + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); + + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. + */ + if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) + return ERR_PTR(-EINVAL); + + /* + * Shared signal handlers imply shared VM. By way of the above, + * thread groups also imply shared VM. Blocking this case allows + * for various simplifications in other code. + */ + if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) + return ERR_PTR(-EINVAL); + + retval = security_task_create(clone_flags); + if (retval) + goto fork_out; + + retval = -ENOMEM; + p = dup_task_struct(current); + if (!p) + goto fork_out; + + retval = -EAGAIN; + if (atomic_read(&p->user->processes) >= + p->signal->rlim[RLIMIT_NPROC].rlim_cur) { + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && + p->user != &root_user) + goto bad_fork_free; + } + + atomic_inc(&p->user->__count); + atomic_inc(&p->user->processes); + get_group_info(p->group_info); + + /* + * If multiple threads are within copy_process(), then this check + * triggers too late. This doesn't hurt, the check is only there + * to stop root fork bombs. + */ + if (nr_threads >= max_threads) + goto bad_fork_cleanup_count; + + if (!try_module_get(task_thread_info(p)->exec_domain->module)) + goto bad_fork_cleanup_count; + + if (p->binfmt && !try_module_get(p->binfmt->module)) + goto bad_fork_cleanup_put_domain; + + p->did_exec = 0; + copy_flags(clone_flags, p); + p->pid = pid; + retval = -EFAULT; + if (clone_flags & CLONE_PARENT_SETTID) + if (put_user(p->pid, parent_tidptr)) + goto bad_fork_cleanup; + + p->proc_dentry = NULL; + + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); + p->vfork_done = NULL; + spin_lock_init(&p->alloc_lock); + spin_lock_init(&p->proc_lock); + + clear_tsk_thread_flag(p, TIF_SIGPENDING); + init_sigpending(&p->pending); + + p->utime = cputime_zero; + p->stime = cputime_zero; + p->sched_time = 0; + p->rchar = 0; /* I/O counter: bytes read */ + p->wchar = 0; /* I/O counter: bytes written */ + p->syscr = 0; /* I/O counter: read syscalls */ + p->syscw = 0; /* I/O counter: write syscalls */ + acct_clear_integrals(p); + + p->it_virt_expires = cputime_zero; + p->it_prof_expires = cputime_zero; + p->it_sched_expires = 0; + INIT_LIST_HEAD(&p->cpu_timers[0]); + INIT_LIST_HEAD(&p->cpu_timers[1]); + INIT_LIST_HEAD(&p->cpu_timers[2]); + + p->lock_depth = -1; /* -1 = no lock */ + do_posix_clock_monotonic_gettime(&p->start_time); + p->security = NULL; + p->io_context = NULL; + p->io_wait = NULL; + p->audit_context = NULL; + cpuset_fork(p); +#ifdef CONFIG_NUMA + p->mempolicy = mpol_copy(p->mempolicy); + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup_cpuset; + } +#endif + +#ifdef CONFIG_DEBUG_MUTEXES + p->blocked_on = NULL; /* not blocked yet */ +#endif + + p->tgid = p->pid; + if (clone_flags & CLONE_THREAD) + p->tgid = current->tgid; + + if ((retval = security_task_alloc(p))) + goto bad_fork_cleanup_policy; + if ((retval = audit_alloc(p))) + goto bad_fork_cleanup_security; + /* copy all the process information */ + if ((retval = copy_semundo(clone_flags, p))) + goto bad_fork_cleanup_audit; + if ((retval = copy_files(clone_flags, p))) + goto bad_fork_cleanup_semundo; + if ((retval = copy_fs(clone_flags, p))) + goto bad_fork_cleanup_files; + if ((retval = copy_sighand(clone_flags, p))) + goto bad_fork_cleanup_fs; + if ((retval = copy_signal(clone_flags, p))) + goto bad_fork_cleanup_sighand; + if ((retval = copy_mm(clone_flags, p))) + goto bad_fork_cleanup_signal; + if ((retval = copy_keys(clone_flags, p))) + goto bad_fork_cleanup_mm; + if ((retval = copy_namespace(clone_flags, p))) + goto bad_fork_cleanup_keys; + retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); + if (retval) + goto bad_fork_cleanup_namespace; + + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + + /* + * sigaltstack should be cleared when sharing the same VM + */ + if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) + p->sas_ss_sp = p->sas_ss_size = 0; + + /* + * Syscall tracing should be turned off in the child regardless + * of CLONE_PTRACE. + */ + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#endif + + /* Our parent execution domain becomes current domain + These must match for thread signalling to apply */ + + p->parent_exec_id = p->self_exec_id; + + /* ok, now we should be set up.. */ + p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + p->pdeath_signal = 0; + p->exit_state = 0; + + /* + * Ok, make it visible to the rest of the system. + * We dont wake it up yet. + */ + p->group_leader = p; + INIT_LIST_HEAD(&p->ptrace_children); + INIT_LIST_HEAD(&p->ptrace_list); + + /* Perform scheduler related setup. Assign this task to a CPU. */ + sched_fork(p, clone_flags); + + /* Need tasklist lock for parent etc handling! */ + write_lock_irq(&tasklist_lock); + + /* + * The task hasn't been attached yet, so its cpus_allowed mask will + * not be changed, nor will its assigned CPU. + * + * The cpus_allowed mask of the parent may have changed after it was + * copied first time - so re-copy it here, then check the child's CPU + * to ensure it is on a valid CPU (and if not, just force it back to + * parent's CPU). This avoids alot of nasty races. + */ + p->cpus_allowed = current->cpus_allowed; + if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || + !cpu_online(task_cpu(p)))) + set_task_cpu(p, smp_processor_id()); + + /* + * Check for pending SIGKILL! The new thread should not be allowed + * to slip out of an OOM kill. (or normal SIGKILL.) + */ + if (sigismember(¤t->pending.signal, SIGKILL)) { + write_unlock_irq(&tasklist_lock); + retval = -EINTR; + goto bad_fork_cleanup_namespace; + } + + /* CLONE_PARENT re-uses the old parent */ + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + p->real_parent = current->real_parent; + else + p->real_parent = current; + p->parent = p->real_parent; + + spin_lock(¤t->sighand->siglock); + if (clone_flags & CLONE_THREAD) { + /* + * Important: if an exit-all has been started then + * do not create this new thread - the whole thread + * group is supposed to exit anyway. + */ + if (current->signal->flags & SIGNAL_GROUP_EXIT) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -EAGAIN; + goto bad_fork_cleanup_namespace; + } + p->group_leader = current->group_leader; + + if (current->signal->group_stop_count > 0) { + /* + * There is an all-stop in progress for the group. + * We ourselves will stop as soon as we check signals. + * Make the new thread part of that group stop too. + */ + current->signal->group_stop_count++; + set_tsk_thread_flag(p, TIF_SIGPENDING); + } + + if (!cputime_eq(current->signal->it_virt_expires, + cputime_zero) || + !cputime_eq(current->signal->it_prof_expires, + cputime_zero) || + current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || + !list_empty(¤t->signal->cpu_timers[0]) || + !list_empty(¤t->signal->cpu_timers[1]) || + !list_empty(¤t->signal->cpu_timers[2])) { + /* + * Have child wake up on its first tick to check + * for process CPU timers. + */ + p->it_prof_expires = jiffies_to_cputime(1); + } + } + + /* + * inherit ioprio + */ + p->ioprio = current->ioprio; + + SET_LINKS(p); + if (unlikely(p->ptrace & PT_PTRACED)) + __ptrace_link(p, current->parent); + + if (thread_group_leader(p)) { + p->signal->tty = current->signal->tty; + p->signal->pgrp = process_group(current); + p->signal->session = current->signal->session; + attach_pid(p, PIDTYPE_PGID, process_group(p)); + attach_pid(p, PIDTYPE_SID, p->signal->session); + if (p->pid) + __get_cpu_var(process_counts)++; + } + attach_pid(p, PIDTYPE_TGID, p->tgid); + attach_pid(p, PIDTYPE_PID, p->pid); + + nr_threads++; + total_forks++; + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); + return p; + +bad_fork_cleanup_namespace: + exit_namespace(p); +bad_fork_cleanup_keys: + exit_keys(p); +bad_fork_cleanup_mm: + if (p->mm) + mmput(p->mm); +bad_fork_cleanup_signal: + exit_signal(p); +bad_fork_cleanup_sighand: + exit_sighand(p); +bad_fork_cleanup_fs: + exit_fs(p); /* blocking */ +bad_fork_cleanup_files: + exit_files(p); /* blocking */ +bad_fork_cleanup_semundo: + exit_sem(p); +bad_fork_cleanup_audit: + audit_free(p); +bad_fork_cleanup_security: + security_task_free(p); +bad_fork_cleanup_policy: +#ifdef CONFIG_NUMA + mpol_free(p->mempolicy); +bad_fork_cleanup_cpuset: +#endif + cpuset_exit(p); +bad_fork_cleanup: + if (p->binfmt) + module_put(p->binfmt->module); +bad_fork_cleanup_put_domain: + module_put(task_thread_info(p)->exec_domain->module); +bad_fork_cleanup_count: + put_group_info(p->group_info); + atomic_dec(&p->user->processes); + free_uid(p->user); +bad_fork_free: + free_task(p); +fork_out: + return ERR_PTR(retval); +} + +struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + return regs; +} + +task_t * __devinit fork_idle(int cpu) +{ + task_t *task; + struct pt_regs regs; + + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); + if (!task) + return ERR_PTR(-ENOMEM); + init_idle(task, cpu); + unhash_process(task); + return task; +} + +static inline int fork_traceflag (unsigned clone_flags) +{ + if (clone_flags & CLONE_UNTRACED) + return 0; + else if (clone_flags & CLONE_VFORK) { + if (current->ptrace & PT_TRACE_VFORK) + return PTRACE_EVENT_VFORK; + } else if ((clone_flags & CSIGNAL) != SIGCHLD) { + if (current->ptrace & PT_TRACE_CLONE) + return PTRACE_EVENT_CLONE; + } else if (current->ptrace & PT_TRACE_FORK) + return PTRACE_EVENT_FORK; + + return 0; +} + +/* + * Ok, this is the main fork-routine. + * + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + struct task_struct *p; + int trace = 0; + long pid = alloc_pidmap(); + + if (pid < 0) + return -EAGAIN; + if (unlikely(current->ptrace)) { + trace = fork_traceflag (clone_flags); + if (trace) + clone_flags |= CLONE_PTRACE; + } + + p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); + /* + * Do this prior waking up the new thread - the thread pointer + * might get invalid after that point, if the thread exits quickly. + */ + if (!IS_ERR(p)) { + struct completion vfork; + + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + } + + if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + /* + * We'll start up with an immediate SIGSTOP. + */ + sigaddset(&p->pending.signal, SIGSTOP); + set_tsk_thread_flag(p, TIF_SIGPENDING); + } + + if (!(clone_flags & CLONE_STOPPED)) + wake_up_new_task(p, clone_flags); + else + p->state = TASK_STOPPED; + + if (unlikely (trace)) { + current->ptrace_message = pid; + ptrace_notify ((trace << 8) | SIGTRAP); + } + + if (clone_flags & CLONE_VFORK) { + wait_for_completion(&vfork); + if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) + ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + } + } else { + free_pidmap(pid); + pid = PTR_ERR(p); + } + return pid; +} + +#ifndef ARCH_MIN_MMSTRUCT_ALIGN +#define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +void __init proc_caches_init(void) +{ + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL, NULL); + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); +} + + +/* + * Check constraints on flags passed to the unshare system call and + * force unsharing of additional process context as appropriate. + */ +static inline void check_unshare_flags(unsigned long *flags_ptr) +{ + /* + * If unsharing a thread from a thread group, must also + * unshare vm. + */ + if (*flags_ptr & CLONE_THREAD) + *flags_ptr |= CLONE_VM; + + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (*flags_ptr & CLONE_VM) + *flags_ptr |= CLONE_SIGHAND; + + /* + * If unsharing signal handlers and the task was created + * using CLONE_THREAD, then must unshare the thread + */ + if ((*flags_ptr & CLONE_SIGHAND) && + (atomic_read(¤t->signal->count) > 1)) + *flags_ptr |= CLONE_THREAD; + + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (*flags_ptr & CLONE_NEWNS) + *flags_ptr |= CLONE_FS; +} + +/* + * Unsharing of tasks created with CLONE_THREAD is not supported yet + */ +static int unshare_thread(unsigned long unshare_flags) +{ + if (unshare_flags & CLONE_THREAD) + return -EINVAL; + + return 0; +} + +/* + * Unshare the filesystem structure if it is being shared + */ +static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) +{ + struct fs_struct *fs = current->fs; + + if ((unshare_flags & CLONE_FS) && + (fs && atomic_read(&fs->count) > 1)) { + *new_fsp = __copy_fs_struct(current->fs); + if (!*new_fsp) + return -ENOMEM; + } + + return 0; +} + +/* + * Unshare the namespace structure if it is being shared + */ +static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) +{ + struct namespace *ns = current->namespace; + + if ((unshare_flags & CLONE_NEWNS) && + (ns && atomic_read(&ns->count) > 1)) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + if (!*new_nsp) + return -ENOMEM; + } + + return 0; +} + +/* + * Unsharing of sighand for tasks created with CLONE_SIGHAND is not + * supported yet + */ +static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) +{ + struct sighand_struct *sigh = current->sighand; + + if ((unshare_flags & CLONE_SIGHAND) && + (sigh && atomic_read(&sigh->count) > 1)) + return -EINVAL; + else + return 0; +} + +/* + * Unshare vm if it is being shared + */ +static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) +{ + struct mm_struct *mm = current->mm; + + if ((unshare_flags & CLONE_VM) && + (mm && atomic_read(&mm->mm_users) > 1)) { + return -EINVAL; + } + + return 0; +} + +/* + * Unshare file descriptor table if it is being shared + */ +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +{ + struct files_struct *fd = current->files; + int error = 0; + + if ((unshare_flags & CLONE_FILES) && + (fd && atomic_read(&fd->count) > 1)) { + *new_fdp = dup_fd(fd, &error); + if (!*new_fdp) + return error; + } + + return 0; +} + +/* + * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not + * supported yet + */ +static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) +{ + if (unshare_flags & CLONE_SYSVSEM) + return -EINVAL; + + return 0; +} + +/* + * unshare allows a process to 'unshare' part of the process + * context which was originally shared using clone. copy_* + * functions used by do_fork() cannot be used here directly + * because they modify an inactive task_struct that is being + * constructed. Here we are modifying the current, active, + * task_struct. + */ +asmlinkage long sys_unshare(unsigned long unshare_flags) +{ + int err = 0; + struct fs_struct *fs, *new_fs = NULL; + struct namespace *ns, *new_ns = NULL; + struct sighand_struct *sigh, *new_sigh = NULL; + struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; + struct files_struct *fd, *new_fd = NULL; + struct sem_undo_list *new_ulist = NULL; + + check_unshare_flags(&unshare_flags); + + if ((err = unshare_thread(unshare_flags))) + goto bad_unshare_out; + if ((err = unshare_fs(unshare_flags, &new_fs))) + goto bad_unshare_cleanup_thread; + if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) + goto bad_unshare_cleanup_fs; + if ((err = unshare_sighand(unshare_flags, &new_sigh))) + goto bad_unshare_cleanup_ns; + if ((err = unshare_vm(unshare_flags, &new_mm))) + goto bad_unshare_cleanup_sigh; + if ((err = unshare_fd(unshare_flags, &new_fd))) + goto bad_unshare_cleanup_vm; + if ((err = unshare_semundo(unshare_flags, &new_ulist))) + goto bad_unshare_cleanup_fd; + + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + + task_lock(current); + + if (new_fs) { + fs = current->fs; + current->fs = new_fs; + new_fs = fs; + } + + if (new_ns) { + ns = current->namespace; + current->namespace = new_ns; + new_ns = ns; + } + + if (new_sigh) { + sigh = current->sighand; + rcu_assign_pointer(current->sighand, new_sigh); + new_sigh = sigh; + } + + if (new_mm) { + mm = current->mm; + active_mm = current->active_mm; + current->mm = new_mm; + current->active_mm = new_mm; + activate_mm(active_mm, new_mm); + new_mm = mm; + } + + if (new_fd) { + fd = current->files; + current->files = new_fd; + new_fd = fd; + } + + task_unlock(current); + } + +bad_unshare_cleanup_fd: + if (new_fd) + put_files_struct(new_fd); + +bad_unshare_cleanup_vm: + if (new_mm) + mmput(new_mm); + +bad_unshare_cleanup_sigh: + if (new_sigh) + if (atomic_dec_and_test(&new_sigh->count)) + kmem_cache_free(sighand_cachep, new_sigh); + +bad_unshare_cleanup_ns: + if (new_ns) + put_namespace(new_ns); + +bad_unshare_cleanup_fs: + if (new_fs) + put_fs_struct(new_fs); + +bad_unshare_cleanup_thread: +bad_unshare_out: + return err; +} diff -r 7f67c15e2c91 -r fbc0e953732e tools/security/python/xensec_tools/acm_getdecision --- a/tools/security/python/xensec_tools/acm_getdecision Thu Jun 15 10:02:53 2006 -0600 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -#!/usr/bin/env python -# -*- mode: python; -*- -import sys -import traceback -import getopt - -# add fallback path for non-native python path installs if needed -sys.path.insert(-1, '/usr/lib/python') -sys.path.insert(-1, '/usr/lib64/python') - -from xen.util.security import ACMError, err, get_decision, active_policy - -def usage(): - print "Usage: acm_getdecision -i domainid --label labelname" - print " Test program illustrating the retrieval of" - print " access control decisions from Xen. At this time," - print " only sharing (STE) policy decisions are supported." - print " Arguments are two paramters in any combination:" - print "\t -i domain_id or --domid domain_id" - print "\t -l labelname or --label labelname" - print " Return value:" - print "\t PERMITTED if access is permitted" - print "\t DENIED if access is denied" - print "\t ACMError -- e.g., unknown label or domain id" - err("Usage") - -try: - - if len(sys.argv) != 5: - usage() - - decision_args = [] - - for idx in range(1, len(sys.argv), 2): - if sys.argv[idx] in ['-i', '--domid']: - decision_args.append(['domid', sys.argv[idx+1]]) - elif sys.argv[idx] in ['-l', '--label']: - decision_args.append(['access_control', - ['policy', active_policy], - ['label', sys.argv[idx+1]] - ]) - else: - print "unknown argument %s" % sys.argv[idx] - usage() - - if len(decision_args) != 2: - print "too many arguments" - usage() - - print get_decision(decision_args[0], decision_args[1]) - -except ACMError: - pass -except: - traceback.print_exc(limit=1) _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |