[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Update vnets to support UDP encapsulation, multicast forwarding
# HG changeset patch # User kaf24@xxxxxxxxxxxxxxxxxxxx # Node ID 71b0f00f6344227d9f8077e2fe509556c431f2ae # Parent a0e7daa2df33b0b60a5de113e5b4f8eb731866eb Update vnets to support UDP encapsulation, multicast forwarding and optionally running in user-space. Signed-off-by: Mike Wray <mike.wray@xxxxxx> diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/00INSTALL --- a/tools/vnet/00INSTALL Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/00INSTALL Thu Feb 9 15:12:11 2006 @@ -1,3 +1,5 @@ +This directory contains the implementation of vnets: +virtual private networks for virtual machines. make - compile in local dirs. The module is in vnet-module/vnet_module.ko. @@ -8,6 +10,12 @@ make install - compile and install into system. + +By default the makefiles expect this code to have been installed +in tools/vnet in a xen source tree. If compiling outside the xen +source tree, set XEN_ROOT to the location of the xen source. +You can do this in the environment or in a Make.local file +in the current directory (see Make.env for details). The xen0 kernel must have been compiled before building the vnet module. The vnet module installs to diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/00README --- a/tools/vnet/00README Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/00README Thu Feb 9 15:12:11 2006 @@ -1,10 +1,15 @@ This directory contains the implementation of vnets: virtual private networks for virtual machines. -See doc/ for more information and examples/ for example -configurations. -The kernel module is in vnet-module/ and the vnet forwarding -daemon is in vnetd/. The vnetd daemon makes vnets work across -subnets when multicast routing is not available. +See 00INSTALL for build instructions, doc/ for more information +and examples/ for example configurations. + +The vnet implementation can be run using a kernel module +or a user-space daemon. The kernel module is in vnet-module/ and the +user-space daemon (varpd) is in vnetd/. The user-space daemon +needs the tun/tap kernel module. Vnets use multicast to find +virtual interfaces and support broadcast. Either implementation can +tunnel multicast packets to other implementations if wide-area +multicast routing is not available. Mike Wray <mike.wray@xxxxxx> \ No newline at end of file diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/Make.env --- a/tools/vnet/Make.env Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/Make.env Thu Feb 9 15:12:11 2006 @@ -1,6 +1,16 @@ # -*- mode: Makefile; -*- -export XEN_ROOT = $(shell cd $(VNET_ROOT)/../.. && pwd) +# Include any local overrides. +-include $(VNET_ROOT)/Make.local + +# If building vnets outside the xen source tree, set XEN_ROOT to the +# absolute path of the root of the xen source tree. Edit this file +# or set XEN_ROOT in Make.local, the make command line or +# the environment. For example put this in Make.local: +# export XEN_ROOT = $(shell cd ~/xen-unstable.hg && pwd) + +export XEN_ROOT ?= $(shell cd $(VNET_ROOT)/../.. && pwd) + export LINUX_SERIES ?= 2.6 DISTDIR ?= $(XEN_ROOT)/dist @@ -10,11 +20,9 @@ export VNETD_DIR = $(VNET_ROOT)/vnetd export LIBXUTIL_DIR = $(VNET_ROOT)/libxutil + export GC_DIR = $(VNET_ROOT)/build/gc export GC_INCLUDE = $(GC_DIR)/include export GC_LIB_DIR = $(GC_DIR)/lib export GC_LIB_A = $(GC_LIB_DIR)/libgc.a export GC_LIB_SO = $(GC_LIB_DIR)/libgc.so - -#$(warning XEN_ROOT = $(XEN_ROOT)) -#$(warning DESTDIR = $(DESTDIR)) diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/Makefile --- a/tools/vnet/Makefile Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/Makefile Thu Feb 9 15:12:11 2006 @@ -7,9 +7,11 @@ .PHONY: all compile install dist clean pristine .PHONY: gc-all gc-install gc-clean +.PHONY: help SUBDIRS:= SUBDIRS+= examples +SUBDIRS+= scripts SUBDIRS+= gc SUBDIRS+= libxutil SUBDIRS+= vnetd @@ -60,3 +62,21 @@ pristine: clean -@$(RM) gc.tar.gz + +help: + @echo 'Cleaning targets:' + @echo ' clean - clean subdirs and remove the build dir' + @echo ' pristine - clean, then remove the gc tarball' + @echo '' + @echo 'Installation targets:' + @echo ' install - build and install relative to /' + @echo ' dist - build and install relative to DESTDIR (default XEN_ROOT/dist/install)' + @echo '' + @echo 'Compilation targets:' + @echo ' all - same as compile' + @echo ' compile - build everything' + @echo '' + @echo 'To build everything locally use "make" or "make all"'. + @echo 'To build and install into XEN_ROOT/dist/install use "make dist".' + @echo 'To build and install into the system use "make dist".' + @echo 'See ./00README and ./00INSTALL for more information.' diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/doc/vnet-module.txt --- a/tools/vnet/doc/vnet-module.txt Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/doc/vnet-module.txt Thu Feb 9 15:12:11 2006 @@ -1,11 +1,14 @@ -Vnet Module Command Interface +Vnet Low-level Command Interface Mike Wray <mike.wray@xxxxxx> -2005/08/25 +2006/10/12 -When insmod the vnet-module creates /proc/vnet/policy which -can be used to control the module by writing commands into it. -The return code from the command should be returned by close. -Xend uses these commands to implement its vnet interface. +The vnet kernel module and user-space daemon vnetd support a low-level +command interface to control vnets. The kernel module creates /proc/vnet/policy, +which is used by writing commands into it. Vnetd listens on the unix-domain +socket /tmp/vnetd. + +The vn utility in ../scripts provides a higher-level interface to +the vnet commands (using the kernel module or vnetd). The commands are: @@ -37,33 +40,33 @@ Add the vif with MAC address <macaddr> to the vnet with id <vnetid>. This makes the vnet module respond to VARP requests for <macaddr> -on vnet <vnetid>. +on vnet <vnetid>. The vnet implementation learns MAC addresses +so doing this should not be necessary. (vif.del (vnet <vnetid>) (vmac <macaddr>)) Remove the vif with MAC address <macaddr> from the vnet with id <vnetid>. The vnet module will stop responding to VARP for the vif. -(vif.print) +(peer.add (addr <addr>)) -Print the known vnets, vifs and varp cache on the console. +Add a peer at IP address <addr> to forward multicasts to, +and accept forwarded multicasts from. -Examples: +(peer.del (addr <addr>)) -To create vnet 10 with no security: +Delete a peer. -echo '(vnet.add (id 10))' > /proc/vnet/policy +(vif.list) - get list of vifs. +(vnet.list) - get list of vnets. +(varp.list) - get vnet/varp info. +(peer.list) - get list of peers. -This creates a device vnif0010. +The kernel module produces output on the console, and vnetd +returns output on the unix socket. The kernel module also provides +the following files which can be read to get information: -To create vnet 11 with message authentication: - -echo '(vnet.add (id 11) (security auth))' > /proc/vnet/policy - -To add the vif with vmac "aa:00:00:bc:34:ae" to vnet 10: - -echo '(vif.add (vnet 10) (vmac aa:00:00:bc:34:ae))' > /proc/vnet/policy - -To remove the vif from the vnet: - -echo '(vif.del (vnet 10) (vmac aa:00:00:bc:34:ae))' > /proc/vnet/policy +/proc/vnet/vifs - get list of vifs. +/proc/vnet/vnets - get list of vnets. +/proc/vnet/varp - get vnet/varp info. +/proc/vnet/peers - get list of peers. diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/doc/vnet-xend.txt --- a/tools/vnet/doc/vnet-xend.txt Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/doc/vnet-xend.txt Thu Feb 9 15:12:11 2006 @@ -3,7 +3,7 @@ Mike Wray <mike.wray@xxxxxx> -2005/08/25 +2005/12/13 0) Introduction --------------- @@ -15,7 +15,7 @@ Virtual interfaces on the same vnet can be on the same machine or on different machines, they can still talk. The hosting machines -can even be on different subnets if you run vnetd to forward, +can even be on different subnets if you configure vnet forwarding, or have multicast routing enabled. @@ -34,7 +34,7 @@ Restart xend. -Alternatively insert the vnet module using vnet-insert, +Alternatively insert the vnet module using 'vn insmod', preferably before xend starts. 2) Creating vnets @@ -47,14 +47,14 @@ For example, if vnet97.sxp contains: -(vnet (id 97) (bridge vnet97) (vnetif vnetif97) (security none)) +(vnet (id 97) (bridge vnet97) (vnetif vnif97) (security none)) do xm vnet-create vnet97.sxp This will define a vnet with id 97 and no security. The bridge for the -vnet is called vnet97 and the virtual interface for it is vnetif97. +vnet is called vnet97 and the virtual interface for it is vnif97. To add an interface on a vm to this vnet simply set its bridge to vnet97 in its configuration. @@ -66,6 +66,22 @@ (dev (vif (mac aa:00:00:01:02:03) (bridge vnet97))) +By default vnets use udp encapsulation, but if you use etherip encapsulation +you will also have to reduce the MTU of the corresponding +device in the domain (because of the tunneling). Reducing the MTU may improve +performance for udp encapsulation, but is not necessary. + +For example, for eth0 (in the domain, not dom0) use + +ifconfig eth0 mtu 1400 + +or, better, put + +MTU=1400 + +in /etc/sysconfig/network-scripts/ifcfg-eth0. You may also have to change or remove +cached config files for eth0 under /etc/sysconfig/networking. + Once configured, vnets are persistent in the xend database. To remove a vnet use @@ -75,9 +91,13 @@ xm vnet-list -To get information on a vnet id use +To get information on one or more vnet ids use -xm vnet-list <vnet id> +xm vnet-list <vnet id>... + +You can also manage vnets using the vn utility which talks +directly to the vnet implementation. The source is in ../scripts/vn +and is installed in /usr/sbin/vn. 3) Troubleshooting ------------------ @@ -87,20 +107,18 @@ Its bridge and interface should appear in 'ifconfig'. It should also show in 'brctl show', with its attached interfaces. -You can 'see into' a vnet from dom0 if you put an IP address on the bridge -and configure its MAC address as a vif. -For example, if you have vnet97 with a vm with ip addr 10.0.0.12 on it, -and <mac> is the MAC address of vnet97 (use ifconfig), then +You can 'see into' a vnet from dom0 if you put an IP address on the bridge. +For example, if you have vnet97 and a vm with ip addr 10.0.0.12 connected to it, +then -echo '(vif.add (vnet 97) (vmac <mac>))' >/proc/vnet/policy ifconfig vnet97 10.0.0.20 up should let you ping 10.0.0.12 via the vnet97 bridge. -This works even if the vm with vif 10.0.0.12 is on another -machine (it only works locally if you don't use vif.add). 4) Examples ----------- + +These assume a vnet with a bridge 'vnet97' has been created. Here's the full config for a vm on vnet 97, using ip addr 10.0.0.12: @@ -143,7 +161,7 @@ the vms should be able to talk over the vnet. Check with ping. If they are both on the same machine the connection will simply be the vnet97 bridge, if they are on separate machines their -packets will be tunneled in etherip. They should be able to +packets will be tunneled in udp (or etherip). They should be able to see each other, but not the real network. diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/examples/Makefile --- a/tools/vnet/examples/Makefile Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/examples/Makefile Thu Feb 9 15:12:11 2006 @@ -1,15 +1,19 @@ # -*- mode: Makefile; -*- #============================================================================ -XEN_SCRIPT_DIR:=/etc/xen/scripts +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +INSTALL_DIR = $(INSTALL) -d -m0755 + +XEN_SCRIPT_DIR = $(DESTDIR)/etc/xen/scripts .PHONY: all install clean all: install: - install -m 0755 -d $(DESTDIR)$(XEN_SCRIPT_DIR) - install -m 0554 network-vnet $(DESTDIR)$(XEN_SCRIPT_DIR) - install -m 0554 vnet-insert $(DESTDIR)$(XEN_SCRIPT_DIR) + $(INSTALL_DIR) $(XEN_SCRIPT_DIR) + $(INSTALL_PROG) network-vnet $(XEN_SCRIPT_DIR) + $(INSTALL_PROG) vnet-insert $(XEN_SCRIPT_DIR) clean: \ No newline at end of file diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/examples/vnet97.sxp --- a/tools/vnet/examples/vnet97.sxp Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/examples/vnet97.sxp Thu Feb 9 15:12:11 2006 @@ -1,2 +1,2 @@ # Vnet configuration for a vnet with id 97 and no security. -(vnet (id 97) (bridge vnet97) (vnetif vnetif97) (security none)) +(vnet (id 97) (bridge vnet97) (vnetif vnif97) (security none)) diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/examples/vnet98.sxp --- a/tools/vnet/examples/vnet98.sxp Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/examples/vnet98.sxp Thu Feb 9 15:12:11 2006 @@ -1,2 +1,2 @@ # Vnet configuration for a vnet with id 98 and message authentication. -(vnet (id 98) (bridge vnet98) (vnetif vnetif98) (security auth)) +(vnet (id 98) (bridge vnet98) (vnetif vnif98) (security auth)) diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/examples/vnet99.sxp --- a/tools/vnet/examples/vnet99.sxp Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/examples/vnet99.sxp Thu Feb 9 15:12:11 2006 @@ -1,2 +1,2 @@ # Vnet configuration for a vnet with id 99 and message confidentiality. -(vnet (id 99) (bridge vnet99) (vnetif vnetif99) (security conf)) +(vnet (id 99) (bridge vnet99) (vnif vnetif99) (security conf)) diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/libxutil/Makefile --- a/tools/vnet/libxutil/Makefile Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/libxutil/Makefile Thu Feb 9 15:12:11 2006 @@ -14,7 +14,7 @@ LIB_SRCS += allocate.c LIB_SRCS += enum.c LIB_SRCS += file_stream.c -LIB_SRCS += gzip_stream.c +#LIB_SRCS += gzip_stream.c LIB_SRCS += hash_table.c LIB_SRCS += iostream.c LIB_SRCS += lexis.c @@ -45,8 +45,10 @@ all: build -build: check-for-zlib +build: #check-for-zlib $(MAKE) $(LIB) + +gzip_stream.o: check-for-zlib libxutil.so: libxutil.so.$(MAJOR) ln -sf $^ $@ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/libxutil/hash_table.c --- a/tools/vnet/libxutil/hash_table.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/libxutil/hash_table.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2001 - 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2001 - 2005 Mike Wray <mike.wray@xxxxxx> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -26,8 +26,6 @@ # include <stddef.h> #endif -//#include <limits.h> - #include "allocate.h" #include "hash_table.h" @@ -40,86 +38,129 @@ * buckets in the table changes. */ -/*==========================================================================*/ -/** Number of bits in half a word. */ -//#if __WORDSIZE == 64 -//#define HALF_WORD_BITS 32 -//#else -#define HALF_WORD_BITS 16 -//#endif - -/** Mask for lo half of a word. On 32-bit this is - * (1<<16) - 1 = 65535 = 0xffff - * It's 4294967295 = 0xffffffff on 64-bit. - */ -#define LO_HALF_MASK ((1 << HALF_WORD_BITS) - 1) - -/** Get the lo half of a word. */ -#define LO_HALF(x) ((x) & LO_HALF_MASK) - -/** Get the hi half of a word. */ -#define HI_HALF(x) ((x) >> HALF_WORD_BITS) - -/** Do a full hash on both inputs, using DES-style non-linear scrambling. - * Both inputs are replaced with the results of the hash. - * - * @param pleft input/output word - * @param pright input/output word - */ -void pseudo_des(unsigned long *pleft, unsigned long *pright){ - // Bit-rich mixing constant. - static const unsigned long a_mixer[] = { - 0xbaa96887L, 0x1e17d32cL, 0x03bcdc3cL, 0x0f33d1b2L, }; - - // Bit-rich mixing constant. - static const unsigned long b_mixer[] = { - 0x4b0f3b58L, 0xe874f0c3L, 0x6955c5a6L, 0x55a7ca46L, }; - - // Number of iterations - must be 2 or 4. - static const int ncycle = 4; - //static const int ncycle = 2; - - unsigned long left = *pleft, right = *pright; - unsigned long v, v_hi, v_lo; - int i; - - for(i=0; i<ncycle; i++){ - // Flip some bits in right to get v. - v = right; - v ^= a_mixer[i]; - // Get lo and hi halves of v. - v_lo = LO_HALF(v); - v_hi = HI_HALF(v); - // Non-linear mix of the halves of v. - v = ((v_lo * v_lo) + ~(v_hi * v_hi)); - // Swap the halves of v. - v = (HI_HALF(v) | (LO_HALF(v) << HALF_WORD_BITS)); - // Flip some bits. - v ^= b_mixer[i]; - // More non-linear mixing. - v += (v_lo * v_hi); - v ^= left; - left = right; - right = v; - } - *pleft = left; - *pright = right; -} - -/** Hash a string. - * - * @param s input to hash - * @return hashcode - */ -Hashcode hash_string(char *s){ - Hashcode h = 0; - if(s){ - for( ; *s; s++){ - h = hash_2ul(h, *s); - } - } - return h; -} +/*============================================================================*/ +/* +-------------------------------------------------------------------- +lookup2.c, by Bob Jenkins, December 1996, Public Domain. +You can use this free for any purpose. It has no warranty. +-------------------------------------------------------------------- +*/ + +#define hashsize(n) ((ub4)1<<(n)) +#define hashmask(n) (hashsize(n)-1) + +/* +-------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. +For every delta with one or two bit set, and the deltas of all three + high bits or all three low bits, whether the original value of a,b,c + is almost all zero or is uniformly distributed, +* If mix() is run forward or backward, at least 32 bits in a,b,c + have at least 1/4 probability of changing. +* If mix() is run forward, every bit of c will change between 1/3 and + 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) +mix() was built out of 36 single-cycle latency instructions in a + structure that could supported 2x parallelism, like so: + a -= b; + a -= c; x = (c>>13); + b -= c; a ^= x; + b -= a; x = (a<<8); + c -= a; b ^= x; + c -= b; x = (b>>13); + ... + Unfortunately, superscalar Pentiums and Sparcs can't take advantage + of that parallelism. They've also turned some of those single-cycle + latency instructions into multi-cycle latency instructions. Still, + this is the fastest good hash I could find. There were about 2^^68 + to choose from. I only looked at a billion or so. +-------------------------------------------------------------------- +*/ +#define mix(a,b,c) \ +{ \ + a -= b; a -= c; a ^= (c>>13); \ + b -= c; b -= a; b ^= (a<<8); \ + c -= a; c -= b; c ^= (b>>13); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<16); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>3); \ + b -= c; b -= a; b ^= (a<<10); \ + c -= a; c -= b; c ^= (b>>15); \ +} + +/* +-------------------------------------------------------------------- +hash() -- hash a variable-length key into a 32-bit value + k : the key (the unaligned variable-length array of bytes) + len : the length of the key, counting by bytes + level : can be any 4-byte value +Returns a 32-bit value. Every bit of the key affects every bit of +the return value. Every 1-bit and 2-bit delta achieves avalanche. +About 36+6len instructions. + +The best hash table sizes are powers of 2. There is no need to do +mod a prime (mod is sooo slow!). If you need less than 32 bits, +use a bitmask. For example, if you need only 10 bits, do + h = (h & hashmask(10)); +In which case, the hash table should have hashsize(10) elements. + +If you are hashing n strings (ub1 **)k, do it like this: + for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h); + +By Bob Jenkins, 1996. bob_jenkins@xxxxxxxxxxxxxxxxx You may use this +code any way you wish, private, educational, or commercial. It's free. + +See http://burlteburtle.net/bob/hash/evahash.html +Use for hash table lookup, or anything where one collision in 2^32 is +acceptable. Do NOT use for cryptographic purposes. +-------------------------------------------------------------------- +*/ + +ub4 hash(const ub1 *k, ub4 length, ub4 initval) +//register ub1 *k; /* the key */ +//register ub4 length; /* the length of the key */ +//register ub4 initval; /* the previous hash, or an arbitrary value */ +{ + /*register*/ ub4 a,b,c,len; + + /* Set up the internal state */ + len = length; + a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ + c = initval; /* the previous hash value */ + + /*---------------------------------------- handle most of the key */ + while (len >= 12) + { + a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24)); + b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24)); + c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24)); + mix(a,b,c); + k += 12; len -= 12; + } + + /*------------------------------------- handle the last 11 bytes */ + c += length; + switch(len) /* all the case statements fall through */ + { + case 11: c+=((ub4)k[10]<<24); + case 10: c+=((ub4)k[9]<<16); + case 9 : c+=((ub4)k[8]<<8); + /* the first byte of c is reserved for the length */ + case 8 : b+=((ub4)k[7]<<24); + case 7 : b+=((ub4)k[6]<<16); + case 6 : b+=((ub4)k[5]<<8); + case 5 : b+=k[4]; + case 4 : a+=((ub4)k[3]<<24); + case 3 : a+=((ub4)k[2]<<16); + case 2 : a+=((ub4)k[1]<<8); + case 1 : a+=k[0]; + /* case 0: nothing left to add */ + } + mix(a,b,c); + /*-------------------------------------------- report the result */ + return c; +} +/*============================================================================*/ /** Get the bucket for a hashcode in a hash table. * @@ -132,28 +173,22 @@ } /** Initialize a hash table. - * Can be safely called more than once. * * @param table to initialize */ -void HashTable_init(HashTable *table){ +static void HashTable_init(HashTable *table){ int i; - if(!table->init_done){ - table->init_done = 1; - table->next_id = 0; - for(i=0; i<table->buckets_n; i++){ - HTBucket *bucket = get_bucket(table, i); - bucket->head = 0; - bucket->count = 0; - } - table->entry_count = 0; - } + for(i = 0; i < table->buckets_n; i++){ + HTBucket *bucket = get_bucket(table, i); + bucket->head = NULL; + bucket->count = 0; + } + table->entry_count = 0; } /** Allocate a new hashtable. * If the number of buckets is not positive the default is used. - * The number of buckets should usually be prime. * * @param buckets_n number of buckets * @return new hashtable or null @@ -167,7 +202,7 @@ z->buckets = (HTBucket*)allocate(buckets_n * sizeof(HTBucket)); if(!z->buckets){ deallocate(z); - z = 0; + z = NULL; goto exit; } z->buckets_n = buckets_n; @@ -233,7 +268,7 @@ goto exit; } table->buckets_n = buckets_n; - for(i=0; i<old_buckets_n; i++){ + for(i=0; i < old_buckets_n; i++){ HTBucket *bucket = old_buckets + i; HTEntry *entry, *next; for(entry = bucket->head; entry; entry = next){ @@ -305,7 +340,7 @@ * @param entry to free */ inline void HashTable_free_entry(HashTable *table, HTEntry *entry){ - if(!entry)return; + if(!entry) return; if(table && table->entry_free_fn){ table->entry_free_fn(table, entry); } else { @@ -325,7 +360,7 @@ inline HTEntry * HashTable_find_entry(HashTable *table, Hashcode hashcode, TableTestFn *test_fn, TableArg arg){ HTBucket *bucket; - HTEntry *entry = 0; + HTEntry *entry = NULL; HTEntry *next; bucket = get_bucket(table, hashcode); @@ -346,7 +381,7 @@ * @return 1 if equal, 0 otherwise */ inline int HashTable_key_equal(HashTable *table, void *key1, void *key2){ - return (table->key_equal_fn ? table->key_equal_fn(key1, key2) : key1==key2); + return (table->key_equal_fn ? table->key_equal_fn(key1, key2) : key1 == key2); } /** Compute the hashcode of a hashtable key. @@ -358,7 +393,9 @@ * @return hashcode */ inline Hashcode HashTable_key_hash(HashTable *table, void *key){ - return (table->key_hash_fn ? table->key_hash_fn(key) : hash_ul((unsigned long)key)); + return (table->key_hash_fn + ? table->key_hash_fn(key) + : hash_hvoid(0, &key, sizeof(key))); } /** Test if an entry has a given key. @@ -378,16 +415,10 @@ * @param key to look for * @return entry if found, null otherwise */ -#if 0 -inline HTEntry * HashTable_get_entry(HashTable *table, void *key){ - TableArg arg = { ptr: key }; - return HashTable_find_entry(table, HashTable_key_hash(table, key), has_key, arg); -} -#else inline HTEntry * HashTable_get_entry(HashTable *table, void *key){ Hashcode hashcode; HTBucket *bucket; - HTEntry *entry = 0; + HTEntry *entry = NULL; HTEntry *next; hashcode = HashTable_key_hash(table, key); @@ -400,7 +431,6 @@ } return entry; } -#endif /** Get the value of an entry with a given key. * @@ -420,7 +450,7 @@ void show_buckets(HashTable *table, IOStream *io){ int i,j ; IOStream_print(io, "entry_count=%d buckets_n=%d\n", table->entry_count, table->buckets_n); - for(i=0; i<table->buckets_n; i++){ + for(i=0; i < table->buckets_n; i++){ if(0 || table->buckets[i].count>0){ IOStream_print(io, "bucket %3d %3d %10p ", i, table->buckets[i].count, @@ -442,10 +472,9 @@ */ static int print_entry(TableArg arg, HashTable *table, HTEntry *entry){ IOStream *io = (IOStream*)arg.ptr; - IOStream_print(io, " b=%4lx h=%08lx i=%08lx |-> e=%8p k=%8p v=%8p\n", + IOStream_print(io, " b=%4lx h=%08lx |-> e=%8p k=%8p v=%8p\n", entry->hashcode % table->buckets_n, entry->hashcode, - entry->index, entry, entry->key, entry->value); return 0; } @@ -460,21 +489,6 @@ IOStream_print(io, "}\n"); } /*==========================================================================*/ - -/** Get the next entry id to use for a table. - * - * @param table hash table - * @return non-zero entry id - */ -static inline unsigned long get_next_id(HashTable *table){ - unsigned long id; - - if(table->next_id == 0){ - table->next_id = 1; - } - id = table->next_id++; - return id; -} /** Add an entry to the bucket for the * given hashcode. @@ -488,7 +502,6 @@ inline HTEntry * HashTable_add_entry(HashTable *table, Hashcode hashcode, void *key, void *value){ HTEntry *entry = HTEntry_new(hashcode, key, value); if(entry){ - entry->index = get_next_id(table); push_on_bucket(table, hashcode, entry); table->entry_count++; } @@ -537,7 +550,6 @@ return HashTable_add_entry(table, HashTable_key_hash(table, key), key, value); } - /** Remove entries satisfying a test from the bucket for the * given hashcode. * @@ -550,7 +562,7 @@ inline int HashTable_remove_entry(HashTable *table, Hashcode hashcode, TableTestFn *test_fn, TableArg arg){ HTBucket *bucket; - HTEntry *entry, *prev = 0, *next; + HTEntry *entry, *prev = NULL, *next; int removed_count = 0; bucket = get_bucket(table, hashcode); @@ -566,7 +578,7 @@ table->entry_count--; removed_count++; HashTable_free_entry(table, entry); - entry = 0; + entry = NULL; } prev = entry; } @@ -580,10 +592,9 @@ * @return number of entries removed */ inline int HashTable_remove(HashTable *table, void *key){ -#if 1 Hashcode hashcode; HTBucket *bucket; - HTEntry *entry, *prev = 0, *next; + HTEntry *entry, *prev = NULL, *next; int removed_count = 0; hashcode = HashTable_key_hash(table, key); @@ -600,15 +611,11 @@ table->entry_count--; removed_count++; HashTable_free_entry(table, entry); - entry = 0; + entry = NULL; } prev = entry; } return removed_count; -#else - return HashTable_remove_entry(table, HashTable_key_hash(table, key), - has_key, (TableArg){ ptr: key}); -#endif } /** Remove (and free) all the entries in a bucket. @@ -622,7 +629,7 @@ next = entry->next; HashTable_free_entry(table, entry); } - bucket->head = 0; + bucket->head = NULL; table->entry_count -= bucket->count; bucket->count = 0; } @@ -634,7 +641,7 @@ void HashTable_clear(HashTable *table){ int i, n = table->buckets_n; - for(i=0; i<n; i++){ + for(i = 0; i < n; i++){ bucket_clear(table, table->buckets + i); } } diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/libxutil/hash_table.h --- a/tools/vnet/libxutil/hash_table.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/libxutil/hash_table.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2001 - 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2001 - 2005 Mike Wray <mike.wray@xxxxxx> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -20,6 +20,7 @@ #define _XUTIL_HASH_TABLE_H_ #include "iostream.h" +#include "sys_string.h" typedef unsigned long Hashcode; @@ -33,8 +34,6 @@ typedef struct HTEntry { /** Hashcode of the entry's key. */ Hashcode hashcode; - /** Identifier for this entry in the table. */ - int index; /** The key for this entry. */ void *key; /** The value in this entry. */ @@ -53,8 +52,8 @@ /** Default number of buckets in a hash table. * You want enough buckets so the lists in the buckets will typically be short. - * It's a good idea if this is prime, since that will help to spread hashcodes - * around the table. + * If the hash function is good it doesn't matter whether the number of + * buckets is prime or not. */ //#define HT_BUCKETS_N 1 //#define HT_BUCKETS_N 3 @@ -91,14 +90,10 @@ * These all default to 0, when default behaviour treating keys as integers is used. */ struct HashTable { - /** Flag indicating whether the table has been initialised. */ - int init_done; - /** Next value for the id field in inserted rules. */ - unsigned long next_id; + /** Array of buckets, each with its own list. */ + HTBucket *buckets; /** Number of buckets in the bucket array. */ int buckets_n; - /** Array of buckets, each with its own list. */ - HTBucket *buckets; /** Number of entries in the table. */ int entry_count; /** Function to free keys and values in entries. */ @@ -126,85 +121,35 @@ TableTestFn *test_fn, TableArg arg); extern int HashTable_remove_entry(HashTable *table, Hashcode hashcode, TableTestFn *test_fn, TableArg arg); -//extern int HashTable_map(HashTable *table, TableMapFn *map_fn, TableArg arg); extern void HashTable_print(HashTable *table, IOStream *out); extern int HashTable_set_buckets_n(HashTable *table, int buckets_n); extern int HashTable_adjust(HashTable *table, int buckets_min); -extern void pseudo_des(unsigned long *pleft, unsigned long *pright); -extern Hashcode hash_string(char *s); extern int HashTable_order_bucket(HashTable *table, Hashcode hashcode, TableOrderFn *order); -/** Control whether to use hashing based on DES or simple - * hashing. DES hashing is `more random' but much more expensive. - */ -#define HASH_PSEUDO_DES 0 - -/** Hash a long using a quick and dirty linear congruential random number generator. - * See `Numerical Recipes in C', Chapter 7, "An Even Quicker Generator". - * - * @param a value to hash - * @return hashed input - */ -static inline unsigned long lcrng_hash(unsigned long a){ - return (1664525L * a + 1013904223L); +typedef unsigned long ub4; +typedef unsigned char ub1; + +extern ub4 hash(const ub1 *k, ub4 length, ub4 initval); + +/** Hash some bytes starting with a given hashcode. + * + * @param h initial hashcode - use 0, a previous hash, or an arbitrary value + * @param b bytes to hash + * @param b_n number of bytes to hash + * @return hashcode + */ +static inline Hashcode hash_hvoid(Hashcode h, const void *b, unsigned b_n){ + return hash(b, b_n, h); } -/** Hash an unsigned long. - * - * @param a input to hash +/** Hash a string (null-terminated). + * + * @param s input to hash * @return hashcode */ -static inline Hashcode hash_ul(unsigned long a){ -#if HASH_PSEUDO_DES - unsigned long left = a; - unsigned long right = 0L; - pseudo_des(&left, &right); - return right; -#else - a = lcrng_hash(a); - a = lcrng_hash(a); - return a; -#endif -} - -/** Hash two unsigned longs together. - * - * @param a input to hash - * @param b input to hash - * @return hashcode - */ -static inline Hashcode hash_2ul(unsigned long a, unsigned long b){ -#if HASH_PSEUDO_DES - unsigned long left = a; - unsigned long right = b; - pseudo_des(&left, &right); - return right; -#else - a = lcrng_hash(a); - a ^= b; - a = lcrng_hash(a); - return a; -#endif -} - -/** Hash a hashcode and an unsigned long together. - * - * @param a input hashcode - * @param b input to hash - * @return hashcode - */ -static inline Hashcode hash_hul(Hashcode a, unsigned long b){ -#if HASH_PSEUDO_DES - unsigned long left = a; - unsigned long right = b; - pseudo_des(&left, &right); - return right; -#else - a ^= b; - a = lcrng_hash(a); - return a; -#endif +static inline Hashcode hash_string(char *s){ + return (s ? hash_hvoid(0, s, strlen(s)) : 0); } /** Macro to declare variables for HashTable_for_each() to use. diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/libxutil/mem_stream.c --- a/tools/vnet/libxutil/mem_stream.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/libxutil/mem_stream.c Thu Feb 9 15:12:11 2006 @@ -183,6 +183,8 @@ static int mem_expand(MemData *data, size_t extra){ int err = -ENOMEM; int delta = (extra < delta_min ? delta_min : extra); + int buf_n; + char *buf; if(data->buf_max > 0){ int delta_max = data->buf_max - data->buf_n; if(delta > delta_max){ @@ -190,8 +192,8 @@ if(delta > delta_max) goto exit; } } - int buf_n = data->buf_n + delta; - char *buf = allocate(buf_n); + buf_n = data->buf_n + delta; + buf = allocate(buf_n); if(!buf) goto exit; mem_get(data, buf, mem_len(data)); data->hi = mem_len(data); @@ -218,9 +220,10 @@ * @return number of bytes written on success, negative error code otherwise */ static int mem_write(IOStream *io, const void *msg, size_t n){ + int room; MemData *data = get_mem_data(io); if(data->err) return -data->err; - int room = mem_room(data); + room = mem_room(data); if(n > room){ int err = mem_expand(data, n - room); if(err) return err; @@ -238,9 +241,10 @@ * @return number of bytes read on success, negative error code otherwise */ static int mem_read(IOStream *io, void *buf, size_t n){ + int k; MemData *data = get_mem_data(io); if(data->err) return -data->err; - int k = mem_len(data); + k = mem_len(data); if(n > k){ n = k; } @@ -292,8 +296,9 @@ IOStream *mem_stream_new_size(size_t buf_n, size_t buf_max){ int err = -ENOMEM; MemData *data = ALLOCATE(MemData); + IOStream *io = NULL; if(!data) goto exit; - IOStream *io = ALLOCATE(IOStream); + io = ALLOCATE(IOStream); if(!io) goto exit; if(buf_n <= delta_min){ buf_n = delta_min; diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/libxutil/sxpr.h --- a/tools/vnet/libxutil/sxpr.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/libxutil/sxpr.h Thu Feb 9 15:12:11 2006 @@ -149,7 +149,7 @@ typedef struct ObjString { int len; - char data[]; + char data[0]; } ObjString; /** An atom. */ @@ -318,7 +318,6 @@ ObjCopyFn *copy; } SxprType; - extern int def_sxpr_type(SxprType *tydef); extern SxprType *get_sxpr_type(int ty); diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/libxutil/sxpr_parser.c --- a/tools/vnet/libxutil/sxpr_parser.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/libxutil/sxpr_parser.c Thu Feb 9 15:12:11 2006 @@ -310,6 +310,7 @@ void Parser_pop(Parser *p){ ParserState *s = p->state; if(!s) return; + dprintf("Parser_pop> %s\n", s->name); p->state = s->parent; if (p->start_state == s) { p->start_state = NULL; @@ -336,6 +337,7 @@ } int Parser_push(Parser *p, ParserStateFn *fn, char *name){ + dprintf("Parser_push> %s\n", name); return ParserState_new(fn, name, p->state, &p->state); } @@ -522,7 +524,7 @@ } Sxpr Parser_get_val(Parser *p){ - Sxpr v = ONONE; + Sxpr v = ONONE, w = ONONE; if(CONSP(p->val)){ } else if (p->start_state && CONSP(p->start_state->val)){ p->val = p->start_state->val; @@ -531,7 +533,7 @@ } else { goto exit; } - Sxpr w = p->val; + w = p->val; v = CAR(w); p->val = CDR(w); hfree(w); @@ -940,11 +942,13 @@ int Parser_input(Parser *p, char *buf, int buf_n){ int err = 0; int i = 0; - dprintf("> |%s|\n", buf); + dprintf("> buf_n=%d\n", buf_n); if(buf_n <= 0){ + buf_n = 0; err = Parser_input_eof(p); goto exit; } + dprintf("> buf=|%*s|\n", buf_n, buf); for(i = 0; i < buf_n; i++){ err = Parser_input_char(p, buf[i]); if(err) goto exit; diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/libxutil/sys_net.c --- a/tools/vnet/libxutil/sys_net.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/libxutil/sys_net.c Thu Feb 9 15:12:11 2006 @@ -182,7 +182,7 @@ #else struct hostent *host = gethostbyname(name); if(!host){ - return -EINVAL; + return -ENOENT; } *address = ((struct in_addr *)(host->h_addr))->s_addr; return 0; diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/Makefile-2.6 --- a/tools/vnet/vnet-module/Makefile-2.6 Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/Makefile-2.6 Thu Feb 9 15:12:11 2006 @@ -30,11 +30,15 @@ #export KBUILD_VERBOSE=1 .PHONY: all -all: module +all: module module_version .PHONY: module module modules: $(MAKE) -C $(KERNEL_SRC) M=`pwd` modules + +.PHONY: module_version +module_version: + $(warning Module version $(shell strings $(KERNEL_MODULE) | grep vermagic)) .PHONY: install install-module modules_install install install-module modules_install: module @@ -47,6 +51,7 @@ -@$(RM) *.a *.o *.ko *~ .*.d .*.cmd *.mod.? -@$(RM) -r .tmp_versions +.PHONY: TAGS TAGS: etags *.c *.h diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/Makefile.ver --- a/tools/vnet/vnet-module/Makefile.ver Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/Makefile.ver Thu Feb 9 15:12:11 2006 @@ -21,8 +21,8 @@ LINUX_SERIES ?=2.6 KERNEL_MINOR ?=-xen0 -LINUX_VERSION ?= $(shell ( /bin/ls -ld $(XEN_ROOT)/linux-$(LINUX_SERIES).*-xen0 ) 2>/dev/null | \ - sed -e 's!^.*linux-\(.\+\)-xen0!\1!' ) +LINUX_VERSION ?= $(shell (/bin/ls -ld $(XEN_ROOT)/pristine-linux-$(LINUX_SERIES).* 2>/dev/null) | \ + sed -e 's!^.*linux-\(.\+\)!\1!' ) ifeq ($(LINUX_VERSION),) $(error Kernel source for linux $(LINUX_SERIES) not found) @@ -32,7 +32,13 @@ KERNEL_SRC ?= $(XEN_ROOT)/linux-$(KERNEL_VERSION) -KERNEL_MODULE_DIR = /lib/modules/$(KERNEL_VERSION)/kernel +# Get the full kernel release version from its makefile, as the source path +# may not have the extraversion, e.g. linux-2.6.12-xen0 may contain release 2.6.12.6-xen0. +KERNEL_RELEASE = $(shell make -s -C $(KERNEL_SRC) kernelrelease || \ + make -f $(shell pwd)/Makefile.kver -s -C $(KERNEL_SRC) kernelrelease ) +KERNEL_MODULE_DIR = /lib/modules/$(KERNEL_RELEASE)/kernel + +$(warning KERNEL_SRC $(KERNEL_SRC)) #$(warning KERNEL_VERSION $(KERNEL_VERSION)) -#$(warning KERNEL_SRC $(KERNEL_SRC)) +$(warning KERNEL_RELEASE $(KERNEL_RELEASE)) diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/Makefile.vnet --- a/tools/vnet/vnet-module/Makefile.vnet Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/Makefile.vnet Thu Feb 9 15:12:11 2006 @@ -32,6 +32,8 @@ VNET_SRC += sa.c VNET_SRC += skb_context.c VNET_SRC += skb_util.c +VNET_SRC += sxpr_util.c +VNET_SRC += timer_util.c VNET_SRC += tunnel.c VNET_SRC += varp.c VNET_SRC += varp_socket.c @@ -39,12 +41,15 @@ VNET_SRC += vnet.c VNET_SRC += vnet_dev.c VNET_SRC += vnet_ioctl.c +VNET_SRC += vnet_eval.c +VNET_SRC += vnet_forward.c VNET_LIB_SRC += allocate.c VNET_LIB_SRC += enum.c VNET_LIB_SRC += hash_table.c VNET_LIB_SRC += iostream.c VNET_LIB_SRC += kernel_stream.c +VNET_LIB_SRC += mem_stream.c VNET_LIB_SRC += sxpr.c VNET_LIB_SRC += sxpr_parser.c VNET_LIB_SRC += sys_net.c diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/esp.c --- a/tools/vnet/vnet-module/esp.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/esp.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -51,6 +51,7 @@ #include <tunnel.h> #include <vnet.h> #include <skb_util.h> +#include <skb_context.h> static const int DEBUG_ICV = 0; @@ -58,6 +59,18 @@ #define DEBUG 1 #undef DEBUG #include "debug.h" + +#ifndef CONFIG_CRYPTO_HMAC +#warning No esp transform - CONFIG_CRYPTO_HMAC not defined + +int __init esp_module_init(void){ + return 0; +} + +void __exit esp_module_exit(void){ +} + +#else /* Outgoing packet: [ eth | ip | data ] * After etherip: [ eth2 | ip2 | ethip | eth | ip | data ] @@ -221,7 +234,7 @@ if(DEBUG_ICV){ dprintf("> skb digest_n=%d icv_n=%d\n", digest_n, icv_n); - skb_print_bits(skb, 0, digest_n); + skb_print_bits("esp", skb, 0, digest_n); } memset(icv, 0, icv_n); esp->digest.icv(esp, skb, 0, digest_n, icv); @@ -248,7 +261,7 @@ if(DEBUG_ICV){ dprintf("> skb len=%d digest_n=%d icv_n=%d\n", skb->len, digest_n, icv_n); - skb_print_bits(skb, 0, skb->len); + skb_print_bits("esp", skb, 0, skb->len); } if(skb_copy_bits(skb, digest_n, icv_skb, icv_n)){ wprintf("> Error getting icv from skb\n"); @@ -309,7 +322,7 @@ dprintf("> len=%d plaintext=%d ciphertext=%d extra=%d\n", skb->len, plaintext_n, ciphertext_n, extra_n); dprintf("> iv=%d icv=%d\n", iv_n, icv_n); - skb_print_bits(skb, 0, skb->len); + skb_print_bits("iv", skb, 0, skb->len); // Add headroom for esp header and iv, tailroom for the ciphertext // and icv. @@ -393,9 +406,12 @@ * Does ESP receive processing (check icv, decrypt), strips * ESP header and re-receives. * + * If return 1 the packet has been freed. + * If return <= 0 the caller must free. + * * @param sa SA * @param skb packet - * @return 0 on success, negative error code otherwise + * @return >= 0 on success, negative protocol otherwise */ static int esp_sa_recv(SAState *sa, struct sk_buff *skb){ int err = -EINVAL; @@ -458,10 +474,19 @@ sa, esp_context_free_fn); if(err) goto exit; // Increase sa refcount now the skb context refers to it. + // Refcount is decreased by esp_context_free_fn. SAState_incref(sa); - err = netif_rx(skb); - exit: - if(mine) err = 1; + // Deliver skb to be received by network code. + // Not safe to refer to the skb after this. + // todo: return -skb->nh.iph->protocol instead? + netif_rx(skb); + exit: + if(mine){ + if(err < 0){ + kfree_skb(skb); + } + err = 1; + } dprintf("< skb=%p err=%d\n", skb, err); return err; } @@ -717,9 +742,15 @@ * Lookup spi, if state found hand to the state. * If no state, check spi, if ok, create state and pass to it. * If spi not ok, drop. + * + * Return value convention for protocols: + * >= 0 Protocol took the packet + * < 0 A -ve protocol id the packet should be re-received as. + * + * So always return >=0 if we took the packet, even if we dropped it. * * @param skb packet - * @return 0 on sucess, negative error code otherwise + * @return 0 on sucess, negative protocol number otherwise */ static int esp_protocol_recv(struct sk_buff *skb){ int err = 0; @@ -730,7 +761,10 @@ u32 addr; dprintf(">\n"); - dprintf("> recv skb=\n"); skb_print_bits(skb, 0, skb->len); +#ifdef DEBUG + dprintf("> recv skb=\n"); + skb_print_bits(skb, 0, skb->len); +#endif ip_n = (skb->nh.iph->ihl << 2); if(skb->data == skb->mac.raw){ // skb->data points at ethernet header. @@ -751,9 +785,14 @@ err = vnet_sa_create(esph->spi, IPPROTO_ESP, addr, &sa); if(err) goto exit; } + //todo: Return a -ve protocol instead? See esp_sa_recv. err = SAState_recv(sa, skb); exit: if(sa) SAState_decref(sa); + if(err <= 0){ + kfree_skb(skb); + err = 0; + } dprintf("< err=%d\n", err); return err; } @@ -861,3 +900,4 @@ } } +#endif // CONFIG_CRYPTO_HMAC diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/esp.h --- a/tools/vnet/vnet-module/esp.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/esp.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,9 +19,18 @@ #ifndef __VNET_ESP_H__ #define __VNET_ESP_H__ +#ifdef __KERNEL__ #include <linux/config.h> #include <linux/types.h> #include <linux/crypto.h> + +#else + +#include "sys_kernel.h" + +struct crypto_tfm; + +#endif /** Header used by IPSEC ESP (Encapsulated Security Payload). */ typedef struct ESPHdr { diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/etherip.c --- a/tools/vnet/vnet-module/etherip.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/etherip.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -16,6 +16,8 @@ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA * */ +#ifdef __KERNEL__ + #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -29,13 +31,30 @@ #include <linux/netdevice.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/icmp.h> +#include <linux/udp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/checksum.h> + +#else + +#include <netinet/in.h> +#include <arpa/inet.h> + +#include "sys_kernel.h" +#include "spinlock.h" +#include "skbuff.h" +#include <linux/ip.h> +#include <linux/udp.h> + +#define IP_DF 0x4000 /* Flag: "Don't Fragment" */ + +#endif #include <etherip.h> #include <tunnel.h> @@ -44,15 +63,22 @@ #include <if_varp.h> #include <varp.h> #include <skb_util.h> +#include <skb_context.h> #define MODULE_NAME "VNET" -//#define DEBUG 1 +#define DEBUG 1 #undef DEBUG #include "debug.h" /** @file Etherip implementation. * The etherip protocol is used to transport Ethernet frames in IP packets. */ + +/** Flag controlling whether to use etherip-in-udp encapsulation. + * If false we send etherip protocol in IP packets. + * If true we send etherip protocol in UDP packets with a vnet header. + */ +int etherip_in_udp = 1; /** Get the vnet label from an etherip header. * @@ -64,7 +90,7 @@ *vnet = *(VnetId*)hdr->vnet; #else *vnet = (VnetId){}; - vnet->u.vnet16[7] = (unsigned short)hdr->reserved; + vnet->u.vnet16[VNET_SIZE16 - 1] = (unsigned short)hdr->reserved; #endif } @@ -81,7 +107,7 @@ *(VnetId*)hdr->vnet = *vnet; #else hdr->version = ETHERIP_VERSION; - hdr->reserved = (vnet->u.vnet16[7] & 0x0fff); + hdr->reserved = (vnet->u.vnet16[VNET_SIZE16 - 1] & 0x0fff); #endif } @@ -112,55 +138,69 @@ */ static int etherip_tunnel_send(Tunnel *tunnel, struct sk_buff *skb){ int err = 0; + const int ip_n = sizeof(struct iphdr); const int etherip_n = sizeof(struct etheriphdr); - const int ip_n = sizeof(struct iphdr); - const int eth_n = ETH_HLEN; - int head_n = 0; + const int udp_n = sizeof(struct udphdr); + const int vnet_n = sizeof(struct VnetMsgHdr); + int head_n = etherip_n + ip_n /* + ETH_HLEN */; VnetId *vnet = &tunnel->key.vnet; struct etheriphdr *etheriph; - struct ethhdr *ethh; u32 saddr = 0; - //dprintf("> skb=%p vnet=%d\n", skb, vnet); - head_n = etherip_n + ip_n + eth_n; + if(etherip_in_udp){ + head_n += vnet_n + udp_n; + } err = skb_make_room(&skb, skb, head_n, 0); if(err) goto exit; - //err = vnet_get_device_address(skb->dev, &saddr); - //if(err) goto exit; - - // The original ethernet header. - ethh = eth_hdr(skb); - //print_skb_data(__FUNCTION__, 0, skb, skb->mac.raw, skb->len); // Null the pointer as we are pushing a new IP header. skb->mac.raw = NULL; // Setup the etherip header. - //dprintf("> push etherip header...\n"); - etheriph = (struct etheriphdr *)skb_push(skb, etherip_n); + etheriph = (void*)skb_push(skb, etherip_n); etheriphdr_set_vnet(etheriph, vnet); + if(etherip_in_udp){ + // Vnet header. + struct VnetMsgHdr *vhdr = (void*)skb_push(skb, vnet_n); + vhdr->id = htons(VUDP_ID); + vhdr->opcode = 0; + + // Setup the UDP header. + skb->h.raw = skb_push(skb, udp_n); + skb->h.uh->source = varp_port; // Source port. + skb->h.uh->dest = varp_port; // Destination port. + skb->h.uh->len = htons(skb->len); // Total packet length (bytes). + skb->h.uh->check = 0; + } + // Setup the IP header. - //dprintf("> push IP header...\n"); skb->nh.raw = skb_push(skb, ip_n); skb->nh.iph->version = 4; // Standard version. skb->nh.iph->ihl = ip_n / 4; // IP header length (32-bit words). skb->nh.iph->tos = 0; // No special type-of-service. skb->nh.iph->tot_len = htons(skb->len); // Total packet length (bytes). skb->nh.iph->id = 0; // No flow id (since no frags). - skb->nh.iph->frag_off = htons(IP_DF); // Don't fragment - can't handle frags. + if(etherip_in_udp){ + skb->nh.iph->protocol = IPPROTO_UDP; // IP protocol number. + skb->nh.iph->frag_off = 0; + } else { + skb->nh.iph->protocol = IPPROTO_ETHERIP;// IP protocol number. + skb->nh.iph->frag_off = htons(IP_DF); // Don't fragment - can't handle frags. + } skb->nh.iph->ttl = 64; // Linux default time-to-live. - skb->nh.iph->protocol = IPPROTO_ETHERIP; // IP protocol number. skb->nh.iph->saddr = saddr; // Source address. - skb->nh.iph->daddr = tunnel->key.addr.u.ip4.s_addr; // Destination address. - skb->nh.iph->check = 0; + skb->nh.iph->daddr = tunnel->key.addr.u.ip4.s_addr; // Destination address. + skb->nh.iph->check = 0; // Zero the checksum. // Ethernet header will be filled-in by device. err = Tunnel_send(tunnel->base, skb); skb = NULL; exit: - if(err && skb) dev_kfree_skb(skb); - //dprintf("< err=%d\n", err); + if(err && skb){ + wprintf("< err=%d\n", err); + kfree_skb(skb); + } return err; } @@ -175,73 +215,59 @@ TunnelType *etherip_tunnel_type = &_etherip_tunnel_type; -/* Defeat compiler warnings about unused functions. */ -static void print_str(char *s, int n) __attribute__((unused)); - -static void print_str(char *s, int n) { - int i; - - for(i=0; i<n; s++, i++){ - if(i && i % 40 == 0) printk("\n"); - if(('a'<= *s && *s <= 'z') || - ('A'<= *s && *s <= 'Z') || - ('0'<= *s && *s <= '9')){ - printk("%c", *s); - } else { - printk("<%x>", (unsigned)(0xff & *s)); - } - } - printk("\n"); +int etherip_tunnel_create(VnetId *vnet, VarpAddr *addr, Tunnel *base, Tunnel **tunnel){ + return Tunnel_create(etherip_tunnel_type, vnet, addr, base, tunnel); } /** Do etherip receive processing. - * Strips etherip header to extract the ethernet frame, sets + * Strips the etherip header to extract the ethernet frame, sets * the vnet from the header and re-receives the frame. * + * Return code 1 means we now own the packet - the caller must not free it. + * Return code < 0 means an error - caller still owns the packet. + * * @param skb packet - * @return 0 on success, error code otherwise - */ -static int etherip_protocol_recv(struct sk_buff *skb){ + * @return 1 on success, error code otherwise + */ +int etherip_protocol_recv(struct sk_buff *skb){ int err = 0; - int mine = 0; - const int eth_n = ETH_HLEN; - int ip_n; const int etherip_n = sizeof(struct etheriphdr); struct etheriphdr *etheriph; - struct ethhdr *ethhdr; Vnet *vinfo = NULL; VnetId vnet = {}; u32 saddr, daddr; char vnetbuf[VNET_ID_BUF]; - + struct ethhdr *eth; + + dprintf(">\n"); saddr = skb->nh.iph->saddr; daddr = skb->nh.iph->daddr; - ethhdr = eth_hdr(skb); if(MULTICAST(daddr) && (daddr != varp_mcast_addr)){ // Ignore multicast packets not addressed to us. - dprintf("> Ignoring mcast skb: src=%u.%u.%u.%u dst=%u.%u.%u.%u" + wprintf("> Ignoring mcast skb: src=%u.%u.%u.%u dst=%u.%u.%u.%u" " varp_mcast_addr=%u.%u.%u.%u\n", NIPQUAD(saddr), NIPQUAD(daddr), NIPQUAD(varp_mcast_addr)); goto exit; } - ip_n = (skb->nh.iph->ihl << 2); if(skb->data == skb->mac.raw){ // skb->data points at ethernet header. + //FIXME: Does this ever happen? //dprintf("> len=%d\n", skb->len); - if (!pskb_may_pull(skb, eth_n + ip_n)){ + int ip_n = (skb->nh.iph->ihl << 2); + int pull_n = ETH_HLEN + ip_n; + if (!pskb_may_pull(skb, pull_n)){ wprintf("> Malformed skb (eth+ip) src=%u.%u.%u.%u\n", NIPQUAD(saddr)); err = -EINVAL; goto exit; } - skb_pull(skb, eth_n + ip_n); + skb_pull(skb, pull_n); } // Assume skb->data points at etherip header. etheriph = (void*)skb->data; if(etheriph->version != ETHERIP_VERSION){ wprintf("> Bad etherip version=%d src=%u.%u.%u.%u\n", - etheriph->version, - NIPQUAD(saddr)); + etheriph->version, NIPQUAD(saddr)); err = -EINVAL; goto exit; } @@ -252,105 +278,81 @@ goto exit; } etheriphdr_get_vnet(etheriph, &vnet); - dprintf("> Rcvd skb vnet=%s src=%u.%u.%u.%u\n", - VnetId_ntoa(&vnet, vnetbuf), - NIPQUAD(saddr)); // If vnet is secure, context must include IPSEC ESP. err = vnet_check_context(&vnet, SKB_CONTEXT(skb), &vinfo); - Vnet_decref(vinfo); if(err){ wprintf("> Failed security check vnet=%s src=%u.%u.%u.%u\n", - VnetId_ntoa(&vnet, vnetbuf), - NIPQUAD(saddr)); + VnetId_ntoa(&vnet, vnetbuf), NIPQUAD(saddr)); goto exit; } - mine = 1; // Point at the headers in the contained ethernet frame. skb->mac.raw = skb_pull(skb, etherip_n); - - // Know source ip, vnet, vmac, so could update varp cache. - // But if traffic comes to us over a vnetd tunnel this points the coa - // at the vnetd rather than the endpoint. So don't do it. - //varp_update(vnet, eth_hdr(skb)->h_source, skb->nh.iph->saddr); - + eth = eth_hdr(skb); + + // Simulate the logic from eth_type_trans() + // to set skb->pkt_type and skb->protocol. + if(mac_is_multicast(eth->h_dest)){ + if(mac_is_broadcast(eth->h_dest)){ + skb->pkt_type = PACKET_BROADCAST; + } else { + skb->pkt_type = PACKET_MULTICAST; + } + } else { + skb->pkt_type = PACKET_HOST; + } + if(ntohs(eth->h_proto) >= 1536){ + skb->protocol = eth->h_proto; + } else { + skb->protocol = htons(ETH_P_802_2); + } + // Assuming a standard Ethernet frame. // Should check for protocol? Support ETH_P_8021Q too. skb->nh.raw = skb_pull(skb, ETH_HLEN); - dprintf("> Unpacked vnet=%s srcmac=" MACFMT " dstmac=" MACFMT "\n", +#ifdef __KERNEL__ + // Fix IP options, checksum, skb dst, netfilter state. + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + if (skb->ip_summed == CHECKSUM_HW){ + skb->ip_summed = CHECKSUM_NONE; + } + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); +#ifdef CONFIG_BRIDGE_NETFILTER + // Stop the eth header being clobbered by nf_bridge_maybe_copy_header(). + // Were using this modified to use h_proto instead of skb->protocol. + if(skb->nf_bridge){ + nf_bridge_save_header(skb); + } +#endif +#endif // __KERNEL__ + + dprintf("> Unpacked srcaddr=" IPFMT " vnet=%s srcmac=" MACFMT " dstmac=" MACFMT "\n", + NIPQUAD(skb->nh.iph->saddr), VnetId_ntoa(&vnet, vnetbuf), - MAC6TUPLE(eth_hdr(skb)->h_source), - MAC6TUPLE(eth_hdr(skb)->h_dest)); - -#ifdef CONFIG_NETFILTER -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) - // This stops our new pkt header being clobbered by a subsequent - // call to nf_bridge_maybe_copy_header. - // Code from nf_bridge_save_header() modidifed to use h_proto - // instead of skb->protocol. - if(skb->nf_bridge){ - // Hmm. Standard ethernet header is ETH_HLEN (14), - // VLAN header (802.1q) is VLAN_ETH_HLEN (18). - // Where does 16 come from? - int header_size = 16; - if(eth_hdr(skb)->h_proto == __constant_htons(ETH_P_8021Q)) { - header_size = 18; - } - memcpy(skb->nf_bridge->data, skb->data - header_size, header_size); - } -#endif -#endif - - if(1){ - struct ethhdr *eth = eth_hdr(skb); - // Devices use eth_type_trans() to set skb->pkt_type and skb->protocol. - // Set them from contained ethhdr, or leave as received? - // 'Ware use of hard_header_len in eth_type_trans(). - - //skb->protocol = htons(ETH_P_IP); - - if(ntohs(eth->h_proto) >= 1536){ - skb->protocol = eth->h_proto; - } else { - skb->protocol = htons(ETH_P_802_2); - } - - if(mac_is_multicast(eth->h_dest)){ - if(mac_is_broadcast(eth->h_dest)){ - skb->pkt_type = PACKET_BROADCAST; - } else { - skb->pkt_type = PACKET_MULTICAST; - } - } else { - skb->pkt_type = PACKET_HOST; - } - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); - if (skb->ip_summed == CHECKSUM_HW){ - skb->ip_summed = CHECKSUM_NONE; - //skb->csum = csum_sub(skb->csum, - // csum_partial(skb->mac.raw, skb->nh.raw - skb->mac.raw, 0)); - } - dst_release(skb->dst); - skb->dst = NULL; - -#ifdef CONFIG_NETFILTER - nf_conntrack_put(skb->nfct); - skb->nfct = NULL; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif -#endif - } - - //print_skb_data(__FUNCTION__, 0, skb, skb->mac.raw, skb->len + ETH_HLEN); - - err = vnet_skb_recv(skb, &vnet, (Vmac*)eth_hdr(skb)->h_dest); + MAC6TUPLE(eth->h_source), + MAC6TUPLE(eth->h_dest)); + //print_skb(__FUNCTION__, 0, skb); + + { + // Know source ip, vnet, vmac, so update the varp cache. + // For this to work forwarded vnet packets must have the + // original source address. + VarpAddr addr = { .family = AF_INET }; + addr.u.ip4.s_addr = saddr; + varp_update(&vnet, eth->h_source, &addr); + } + + err = vnet_skb_recv(skb, vinfo); exit: - if(mine) err = 1; + if(vinfo) Vnet_decref(vinfo); dprintf("< skb=%p err=%d\n", skb, err); return err; } + + +#ifdef __KERNEL__ /** Handle an ICMP error related to etherip. * @@ -433,3 +435,5 @@ printk(KERN_INFO "%s: can't remove etherip protocol\n", __FUNCTION__); } } + +#endif // __KERNEL__ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/etherip.h --- a/tools/vnet/vnet-module/etherip.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/etherip.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -21,7 +21,18 @@ #include "if_etherip.h" +#ifdef __KERNEL__ extern int etherip_module_init(void); extern void etherip_module_exit(void); +#endif +extern int etherip_protocol_recv(struct sk_buff *skb); +extern int etherip_in_udp; + +struct VnetId; +struct VarpAddr; +struct Tunnel; + +extern int etherip_tunnel_create(struct VnetId *vnet, struct VarpAddr *addr, + struct Tunnel *base, struct Tunnel **tunnel); #endif diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/if_etherip.h --- a/tools/vnet/vnet-module/if_etherip.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/if_etherip.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,9 +19,25 @@ #ifndef _VNET_IF_ETHERIP_H_ #define _VNET_IF_ETHERIP_H_ +#ifdef __KERNEL__ +#include <asm/byteorder.h> +#else +#define __KERNEL__ +/* This include may cause a compile warning, which can be ignored. + * Can't use <endian.h> because it doesn't define + *__LITTLE_ENDIAN_BITFIELD or __BIG_ENDIAN_BITFIELD. + */ +#include <asm/byteorder.h> +#undef __KERNEL__ +#endif + +#include <if_varp.h> + #define CONFIG_ETHERIP_EXT #ifdef CONFIG_ETHERIP_EXT + +/* Extended header with room for a longer vnet id. */ #define ETHERIP_VERSION 4 @@ -33,12 +49,14 @@ __u16 version:4, reserved:12; #else -#error "Please fix <asm/byteorder.h>" +#error "Adjust your <asm/byteorder.h> defines" #endif - __u8 vnet[16]; + __u8 vnet[VNETID_SIZE8]; } __attribute__ ((packed)); #else + +/* Original header as in Etherip RFC. */ #define ETHERIP_VERSION 3 @@ -51,7 +69,7 @@ __u16 version:4, reserved:12; #else -#error "Please fix <asm/byteorder.h>" +#error "Adjust your <asm/byteorder.h> defines" #endif }; diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/if_varp.h --- a/tools/vnet/vnet-module/if_varp.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/if_varp.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -22,27 +22,54 @@ /* Need struct in_addr, struct in6_addr. */ #ifdef __KERNEL__ + #include <linux/in.h> #include <linux/in6.h> + #else + +#include <sys/socket.h> #include <netinet/in.h> + #endif + +#include <linux/if_ether.h> typedef struct Vmac { unsigned char mac[ETH_ALEN]; } Vmac; enum { + /* Varp protocol messages. + * Format is defined by struct VarpHdr. + */ VARP_ID = 1, + + /* Vnet ethernet in udp messages. + * Format is uint16_t id (VUDP_ID), then + * struct etheriphdr. + */ + VUDP_ID = 2, + + /* Forwarded messages. + */ + VFWD_ID = 3, + + /* Varp request. */ VARP_OP_REQUEST = 1, + /* Varp announce. */ VARP_OP_ANNOUNCE = 2, }; +#define VNETID_SIZE8 16 +#define VNETID_SIZE16 (VNETID_SIZE8 >> 1) +#define VNETID_SIZE32 (VNETID_SIZE8 >> 2) + typedef struct VnetId { union { - uint8_t vnet8[16]; - uint16_t vnet16[8]; - uint32_t vnet32[4]; + uint8_t vnet8[VNETID_SIZE8]; + uint16_t vnet16[VNETID_SIZE16]; + uint32_t vnet32[VNETID_SIZE32]; } u; } __attribute__((packed)) VnetId; @@ -53,6 +80,7 @@ struct in_addr ip4; struct in6_addr ip6; } u; + //uint16_t port; } __attribute__((packed)) VarpAddr; typedef struct VnetMsgHdr { diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/random.c --- a/tools/vnet/vnet-module/random.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/random.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -38,20 +38,28 @@ static unsigned long seed = 0; static unsigned long count = 0; -static unsigned long stir(unsigned long *a, unsigned long b){ - pseudo_des(a, &b); - return b; -} +/** Contribute some random bytes. + * + * @param src bytes to contribute + * @param src_n number of bytes + */ +void add_random_bytes(const void *src, int src_n){ + ++count; + seed = hash_hvoid(seed, &count, sizeof(count)); + seed = hash_hvoid(seed, src, src_n); +} /** Get one random byte. * * @return random byte */ int get_random_byte(void){ - return stir(&seed, ++count); + int tmp = jiffies; + add_random_bytes(&tmp, sizeof(tmp)); + return seed; } -#if 0 +#ifndef __KERNEL__ /* Get some random bytes. * * @param dst destination for the bytes @@ -66,33 +74,11 @@ } #endif -/** Contribute a random byte. - * - * @param b byte to contribute - */ -void add_random_byte(int b){ - stir(&seed, ++count); - stir(&seed, b); -} - -/** Contribute some random bytes. - * - * @param src bytes to contribute - * @param src_n number of bytes - */ -void add_random_bytes(const void *src, int src_n){ - int i; - char *p = (char *)src; - for(i = 0; i < src_n; i++){ - add_random_byte(*p++); - } -} - int __init random_module_init(void){ int dummy; int tmp = jiffies; seed = (unsigned long)&dummy; - add_random_byte(tmp); + add_random_bytes(&tmp, sizeof(tmp)); return 0; } diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/random.h --- a/tools/vnet/vnet-module/random.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/random.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,9 +19,7 @@ #ifndef __VNET_RANDOM_H__ #define __VNET_RANDOM_H__ -extern int get_random_byte(void); extern void get_random_bytes(void *dst, int dst_n); -extern void add_random_byte(int b); extern void add_random_bytes(const void *src, int src_n); extern int random_module_init(void); diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/sa.c --- a/tools/vnet/vnet-module/sa.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/sa.c Thu Feb 9 15:12:11 2006 @@ -19,17 +19,11 @@ #include <linux/config.h> #include <linux/kernel.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <linux/skbuff.h> - -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> - +#include <tunnel.h> +#include <vnet.h> #include <sa.h> #include <sa_algorithm.h> + #include "hash_table.h" #include "allocate.h" @@ -120,58 +114,46 @@ static unsigned long sa_spi_counter = 0; -/** Generate a random spi. - * Uses a hashed counter. - * - * @return spi - */ -static u32 random_spi(void){ - unsigned long left, right = 0; - u32 spi; - do{ - left = sa_spi_counter++; - pseudo_des(&left, &right); - spi = right; - } while(!spi); - return spi; -} - /** Mangle some input to generate output. * This is used to derive spis and keying material from secrets, * so it probably ought to be cryptographically strong. * Probably ought to use a good hash (sha1) or cipher (aes). * - * @param input input values - * @param n number of values + * @param input input bytes + * @param n number of bytes * @return mangled value */ -static u32 mangle(u32 input[], int n){ - unsigned long left = 0, right = 0; - int i; - for(i=0; i<n; i++){ - left ^= input[i]; - pseudo_des(&left, &right); - } - return (u32)right; -} - -/** Generate a spi for a given protocol and address, using a secret key. - * The offset is used when it is necessary to generate more than one spi - * for the same protocol and address. - * - * @param key key - * @param offset offset - * @param protocol protocol - * @param addr IP address +static u32 mangle(void *input, int n){ + return hash_hvoid(0, input, n); +} + +/** Generate a random spi. + * Uses a hashed counter. + * * @return spi */ +static u32 random_spi(void){ + u32 spi; + do{ + spi = sa_spi_counter++; + spi = mangle(&spi, sizeof(spi)); + } while(!spi); + return spi; +} + + /** Generate a spi for a given protocol and address, using a secret key. + * The offset is used when it is necessary to generate more than one spi + * for the same protocol and address. + * + * @param key key + * @param offset offset + * @param protocol protocol + * @param addr IP address + * @return spi + */ static u32 generate_spi(u32 key, u32 offset, u32 protocol, u32 addr){ u32 input[] = { key, offset, protocol, addr }; - u32 spi; - dprintf(">\n"); - spi = mangle(input, 4); - dprintf("< spi=%x\n", spi); - return spi; + return mangle(input, sizeof(input)); } /** Generate keying material for a given spi, based on a @@ -184,7 +166,7 @@ */ static u32 generate_key(u32 key, u32 offset, u32 spi){ u32 input[] = { key, offset, spi }; - return mangle(input, 3); + return mangle(input, sizeof(input)); } /** Allocate a spi. @@ -238,7 +220,7 @@ * @return hashcode */ static inline Hashcode sa_table_hash_id(u32 id){ - return hash_ul(id); + return hash_hvoid(0, &id, sizeof(id)); } /** Hash SA spi/protocol/addr. @@ -249,10 +231,8 @@ * @return hashcode */ static inline Hashcode sa_table_hash_spi(u32 spi, u32 protocol, u32 addr){ - Hashcode h = 0; - h = hash_2ul(spi, protocol); - h = hash_hul(h, addr); - return h; + u32 a[] = { spi, protocol, addr }; + return hash_hvoid(0, a, sizeof(a)); } /** Test if an SA entry has a given value. @@ -299,7 +279,7 @@ * @param table containing table * @param entry to free */ -void sa_table_free_fn(HashTable *table, HTEntry *entry){ +static void sa_table_free_fn(HashTable *table, HTEntry *entry){ if(!entry) return; if(entry->value){ SAState *state = entry->value; @@ -668,3 +648,110 @@ exit: return err; } +/** Determine ESP security mode for a new SA. + * + * @param spi incoming spi + * @param protocol incoming protocol + * @param addr source address + * @return security level or negative error code + * + * @todo Need to check spi, and do some lookup for security params. + */ +int vnet_sa_security(u32 spi, int protocol, u32 addr){ + extern int vnet_security_default; + int security = vnet_security_default; + dprintf("< security=%x\n", security); + return security; +} + +/** Create a new SA for incoming traffic. + * + * @param spi incoming spi + * @param protocol incoming protocol + * @param addr source address + * @param sa return parameter for SA + * @return 0 on success, error code otherwise + */ +int vnet_sa_create(u32 spi, int protocol, u32 addr, SAState **sa){ + int err = 0; + int security = vnet_sa_security(spi, protocol, addr); + if(security < 0){ + err = security; + goto exit; + } + err = sa_create(security, spi, protocol, addr, sa); + exit: + return err; +} +/** Open function for SA tunnels. + * + * @param tunnel to open + * @return 0 on success, error code otherwise + */ +static int sa_tunnel_open(Tunnel *tunnel){ + int err = 0; + //dprintf(">\n"); + //dprintf("< err=%d\n", err); + return err; +} + +/** Close function for SA tunnels. + * + * @param tunnel to close (OK if null) + */ +static void sa_tunnel_close(Tunnel *tunnel){ + SAState *sa; + if(!tunnel) return; + sa = tunnel->data; + if(!sa) return; + SAState_decref(sa); + tunnel->data = NULL; +} + +/** Packet send function for SA tunnels. + * + * @param tunnel to send on + * @param skb packet to send + * @return 0 on success, negative error code on error + */ +static int sa_tunnel_send(Tunnel *tunnel, struct sk_buff *skb){ + int err = -EINVAL; + SAState *sa; + if(!tunnel){ + wprintf("> Null tunnel!\n"); + goto exit; + } + sa = tunnel->data; + if(!sa){ + wprintf("> Null SA!\n"); + goto exit; + } + err = SAState_send(sa, skb, tunnel->base); + exit: + return err; +} + +/** Functions used by SA tunnels. */ +static TunnelType _sa_tunnel_type = { + .name = "SA", + .open = sa_tunnel_open, + .close = sa_tunnel_close, + .send = sa_tunnel_send +}; + +/** Functions used by SA tunnels. */ +TunnelType *sa_tunnel_type = &_sa_tunnel_type; + +int sa_tunnel_create(Vnet *info, VarpAddr *addr, Tunnel *base, Tunnel **tunnel){ + int err = 0; + SAState *sa = NULL; + //FIXME: Assuming IPv4 for now. + u32 ipaddr = addr->u.ip4.s_addr; + err = Tunnel_create(sa_tunnel_type, &info->vnet, addr, base, tunnel); + if(err) goto exit; + err = sa_create(info->security, 0, IPPROTO_ESP, ipaddr, &sa); + if(err) goto exit; + (*tunnel)->data = sa; + exit: + return err; +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/sa.h --- a/tools/vnet/vnet-module/sa.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/sa.h Thu Feb 9 15:12:11 2006 @@ -19,14 +19,27 @@ #ifndef __VNET_SA_H__ #define __VNET_SA_H__ +#ifdef __KERNEL__ #include <linux/types.h> #include <linux/crypto.h> -#include <tunnel.h> +#else + +#include "sys_kernel.h" + +#endif + +struct Vnet; +struct VarpAddr; +struct Tunnel; #ifndef CRYPTO_MAX_KEY_BYTES #define CRYPTO_MAX_KEY_BYTES 64 #define CRYPTO_MAX_KEY_BITS (CRYPTO_MAX_KEY_BYTES * 8) +#endif + +#ifndef CRYPTO_MAX_ALG_NAME +#define CRYPTO_MAX_ALG_NAME 64 #endif typedef struct SALimits { @@ -104,7 +117,7 @@ int (*init)(SAState *state, void *args); void (*fini)(SAState *state); int (*recv)(SAState *state, struct sk_buff *skb); - int (*send)(SAState *state, struct sk_buff *skb, Tunnel *tunnel); + int (*send)(SAState *state, struct sk_buff *skb, struct Tunnel *tunnel); u32 (*size)(SAState *state, int size); } SAType; @@ -170,7 +183,7 @@ extern int SAState_init(SAIdent *id, SAState **statep); extern int SAState_create(SAInfo *info, SAState **statep); -static inline int SAState_send(SAState *sa, struct sk_buff *skb, Tunnel *tunnel){ +static inline int SAState_send(SAState *sa, struct sk_buff *skb, struct Tunnel *tunnel){ return sa->type->send(sa, skb, tunnel); } @@ -196,4 +209,7 @@ SA_STATE_VALID = 2, }; +extern int sa_tunnel_create(struct Vnet *info, struct VarpAddr *addr, + struct Tunnel *base, struct Tunnel **tunnel); + #endif /* !__VNET_SA_H__ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/skb_context.h --- a/tools/vnet/vnet-module/skb_context.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/skb_context.h Thu Feb 9 15:12:11 2006 @@ -20,10 +20,24 @@ #ifndef __VNET_SKB_CONTEXT_H__ #define __VNET_SKB_CONTEXT_H__ +#ifdef __KERNEL__ #include <linux/config.h> #include <linux/kernel.h> #include <asm/atomic.h> #include <linux/types.h> + +//todo: fixme +#define SKB_CONTEXT(_skb) ((SkbContext *)(&(_skb)->cb[0])) + +#else + +#include "sys_kernel.h" +#include "spinlock.h" + +//todo: fixme +#define SKB_CONTEXT(_skb) ((SkbContext *)NULL) + +#endif /** Structure used to record inbound processing path for skbs. * For example, the ETHERIP protocol handler can use this to @@ -70,7 +84,4 @@ extern int skb_push_context(struct sk_buff *skb, u32 vnet, u32 addr, int protocol, void *data, void (*free_fn)(SkbContext *)); -//todo: fixme -#define SKB_CONTEXT(_skb) ((SkbContext *)(&(_skb)->cb[0])) - #endif /* !__VNET_SKB_CONTEXT_H__ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/skb_util.c --- a/tools/vnet/vnet-module/skb_util.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/skb_util.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -16,6 +16,7 @@ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA * */ +#ifdef __KERNEL__ #include <linux/config.h> #include <linux/module.h> #include <linux/kernel.h> @@ -39,6 +40,44 @@ #include <net/route.h> #include <linux/skbuff.h> +#else + +#include <stdlib.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <stdio.h> +#include <errno.h> + +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <sys/types.h> +#include <sys/socket.h> + +#include <linux/if_ether.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/udp.h> + +#include "sys_kernel.h" +#include "skbuff.h" + +#if defined(__LITTLE_ENDIAN) +#define HIPQUAD(addr) \ + ((unsigned char *)&addr)[3], \ + ((unsigned char *)&addr)[2], \ + ((unsigned char *)&addr)[1], \ + ((unsigned char *)&addr)[0] +#elif defined(__BIG_ENDIAN) +#define HIPQUAD NIPQUAD +#else +#error "Please fix asm/byteorder.h" +#endif /* __LITTLE_ENDIAN */ + +#endif + #include <varp.h> #include <skb_util.h> @@ -47,16 +86,7 @@ #undef DEBUG #include "debug.h" -static const int DEBUG_SCATTERLIST = 0; -static const int DEBUG_SKB = 0; - //============================================================================ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define SET_SCATTER_ADDR(sg, addr) do{} while(0) -#else -#define SET_SCATTER_ADDR(sg, addr) (sg).address = (addr) -#endif - /** Make enough room in an skb for extra header and trailer. * * @param pskb return parameter for expanded skb @@ -85,7 +115,7 @@ err = -ENOMEM; goto exit; } - dev_kfree_skb(skb); + kfree_skb(skb); *pskb = new_skb; } else { // No room. Expand. There may be more efficient ways to do @@ -95,7 +125,7 @@ err = -ENOMEM; goto exit; } - dev_kfree_skb(skb); + kfree_skb(skb); *pskb = new_skb; } dprintf("> skb=%p headroom=%d head_n=%d tailroom=%d tail_n=%d\n", @@ -129,6 +159,7 @@ src += copy; } +#ifdef __KERNEL__ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -177,6 +208,10 @@ start = end; } } +#else + i=0; +#endif + if (len == 0) return 0; @@ -184,45 +219,11 @@ return -EFAULT; } -/** Add some space to the end of a (possibly fragmented) skb. - * - * Only works with Xen output skbs. Output skbs have 1 frag, and we - * add another frag for the extra space. - * - * @param skb skb - * @param n number of bytes to add - * @return 0 on success, error code otherwise - * - * @todo fixme - */ -int pskb_put(struct sk_buff *skb, int n){ - int err = 0; - if(1 || skb_is_nonlinear(skb)){ - struct skb_shared_info *info = skb_shinfo(skb); - char *ptr = NULL; - - if(info->nr_frags >= MAX_SKB_FRAGS){ - err = -ENOMEM; - goto exit; - } - ptr = kmalloc(n, GFP_ATOMIC); - if(!ptr){ - err = -ENOMEM; - goto exit; - } - info->nr_frags++; - info->frags[info->nr_frags - 1].page = virt_to_page(ptr); - info->frags[info->nr_frags - 1].page_offset = ((unsigned long)ptr & ~PAGE_MASK); - info->frags[info->nr_frags - 1].size = n; - - skb->data_len += n; - skb->len += n; - } else { - __skb_put(skb, n); - } - exit: - if(err) dprintf("< err=%d\n", err); - return err; +int skboffset(struct sk_buff *skb, unsigned char *ptr){ + if(!ptr || ptr < skb->head || ptr > skb->tail){ + return -1; + } + return (ptr - skb->head); } /** Print some bits of an skb. @@ -231,11 +232,23 @@ * @param offset byte offset to start printing at * @param n number of bytes to print */ -void skb_print_bits(struct sk_buff *skb, int offset, int n){ +void skb_print_bits(const char *msg, struct sk_buff *skb, int offset, int n){ int chunk = 16; int i, k; u8 buff[chunk]; - if(!DEBUG_SKB) return; + if(!skb) return; + printk("%s> tot=%d len=%d data=%d mac=%d nh=%d h=%d\n", + msg, + skb->tail - skb->head, + skb->len, + skboffset(skb, skb->data), + skboffset(skb, skb->mac.raw), + skboffset(skb, skb->nh.raw), + skboffset(skb, skb->h.raw)); + printk("%s> head=%p data=%p mac=%p nh=%p h=%p tail=%p\n", + msg, skb->head, skb->data, + skb->mac.raw, skb->nh.raw, skb->h.raw, + skb->tail); while(n){ k = (n > chunk ? chunk : n); skb_copy_bits(skb, offset, buff, k); @@ -275,8 +288,15 @@ return skb->tail; } -// #define BUG_TRAP(x) -// if(!(x)){ printk("KERNEL: assertion (" #x ") failed at " __FILE__ "(%d)\n", __LINE__); } +#ifdef __KERNEL__ + +static const int DEBUG_SCATTERLIST = 0; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define SET_SCATTER_ADDR(sg, addr) do{} while(0) +#else +#define SET_SCATTER_ADDR(sg, addr) (sg).address = (addr) +#endif /** Convert a (possibly fragmented) skb into a scatter list. * @@ -360,27 +380,9 @@ return err; } -struct arpheader -{ - unsigned short ar_hrd; /* format of hardware address */ - unsigned short ar_pro; /* format of protocol address */ - unsigned char ar_hln; /* length of hardware address */ - unsigned char ar_pln; /* length of protocol address */ - unsigned short ar_op; /* ARP opcode (command) */ - -#if 1 - /* - * Ethernet looks like this : This bit is variable sized however... - */ - unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ - unsigned char ar_sip[4]; /* sender IP address */ - unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ - unsigned char ar_tip[4]; /* target IP address */ #endif -}; - -void print_skb_data(char *msg, int count, struct sk_buff *skb, u8 *data, int len) +void print_skb_data(const char *msg, int count, struct sk_buff *skb, u8 *data, int len) { static int skb_count = 1000000; u8 *ptr, *end; @@ -460,7 +462,7 @@ msg, count, nh.iph->protocol, HIPQUAD(src_addr), HIPQUAD(dst_addr)); printk("%s.%d> IP tot_len=%u len=%d\n", - msg, count, nh.iph->tot_len & 0xffff, len - ETH_HLEN); + msg, count, ntohs(nh.iph->tot_len), len - ETH_HLEN); } ptr += (nh.iph->ihl * 4); if(ptr > end){ printk ("***IP: len"); goto exit; } @@ -506,10 +508,49 @@ return; exit: printk("%s.%d> %s: skb problem\n", msg, count, __FUNCTION__); - printk("%s.%d> %s: data=%p end=%p(%d) ptr=%p(%d) eth=%d arp=%d ip=%d\n", + printk("%s.%d> %s: data=%p end=%p(%d) ptr=%p(%d) eth=%d ip=%d\n", msg, count, __FUNCTION__, data, end, end - data, ptr, ptr - data, - sizeof(struct ethhdr), sizeof(struct arphdr), sizeof(struct iphdr)); + sizeof(struct ethhdr), + sizeof(struct iphdr)); return; } +void print_skb(const char *msg, int count, struct sk_buff *skb){ + print_skb_data(msg, count, skb, skb->mac.raw, skb->tail - skb->mac.raw); +} + +void print_ethhdr(const char *msg, struct sk_buff *skb){ + struct ethhdr *eth; + + if(!skb || skboffset(skb, skb->mac.raw) < 0) return; + eth = eth_hdr(skb); + printk("%s> ETH proto=%d src=" MACFMT " dst=" MACFMT "\n", + msg, + ntohs(eth->h_proto), + MAC6TUPLE(eth->h_source), + MAC6TUPLE(eth->h_dest)); +} + +void print_iphdr(const char *msg, struct sk_buff *skb){ + u32 src_addr, dst_addr; + + if(!skb || skboffset(skb, skb->nh.raw) < 0) return; + src_addr = ntohl(skb->nh.iph->saddr); + dst_addr = ntohl(skb->nh.iph->daddr); + printk("%s> IP proto=%d src=" IPFMT " dst=" IPFMT " tot_len=%u\n", + msg, + skb->nh.iph->protocol, + HIPQUAD(src_addr), + HIPQUAD(dst_addr), + ntohs(skb->nh.iph->tot_len)); +} + +void print_udphdr(const char *msg, struct sk_buff *skb){ + if(!skb || skboffset(skb, skb->h.raw) < 0) return; + printk("%s> UDP src=%u dst=%u len=%u\n", + msg, + ntohs(skb->h.uh->source), + ntohs(skb->h.uh->dest), + ntohs(skb->h.uh->len)); +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/skb_util.h --- a/tools/vnet/vnet-module/skb_util.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/skb_util.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,39 +19,61 @@ #ifndef _VNET_SKB_UTIL_H_ #define _VNET_SKB_UTIL_H_ +#ifdef __KERNEL__ #include <net/route.h> #include <linux/skbuff.h> -struct scatterlist; +#else + +#include "skbuff.h" + +#endif + +struct sk_buff; extern int skb_make_room(struct sk_buff **pskb, struct sk_buff *skb, int head_n, int tail_n); extern int skb_put_bits(const struct sk_buff *skb, int offset, void *src, int len); -extern int pskb_put(struct sk_buff *skb, int n); - -extern void skb_print_bits(struct sk_buff *skb, int offset, int n); +extern void skb_print_bits(const char *msg, struct sk_buff *skb, int offset, int n); extern void buf_print(char *buf, int n); extern void *skb_trim_tail(struct sk_buff *skb, int n); -extern int skb_scatterlist(struct sk_buff *skb, struct scatterlist *sg, - int *sg_n, int offset, int len); +extern void print_skb_data(const char *msg, int count, struct sk_buff *skb, u8 *data, int len); +extern void print_skb(const char *msg, int count, struct sk_buff *skb); -extern void print_skb_data(char *msg, int count, struct sk_buff *skb, u8 *data, int len); - +extern void print_ethhdr(const char *msg, struct sk_buff *skb); +extern void print_iphdr(const char *msg, struct sk_buff *skb); +extern void print_udphdr(const char *msg, struct sk_buff *skb); /* The mac.ethernet field went away in 2.6 in favour of eth_hdr(). */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#ifdef __KERNEL__ +# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +# define NEED_ETH_HDR +# endif #else +# define NEED_ETH_HDR +#endif + +#ifdef NEED_ETH_HDR + static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->mac.raw; } + #endif + +#ifdef __KERNEL__ + +struct scatterlist; + +extern int skb_scatterlist(struct sk_buff *skb, struct scatterlist *sg, + int *sg_n, int offset, int len); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) @@ -91,4 +113,27 @@ #endif +#endif /* __KERNEL__ */ + +/** Arp header struct with all the fields so we can access them. */ +struct arpheader +{ + unsigned short ar_hrd; /* format of hardware address */ + unsigned short ar_pro; /* format of protocol address */ + unsigned char ar_hln; /* length of hardware address */ + unsigned char ar_pln; /* length of protocol address */ + unsigned short ar_op; /* ARP opcode (command) */ + +#if 1 + /* + * Ethernet looks like this : This bit is variable sized however... + */ + unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ + unsigned char ar_sip[4]; /* sender IP address */ + unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ + unsigned char ar_tip[4]; /* target IP address */ #endif + +}; + +#endif /* ! _VNET_SKB_UTIL_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/tunnel.c --- a/tools/vnet/vnet-module/tunnel.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/tunnel.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -16,19 +16,21 @@ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA * */ +#ifdef __KERNEL__ + #include <linux/config.h> #include <linux/module.h> #include <linux/init.h> - -#include <linux/net.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> - -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> #include <linux/skbuff.h> +#include <linux/spinlock.h> + +#else + +#include "sys_kernel.h" +#include "spinlock.h" +#include "skbuff.h" + +#endif #include <tunnel.h> #include <vnet.h> @@ -40,9 +42,18 @@ #undef DEBUG #include "debug.h" +/** Table of tunnels, indexed by vnet and addr. */ +HashTable *tunnel_table = NULL; +rwlock_t tunnel_table_lock = RW_LOCK_UNLOCKED; + +#define tunnel_read_lock(flags) read_lock_irqsave(&tunnel_table_lock, (flags)) +#define tunnel_read_unlock(flags) read_unlock_irqrestore(&tunnel_table_lock, (flags)) +#define tunnel_write_lock(flags) write_lock_irqsave(&tunnel_table_lock, (flags)) +#define tunnel_write_unlock(flags) write_unlock_irqrestore(&tunnel_table_lock, (flags)) + void Tunnel_print(Tunnel *tunnel){ if(tunnel){ - printk("Tunnel<%p base=%p ref=%02d type=%s>\n", + iprintf("Tunnel<%p base=%p ref=%02d type=%s>\n", tunnel, tunnel->base, atomic_read(&tunnel->refcount), @@ -51,12 +62,13 @@ Tunnel_print(tunnel->base); } } else { - printk("Tunnel<%p base=%p ref=%02d type=%s>\n", + iprintf("Tunnel<%p base=%p ref=%02d type=%s>\n", NULL, NULL, 0, "ip"); } } -int Tunnel_create(TunnelType *type, VnetId *vnet, VarpAddr *addr, Tunnel *base, Tunnel **val){ +int Tunnel_create(TunnelType *type, VnetId *vnet, VarpAddr *addr, + Tunnel *base, Tunnel **val){ int err = 0; Tunnel *tunnel = NULL; if(!type || !type->open || !type->send || !type->close){ @@ -87,22 +99,6 @@ return err; } -int Tunnel_open(TunnelType *type, VnetId *vnet, VarpAddr *addr, Tunnel *base, Tunnel **tunnel){ - int err = 0; - - dprintf(">\n"); - err = Tunnel_create(type, vnet, addr, base, tunnel); - if(err) goto exit; - err = Tunnel_add(*tunnel); - exit: - if(err){ - Tunnel_decref(*tunnel); - *tunnel = NULL; - } - dprintf("< err=%d\n", err); - return err; -} - void TunnelStats_update(TunnelStats *stats, int len, int err){ dprintf(">len=%d err=%d\n", len, err); if(err){ @@ -115,29 +111,18 @@ dprintf("<\n"); } -/** Table of tunnels, indexed by vnet and addr. */ -HashTable *tunnel_table = NULL; - static inline Hashcode tunnel_table_key_hash_fn(void *k){ - TunnelKey *key = k; - Hashcode h = 0; - h = VnetId_hash(h, &key->vnet); - h = VarpAddr_hash(h, &key->addr); - return h; + return hash_hvoid(0, k, sizeof(TunnelKey)); } static int tunnel_table_key_equal_fn(void *k1, void *k2){ - TunnelKey *key1 = k1; - TunnelKey *key2 = k2; - return VnetId_eq(&key1->vnet, &key2->vnet) && - VarpAddr_eq(&key1->addr, &key2->addr); + return memcmp(k1, k2, sizeof(TunnelKey)) == 0; } static void tunnel_table_entry_free_fn(HashTable *table, HTEntry *entry){ Tunnel *tunnel; if(!entry) return; tunnel = entry->value; - //dprintf(">\n"); Tunnel_print(tunnel); Tunnel_decref(tunnel); HTEntry_free(entry); } @@ -159,35 +144,86 @@ } /** Lookup tunnel state by vnet and destination. + * The caller must drop the tunnel reference when done. * * @param vnet vnet * @param addr destination address - * @return tunnel state or NULL - */ -Tunnel * Tunnel_lookup(VnetId *vnet, VarpAddr *addr){ + * @return 0 on success + */ +int Tunnel_lookup(VnetId *vnet, VarpAddr *addr, Tunnel **tunnel){ + unsigned long flags; + TunnelKey key = { .vnet = *vnet, .addr = *addr }; + dprintf(">\n"); + tunnel_read_lock(flags); + *tunnel = HashTable_get(tunnel_table, &key); + tunnel_read_unlock(flags); + Tunnel_incref(*tunnel); + dprintf("< tunnel=%p\n", *tunnel); + return (*tunnel ? 0 : -ENOENT); +} + +/** Get a tunnel to a given vnet and destination, creating + * a tunnel if necessary. + * The caller must drop the tunnel reference when done. + * + * @param vnet vnet + * @param addr destination address + * @param ctor tunnel constructor + * @parma ptunnel return parameter for the tunnel + * @return 0 on success + */ +int Tunnel_open(VnetId *vnet, VarpAddr *addr, + int (*ctor)(VnetId *vnet, VarpAddr *addr, Tunnel **ptunnel), + Tunnel **ptunnel){ + int err = 0; Tunnel *tunnel = NULL; - TunnelKey key = {.vnet = *vnet, .addr = *addr }; + unsigned long flags; + TunnelKey key = { .vnet = *vnet, .addr = *addr }; + + tunnel_write_lock(flags); + tunnel = HashTable_get(tunnel_table, &key); + if(!tunnel){ + err = ctor(vnet, addr, &tunnel); + if(err) goto exit; + if(!HashTable_add(tunnel_table, tunnel, tunnel)){ + err = -ENOMEM; + goto exit; + } + } + exit: + tunnel_write_unlock(flags); + if(err){ + Tunnel_decref(tunnel); + *ptunnel = NULL; + } else { + Tunnel_incref(tunnel); + *ptunnel = tunnel; + } + return err; +} + +int Tunnel_add(Tunnel *tunnel){ + int err = 0; + unsigned long flags; dprintf(">\n"); - tunnel = HashTable_get(tunnel_table, &key); - Tunnel_incref(tunnel); - dprintf("< tunnel=%p\n", tunnel); - return tunnel; -} - -int Tunnel_add(Tunnel *tunnel){ - int err = 0; - dprintf(">\n"); + tunnel_write_lock(flags); if(HashTable_add(tunnel_table, tunnel, tunnel)){ Tunnel_incref(tunnel); } else { err = -ENOMEM; } + tunnel_write_unlock(flags); dprintf("< err=%d\n", err); return err; } int Tunnel_del(Tunnel *tunnel){ - return HashTable_remove(tunnel_table, tunnel); + int err; + unsigned long flags; + tunnel_write_lock(flags); + err = HashTable_remove(tunnel_table, tunnel); + tunnel_write_unlock(flags); + return err; } /** Do tunnel send processing on a packet. @@ -217,8 +253,11 @@ } void __exit tunnel_module_exit(void){ + unsigned long flags; + tunnel_write_lock(flags); if(tunnel_table){ HashTable_free(tunnel_table); tunnel_table = NULL; } -} + tunnel_write_unlock(flags); +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/tunnel.h --- a/tools/vnet/vnet-module/tunnel.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/tunnel.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,9 +19,18 @@ #ifndef __VNET_TUNNEL_H__ #define __VNET_TUNNEL_H__ +#ifdef __KERNEL__ #include <linux/types.h> -#include <linux/slab.h> #include <asm/atomic.h> + +#else + +//#include <linux/types.h> +#include "sys_kernel.h" +#include "spinlock.h" + +#endif + #include <if_varp.h> struct sk_buff; @@ -42,8 +51,8 @@ } TunnelStats; typedef struct TunnelKey { - VnetId vnet; - VarpAddr addr; + struct VnetId vnet; + struct VarpAddr addr; } TunnelKey; typedef struct Tunnel { @@ -61,17 +70,13 @@ struct Tunnel *base; } Tunnel; -extern void Tunnel_print(Tunnel *tunnel); - /** Decrement the reference count, freeing if zero. * * @param tunnel tunnel (may be null) */ -static inline void Tunnel_decref(Tunnel *tunnel){ +static inline void Tunnel_decref(struct Tunnel *tunnel){ if(!tunnel) return; if(atomic_dec_and_test(&tunnel->refcount)){ - printk("%s> Closing tunnel:\n", __FUNCTION__); - Tunnel_print(tunnel); tunnel->type->close(tunnel); Tunnel_decref(tunnel->base); kfree(tunnel); @@ -88,15 +93,19 @@ } extern int Tunnel_init(void); -extern Tunnel * Tunnel_lookup(struct VnetId *vnet, struct VarpAddr *addr); -extern int Tunnel_add(Tunnel *tunnel); -extern int Tunnel_del(Tunnel *tunnel); -extern int Tunnel_send(Tunnel *tunnel, struct sk_buff *skb); +extern int Tunnel_lookup(struct VnetId *vnet, struct VarpAddr *addr, struct Tunnel **tunnel); +extern int Tunnel_open(struct VnetId *vnet, struct VarpAddr *addr, + int (*ctor)(struct VnetId *vnet, + struct VarpAddr *addr, + struct Tunnel **ptunnel), + struct Tunnel **ptunnel); +extern int Tunnel_add(struct Tunnel *tunnel); +extern int Tunnel_del(struct Tunnel *tunnel); +extern void Tunnel_print(struct Tunnel *tunnel); +extern int Tunnel_send(struct Tunnel *tunnel, struct sk_buff *skb); -extern int Tunnel_create(TunnelType *type, struct VnetId *vnet, struct VarpAddr *addr, - Tunnel *base, Tunnel **tunnelp); -extern int Tunnel_open(TunnelType *type, struct VnetId *vnet, struct VarpAddr *addr, - Tunnel *base, Tunnel **tunnelp); +extern int Tunnel_create(struct TunnelType *type, struct VnetId *vnet, struct VarpAddr *addr, + struct Tunnel *base, struct Tunnel **tunnelp); extern int tunnel_module_init(void); extern void tunnel_module_exit(void); diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/varp.c --- a/tools/vnet/vnet-module/varp.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/varp.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -17,6 +17,7 @@ * */ +#ifdef __KERNEL__ #include <linux/config.h> #include <linux/kernel.h> #include <linux/module.h> @@ -28,6 +29,7 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/inetdevice.h> #include <linux/udp.h> #include <net/ip.h> @@ -37,18 +39,35 @@ #include <linux/spinlock.h> #include <asm/semaphore.h> +#else + +#include "sys_kernel.h" +#include <netinet/in.h> +#include <arpa/inet.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include "spinlock.h" +#include "skbuff.h" + +#endif + #include <tunnel.h> #include <vnet.h> #include <vif.h> #include <if_varp.h> #include <varp.h> +#include <varp_util.h> #include <vnet.h> +#include <etherip.h> +#include <vnet_forward.h> #include "allocate.h" +#include "iostream.h" #include "hash_table.h" #include "sys_net.h" #include "sys_string.h" #include "skb_util.h" +#include "timer_util.h" #define MODULE_NAME "VARP" #define DEBUG 1 @@ -104,7 +123,7 @@ enum { VARP_STATE_INCOMPLETE = 1, VARP_STATE_REACHABLE = 2, - VARP_STATE_FAILED = 3 + VARP_STATE_FAILED = 3, }; /** Varp entry flags. */ @@ -137,6 +156,8 @@ atomic_t refcount; /** Lock. */ rwlock_t lock; + unsigned long lflags; + /** How many probes have been made. */ atomic_t probes; /** Probe timer. */ @@ -146,6 +167,7 @@ struct sk_buff_head queue; /** Maximum size of the queue. */ int queue_max; + atomic_t deleted; } VarpEntry; /** The varp cache. Varp entries indexed by VarpKey. */ @@ -156,11 +178,13 @@ /** Sweep timer. */ struct timer_list timer; - /** Lock. Need to use a semaphore instead of a spinlock because - * some operations under the varp table lock can schedule - and - * you mustn't hold a spinlock when scheduling. - */ - struct semaphore lock; + rwlock_t lock; + struct semaphore mutex; + + int entry_ttl; + int probe_max; + int probe_interval; + int queue_max; } VarpTable; @@ -176,19 +200,30 @@ /** UDP port (network order). */ u16 varp_port = 0; -char *varp_device = "xenbr0"; - -#define VarpTable_read_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0) -#define VarpTable_read_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0) -#define VarpTable_write_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0) -#define VarpTable_write_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0) - -#define VarpEntry_lock(ventry, flags) write_lock_irqsave(&(ventry)->lock, (flags)) -#define VarpEntry_unlock(ventry, flags) write_unlock_irqrestore(&(ventry)->lock, (flags)) - -void VarpTable_sweep(VarpTable *z, int all); -void VarpTable_flush(VarpTable *z); -void VarpTable_print(VarpTable *z); +char *varp_device = "xen-br0"; + +#define VarpTable_read_lock(vtable, flags) \ + do{ read_lock_irqsave(&(vtable)->lock, (flags)); } while(0) + +#define VarpTable_read_unlock(vtable, flags) \ + do{ read_unlock_irqrestore(&(vtable)->lock, (flags)); } while(0) + +#define VarpTable_write_lock(vtable, flags) \ + do{ write_lock_irqsave(&(vtable)->lock, (flags)); } while(0) + +#define VarpTable_write_unlock(vtable, flags) \ + do{ write_unlock_irqrestore(&(vtable)->lock, (flags)); } while(0) + +#define VarpEntry_lock(ventry, flags) \ + do{ write_lock_irqsave(&(ventry)->lock, (flags)); (ventry)->lflags = (flags); } while(0) + +#define VarpEntry_unlock(ventry, flags) \ + do{ (flags) = (ventry)->lflags; write_unlock_irqrestore(&(ventry)->lock, (flags)); } while(0) + +void VarpTable_sweep(VarpTable *vtable); +void VarpTable_flush(VarpTable *vtable); +void VarpTable_print(VarpTable *vtable, IOStream *io); +int VarpEntry_output(VarpEntry *ventry, struct sk_buff *skb); #include "./varp_util.c" @@ -196,7 +231,7 @@ */ void varp_dprint(void){ #ifdef DEBUG - VarpTable_print(varp_table); + VarpTable_print(varp_table, iostdout); #endif } @@ -206,6 +241,7 @@ VarpTable_flush(varp_table); } +#ifdef __KERNEL__ static int device_ucast_addr(const char *device, uint32_t *addr) { int err; @@ -234,23 +270,6 @@ return err; } -/** Print varp info and the varp cache. - */ -void varp_print(void){ - uint32_t addr = 0; - varp_ucast_addr(&addr); - - printk(KERN_INFO "=== VARP ===============================================================\n"); - printk(KERN_INFO "varp_device %s\n", varp_device); - printk(KERN_INFO "varp_mcast_addr " IPFMT "\n", NIPQUAD(varp_mcast_addr)); - printk(KERN_INFO "varp_ucast_addr " IPFMT "\n", NIPQUAD(addr)); - printk(KERN_INFO "varp_port %d\n", ntohs(varp_port)); - vnet_print(); - vif_print(); - VarpTable_print(varp_table); - printk(KERN_INFO "========================================================================\n"); -} - /** Lookup a network device by name. * * @param name device name @@ -286,6 +305,35 @@ exit: return err; } + +#else + +int varp_ucast_addr(uint32_t *addr) +{ + return 0; +} + +#endif + +/** Print varp info and the varp cache. + */ +void varp_print(IOStream *io){ + uint32_t addr = 0; + varp_ucast_addr(&addr); + + IOStream_print(io, "(varp \n"); + IOStream_print(io, " (device %s)\n", varp_device); + IOStream_print(io, " (mcast_addr " IPFMT ")\n", NIPQUAD(varp_mcast_addr)); + IOStream_print(io, " (ucast_addr " IPFMT ")\n", NIPQUAD(addr)); + IOStream_print(io, " (port %d)\n", ntohs(varp_port)); + IOStream_print(io, " (encapsulation %s)\n", + (etherip_in_udp ? "etherip_in_udp" : "etherip")); + IOStream_print(io, " (entry_ttl %lu)\n", varp_table->entry_ttl); + IOStream_print(io, ")\n"); + VarpTable_print(varp_table, io); +} + +#ifdef __KERNEL__ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) @@ -312,13 +360,20 @@ return err; } -#endif +#endif // LINUX_VERSION_CODE #ifndef LL_RESERVED_SPACE #define HH_DATA_MOD 16 #define LL_RESERVED_SPACE(dev) \ ((dev->hard_header_len & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) -#endif + +#endif // LL_RESERVED_SPACE + +#else // __KERNEL__ + +#define ip_eth_mc_map(daddr, dmac) do{ }while(0) + +#endif // __KERNEL__ /** Send a varp protocol message. * @@ -337,12 +392,11 @@ int udp_n = sizeof(struct udphdr); int varp_n = sizeof(VarpHdr); struct sk_buff *skbout = NULL; - struct in_device *in_dev = NULL; VarpHdr *varph = NULL; - u8 macbuf[6] = {}; - u8 *smac, *dmac = macbuf; - u32 saddr, daddr; - u16 sport, dport; + u8 smacbuf[6] = {}, dmacbuf[6] = {}; + u8 *smac = smacbuf, *dmac = dmacbuf; + u32 saddr = 0, daddr = 0; + u16 sport = 0, dport = 0; #if defined(DEBUG) char vnetbuf[VNET_ID_BUF]; #endif @@ -365,28 +419,38 @@ sport = varp_port; } - if(!dev){ - struct rtable *rt = NULL; - err = addr_route(daddr, &rt); - if(err) goto exit; - dev = rt->u.dst.dev; - } - - in_dev = in_dev_get(dev); - if(!in_dev){ - err = -ENODEV; - goto exit; - } - link_n = LL_RESERVED_SPACE(dev); - saddr = in_dev->ifa_list->ifa_address; - smac = dev->dev_addr; - if(daddr == INADDR_BROADCAST){ - daddr = in_dev->ifa_list->ifa_broadcast; - dmac = dev->broadcast; - } - in_dev_put(in_dev); - - dprintf("> dev=%s\n", dev->name); +#ifdef __KERNEL__ + { + struct in_device *in_dev = NULL; + if(!dev){ + struct rtable *rt = NULL; + err = addr_route(daddr, &rt); + if(err) goto exit; + dev = rt->u.dst.dev; + } + + in_dev = in_dev_get(dev); + if(!in_dev){ + err = -ENODEV; + goto exit; + } + link_n = LL_RESERVED_SPACE(dev); + saddr = in_dev->ifa_list->ifa_address; + smac = dev->dev_addr; + if(daddr == INADDR_BROADCAST){ + daddr = in_dev->ifa_list->ifa_broadcast; + dmac = dev->broadcast; + } + in_dev_put(in_dev); + } +#else + { + extern uint32_t vnetd_addr(void); + saddr = vnetd_addr(); + } +#endif // __KERNEL__ + + dprintf("> dev=%s\n", (dev ? dev->name : "<none>")); dprintf("> smac=" MACFMT " dmac=" MACFMT "\n", MAC6TUPLE(smac), MAC6TUPLE(dmac)); dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n", NIPQUAD(saddr), NIPQUAD(daddr)); dprintf("> sport=%u dport=%u\n", ntohs(sport), ntohs(dport)); @@ -400,12 +464,16 @@ skb_reserve(skbout, link_n); skbout->protocol = htons(ETH_P_IP); +#ifdef __KERNEL__ // Device header. Pushes device header on front of skb. if (dev->hard_header){ err = dev->hard_header(skbout, dev, ETH_P_IP, dmac, smac, skbout->len); if(err < 0) goto exit; skbout->mac.raw = skbout->data; } +#else + smac = smac; // Defeat unused variable warning. +#endif // __KERNEL__ // IP header. skbout->nh.raw = skb_put(skbout, ip_n); @@ -446,104 +514,102 @@ return err; } + /** Send a varp request for the vnet and destination mac of a packet. + * Assumes the ventry is locked. * * @param skb packet * @param vnet vnet (in network order) * @return 0 on success, error code otherwise */ -int varp_solicit(struct sk_buff *skb, VnetId *vnet){ - int err = 0; - err = varp_send(VARP_OP_REQUEST, NULL, NULL, - vnet, (Vmac*)eth_hdr(skb)->h_dest); - return err; +int varp_solicit(VnetId *vnet, Vmac *vmac){ + return varp_send(VARP_OP_REQUEST, NULL, NULL, vnet, vmac); } /* Test some flags. * - * @param z varp entry + * @param ventry varp entry * @param flags to test * @return nonzero if flags set */ -int VarpEntry_get_flags(VarpEntry *z, int flags){ - return z->flags & flags; +int VarpEntry_get_flags(VarpEntry *ventry, int flags){ + return ventry->flags & flags; } /** Set some flags. * - * @param z varp entry + * @param ventry varp entry * @param flags to set * @param set set flags on if nonzero, off if zero * @return new flags value */ -int VarpEntry_set_flags(VarpEntry *z, int flags, int set){ +int VarpEntry_set_flags(VarpEntry *ventry, int flags, int set){ if(set){ - z->flags |= flags; + ventry->flags |= flags; } else { - z->flags &= ~flags; - } - return z->flags; + ventry->flags &= ~flags; + } + return ventry->flags; } /** Print a varp entry. * * @param ventry varp entry */ -void VarpEntry_print(VarpEntry *ventry){ +void VarpEntry_print(VarpEntry *ventry, IOStream *io){ + IOStream_print(io, "(ventry \n"); if(ventry){ + unsigned long now = jiffies; char *state, *flags; char vnetbuf[VNET_ID_BUF]; char addrbuf[VARP_ADDR_BUF]; switch(ventry->state){ - case VARP_STATE_INCOMPLETE: state = "INC"; break; - case VARP_STATE_REACHABLE: state = "RCH"; break; - case VARP_STATE_FAILED: state = "FLD"; break; - default: state = "UNK"; break; - } - flags = (VarpEntry_get_flags(ventry, VARP_FLAG_PROBING) ? "P" : " "); - - printk(KERN_INFO "VENTRY(%p ref=%1d %s %s vnet=%s vmac=" MACFMT - " addr=%s q=%3d t=%lu)\n", - ventry, - atomic_read(&ventry->refcount), - state, flags, - VnetId_ntoa(&ventry->key.vnet, vnetbuf), - MAC6TUPLE(ventry->key.vmac.mac), - VarpAddr_ntoa(&ventry->addr, addrbuf), - skb_queue_len(&ventry->queue), - ventry->timestamp); - } else { - printk("VENTRY: Null!\n"); - } + case VARP_STATE_INCOMPLETE: state = "incomplete"; break; + case VARP_STATE_REACHABLE: state = "reachable"; break; + case VARP_STATE_FAILED: state = "failed"; break; + default: state = "unknown"; break; + } + flags = (VarpEntry_get_flags(ventry, VARP_FLAG_PROBING) ? "P" : "-"); + + IOStream_print(io, " (ref %d)\n", atomic_read(&ventry->refcount)); + IOStream_print(io, " (state %s)\n", state); + IOStream_print(io, " (flags %s)\n", flags); + IOStream_print(io, " (addr %s)\n", VarpAddr_ntoa(&ventry->addr, addrbuf)); + IOStream_print(io, " (queue %d)\n", skb_queue_len(&ventry->queue)); + IOStream_print(io, " (age %lu)\n", now - ventry->timestamp); + IOStream_print(io, " (vmac " MACFMT ")\n", MAC6TUPLE(ventry->key.vmac.mac)); + IOStream_print(io, " (vnet %s)\n", VnetId_ntoa(&ventry->key.vnet, vnetbuf)); + } + IOStream_print(io, ")\n"); } /** Free a varp entry. * - * @param z varp entry - */ -void VarpEntry_free(VarpEntry *z){ - if(!z) return; - deallocate(z); + * @param ventry varp entry + */ +static void VarpEntry_free(VarpEntry *ventry){ + if(!ventry) return; + deallocate(ventry); } /** Increment reference count. * - * @param z varp entry (may be null) - */ -void VarpEntry_incref(VarpEntry *z){ - if(!z) return; - atomic_inc(&z->refcount); + * @param ventry varp entry (may be null) + */ +void VarpEntry_incref(VarpEntry *ventry){ + if(!ventry) return; + atomic_inc(&ventry->refcount); } /** Decrement reference count, freeing if zero. * - * @param z varp entry (may be null) - */ -void VarpEntry_decref(VarpEntry *z){ - if(!z) return; - if(atomic_dec_and_test(&z->refcount)){ - VarpEntry_free(z); + * @param ventry varp entry (may be null) + */ +void VarpEntry_decref(VarpEntry *ventry){ + if(!ventry) return; + if(atomic_dec_and_test(&ventry->refcount)){ + VarpEntry_free(ventry); } } @@ -567,9 +633,7 @@ * @param ventry varp entry */ void VarpEntry_schedule(VarpEntry *ventry){ - unsigned long now = jiffies; - ventry->timer.expires = now + VARP_PROBE_INTERVAL; - add_timer(&ventry->timer); + timer_set(&ventry->timer, VARP_PROBE_INTERVAL); } /** Function called when a varp entry timer goes off. @@ -582,36 +646,49 @@ unsigned long flags; VarpEntry *ventry = (VarpEntry *)arg; struct sk_buff *skb = NULL; - int locked = 0, probing = 0; - - dprintf(">\n"); //VarpEntry_print(ventry); + int probing = 0; + + dprintf(">\n"); VarpEntry_lock(ventry, flags); - locked = 1; - if(ventry->state == VARP_STATE_REACHABLE){ - // Do nothing. - } else { - // Probe if haven't run out of tries, otherwise fail. - if(atomic_read(&ventry->probes) < VARP_PROBE_MAX){ - probing = 1; - VarpEntry_schedule(ventry); - skb = skb_peek(&ventry->queue); - if(skb){ - dprintf("> skbs in queue - solicit\n"); - atomic_inc(&ventry->probes); - VarpEntry_unlock(ventry, flags); - locked = 0; - varp_solicit(skb, &ventry->key.vnet); + if(!atomic_read(&ventry->deleted)){ + switch(ventry->state){ + case VARP_STATE_REACHABLE: + case VARP_STATE_FAILED: + break; + case VARP_STATE_INCOMPLETE: + // Probe if haven't run out of tries, otherwise fail. + if(atomic_read(&ventry->probes) < VARP_PROBE_MAX){ + unsigned long qflags; + VnetId vnet; + Vmac vmac; + + probing = 1; + spin_lock_irqsave(&ventry->queue.lock, qflags); + skb = skb_peek(&ventry->queue); + if(skb){ + vmac = *(Vmac*)eth_hdr(skb)->h_dest; + } + spin_unlock_irqrestore(&ventry->queue.lock, qflags); + if(skb){ + dprintf("> skbs in queue - solicit\n"); + vnet = ventry->key.vnet; + atomic_inc(&ventry->probes); + VarpEntry_unlock(ventry, flags); + varp_solicit(&vnet, &vmac); + VarpEntry_lock(ventry, flags); + } else { + dprintf("> empty queue.\n"); + } + VarpEntry_schedule(ventry); } else { - dprintf("> empty queue.\n"); + VarpEntry_error(ventry); + ventry->state = VARP_STATE_FAILED; } - } else { - dprintf("> Out of probes: FAILED\n"); - VarpEntry_error(ventry); - ventry->state = VARP_STATE_FAILED; + break; } } VarpEntry_set_flags(ventry, VARP_FLAG_PROBING, probing); - if(locked) VarpEntry_unlock(ventry, flags); + VarpEntry_unlock(ventry, flags); if(!probing) VarpEntry_decref(ventry); dprintf("<\n"); } @@ -631,25 +708,25 @@ * @return ventry or null */ VarpEntry * VarpEntry_new(VnetId *vnet, Vmac *vmac){ - VarpEntry *z = ALLOCATE(VarpEntry); - if(z){ + VarpEntry *ventry = ALLOCATE(VarpEntry); + if(ventry){ unsigned long now = jiffies; - atomic_set(&z->refcount, 1); - z->lock = RW_LOCK_UNLOCKED; - z->state = VARP_STATE_INCOMPLETE; - z->queue_max = VARP_QUEUE_MAX; - skb_queue_head_init(&z->queue); - init_timer(&z->timer); - z->timer.data = (unsigned long)z; - z->timer.function = varp_timer_fn; - z->timestamp = now; - z->error = varp_error_fn; - - z->key.vnet = *vnet; - z->key.vmac = *vmac; - } - return z; + atomic_set(&ventry->refcount, 1); + atomic_set(&ventry->probes, 0); + atomic_set(&ventry->deleted, 0); + ventry->lock = RW_LOCK_UNLOCKED; + ventry->state = VARP_STATE_INCOMPLETE; + ventry->queue_max = VARP_QUEUE_MAX; + skb_queue_head_init(&ventry->queue); + timer_init(&ventry->timer, varp_timer_fn, ventry); + ventry->timestamp = now; + ventry->error = varp_error_fn; + + ventry->key.vnet = *vnet; + ventry->key.vmac = *vmac; + } + return ventry; } /** Hash function for keys in the varp cache. @@ -658,12 +735,8 @@ * @param k key (VarpKey) * @return hashcode */ -Hashcode varp_key_hash_fn(void *k){ - VarpKey *key = k; - Hashcode h = 0; - h = VnetId_hash(h, &key->vnet); - h = Vmac_hash(h, &key->vmac); - return h; +static Hashcode varp_key_hash_fn(void *k){ + return hash_hvoid(0, k, sizeof(VarpKey)); } /** Test equality for keys in the varp cache. @@ -673,11 +746,8 @@ * @param k2 key to compare (VarpKey) * @return 1 if equal, 0 otherwise */ -int varp_key_equal_fn(void *k1, void *k2){ - VarpKey *key1 = k1; - VarpKey *key2 = k2; - return (VnetId_eq(&key1->vnet, &key2->vnet) && - Vmac_eq(&key1->vmac, &key2->vmac)); +static int varp_key_equal_fn(void *k1, void *k2){ + return memcmp(k1, k2, sizeof(VarpKey)) == 0; } /** Free an entry in the varp cache. @@ -696,27 +766,43 @@ /** Free the whole varp cache. * Dangerous. * - * @param z varp cache - */ -void VarpTable_free(VarpTable *z){ - unsigned long flags; - if(!z) return; - VarpTable_write_lock(z, flags); - del_timer(&z->timer); - z->timer.data = 0; - if(z->table) HashTable_free(z->table); - VarpTable_write_unlock(z, flags); - deallocate(z); + * @param vtable varp cache + */ +void VarpTable_free(VarpTable *vtable){ + unsigned long vtflags; + if(!vtable) return; + VarpTable_write_lock(vtable, vtflags); + timer_cancel(&vtable->timer); + vtable->timer.data = 0; + if(vtable->table){ + HashTable *table = vtable->table; + HashTable_for_decl(entry); + + vtable->table = NULL; + HashTable_for_each(entry, table){ + VarpEntry *ventry = entry->value; + unsigned long flags; + VarpEntry_lock(ventry, flags); + atomic_set(&ventry->deleted, 1); + if(VarpEntry_get_flags(ventry, VARP_FLAG_PROBING)){ + timer_cancel(&ventry->timer); + ventry->timer.data = 0; + VarpEntry_decref(ventry); + } + VarpEntry_unlock(ventry, flags); + } + HashTable_free(table); + } + VarpTable_write_unlock(vtable, vtflags); + deallocate(vtable); } /** Schedule the varp table timer. * - * @param z varp table - */ -void VarpTable_schedule(VarpTable *z){ - unsigned long now = jiffies; - z->timer.expires = now + VARP_ENTRY_TTL; - add_timer(&z->timer); + * @param vtable varp table + */ +void VarpTable_schedule(VarpTable *vtable){ + timer_set(&vtable->timer, vtable->entry_ttl); } /** Function called when the varp table timer goes off. @@ -725,30 +811,30 @@ * @param arg varp table */ static void varp_table_timer_fn(unsigned long arg){ - VarpTable *z = (VarpTable *)arg; - if(z){ - VarpTable_sweep(z, 0); - VarpTable_schedule(z); + VarpTable *vtable = (VarpTable *)arg; + if(vtable){ + VarpTable_sweep(vtable); + VarpTable_schedule(vtable); } } /** Print a varp table. * - * @param z table - */ -void VarpTable_print(VarpTable *z){ + * @param vtable table + */ +void VarpTable_print(VarpTable *vtable, IOStream *io){ HashTable_for_decl(entry); VarpEntry *ventry; - unsigned long flags, vflags; - - VarpTable_read_lock(z, flags); - HashTable_for_each(entry, varp_table->table){ + unsigned long vtflags, flags; + + VarpTable_read_lock(vtable, vtflags); + HashTable_for_each(entry, vtable->table){ ventry = entry->value; - VarpEntry_lock(ventry, vflags); - VarpEntry_print(ventry); - VarpEntry_unlock(ventry, vflags); - } - VarpTable_read_unlock(z, flags); + VarpEntry_lock(ventry, flags); + VarpEntry_print(ventry, io); + VarpEntry_unlock(ventry, flags); + } + VarpTable_read_unlock(vtable, vtflags); } /** Create a varp table. @@ -757,83 +843,140 @@ */ VarpTable * VarpTable_new(void){ int err = -ENOMEM; - VarpTable *z = NULL; - - z = ALLOCATE(VarpTable); - if(!z) goto exit; - z->table = HashTable_new(VARP_TABLE_BUCKETS); - if(!z->table) goto exit; - z->table->key_equal_fn = varp_key_equal_fn; - z->table->key_hash_fn = varp_key_hash_fn; - z->table->entry_free_fn = varp_entry_free_fn; - init_MUTEX(&z->lock); - init_timer(&z->timer); - z->timer.data = (unsigned long)z; - z->timer.function = varp_table_timer_fn; - VarpTable_schedule(z); + VarpTable *vtable = NULL; + + vtable = ALLOCATE(VarpTable); + if(!vtable) goto exit; + vtable->table = HashTable_new(VARP_TABLE_BUCKETS); + if(!vtable->table) goto exit; + vtable->table->key_equal_fn = varp_key_equal_fn; + vtable->table->key_hash_fn = varp_key_hash_fn; + vtable->table->entry_free_fn = varp_entry_free_fn; + + vtable->entry_ttl = VARP_ENTRY_TTL; + vtable->probe_max = VARP_PROBE_MAX; + vtable->probe_interval = VARP_PROBE_INTERVAL; + vtable->queue_max = VARP_QUEUE_MAX; + + init_MUTEX(&vtable->mutex); + vtable->lock = RW_LOCK_UNLOCKED; + timer_init(&vtable->timer, varp_table_timer_fn, vtable); err = 0; exit: if(err){ - VarpTable_free(z); - z = NULL; - } - return z; + VarpTable_free(vtable); + vtable = NULL; + } + return vtable; } /** Add a new entry to the varp table. * - * @param z table + * @param vtable table * @param vnet vnet id * @param vmac virtual MAC address (copied) * @return new entry or null */ -VarpEntry * VarpTable_add(VarpTable *z, VnetId *vnet, Vmac *vmac){ - int err = -ENOMEM; - VarpEntry *ventry; - HTEntry *entry; - unsigned long flags; - +VarpEntry * VarpTable_add(VarpTable *vtable, VnetId *vnet, Vmac *vmac){ + int err = 0; + VarpKey key = { .vnet = *vnet, .vmac = *vmac}; + VarpEntry *ventry = NULL; + HTEntry *entry = NULL; + unsigned long vtflags; + + VarpTable_write_lock(vtable, vtflags); + ventry = HashTable_get(vtable->table, &key); + if(ventry){ + VarpEntry_incref(ventry); + goto exit; + } + err = -ENOMEM; ventry = VarpEntry_new(vnet, vmac); if(!ventry) goto exit; - VarpTable_write_lock(z, flags); - entry = HashTable_add(z->table, ventry, ventry); - VarpTable_write_unlock(z, flags); - if(!entry) goto exit; + entry = HashTable_add(vtable->table, ventry, ventry); + if(!entry){ + VarpEntry_decref(ventry); + ventry = NULL; + goto exit; + } + err = 0; VarpEntry_incref(ventry); - err = 0; exit: - if(err){ - VarpEntry_free(ventry); - ventry = NULL; - } + VarpTable_write_unlock(vtable, vtflags); return ventry; } /** Remove an entry from the varp table. * - * @param z table + * @param vtable table * @param ventry entry to remove * @return removed count */ -int VarpTable_remove(VarpTable *z, VarpEntry *ventry){ - return HashTable_remove(z->table, ventry); +int VarpTable_remove(VarpTable *vtable, VarpEntry *ventry){ + //TODO: Could send a varp announce with null addr for the entry + // vnet and vmac to notify others, so they will resolve the addr + // instead of sending traffic to us. + atomic_set(&ventry->deleted, 1); + skb_queue_purge(&ventry->queue); + return HashTable_remove(vtable->table, ventry); +} + +/** Remove all entries using a vnet. + * Caller must hold the table lock. + * + * @param vtable table + * @param vnet vnet + * @return removed count + */ +int VarpTable_remove_vnet(VarpTable *vtable, VnetId *vnet){ + int count = 0; + HashTable_for_decl(entry); + + HashTable_for_each(entry, vtable->table){ + VarpEntry *ventry = entry->value; + if(VnetId_eq(&ventry->key.vnet, vnet)){ + count += VarpTable_remove(vtable, ventry); + } + } + return count; +} + +/** Remove all entries using a vnet from the varp table. + * + * @param vnet vnet + * @return removed count + */ +int varp_remove_vnet(VnetId *vnet){ + int count = 0; + unsigned long vtflags; + + VarpTable_write_lock(varp_table, vtflags); + count = VarpTable_remove_vnet(varp_table, vnet); + VarpTable_write_unlock(varp_table, vtflags); + return count; } /** Lookup an entry in the varp table. * - * @param z table + * @param vtable table * @param vnet vnet id - * @param vmac virtual MAC addres + * @param vmac virtual MAC address + * @param create create a new entry if needed if true * @return entry found or null */ -VarpEntry * VarpTable_lookup(VarpTable *z, VnetId *vnet, Vmac *vmac){ - unsigned long flags; +VarpEntry * VarpTable_lookup(VarpTable *vtable, VnetId *vnet, Vmac *vmac, int create){ VarpKey key = { .vnet = *vnet, .vmac = *vmac }; - VarpEntry *ventry; - VarpTable_read_lock(z, flags); - ventry = HashTable_get(z->table, &key); + VarpEntry *ventry = NULL; + unsigned long vtflags; + + VarpTable_read_lock(vtable, vtflags); + ventry = HashTable_get(vtable->table, &key); if(ventry) VarpEntry_incref(ventry); - VarpTable_read_unlock(z, flags); + VarpTable_read_unlock(vtable, vtflags); + + if(!ventry && create){ + ventry = VarpTable_add(vtable, vnet, vmac); + } return ventry; } @@ -849,11 +992,13 @@ int err = 0; unsigned long flags = 0; VarpAddr addr; + VnetId vnet; dprintf("> skb=%p\n", skb); + vnet = ventry->key.vnet; addr = ventry->addr; VarpEntry_unlock(ventry, flags); - err = vnet_tunnel_send(&ventry->key.vnet, &addr, skb); + err = vnet_tunnel_send(&vnet, &addr, skb); VarpEntry_lock(ventry, flags); dprintf("< err=%d\n", err); return err; @@ -872,6 +1017,8 @@ int VarpEntry_resolve(VarpEntry *ventry, struct sk_buff *skb){ int err = 0; unsigned long flags = 0; + VnetId vnet; + Vmac vmac; dprintf("> skb=%p\n", skb); ventry->state = VARP_STATE_INCOMPLETE; @@ -881,44 +1028,27 @@ VarpEntry_incref(ventry); VarpEntry_schedule(ventry); } + vnet = ventry->key.vnet; + vmac = *(Vmac*)eth_hdr(skb)->h_dest; VarpEntry_unlock(ventry, flags); - varp_solicit(skb, &ventry->key.vnet); + varp_solicit(&vnet, &vmac); VarpEntry_lock(ventry, flags); if(ventry->state == VARP_STATE_INCOMPLETE){ - if(skb_queue_len(&ventry->queue) >= ventry->queue_max){ + while(skb_queue_len(&ventry->queue) >= ventry->queue_max){ struct sk_buff *oldskb; - oldskb = ventry->queue.next; - __skb_unlink(oldskb, &ventry->queue); + oldskb = skb_dequeue(&ventry->queue); + //oldskb = ventry->queue.next; + //__skb_unlink(oldskb, &ventry->queue); + if(!oldskb) break; dprintf("> dropping skb=%p\n", oldskb); kfree_skb(oldskb); } - __skb_queue_tail(&ventry->queue, skb); + skb_queue_tail(&ventry->queue, skb); } else { err = VarpEntry_send(ventry, skb); } dprintf("< err=%d\n", err); - return err; -} - -/** Handle output for a ventry. Resolves the ventry - * if necessary. - * - * @param ventry varp entry - * @param skb skb to send - * @return 0 on success, error code otherwise - */ -int VarpEntry_output(VarpEntry *ventry, struct sk_buff *skb){ - int err = 0; - - switch(ventry->state){ - case VARP_STATE_REACHABLE: - err = VarpEntry_send(ventry, skb); - break; - default: - err = VarpEntry_resolve(ventry, skb); - break; - } return err; } @@ -931,11 +1061,58 @@ struct sk_buff *skb; for( ; ; ){ if(ventry->state != VARP_STATE_REACHABLE) break; - skb = __skb_dequeue(&ventry->queue); + skb = skb_dequeue(&ventry->queue); if(!skb) break; - VarpEntry_output(ventry, skb); + VarpEntry_send(ventry, skb); } skb_queue_purge(&ventry->queue); +} + +/** Multicast an skb on a vnet. + * + * @param vnet vnet id + * @param skb skb to send + * @return 0 on success, error code otherwise + */ +static int varp_multicast(VnetId *vnet, struct sk_buff *skb){ + VarpAddr addr = { .family = AF_INET }; + addr.u.ip4.s_addr = varp_mcast_addr; + return vnet_tunnel_send(vnet, &addr, skb); +} + +/** Handle output for a ventry. Resolves the ventry + * if necessary. + * + * @param ventry varp entry + * @param skb skb to send + * @return 0 on success, error code otherwise + */ +int VarpEntry_output(VarpEntry *ventry, struct sk_buff *skb){ + int err = 0; + unsigned long flags; + + VarpEntry_lock(ventry, flags); + switch(ventry->state){ + case VARP_STATE_REACHABLE: + if(skb_queue_len(&ventry->queue) > 0){ + VarpEntry_process_queue(ventry); + } + err = VarpEntry_send(ventry, skb); + break; + default: + if(0){ + err = VarpEntry_resolve(ventry, skb); + } else { + // Multicast the skb if the entry is not reachable. + VnetId vnet = ventry->key.vnet; + VarpEntry_unlock(ventry, flags); + err = varp_multicast(&vnet, skb); + VarpEntry_lock(ventry, flags); + } + break; + } + VarpEntry_unlock(ventry, flags); + return err; } /** Update a ventry. Sets the address and state to those given @@ -946,132 +1123,155 @@ * @param state state * @return 0 on success, error code otherwise */ -int VarpEntry_update(VarpEntry *ventry, VarpAddr *addr, int state){ +int VarpEntry_update(VarpEntry *ventry, VarpAddr *addr, int state, int vflags){ int err = 0; unsigned long now = jiffies; unsigned long flags; - dprintf("> addr=" IPFMT " state=%d\n", NIPQUAD(addr), state); VarpEntry_lock(ventry, flags); + //if(atomic_read(&ventry->deleted)) goto exit; if(VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT)) goto exit; ventry->addr = *addr; ventry->timestamp = now; ventry->state = state; - VarpEntry_process_queue(ventry); + // Can't process the queue while atomic as it calls schedule(), + // and that's bad. + //if(0 && (vflags & VARP_UPDATE_QUEUE) && !in_atomic()){ + // VarpEntry_process_queue(ventry); + //} exit: VarpEntry_unlock(ventry, flags); dprintf("< err=%d\n", err); return err; } -int VarpTable_update(VarpTable *z, VnetId *vnet, Vmac *vmac, VarpAddr *addr, - int state, int force){ +/** Update the entry for a vnet. + * + * @param vtable varp table + * @param vnet vnet id + * @param vmac mac address + * @param addr care-of-address + * @param state state + * @param flags update flags + * @return 0 on success, error code otherwise + */ +int VarpTable_update(VarpTable *vtable, VnetId *vnet, Vmac *vmac, VarpAddr *addr, + int state, int flags){ int err = 0; VarpEntry *ventry; #ifdef DEBUG char vnetbuf[VNET_ID_BUF]; char addrbuf[VARP_ADDR_BUF]; -#endif - dprintf("> vnet=%s mac=" MACFMT " addr=%s state=%d force=%d\n", + dprintf("> vnet=%s mac=" MACFMT " addr=%s state=%d flags=%x\n", VnetId_ntoa(vnet, vnetbuf), MAC6TUPLE(vmac->mac), VarpAddr_ntoa(addr, addrbuf), state, - force); - ventry = VarpTable_lookup(z, vnet, vmac); - if(force && !ventry){ - dprintf("> No entry, adding\n"); - ventry = VarpTable_add(z, vnet, vmac); - } - if(ventry){ - dprintf("> Updating\n"); - err = VarpEntry_update(ventry, addr, state); - VarpEntry_decref(ventry); + flags); +#endif + ventry = VarpTable_lookup(vtable, vnet, vmac, (flags & VARP_UPDATE_CREATE)); + if(!ventry){ + err = -ENOENT; + goto exit; + } + err = VarpEntry_update(ventry, addr, state, flags); + VarpEntry_decref(ventry); + exit: + dprintf("< err=%d\n", err); + return err; +} + +/** Update the entry for a vnet: make it reachable and create an entry + * if needed. + * + * @param vnet vnet id + * @param vmac mac address + * @param addr care-of-address + * @return 0 on success, error code otherwise + */ +int varp_update(VnetId *vnet, unsigned char *vmac, VarpAddr *addr){ + int err = 0; + if(!varp_table){ + err = -ENOSYS; } else { - dprintf("> No entry found\n"); - err = -ENOENT; - } - dprintf("< err=%d\n", err); - return err; -} - -/** Update the ventry corresponding to the given varp header. - * - * @param z table - * @param varph varp header - * @param state state - * @return 0 on success, -ENOENT if no entry found - */ -int VarpTable_update_entry(VarpTable *z, VarpHdr *varph, int state){ - return VarpTable_update(z, &varph->vnet, &varph->vmac, &varph->addr, state, 0); -} - -int varp_update(VnetId *vnet, unsigned char *vmac, VarpAddr *addr){ - if(!varp_table){ - return -ENOSYS; - } - return VarpTable_update(varp_table, vnet, (Vmac*)vmac, addr, - VARP_STATE_REACHABLE, 1); -} - -/** Put old varp entries into the incomplete state. - * Permanent entries are not changed. - * If 'all' is non-zero, all non-permanent entries - * are put into the incomplete state, regardless of age. - * - * @param z table - * @param all reset all entries if non-zero - */ -void VarpTable_sweep(VarpTable *z, int all){ + err = VarpTable_update(varp_table, vnet, (Vmac*)vmac, addr, + VARP_STATE_REACHABLE, VARP_UPDATE_CREATE); + } + return err; +} + +static inline int VarpEntry_sweepable(VarpEntry *ventry){ + return !VarpEntry_get_flags(ventry, (VARP_FLAG_PERMANENT | VARP_FLAG_PROBING)); +} + +static inline int VarpTable_old(VarpTable *vtable, VarpEntry *ventry, unsigned long now){ + return now - ventry->timestamp > vtable->entry_ttl; +} + +/** Sweep old varp entries. + * Doesn't affect entries that are probing or permanent. + * + * @param vtable table + */ +void VarpTable_sweep(VarpTable *vtable){ HashTable_for_decl(entry); VarpEntry *ventry; unsigned long now = jiffies; - unsigned long old = now - VARP_ENTRY_TTL; - unsigned long flags, vflags; - - VarpTable_read_lock(z, flags); - HashTable_for_each(entry, varp_table->table){ + unsigned long vtflags, flags; + int sweep, swept = 0; + + if(!vtable) return; + VarpTable_write_lock(vtable, vtflags); + HashTable_for_each(entry, vtable->table){ ventry = entry->value; - VarpEntry_lock(ventry, vflags); - if(!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) && - (all || (ventry->timestamp < old))){ - VarpEntry_process_queue(ventry); + VarpEntry_lock(ventry, flags); + sweep = VarpEntry_sweepable(ventry) && VarpTable_old(vtable, ventry, now); + if(sweep){ + swept++; + iprintf("> Sweeping:\n"); + VarpEntry_print(ventry, iostdout); + //VarpEntry_process_queue(ventry); ventry->state = VARP_STATE_INCOMPLETE; } - VarpEntry_unlock(ventry, vflags); - } - VarpTable_read_unlock(z, flags); + VarpEntry_unlock(ventry, flags); + if(sweep){ + VarpTable_remove(vtable, ventry); + } + } + VarpTable_write_unlock(vtable, vtflags); + if(swept){ + iprintf(">\n"); + varp_print(iostdout); + } } /** Flush the varp table. - * Remove old unreachable varp entries with empty queues. - * Permanent entries are not removed. - * - * @param z table - */ -void VarpTable_flush(VarpTable *z){ + * + * @param vtable table + */ +void VarpTable_flush(VarpTable *vtable){ HashTable_for_decl(entry); VarpEntry *ventry; - unsigned long now = jiffies; - unsigned long old = now - VARP_ENTRY_TTL; - unsigned long flags, vflags; + unsigned long vtflags, flags; int flush; - VarpTable_write_lock(z, flags); - HashTable_for_each(entry, varp_table->table){ + VarpTable_write_lock(vtable, vtflags); + HashTable_for_each(entry, vtable->table){ ventry = entry->value; - VarpEntry_lock(ventry, vflags); + VarpEntry_lock(ventry, flags); flush = (!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) && - (ventry->timestamp < old) && - (ventry->state != VARP_STATE_REACHABLE) && - (skb_queue_len(&ventry->queue) == 0)); - VarpEntry_unlock(ventry, vflags); + !VarpEntry_get_flags(ventry, VARP_FLAG_PROBING)); if(flush){ - VarpTable_remove(z, ventry); - } - } - VarpTable_write_unlock(z, flags); + iprintf("> Flushing:\n"); + VarpEntry_print(ventry, iostdout); + } + VarpEntry_unlock(ventry, flags); + if(flush){ + VarpTable_remove(vtable, ventry); + } + } + VarpTable_write_unlock(vtable, vtflags); } /** Handle a varp request. Look for a vif with the requested @@ -1129,7 +1329,10 @@ int err = 0; dprintf(">\n"); - err = VarpTable_update_entry(varp_table, varph, VARP_STATE_REACHABLE); + err = VarpTable_update(varp_table, + &varph->vnet, &varph->vmac, &varph->addr, + VARP_STATE_REACHABLE, + (VARP_UPDATE_CREATE | VARP_UPDATE_QUEUE)); dprintf("< err=%d\n", err); return err; } @@ -1140,33 +1343,51 @@ * @return 0 if OK, error code otherwise */ int varp_handle_message(struct sk_buff *skb){ - // Assume h. nh set, skb->data point after udp hdr (at varphdr). - int err = -EINVAL, mine = 0; - VarpHdr *varph = (void*)(skb->h.uh + 1); - - dprintf(">\n"); + // Assume nh, h set, skb->data points at udp hdr (h). + int err = -EINVAL; + VarpHdr *varph; // = (void*)(skb->h.uh + 1); + + dprintf("> skb=%p saddr=" IPFMT " daddr=" IPFMT "\n", + skb, + NIPQUAD(skb->nh.iph->saddr), + NIPQUAD(skb->nh.iph->daddr)); if(!varp_table){ err = -ENOSYS; + return err; + } + if(MULTICAST(skb->nh.iph->daddr)){ + if(skb->nh.iph->daddr != varp_mcast_addr){ + // Ignore multicast packets not addressed to us. + err = 0; + dprintf("> Ignoring daddr=" IPFMT " mcaddr=" IPFMT "\n", + NIPQUAD(skb->nh.iph->daddr), NIPQUAD(varp_mcast_addr)); + goto exit; + } + } + varph = (void*)skb_pull(skb, sizeof(struct udphdr)); + if(skb->len < sizeof(struct VnetMsgHdr)){ + wprintf("> Varp msg too short: %d < %d\n", skb->len, sizeof(struct VnetMsgHdr)); goto exit; } - if(MULTICAST(skb->nh.iph->daddr) && - (skb->nh.iph->daddr != varp_mcast_addr)){ - // Ignore multicast packets not addressed to us. - err = 0; - dprintf("> Ignoring daddr=" IPFMT " mcaddr=" IPFMT "\n", - NIPQUAD(skb->nh.iph->daddr), NIPQUAD(varp_mcast_addr)); + switch(ntohs(varph->hdr.id)){ + case VARP_ID: // Varp message. Handled below. + if(skb->len < sizeof(*varph)){ + wprintf("> Varp msg too short: %d < %d\n", skb->len, sizeof(*varph)); + goto exit; + } + break; + case VUDP_ID: // Etherip-in-udp packet. + skb_pull(skb, sizeof(struct VnetMsgHdr)); + err = etherip_protocol_recv(skb); goto exit; - } - if(skb->len < sizeof(*varph)){ - wprintf("> Varp msg too short: %d < %d\n", skb->len, sizeof(*varph)); + case VFWD_ID: // Forwarded. + skb_pull(skb, sizeof(struct VnetMsgHdr)); + err = vnet_forward_recv(skb); goto exit; - } - mine = 1; - if(varph->hdr.id != htons(VARP_ID)){ + default: // It's not varp at all - ignore it. - wprintf("> Invalid varp id: %d, expected %d \n", - ntohs(varph->hdr.id), - VARP_ID); + wprintf("> Invalid varp id: %d\n", ntohs(varph->hdr.id)); + print_skb("INVALID", 0, skb); goto exit; } #ifdef DEBUG @@ -1196,7 +1417,6 @@ break; } exit: - if(mine) err = 1; dprintf("< err=%d\n", err); return err; } @@ -1212,8 +1432,11 @@ unsigned char *mac = NULL; Vmac *vmac = NULL; VarpEntry *ventry = NULL; - - dprintf(">\n"); +#if defined(DEBUG) + char vnetbuf[VNET_ID_BUF]; +#endif + + dprintf("> vnet=%s\n", VnetId_ntoa(vnet, vnetbuf)); if(!varp_table){ err = -ENOSYS; goto exit; @@ -1226,20 +1449,11 @@ mac = eth_hdr(skb)->h_dest; vmac = (Vmac*)mac; if(mac_is_multicast(mac)){ - VarpAddr addr = {}; - addr.family = AF_INET; - addr.u.ip4.s_addr = varp_mcast_addr; - err = vnet_tunnel_send(vnet, &addr, skb); + err = varp_multicast(vnet, skb); } else { - ventry = VarpTable_lookup(varp_table, vnet, vmac); - if(!ventry){ - ventry = VarpTable_add(varp_table, vnet, vmac); - } + ventry = VarpTable_lookup(varp_table, vnet, vmac, 1); if(ventry){ - unsigned long flags; - VarpEntry_lock(ventry, flags); err = VarpEntry_output(ventry, skb); - VarpEntry_unlock(ventry, flags); VarpEntry_decref(ventry); } else { err = -ENOMEM; @@ -1292,6 +1506,7 @@ err = -ENOMEM; goto exit; } + VarpTable_schedule(varp_table); varp_init_mcast_addr(varp_mcaddr); varp_port = htons(VARP_PORT); @@ -1307,9 +1522,9 @@ dprintf(">\n"); varp_close(); if(varp_table){ - VarpTable *z = varp_table; + VarpTable *vtable = varp_table; varp_table = NULL; - VarpTable_free(z); + VarpTable_free(vtable); } dprintf("<\n"); } diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/varp.h --- a/tools/vnet/vnet-module/varp.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/varp.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,10 +19,18 @@ #ifndef _VNET_VARP_H #define _VNET_VARP_H + +#ifdef __KERNEL__ + +#else + +#include "sys_kernel.h" + +#endif + #include "hash_table.h" #include "if_varp.h" #include "varp_util.h" - #define CONFIG_VARP_GRATUITOUS 1 @@ -30,12 +38,19 @@ struct sk_buff; struct Vif; +enum { + VARP_UPDATE_CREATE = 1, + VARP_UPDATE_QUEUE = 2, +}; + extern int vnet_get_device(const char *name, struct net_device **dev); extern int vnet_get_device_address(struct net_device *dev, u32 *addr); +extern int varp_remove_vnet(struct VnetId *vnet); extern int varp_handle_message(struct sk_buff *skb); extern int varp_output(struct sk_buff *skb, struct VnetId *vnet); -extern int varp_update(struct VnetId *vnet, unsigned char *vmac, struct VarpAddr *addr); +extern int varp_update(struct VnetId *vnet, unsigned char *vmac, + struct VarpAddr *addr); extern int varp_init(void); extern void varp_exit(void); @@ -44,12 +59,13 @@ extern void varp_close(void); extern int varp_set_mcast_addr(u32 addr); -extern void varp_print(void); +extern void varp_print(struct IOStream *io); extern void varp_flush(void); extern int varp_announce_vif(struct net_device *dev, struct Vif *vif); extern u32 varp_mcast_addr; +extern u16 varp_port; /* MAC broadcast addr is ff-ff-ff-ff-ff-ff (all 1's). * MAC multicast addr has low bit 1, i.e. 01-00-00-00-00-00. diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/varp_socket.c --- a/tools/vnet/vnet-module/varp_socket.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/varp_socket.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -23,6 +23,7 @@ #include <asm/uaccess.h> #include <linux/net.h> #include <linux/in.h> +#include <linux/ip.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/version.h> @@ -31,6 +32,7 @@ #include <if_varp.h> #include <varp.h> +#include <vnet_forward.h> /* Get macros needed to define system calls as functions in the kernel. */ #define __KERNEL_SYSCALLS__ @@ -42,29 +44,32 @@ #undef DEBUG #include "debug.h" -// Compensate for struct sock fields having 'sk_' added -// to them in 2.6. -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - -#define SK_RECEIVE_QUEUE sk_receive_queue -#define SK_SLEEP sk_sleep - -#else - -#define SK_RECEIVE_QUEUE receive_queue -#define SK_SLEEP sleep - -#endif - /** @file * Support for the VARP udp sockets. */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + +/* Compensate for struct sock fields having 'sk_' added to them in 2.6. */ +#define sk_receive_queue receive_queue +#define sk_sleep sleep + +/* Here because inline in 'socket.c' (2.4, in net.h for 2.6). */ +#define sockfd_put(sock) fput((sock)->file) + +#endif static inline mm_segment_t change_fs(mm_segment_t fs){ mm_segment_t oldfs = get_fs(); set_fs(fs); return oldfs; } + +/** Define the fcntl() syscall. */ +static inline _syscall3(int, fcntl, + unsigned int, fd, + unsigned int, cmd, + unsigned long, arg) /* Replicate the user-space socket API. * The parts we need anyway. @@ -183,6 +188,7 @@ VSOCK_CONNECT = 4, VSOCK_BROADCAST = 8, VSOCK_MULTICAST = 16, + VSOCK_NONBLOCK = 32, }; /** Convert socket flags to a string. @@ -191,32 +197,41 @@ * @return static string */ char * socket_flags(int flags){ - static char s[6]; + static char s[7]; int i = 0; s[i++] = (flags & VSOCK_CONNECT ? 'c' : '-'); s[i++] = (flags & VSOCK_BIND ? 'b' : '-'); s[i++] = (flags & VSOCK_REUSE ? 'r' : '-'); s[i++] = (flags & VSOCK_BROADCAST ? 'B' : '-'); s[i++] = (flags & VSOCK_MULTICAST ? 'M' : '-'); + s[i++] = (flags & VSOCK_NONBLOCK ? 'N' : '-'); s[i++] = '\0'; return s; } +/** Control flag for whether varp should be running. + * If this is set 0 then the varp thread will notice and + * (eventually) exit. + */ +atomic_t varp_run = ATOMIC_INIT(0); + +enum { + VARP_STATE_EXITED = 2, + VARP_STATE_RUNNING = 1, + VARP_STATE_NONE = 0, + VARP_STATE_ERROR = -1, +}; + +/** State indicating whether the varp thread is running. */ +atomic_t varp_state = ATOMIC_INIT(VARP_STATE_NONE); + +int varp_thread_err = 0; + /** The varp multicast socket. */ int varp_mcast_sock = -1; /** The varp unicast socket. */ int varp_ucast_sock = -1; - -/** Control flag for whether varp should be running. - * If this is set 0 then the varp thread will notice and - * (eventually) exit. This is indicated by setting varp_running - * to 0. - */ -atomic_t varp_run = ATOMIC_INIT(0); - -/** State flag indicating whether the varp thread is running. */ -atomic_t varp_running = ATOMIC_INIT(0); /** Set socket option to reuse address. * @@ -274,7 +289,6 @@ goto exit; } exit: - err = 0; //todo: remove hack return err; } @@ -305,12 +319,9 @@ struct sockaddr_in addr_in; struct sockaddr *addr = (struct sockaddr *)&addr_in; int addr_n = sizeof(addr_in); - int reuse, bcast; int sockproto = 0; //dprintf(">\n"); - reuse = (flags & VSOCK_REUSE); - bcast = (flags & VSOCK_BROADCAST); addr_in.sin_family = AF_INET; addr_in.sin_addr.s_addr = saddr; addr_in.sin_port = port; @@ -324,12 +335,12 @@ } sock = socket(AF_INET, socktype, sockproto); if(sock < 0) goto exit; - if(reuse){ - err = setsock_reuse(sock, reuse); + if(flags & VSOCK_REUSE){ + err = setsock_reuse(sock, 1); if(err < 0) goto exit; } - if(bcast){ - err = setsock_broadcast(sock, bcast); + if(flags & VSOCK_BROADCAST){ + err = setsock_broadcast(sock, 1); if(err < 0) goto exit; } if(flags & VSOCK_MULTICAST){ @@ -344,6 +355,10 @@ err = bind(sock, addr, addr_n); if(err < 0) goto exit; } + if(flags & VSOCK_NONBLOCK){ + err = fcntl(sock, F_SETFL, O_NONBLOCK); + if(err < 0) goto exit; + } exit: *val = (err ? -1 : sock); if(err) eprintf("> err=%d errno=%d\n", err, errno); @@ -360,7 +375,6 @@ int varp_mcast_open(uint32_t mcaddr, uint16_t port, int *val){ int err = 0; int flags = VSOCK_REUSE; - int multicast = MULTICAST(mcaddr); int sock = 0; dprintf(">\n"); @@ -369,7 +383,7 @@ err = create_socket(SOCK_DGRAM, mcaddr, port, flags, &sock); if(err < 0) goto exit; - if(multicast){ + if(MULTICAST(mcaddr)){ err = setsock_multicast_ttl(sock, 1); if(err < 0) goto exit; } @@ -398,47 +412,82 @@ return err; } -/* Here because inline in 'socket.c'. */ -#ifndef sockfd_put -#define sockfd_put(sock) fput((sock)->file) -#endif - -/** Get the next skb from a socket's receive queue. +/** + * Return code > 0 means the handler owns the packet. + * Return code <= 0 means we still own it, with < 0 meaning + * an error. + */ +static int handle_varp_skb(struct sk_buff *skb){ + static int count = 0; + int err = 0; + count++; + switch(skb->pkt_type){ + case PACKET_BROADCAST: + case PACKET_MULTICAST: + vnet_forward_send(skb); + /* Fall through. */ + case PACKET_HOST: + err = varp_handle_message(skb); + break; + case PACKET_OTHERHOST: + dprintf("> PACKET_OTHERHOST\n"); + break; + case PACKET_OUTGOING: + dprintf("> PACKET_OUTGOING\n"); + break; + case PACKET_FASTROUTE: + dprintf("> PACKET_FASTROUTE\n"); + break; + case PACKET_LOOPBACK: + // Outbound mcast/bcast are echoed with this type. Drop. + dprintf("> LOOP src=" IPFMT " dst=" IPFMT " dev=%s\n", + NIPQUAD(skb->nh.iph->saddr), + NIPQUAD(skb->nh.iph->daddr), + (skb->dev ? skb->dev->name : "??")); + default: + // Drop. + break; + } + if(err <= 0){ + kfree_skb(skb); + } + return (err < 0 ? err : 0); +} + +/** Handle some skbs on a varp socket (if any). * * @param fd socket file descriptor - * @return skb or NULL - */ -static struct sk_buff *get_sock_skb(int fd){ - int err = 0; - struct sk_buff *skb = NULL; + * @param n maximum number of skbs to handle + * @return number of skbs handled + */ +static int handle_varp_sock(int fd, int n){ + int ret = 0; + int err = 0; + struct sk_buff *skb; struct socket *sock = NULL; sock = sockfd_lookup(fd, &err); if (!sock){ - dprintf("> no sock for fd=%d\n", fd); + wprintf("> no sock for fd=%d\n", fd); goto exit; } - skb = skb_dequeue(&sock->sk->SK_RECEIVE_QUEUE); - //skb = skb_recv_datagram(sock->sk, 0, 1, &recv_err); + for( ; ret < n; ret++){ + if(!sock->sk) break; + skb = skb_dequeue(&sock->sk->sk_receive_queue); + if(!skb) break; + // Call the skb destructor so it isn't charged to the socket anymore. + // An skb from a socket receive queue is charged to the socket + // by skb_set_owner_r() until its destructor is called. + // If the destructor is not called the socket will run out of + // receive queue space and be unable to accept incoming skbs. + // The destructor used is sock_rfree(), see 'include/net/sock.h'. + // Other destructors: sock_wfree, sk_stream_rfree. + skb_orphan(skb); + handle_varp_skb(skb); + } sockfd_put(sock); exit: - return skb; -} - -/** Handle the next skb on a socket (if any). - * - * @param fd socket file descriptor - * @return 1 if there was an skb, 0 otherwise - */ -static int handle_sock_skb(int fd){ - int ret = 0; - struct sk_buff *skb = get_sock_skb(fd); - if(skb){ - ret = 1; - dprintf("> skb fd=%d skb=%p\n", fd, skb); - varp_handle_message(skb); - kfree_skb(skb); - } + dprintf("< ret=%d\n", ret); return ret; } @@ -449,16 +498,16 @@ * @return 0 on success, error code otherwise */ int sock_add_wait_queue(int fd, wait_queue_t *waitq){ - int err = 0; + int err = -EINVAL; struct socket *sock = NULL; - dprintf("> fd=%d\n", fd); + if(fd < 0) goto exit; sock = sockfd_lookup(fd, &err); if (!sock) goto exit; - add_wait_queue(sock->sk->SK_SLEEP, waitq); + add_wait_queue(sock->sk->sk_sleep, waitq); sockfd_put(sock); - exit: - dprintf("< err=%d\n", err); + err = 0; + exit: return err; } @@ -469,132 +518,185 @@ * @return 0 on success, error code otherwise */ int sock_remove_wait_queue(int fd, wait_queue_t *waitq){ - int err = 0; + int err = -EINVAL; struct socket *sock = NULL; + if(fd < 0) goto exit; sock = sockfd_lookup(fd, &err); if (!sock) goto exit; - remove_wait_queue(sock->sk->SK_SLEEP, waitq); + remove_wait_queue(sock->sk->sk_sleep, waitq); sockfd_put(sock); - exit: - return err; -} - -/** Loop handling the varp sockets. - * We use kernel API for this (waitqueue, schedule_timeout) instead - * of select because the select syscall was returning EFAULT. Oh well. - * - * @param arg arguments - * @return exit code - */ -int varp_main(void *arg){ - int err = 0; - long timeout = 3 * HZ; - int count = 0; - int n = 0; - DECLARE_WAITQUEUE(mcast_wait, current); - DECLARE_WAITQUEUE(ucast_wait, current); - - dprintf("> start\n"); - atomic_set(&varp_running, 1); - err = sock_add_wait_queue(varp_mcast_sock, &mcast_wait); - err = sock_add_wait_queue(varp_ucast_sock, &ucast_wait); - for(n = 1; atomic_read(&varp_run) == 1; n++){ - count = 0; - count += handle_sock_skb(varp_mcast_sock); - count += handle_sock_skb(varp_ucast_sock); - if(!count){ - // No skbs were handled, so go back to sleep. - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(timeout); - current->state = TASK_RUNNING; - } - } - sock_remove_wait_queue(varp_mcast_sock, &mcast_wait); - sock_remove_wait_queue(varp_ucast_sock, &ucast_wait); - atomic_set(&varp_running, 0); - //MOD_DEC_USE_COUNT; - dprintf("< stop err=%d\n", err); - return err; -} - -/** Start the varp thread. - * - * @return 0 on success, error code otherwise - */ -int varp_start(void){ - int err = 0; - void *args = NULL; - int flags = 0; - long pid = 0; - - dprintf(">\n"); - //flags |= CLONE_VM; - flags |= CLONE_FS; - flags |= CLONE_FILES; - flags |= CLONE_SIGHAND; - atomic_set(&varp_run, 1); - atomic_set(&varp_running, 0); - pid = kernel_thread(varp_main, args, flags); - dprintf("< pid=%ld\n", pid); - return err; -} - -/** Close the varp sockets and stop the thread handling them. - */ -void varp_close(void){ + err = 0; + exit: + return err; +} + +#if 0 +// Default data ready function on a socket. +static void sock_def_readable(struct sock *sk, int len) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + sk_wake_async(sk,1,POLL_IN); + read_unlock(&sk->sk_callback_lock); +} +#endif + +static void sock_data_ready(struct sock *sk, int len){ + struct sk_buff *skb; + //read_lock(&sk->sk_callback_lock); + skb = skb_dequeue(&sk->sk_receive_queue); + if(skb){ + skb_orphan(skb); + } + //read_unlock(&sk->sk_callback_lock); + if(skb){ + handle_varp_skb(skb); + } +} + +/** Set the data ready callback on a socket. + */ +int sock_set_callback(int fd){ + int err = -EINVAL; + struct socket *sock = NULL; + + if(fd < 0) goto exit; + sock = sockfd_lookup(fd, &err); + if (!sock) goto exit; + sock->sk->sk_data_ready = sock_data_ready; + sockfd_put(sock); + err = 0; + exit: + return err; +} + +/** Open the sockets. */ +int varp_sockets_open(u32 mcaddr, u16 port){ + int err = 0; mm_segment_t oldfs; - long timeout = 1 * HZ; - int tries = 10; - dprintf(">\n"); - // Tell the varp thread to stop and wait a while for it. - atomic_set(&varp_run, 0); - while(atomic_read(&varp_running) && tries-- > 0){ - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(timeout); - current->state = TASK_RUNNING; - } - // Close the sockets. - oldfs = change_fs(KERNEL_DS); - if(varp_mcast_sock > 0){ - shutdown(varp_mcast_sock, 2); - varp_mcast_sock = -1; - } - if(varp_ucast_sock > 0){ - shutdown(varp_ucast_sock, 2); - varp_ucast_sock = -1; - } - set_fs(oldfs); - //MOD_DEC_USE_COUNT; - dprintf("<\n"); -} - -/** Open the varp sockets and start the thread handling them. - * - * @param mcaddr multicast address - * @param port port - * @return 0 on success, error code otherwise - */ -int varp_open(u32 mcaddr, u16 port){ - int err = 0; - mm_segment_t oldfs; - - //MOD_INC_USE_COUNT; - dprintf("> mcaddr=%u.%u.%u.%u port=%u\n", - NIPQUAD(mcaddr), ntohs(port)); + + dprintf("> mcaddr=%u.%u.%u.%u port=%u\n", NIPQUAD(mcaddr), ntohs(port)); oldfs = change_fs(KERNEL_DS); err = varp_mcast_open(mcaddr, port, &varp_mcast_sock); if(err < 0 ) goto exit; err = varp_ucast_open(INADDR_ANY, port, &varp_ucast_sock); if(err < 0 ) goto exit; + sock_set_callback(varp_ucast_sock); + sock_set_callback(varp_mcast_sock); + exit: set_fs(oldfs); - err = varp_start(); - exit: + dprintf("< err=%d\n", err); + return err; +} + +/** Close the sockets. */ +void varp_sockets_close(void){ + mm_segment_t oldfs; + oldfs = change_fs(KERNEL_DS); + if(varp_mcast_sock >= 0){ + shutdown(varp_mcast_sock, 2); + varp_mcast_sock = -1; + } + if(varp_ucast_sock >= 0){ + shutdown(varp_ucast_sock, 2); + varp_ucast_sock = -1; + } set_fs(oldfs); +} + +/** Loop handling the varp sockets. + * We use kernel API for this (waitqueue, schedule_timeout) instead + * of select because the select syscall was returning EFAULT. Oh well. + * + * @param arg arguments + * @return exit code + */ +int varp_main(void *arg){ + int err = 0; + long timeout = 1 * HZ; + int count = 0; + DECLARE_WAITQUEUE(mcast_wait, current); + DECLARE_WAITQUEUE(ucast_wait, current); + + dprintf("> start\n"); + snprintf(current->comm, sizeof(current->comm), "varp_main"); + + err = sock_add_wait_queue(varp_mcast_sock, &mcast_wait); + if(err) goto exit_mcast_sock; + err = sock_add_wait_queue(varp_ucast_sock, &ucast_wait); + if(err) goto exit_ucast_sock; + atomic_set(&varp_state, VARP_STATE_RUNNING); + for( ; atomic_read(&varp_run); ){ + count = 0; + count += handle_varp_sock(varp_mcast_sock, 1); + count += handle_varp_sock(varp_ucast_sock, 16); + if(!count){ + if(!atomic_read(&varp_run)) break; + // No skbs were handled, go to sleep. + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + } + } + exit_ucast_sock: + sock_remove_wait_queue(varp_ucast_sock, &ucast_wait); + exit_mcast_sock: + sock_remove_wait_queue(varp_mcast_sock, &mcast_wait); + varp_sockets_close(); if(err){ - varp_close(); - } - dprintf("< err=%d\n", err); - return err; -} - + eprintf("%s< err=%d\n", __FUNCTION__, err); + } + varp_thread_err = err; + atomic_set(&varp_state, VARP_STATE_EXITED); + //MOD_DEC_USE_COUNT; + return err; +} + +/** Close the varp sockets and stop the thread handling them. + */ +void varp_close(void){ + int tries = 10; + dprintf(">\n"); + // Tell the varp thread to stop and wait a while for it. + atomic_set(&varp_run, 0); + while(atomic_read(&varp_state) == VARP_STATE_RUNNING && tries-- > 0){ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 2); + __set_current_state(TASK_RUNNING); + } + //MOD_DEC_USE_COUNT; + dprintf("<\n"); +} + +/** Open the varp sockets and start the thread handling them. + * + * @param mcaddr multicast address + * @param port port + * @return 0 on success, error code otherwise + */ +int varp_open(u32 mcaddr, u16 port){ + int err = 0; + + //MOD_INC_USE_COUNT; + dprintf(">\n"); + err = varp_sockets_open(mcaddr, port); + if(err) goto exit; + atomic_set(&varp_run, 1); + atomic_set(&varp_state, VARP_STATE_NONE); + kernel_thread(varp_main, NULL, (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)); +#if 0 + while(atomic_read(&varp_state) == VARP_STATE_NONE){ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1 * HZ); + __set_current_state(TASK_RUNNING); + } + err = varp_thread_err; +#endif + exit: + if(err){ + wprintf("> err=%d\n", err); + } + return err; +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/varp_util.c --- a/tools/vnet/vnet-module/varp_util.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/varp_util.c Thu Feb 9 15:12:11 2006 @@ -47,7 +47,7 @@ char buf[5]; int buf_n = sizeof(buf) - 1; int i, n; - const int elts_n = 8; + const int elts_n = VNETID_SIZE16; q = s; p = strchr(q, ':'); diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/varp_util.h --- a/tools/vnet/vnet-module/varp_util.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/varp_util.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -84,59 +84,12 @@ */ static inline struct VnetId toVnetId(uint32_t vnetid){ struct VnetId vnet = {}; - vnet.u.vnet32[3] = htonl(vnetid); + vnet.u.vnet32[VNETID_SIZE32 - 1] = htonl(vnetid); return vnet; } -static inline uint32_t VnetId_hash(uint32_t h, VnetId *vnet) -{ - h = hash_hul(h, vnet->u.vnet32[0]); - h = hash_hul(h, vnet->u.vnet32[1]); - h = hash_hul(h, vnet->u.vnet32[2]); - h = hash_hul(h, vnet->u.vnet32[3]); - return h; -} - -static inline int VnetId_eq(VnetId *vnet1, VnetId *vnet2) -{ - return memcmp(vnet1, vnet2, sizeof(VnetId)) == 0; -} - -static inline uint32_t VarpAddr_hash(uint32_t h, VarpAddr *addr) -{ - h = hash_hul(h, addr->family); - if(addr->family == AF_INET6){ - h = hash_hul(h, addr->u.ip6.s6_addr32[0]); - h = hash_hul(h, addr->u.ip6.s6_addr32[1]); - h = hash_hul(h, addr->u.ip6.s6_addr32[2]); - h = hash_hul(h, addr->u.ip6.s6_addr32[3]); - } else { - h = hash_hul(h, addr->u.ip4.s_addr); - } - return h; -} - -static inline int VarpAddr_eq(VarpAddr *addr1, VarpAddr*addr2) -{ - return memcmp(addr1, addr2, sizeof(VarpAddr)) == 0; -} - -static inline uint32_t Vmac_hash(uint32_t h, Vmac *vmac) -{ - h = hash_hul(h, - (vmac->mac[0] << 24) | - (vmac->mac[1] << 16) | - (vmac->mac[2] << 8) | - (vmac->mac[3] )); - h = hash_hul(h, - (vmac->mac[4] << 8) | - (vmac->mac[5] )); - return h; -} - -static inline int Vmac_eq(Vmac *vmac1, Vmac *vmac2) -{ - return memcmp(vmac1, vmac2, sizeof(Vmac)) == 0; +static inline int VnetId_eq(VnetId *id1, VnetId *id2){ + return memcmp(id1, id2, sizeof(VnetId)) == 0; } #endif /* _VNET_VARP_UTIL_H */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vif.c --- a/tools/vnet/vnet-module/vif.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vif.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -17,61 +17,78 @@ * */ +#ifdef __KERNEL__ + #include <linux/config.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/version.h> - -#include <linux/net.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/udp.h> - -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <linux/skbuff.h> #include <linux/spinlock.h> -#include <etherip.h> -#include <if_varp.h> -#include <vnet_dev.h> +#else + +#include "sys_kernel.h" +#include "spinlock.h" +#include "skbuff.h" + +#endif + #include <vif.h> #include <varp.h> +#include <varp_util.h> #include "allocate.h" +#include "iostream.h" #include "hash_table.h" -#include "sys_net.h" -#include "sys_string.h" +#include "timer_util.h" #define MODULE_NAME "VNET" #define DEBUG 1 #undef DEBUG #include "debug.h" +/** Vif table ttl - interval between sweeps of old vifs. */ +#define VIF_TABLE_TTL (60*HZ) + +/** Vif entry ttl - a vif entry older than this is removed. */ +#define VIF_ENTRY_TTL (60*HZ) + /** Table of vifs indexed by VifKey. */ HashTable *vif_table = NULL; rwlock_t vif_table_lock = RW_LOCK_UNLOCKED; +struct timer_list vif_table_timer = {}; +int vif_table_sweeps = 0; #define vif_read_lock(flags) read_lock_irqsave(&vif_table_lock, (flags)) #define vif_read_unlock(flags) read_unlock_irqrestore(&vif_table_lock, (flags)) #define vif_write_lock(flags) write_lock_irqsave(&vif_table_lock, (flags)) #define vif_write_unlock(flags) write_unlock_irqrestore(&vif_table_lock, (flags)) -void vif_print(void){ +void vif_entry_print(Vif *vif, IOStream *io){ + char vnetbuf[VNET_ID_BUF]; + unsigned long now = jiffies; + + IOStream_print(io, "(vif\n"); + IOStream_print(io, " (vnet %s)\n", VnetId_ntoa(&vif->vnet, vnetbuf)); + IOStream_print(io, " (vmac " MACFMT ")\n", MAC6TUPLE(vif->vmac.mac)); + IOStream_print(io, " (age %u)\n", now - vif->timestamp); + IOStream_print(io, ")\n"); +} + +void vif_print(IOStream *io){ HashTable_for_decl(entry); Vif *vif; unsigned long flags; - char vnetbuf[VNET_ID_BUF]; vif_read_lock(flags); + IOStream_print(io, "(viftable\n"); + IOStream_print(io, " (table_ttl %u)\n", VIF_TABLE_TTL); + IOStream_print(io, " (entry_ttl %u)\n", VIF_ENTRY_TTL); + IOStream_print(io, " (sweeps %d)\n", vif_table_sweeps); + IOStream_print(io, ")\n"); + HashTable_for_each(entry, vif_table){ vif = entry->value; - printk(KERN_INFO "VIF(vnet=%s vmac=" MACFMT ")\n", - VnetId_ntoa(&vif->vnet, vnetbuf), MAC6TUPLE(vif->vmac.mac)); + vif_entry_print(vif, io); } vif_read_unlock(flags); } @@ -94,12 +111,8 @@ * @param k key (VifKey) * @return hashcode */ -Hashcode vif_key_hash_fn(void *k){ - VifKey *key = k; - Hashcode h = 0; - h = VnetId_hash(h, &key->vnet); - h = Vmac_hash(h, &key->vmac); - return h; +static Hashcode vif_key_hash_fn(void *k){ + return hash_hvoid(0, k, sizeof(VifKey)); } /** Test equality for keys in the vif table. @@ -109,11 +122,8 @@ * @param k2 key to compare (VifKey) * @return 1 if equal, 0 otherwise */ -int vif_key_equal_fn(void *k1, void *k2){ - VifKey *key1 = k1; - VifKey *key2 = k2; - return (VnetId_eq(&key1->vnet , &key2->vnet) && - Vmac_eq(&key1->vmac, &key2->vmac)); +static int vif_key_equal_fn(void *k1, void *k2){ + return memcmp(k1, k2, sizeof(VifKey)) == 0; } /** Free an entry in the vif table. @@ -132,18 +142,17 @@ } /** Lookup a vif. + * Caller must hold vif lock. * * @param vnet vnet id * @param mac MAC address * @return 0 on success, -ENOENT otherwise */ -int vif_lookup(VnetId *vnet, Vmac *vmac, Vif **vif){ +static int _vif_lookup(VnetId *vnet, Vmac *vmac, Vif **vif){ int err = 0; VifKey key = { .vnet = *vnet, .vmac = *vmac }; HTEntry *entry = NULL; - unsigned long flags; - vif_read_lock(flags); entry = HashTable_get_entry(vif_table, &key); if(entry){ *vif = entry->value; @@ -152,23 +161,39 @@ *vif = NULL; err = -ENOENT; } + return err; +} + +/** Lookup a vif. + * + * @param vnet vnet id + * @param mac MAC address + * @return 0 on success, -ENOENT otherwise + */ +int vif_lookup(VnetId *vnet, Vmac *vmac, Vif **vif){ + unsigned long flags; + int err; + + vif_read_lock(flags); + err = _vif_lookup(vnet, vmac, vif); vif_read_unlock(flags); return err; } /** Create a new vif. + * Entry must not exist. + * Caller must hold vif lock. * * @param vnet vnet id * @param mac MAC address * @return 0 on success, negative error code otherwise */ -int vif_add(VnetId *vnet, Vmac *vmac, Vif **val){ +static int _vif_add(VnetId *vnet, Vmac *vmac, Vif **val){ int err = 0; Vif *vif = NULL; HTEntry *entry; - unsigned long flags; - - dprintf("> vnet=%d\n", vnet); + unsigned long now = jiffies; + vif = ALLOCATE(Vif); if(!vif){ err = -ENOMEM; @@ -177,9 +202,8 @@ atomic_set(&vif->refcount, 1); vif->vnet = *vnet; vif->vmac = *vmac; - vif_write_lock(flags); + vif->timestamp = now; entry = HashTable_add(vif_table, vif, vif); - vif_write_unlock(flags); if(!entry){ err = -ENOMEM; deallocate(vif); @@ -189,15 +213,13 @@ vif_incref(vif); exit: *val = (err ? NULL : vif); - dprintf("< err=%d\n", err); - return err; -} - -/** Delete an entry. - * - * @param vnet vnet id - * @param mac MAC address - * @param coaddr return parameter for care-of address + return err; +} + +/** Delete a vif entry. + * + * @param vnet vnet id + * @param mac MAC address * @return number of entries deleted, or negative error code */ int vif_remove(VnetId *vnet, Vmac *vmac){ @@ -211,35 +233,133 @@ return err; } +/** Delete all vifs on a vnet. + * + * @param vnet vnet id + * @return number of entries deleted + */ +int vif_remove_vnet(VnetId *vnet){ + int count = 0; + unsigned long flags; + HashTable_for_decl(entry); + + + vif_write_lock(flags); + HashTable_for_each(entry, vif_table){ + Vif *vif = entry->value; + if(VnetId_eq(&vif->vnet, vnet)){ + count += HashTable_remove(vif_table, vif); + } + } + vif_write_unlock(flags); + return count; +} + +/** Purge the vif table. + */ void vif_purge(void){ + unsigned long flags; + vif_write_lock(flags); HashTable_clear(vif_table); -} - -int vif_create(VnetId *vnet, Vmac *vmac, Vif **vif){ - int err = 0; - - dprintf(">\n"); - if(vif_lookup(vnet, vmac, vif) == 0){ - vif_decref(*vif); - err = -EEXIST; + vif_write_unlock(flags); +} + +/** Sweep old vif entries from the vif table. + */ +void vif_sweep(void){ + HashTable_for_decl(entry); + Vif *vif; + int vif_count = 0; + unsigned long now = jiffies; + unsigned long old = VIF_ENTRY_TTL; + unsigned long flags; + + vif_write_lock(flags); + vif_table_sweeps++; + HashTable_for_each(entry, vif_table){ + vif = entry->value; + vif_count++; + if(!(vif->flags & VIF_FLAG_PERSISTENT) + && (now - vif->timestamp > old)){ + iprintf("> Sweeping:\n"); + vif_entry_print(vif, iostdout); + HashTable_remove(vif_table, entry->key); + } + } + vif_write_unlock(flags); +} + +/** Create a new vif if it does not exist. + * Caller must hold vif lock. + * + * @param vnet vnet id + * @param mac MAC address + * @return 0 on success, negative error code otherwise + */ +int _vif_create(VnetId *vnet, Vmac *vmac, Vif **vif){ + int err = 0; + + if(_vif_lookup(vnet, vmac, vif) == 0){ goto exit; } - err = vif_add(vnet, vmac, vif); + err = _vif_add(vnet, vmac, vif); exit: - if(err){ - *vif = NULL; - } - dprintf("< err=%d\n", err); - return err; -} - + return err; +} + +/** Create a new vif if it does not exist. + * + * @param vnet vnet id + * @param mac MAC address + * @return 0 on success, negative error code otherwise + */ +int vif_create(VnetId *vnet, Vmac *vmac, int vflags, Vif **vif){ + int err = 0; + unsigned long flags; + + vif_write_lock(flags); + err = _vif_create(vnet, vmac, vif); + if(!err && *vif){ + (*vif)->flags = vflags; + } + vif_write_unlock(flags); + return err; +} + +/** Update the timestamp for a vif. + * + * @param vnet vnet id + * @param mac MAC address + * @return 0 on success, negative error code otherwise + */ +int vif_update(VnetId *vnet, Vmac *vmac){ + Vif *vif = NULL; + int err = 0; + unsigned long now = jiffies; + unsigned long flags; + + vif_write_lock(flags); + err = _vif_create(vnet, vmac, &vif); + if(err) goto exit; + vif->timestamp = now; + vif_decref(vif); + exit: + vif_write_unlock(flags); + return err; +} + +static void vif_table_timer_fn(unsigned long arg){ + if(!vif_table) return; + vif_sweep(); + timer_set(&vif_table_timer, VIF_TABLE_TTL); +} + /** Initialize the vif table. * * @return 0 on success, error code otherwise */ int vif_init(void){ int err = 0; - dprintf(">\n"); vif_table = HashTable_new(0); if(!vif_table){ err = -ENOMEM; @@ -249,12 +369,18 @@ vif_table->key_hash_fn = vif_key_hash_fn; vif_table->key_equal_fn = vif_key_equal_fn; + timer_init(&vif_table_timer, vif_table_timer_fn, 0); + timer_set(&vif_table_timer, VIF_TABLE_TTL); + exit: - if(err < 0) wprintf("< err=%d\n", err); - dprintf("< err=%d\n", err); + if(err < 0){ + eprintf("> vif_init err=%d\n", err); + } return err; } void vif_exit(void){ + timer_cancel(&vif_table_timer); HashTable_free(vif_table); -} + vif_table = NULL; +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vif.h --- a/tools/vnet/vnet-module/vif.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vif.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,37 +19,44 @@ #ifndef _VNET_VIF_H_ #define _VNET_VIF_H_ +#ifdef __KERNEL__ +#include <asm/atomic.h> +#else +#include "spinlock.h" +#endif + #include <if_varp.h> -struct net_device; +struct IOStream; /** Key for entries in the vif table. */ typedef struct VifKey { - VnetId vnet; - Vmac vmac; + struct VnetId vnet; + struct Vmac vmac; } VifKey; typedef struct Vif { - VnetId vnet; - Vmac vmac; - struct net_device *dev; + struct VnetId vnet; + struct Vmac vmac; atomic_t refcount; + unsigned long timestamp; + int flags; } Vif; -struct HashTable; -extern struct HashTable *vif_table; +enum { + VIF_FLAG_PERSISTENT = 1, +}; -extern void vif_print(void); +extern void vif_print(struct IOStream *io); -extern void vif_decref(Vif *vif); -extern void vif_incref(Vif *vif); +extern void vif_decref(struct Vif *vif); +extern void vif_incref(struct Vif *vif); -extern int vif_create(struct VnetId *vnet, Vmac *vmac, Vif **vif); - -extern int vif_create(VnetId *vnet, Vmac *vmac, Vif **vif); -extern int vif_add(struct VnetId *vnet, Vmac *vmac, Vif **vif); -extern int vif_lookup(struct VnetId *vnet, Vmac *vmac, Vif **vif); -extern int vif_remove(struct VnetId *vnet, Vmac *vmac); +extern int vif_create(struct VnetId *vnet, struct Vmac *vmac, int flags, struct Vif **vif); +extern int vif_lookup(struct VnetId *vnet, struct Vmac *vmac, struct Vif **vif); +extern int vif_update(struct VnetId *vnet, struct Vmac *vmac); +extern int vif_remove(struct VnetId *vnet, struct Vmac *vmac); extern void vif_purge(void); +extern int vif_remove_vnet(struct VnetId *vnet); extern int vif_init(void); extern void vif_exit(void); diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet.c --- a/tools/vnet/vnet-module/vnet.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -17,6 +17,7 @@ * */ +#ifdef __KERNEL__ #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -25,6 +26,7 @@ #include <linux/errno.h> #include <linux/string.h> +#include <linux/spinlock.h> #include <linux/net.h> #include <linux/in.h> @@ -37,6 +39,22 @@ #include <net/route.h> #include <linux/skbuff.h> #include <net/checksum.h> + + +#else + +#include <netinet/in.h> +#include <arpa/inet.h> + +#include "sys_kernel.h" +#include "spinlock.h" +#include "skbuff.h" + +#include <linux/ip.h> // For struct iphdr. + +extern int netif_rx(struct sk_buff *skb); + +#endif #include <tunnel.h> #include <sa.h> @@ -45,16 +63,22 @@ #include <esp.h> #include <etherip.h> #include <random.h> -#include <tunnel.h> + +#include <skb_context.h> #include <skb_util.h> #include <vnet_dev.h> #include <vnet.h> +#include <vnet_forward.h> #include <vif.h> #include <vnet_ioctl.h> +#include <sa.h> +#ifdef __KERNEL__ #include <sa_algorithm.h> +#endif #include "allocate.h" +#include "iostream.h" #include "hash_table.h" #include "sys_net.h" #include "sys_string.h" @@ -68,19 +92,18 @@ */ int vnet_security_default = SA_AUTH ; //| SA_CONF; -/** Key for entries in the vnet address table. */ -typedef struct VnetAddrKey { - /** Vnet id. */ - VnetId vnet; - /** MAC address. */ - unsigned char mac[ETH_ALEN]; -} VnetAddrKey; - /** The physical vnet. */ Vnet *vnet_physical = NULL; /** Table of vnets indexed by id. */ -static HashTable *vnet_table = NULL; +HashTable *vnet_table = NULL; + +rwlock_t vnet_lock = RW_LOCK_UNLOCKED; + +#define vnet_table_read_lock(flags) read_lock_irqsave(&vnet_lock, flags) +#define vnet_table_read_unlock(flags) read_unlock_irqrestore(&vnet_lock, flags) +#define vnet_table_write_lock(flags) write_lock_irqsave(&vnet_lock, flags) +#define vnet_table_write_unlock(flags) write_unlock_irqrestore(&vnet_lock, flags) /** Decrement reference count, freeing if zero. * @@ -89,7 +112,6 @@ void Vnet_decref(Vnet *info){ if(!info) return; if(atomic_dec_and_test(&info->refcount)){ - vnet_dev_remove(info); deallocate(info); } } @@ -103,26 +125,40 @@ atomic_inc(&info->refcount); } -void Vnet_print(Vnet *info) +void Vnet_print(Vnet *info, IOStream *io) { char vnetbuf[VNET_ID_BUF]; - - printk(KERN_INFO "VNET(vnet=%s device=%s security=%c%c)\n", - VnetId_ntoa(&info->vnet, vnetbuf), - info->device, - ((info->security & SA_AUTH) ? 'a' : '-'), - ((info->security & SA_CONF) ? 'c' : '-')); -} - -void vnet_print(void) + char *security; + + if(info->security & SA_CONF){ + security = "conf"; + } else if(info->security & SA_AUTH){ + security = "auth"; + } else { + security = "none"; + } + + IOStream_print(io, "(vnet"); + IOStream_print(io, " (id %s)", VnetId_ntoa(&info->vnet, vnetbuf)); + IOStream_print(io, " (vnetif %s)", info->device); + IOStream_print(io, " (security %s)", security); + IOStream_print(io, " (header %d)", info->header_n); + IOStream_print(io, ")"); +} + +void vnet_print(IOStream *io) { HashTable_for_decl(entry); Vnet *info; + unsigned long flags; + vnet_table_read_lock(flags); HashTable_for_each(entry, vnet_table){ info = entry->value; - Vnet_print(info); - } + Vnet_print(info, io); + IOStream_print(io, "\n"); + } + vnet_table_read_unlock(flags); } /** Allocate a vnet, setting reference count to 1. @@ -141,6 +177,21 @@ return err; } +/** Create the virtual interface for a vnet. + * + * @param info vnet + * @return 0 on success, error code otherwise + */ +int Vnet_create(Vnet *info){ + int err = 0; + + err = vnet_dev_add(info); + if(err) goto exit; + err = Vnet_add(info); + exit: + return err; +} + /** Add a vnet to the table under its vnet id. * * @param info vnet to add @@ -149,41 +200,90 @@ int Vnet_add(Vnet *info){ int err = 0; HTEntry *entry = NULL; - // Vnet_del(info->vnet); //todo: Delete existing vnet info? + unsigned long flags; + + if(Vnet_lookup(&info->vnet, NULL) == 0){ + //todo: Delete existing vnet info? + err = -EEXIST; + goto exit; + } Vnet_incref(info); + vnet_table_write_lock(flags); entry = HashTable_add(vnet_table, &info->vnet, info); + vnet_table_write_unlock(flags); if(!entry){ err = -ENOMEM; + vnet_dev_remove(info); Vnet_decref(info); } + exit: return err; } /** Remove a vnet from the table. + * Also removes all vifs and varp entries for the vnet. * * @param vnet id of vnet to remove * @return number of vnets removed */ int Vnet_del(VnetId *vnet){ - return HashTable_remove(vnet_table, vnet); + int count; + unsigned long flags; + Vnet *info; + + vnet_table_write_lock(flags); + info = HashTable_get(vnet_table, vnet); + count = HashTable_remove(vnet_table, vnet); + vnet_table_write_unlock(flags); + + varp_remove_vnet(vnet); + vif_remove_vnet(vnet); + + if(info){ + // Can't do this in the hashtable entry free function because it runs + // while we hold the vnet table lock, and the vnet tidy up calls + // vnet_dev_remove(), which calls unregister_netdev(), which schedules. + vnet_dev_remove(info); + Vnet_decref(info); + } + return count; } /** Lookup a vnet by id. * References the vnet on success - the caller must decref. * * @param vnet vnet id - * @param info return parameter for vnet + * @param pinfo return parameter for vnet (or NULL) * @return 0 on sucess, -ENOENT if no vnet found */ -int Vnet_lookup(VnetId *vnet, Vnet **info){ - int err = 0; - *info = HashTable_get(vnet_table, vnet); - if(*info){ - Vnet_incref(*info); +int Vnet_lookup(VnetId *vnet, Vnet **pinfo){ + int err = 0; + unsigned long flags; + Vnet *info; + + vnet_table_read_lock(flags); + info = HashTable_get(vnet_table, vnet); + if(info){ + if(pinfo){ + Vnet_incref(info); + } } else { err = -ENOENT; } - return err; + vnet_table_read_unlock(flags); + + if(pinfo){ + *pinfo = (err ? NULL : info); + } + return err; +} + +static int vnet_key_equal_fn(void *k1, void *k2){ + return memcmp(k1, k2, sizeof(VnetId)) == 0; +} + +static Hashcode vnet_key_hash_fn(void *k){ + return hash_hvoid(0, k, sizeof(VnetId)); } /** Free an entry in the vnet table. @@ -192,14 +292,47 @@ * @param entry to free */ static void vnet_entry_free_fn(HashTable *table, HTEntry *entry){ - Vnet *info; if(!entry) return; - info = entry->value; - if(info){ + HTEntry_free(entry); +} + +void vnet_table_free(void){ + HashTable *vnt; + HashTable_for_decl(entry); + + vnt = vnet_table; + if(!vnt) return; + vnet_table = NULL; + HashTable_for_each(entry, vnt){ + Vnet *info = entry->value; vnet_dev_remove(info); Vnet_decref(info); } - HTEntry_free(entry); + HashTable_free(vnt); +} + +int vnet_table_init(void){ + int err = 0; + vnet_table = HashTable_new(0); + if(!vnet_table){ + err = -ENOMEM; + goto exit; + } + vnet_table->key_equal_fn = vnet_key_equal_fn; + vnet_table->key_hash_fn = vnet_key_hash_fn; + vnet_table->entry_free_fn = vnet_entry_free_fn; + + err = Vnet_alloc(&vnet_physical); + if(err) goto exit; + vnet_physical->vnet = toVnetId(VNET_PHYS); + vnet_physical->security = 0; + err = Vnet_add(vnet_physical); + + exit: + if(err){ + vnet_table_free(); + } + return err; } /** Setup some vnet entries (for testing). @@ -223,22 +356,12 @@ sprintf(vnet->device, "vnif%04x", vnetid); vnet->security = (vnetid > 10 ? security : 0); err = Vnet_create(vnet); + Vnet_decref(vnet); if(err) break; } return err; } -int vnet_key_equal_fn(void *k1, void *k2){ - VnetId *key1 = k1; - VnetId *key2 = k2; - return VnetId_eq(key1, key2); -} - -Hashcode vnet_key_hash_fn(void *k){ - VnetId *key = k; - return VnetId_hash(0, key); -} - /** Initialize the vnet table and the physical vnet. * * @return 0 on success, error code otherwise @@ -246,43 +369,38 @@ int vnet_init(void){ int err = 0; - vnet_table = HashTable_new(0); - if(!vnet_table){ - err = -ENOMEM; - goto exit; - } - vnet_table->key_equal_fn = vnet_key_equal_fn; - vnet_table->key_hash_fn = vnet_key_hash_fn; - vnet_table->entry_free_fn = vnet_entry_free_fn; - - err = Vnet_alloc(&vnet_physical); - if(err) goto exit; - vnet_physical->vnet = toVnetId(VNET_PHYS); - vnet_physical->security = 0; - err = Vnet_add(vnet_physical); + err = vnet_forward_init(); + if(err) goto exit; + err = vnet_table_init(); if(err) goto exit; err = vnet_setup(); if(err) goto exit; + err = vif_init(); + if(err) goto exit; err = varp_init(); - if(err) goto exit; - err = vif_init(); exit: return err; } void vnet_exit(void){ + varp_exit(); vif_exit(); - varp_exit(); - HashTable_free(vnet_table); - vnet_table = NULL; -} - -inline int skb_xmit(struct sk_buff *skb){ + vnet_table_free(); + vnet_forward_exit(); +} + +#ifdef __KERNEL__ +inline int _skb_xmit(struct sk_buff *skb, uint32_t saddr){ int err = 0; struct rtable *rt = NULL; - dprintf(">\n"); + dprintf("> src=%u.%u.%u.%u dst=%u.%u.%u.%u\n", + NIPQUAD(skb->nh.iph->saddr), + NIPQUAD(skb->nh.iph->daddr)); skb->protocol = htons(ETH_P_IP); + if(saddr){ + skb->nh.iph->saddr = 0; + } err = skb_route(skb, &rt); if(err){ wprintf("> skb_route=%d\n", err); @@ -295,6 +413,7 @@ goto exit; } + dst_release(skb->dst); skb->dst = &rt->u.dst; if(!skb->dev){ skb->dev = rt->u.dst.dev; @@ -302,18 +421,48 @@ ip_select_ident(skb->nh.iph, &rt->u.dst, NULL); - if(skb->nh.iph->saddr == 0){ - skb->nh.iph->saddr = rt->rt_src; - } - - skb->nh.iph->check = 0; - skb->nh.iph->check = ip_compute_csum(skb->nh.raw, (skb->nh.iph->ihl << 2)); - - err = neigh_compat_output(skb); - + if(saddr){ + skb->nh.iph->saddr = saddr; + } else { + if(!skb->nh.iph->saddr){ + skb->nh.iph->saddr = rt->rt_src; + } + } + + ip_send_check(skb->nh.iph); + + if(1){ + // Output to skb destination. Will use ip_output(), which fragments. + // Slightly slower than neigh_compat_output() (marginal - 1%). + err = dst_output(skb); + } else { + // Sends direct to device via dev_queue_xmit(). No fragmentation? + err = neigh_compat_output(skb); + } + +#if 0 + if(needs_frags){ + err = ip_fragment(skb, ip_finish_output); + } else { + err = ip_finish_output(skb); + } +#endif exit: dprintf("< err=%d\n", err); return err; +} + +#else + +extern int _skb_xmit(struct sk_buff *skb, uint32_t saddr); + +#endif + +int skb_xmit(struct sk_buff *skb){ + if(MULTICAST(skb->nh.iph->daddr)){ + vnet_forward_send(skb); + } + return _skb_xmit(skb, 0); } /** Called when a vif sends a packet to the network. @@ -322,88 +471,54 @@ * @param skb packet * @return 0 on success, error code otherwise * - * @todo fixme */ int vnet_skb_send(struct sk_buff *skb, VnetId *vnet){ - int err = 0; VnetId vnet_phys = toVnetId(VNET_PHYS); - - dprintf(">\n"); + int err = 0; + + //dprintf(">\n"); skb->dev = NULL; if(!vnet || VnetId_eq(vnet, &vnet_phys)){ // No vnet or physical vnet, send direct to the network. skb_xmit(skb); } else { + // Update the vif table with the source MAC. + vif_update(vnet, (Vmac*)eth_hdr(skb)->h_source); err = varp_output(skb, vnet); } - dprintf("< err=%d\n", err); + //dprintf("< err=%d\n", err); return err; } /** Receive an skb for a vnet. * We make the skb come out of the vif for the vnet, and * let ethernet bridging forward it to related interfaces. - * If the dest is broadcast, goes to all vifs on the vnet. - * If the dest is unicast, goes to the addressed vif on the vnet. * * The packet must have skb->mac.raw set and skb->data must point * after the device (ethernet) header. * + * Return code 1 means we now own the packet - the caller must not free it. + * Return code < 0 means an error - caller still owns the packet. + * * @param skb packet * @param vnet packet vnet - * @param vmac packet vmac - * @return 0 on success, error code otherwise - */ -int vnet_skb_recv(struct sk_buff *skb, VnetId *vnet, Vmac *vmac){ - int err = 0; - Vnet *info = NULL; - - err = Vnet_lookup(vnet, &info); - if(err) goto exit; - skb->dev = info->dev; + */ +int vnet_skb_recv(struct sk_buff *skb, Vnet *vnet){ + int err = 1; + + if(!vnet->dev){ + // No device for the vnet. + err = -ENOTCONN; + goto exit; + } + skb->dev = vnet->dev; + vnet->stats.rx_packets++; + vnet->stats.rx_bytes += skb->len; netif_rx(skb); exit: - if(info) Vnet_decref(info); - if(err){ - kfree_skb(skb); - } - return err; -} - -/** Determine ESP security mode for a new SA. - * - * @param spi incoming spi - * @param protocol incoming protocol - * @param addr source address - * @return security level or negative error code - * - * @todo Need to check spi, and do some lookup for security params. - */ -int vnet_sa_security(u32 spi, int protocol, u32 addr){ - int security = vnet_security_default; - dprintf("< security=%x\n", security); - return security; -} - -/** Create a new SA for incoming traffic. - * - * @param spi incoming spi - * @param protocol incoming protocol - * @param addr source address - * @param sa return parameter for SA - * @return 0 on success, error code otherwise - */ -int vnet_sa_create(u32 spi, int protocol, u32 addr, SAState **sa){ - int err = 0; - int security = vnet_sa_security(spi, protocol, addr); - if(security < 0){ - err = security; - goto exit; - } - err = sa_create(security, spi, protocol, addr, sa); - exit: - return err; -} + return err; +} + /** Check that a context has the correct properties w.r.t. a vnet. * The context must be secure if the vnet requires security. @@ -443,104 +558,33 @@ return err; } -/** Open function for SA tunnels. - * - * @param tunnel to open - * @return 0 on success, error code otherwise - */ -static int sa_tunnel_open(Tunnel *tunnel){ - int err = 0; - //dprintf(">\n"); - //dprintf("< err=%d\n", err); - return err; -} - -/** Close function for SA tunnels. - * - * @param tunnel to close (OK if null) - */ -static void sa_tunnel_close(Tunnel *tunnel){ - SAState *sa; - if(!tunnel) return; - sa = tunnel->data; - if(!sa) return; - SAState_decref(sa); - tunnel->data = NULL; -} - -/** Packet send function for SA tunnels. - * - * @param tunnel to send on - * @param skb packet to send - * @return 0 on success, negative error code on error - */ -static int sa_tunnel_send(Tunnel *tunnel, struct sk_buff *skb){ - int err = -EINVAL; - SAState *sa; - if(!tunnel){ - wprintf("> Null tunnel!\n"); - goto exit; - } - sa = tunnel->data; - if(!sa){ - wprintf("> Null SA!\n"); - goto exit; - } - err = SAState_send(sa, skb, tunnel->base); - exit: - return err; -} - -/** Functions used by SA tunnels. */ -static TunnelType _sa_tunnel_type = { - .name = "SA", - .open = sa_tunnel_open, - .close = sa_tunnel_close, - .send = sa_tunnel_send -}; - -/** Functions used by SA tunnels. */ -TunnelType *sa_tunnel_type = &_sa_tunnel_type; - -/** Open a tunnel for a vnet to a given address. + +/** Create a tunnel for a vnet to a given address. * * @param vnet vnet id * @param addr destination address * @param tunnel return parameter * @return 0 on success, error code otherwise */ -int vnet_tunnel_open(VnetId *vnet, VarpAddr *addr, Tunnel **tunnel){ - extern TunnelType *etherip_tunnel_type; +static int vnet_tunnel_create(VnetId *vnet, VarpAddr *addr, Tunnel **tunnel){ int err = 0; Vnet *info = NULL; - Tunnel *base_tunnel = NULL; + Tunnel *base = NULL; Tunnel *sa_tunnel = NULL; - Tunnel *etherip_tunnel = NULL; + Tunnel *eth_tunnel = NULL; err = Vnet_lookup(vnet, &info); if(err) goto exit; if(info->security){ - SAState *sa = NULL; - //FIXME: Assuming IPv4 for now. - u32 ipaddr = addr->u.ip4.s_addr; - err = Tunnel_create(sa_tunnel_type, vnet, addr, base_tunnel, &sa_tunnel); + err = sa_tunnel_create(info, addr, base, &sa_tunnel); if(err) goto exit; - err = sa_create(info->security, 0, IPPROTO_ESP, ipaddr, &sa); - if(err) goto exit; - sa_tunnel->data = sa; - base_tunnel = sa_tunnel; - } - err = Tunnel_create(etherip_tunnel_type, vnet, addr, base_tunnel, ðerip_tunnel); - if(err) goto exit; - err = Tunnel_add(etherip_tunnel); + base = sa_tunnel; + } + err = etherip_tunnel_create(vnet, addr, base, ð_tunnel); exit: Tunnel_decref(sa_tunnel); Vnet_decref(info); - if(err){ - *tunnel = NULL; - } else { - *tunnel = etherip_tunnel; - } + *tunnel = (err ? NULL : eth_tunnel); return err; } @@ -554,9 +598,9 @@ */ int vnet_tunnel_lookup(VnetId *vnet, VarpAddr *addr, Tunnel **tunnel){ int err = 0; - *tunnel = Tunnel_lookup(vnet, addr); - if(!*tunnel){ - err = vnet_tunnel_open(vnet, addr, tunnel); + err = Tunnel_lookup(vnet, addr, tunnel); + if(err){ + err = Tunnel_open(vnet, addr, vnet_tunnel_create, tunnel); } return err; } @@ -571,13 +615,27 @@ int vnet_tunnel_send(VnetId *vnet, VarpAddr *addr, struct sk_buff *skb){ int err = 0; Tunnel *tunnel = NULL; + err = vnet_tunnel_lookup(vnet, addr, &tunnel); - if(err) goto exit; + if(err) { + char vnetbuf[VNET_ID_BUF]; + char addrbuf[VARP_ADDR_BUF]; + wprintf("No tunnel: skb=%p vnet=%s addr=%s\n", + skb, + VnetId_ntoa(vnet, vnetbuf), + VarpAddr_ntoa(addr, addrbuf)); + goto exit; + } err = Tunnel_send(tunnel, skb); Tunnel_decref(tunnel); exit: return err; } + +#ifdef __KERNEL__ + +/** Module parameter for vnet encapsulation. */ +static char *vnet_encaps = NULL; static void __exit vnet_module_exit(void){ ProcFS_exit(); @@ -597,6 +655,9 @@ static int __init vnet_module_init(void){ int err = 0; + if(vnet_encaps && !strcmp(vnet_encaps, "udp")){ + etherip_in_udp = 1; + } dprintf(">\n"); err = random_module_init(); if(err) wprintf("> random_module_init err=%d\n", err); @@ -629,3 +690,8 @@ module_init(vnet_module_init); module_exit(vnet_module_exit); MODULE_LICENSE("GPL"); + +MODULE_PARM(vnet_encaps, "s"); +MODULE_PARM_DESC(vnet_encaps, "Vnet encapsulation: etherip or udp."); + +#endif diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet.h --- a/tools/vnet/vnet-module/vnet.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -19,61 +19,83 @@ #ifndef __VNET_VNET_H__ #define __VNET_VNET_H__ +#ifdef __KERNEL__ + #include <asm/atomic.h> #include <linux/skbuff.h> +#include <linux/if.h> +#include <linux/netdevice.h> -#include <tunnel.h> -#include <skb_context.h> +#else +#include <linux/netdevice.h> // struct net_device_stats + +struct net_device { + char name[IFNAMSIZ]; + char tap[255]; + int tapfd; +}; + +#endif + +#include <if_varp.h> + +struct sk_buff; + +struct IOStream; struct Vmac; struct Vif; -struct net_device; +struct SkbContext; +struct VarpAddr; +struct Tunnel; +struct SAState; /** Vnet property record. */ typedef struct Vnet { + /** Vnet id. */ + struct VnetId vnet; /** Reference count. */ atomic_t refcount; - /** Vnet id. */ - struct VnetId vnet; /** Security flag. If true the vnet requires ESP. */ int security; char device[IFNAMSIZ]; struct net_device *dev; - struct net_device *bridge; /** Max size of the header. */ int header_n; + int mtu; /** Statistics. */ struct net_device_stats stats; int recursion; } Vnet; -extern void vnet_print(void); -extern void Vnet_print(Vnet *info); +extern void vnet_print(struct IOStream *io); +extern void Vnet_print(struct Vnet *info, struct IOStream *io); extern int Vnet_lookup(struct VnetId *vnet, struct Vnet **info); +extern int Vnet_create(struct Vnet *info); extern int Vnet_add(struct Vnet *info); extern int Vnet_del(struct VnetId *vnet); extern void Vnet_incref(struct Vnet *info); extern void Vnet_decref(struct Vnet *info); extern int Vnet_alloc(struct Vnet **info); -extern Vnet *vnet_physical; +extern struct Vnet *vnet_physical; extern int skb_xmit(struct sk_buff *skb); +extern int skb_xmit_fwd(struct sk_buff *skb); extern int vnet_skb_send(struct sk_buff *skb, struct VnetId *vnet); -extern int vnet_skb_recv(struct sk_buff *skb, struct VnetId *vnet, struct Vmac *vmac); +extern int vnet_skb_recv(struct sk_buff *skb, struct Vnet *vnet); -extern int vnet_check_context(struct VnetId *vnet, SkbContext *context, Vnet **vinfo); +extern int vnet_check_context(struct VnetId *vnet, struct SkbContext *context, struct Vnet **vinfo); -extern int vnet_tunnel_open(struct VnetId *vnet, struct VarpAddr *addr, Tunnel **tunnel); -extern int vnet_tunnel_lookup(struct VnetId *vnet, struct VarpAddr *addr, Tunnel **tunnel); +extern int vnet_tunnel_open(struct VnetId *vnet, struct VarpAddr *addr, struct Tunnel **tunnel); +extern int vnet_tunnel_lookup(struct VnetId *vnet, struct VarpAddr *addr, struct Tunnel **tunnel); extern int vnet_tunnel_send(struct VnetId *vnet, struct VarpAddr *addr, struct sk_buff *skb); extern int vnet_init(void); extern int vnet_sa_security(u32 spi, int protocol, u32 addr); -struct SAState; extern int vnet_sa_create(u32 spi, int protocol, u32 addr, struct SAState **sa); enum { @@ -81,4 +103,6 @@ VNET_VIF = 2, }; +extern struct HashTable *vnet_table; + #endif /* !__VNET_VNET_H__ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet_dev.c --- a/tools/vnet/vnet-module/vnet_dev.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet_dev.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -42,6 +42,7 @@ #include <varp.h> #include <vif.h> #include <vnet_dev.h> +#include <random.h> #define MODULE_NAME "VNET" #define DEBUG 1 @@ -49,51 +50,31 @@ #include "debug.h" #ifndef CONFIG_BRIDGE -#error Must configure ethernet bridging in Network Options +#warning Should configure ethernet bridging in kernel Network Options #endif static void vnet_dev_destructor(struct net_device *dev){ - dprintf(">\n"); - dev->open = NULL; - dev->stop = NULL; - dev->uninit = NULL; - dev->destructor = NULL; - dev->hard_start_xmit = NULL; - dev->get_stats = NULL; - dev->do_ioctl = NULL; - dev->change_mtu = NULL; - - dev->tx_timeout = NULL; - dev->set_multicast_list = NULL; - dev->flags = 0; - - dev->priv = NULL; -} - -static void vnet_dev_uninit(struct net_device *dev){ - //Vnet *vnet = dev->priv; - dprintf(">\n"); - //dev_put(dev); - dprintf("<\n"); + Vnet *vnet = dev->priv; + if(vnet){ + if(vnet->dev == dev){ + vnet->dev = NULL; + } + dev->priv = NULL; + Vnet_decref(vnet); + } + free_netdev(dev); } static struct net_device_stats *vnet_dev_get_stats(struct net_device *dev){ - Vnet *vnet = dev->priv; - //dprintf(">\n"); - return &vnet->stats; -} - -static int vnet_dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd){ - int err = 0; - - dprintf(">\n"); - return err; + static struct net_device_stats stats = {}; + Vnet *vnet = dev->priv; + return (vnet ? &vnet->stats : &stats); } static int vnet_dev_change_mtu(struct net_device *dev, int mtu){ int err = 0; Vnet *vnet = dev->priv; - if (mtu < 68 || mtu > 1500 - vnet->header_n){ + if (mtu < 68 || mtu > (vnet ? vnet->mtu : 1500)){ err = -EINVAL; goto exit; } @@ -102,64 +83,29 @@ return err; } -static int vnet_dev_set_name(struct net_device *dev){ - int err = 0; - Vnet *vnet = (void*)dev->priv; - - dprintf(">\n"); - if(__dev_get_by_name(vnet->device)){ - err = -ENOMEM; - wprintf("> vnet device name in use: %s\n", vnet->device); - } - strcpy(dev->name, vnet->device); - dprintf("< err=%d\n", err); - return err; -} - -/** Create the virtual interface for a vnet. - * - * @param info vnet - * @return 0 on success, error code otherwise - */ -int Vnet_create(Vnet *info){ - int err = 0; - - err = vnet_dev_add(info); - if(err) goto exit; - err = Vnet_add(info); - exit: - return err; -} - /** Remove the net device for a vnet. - * Clears the dev field of the vnet. * Safe to call if the vnet or its dev are null. * * @param vnet vnet */ void vnet_dev_remove(Vnet *vnet){ - if(!vnet) return; - if(vnet->dev){ - //dev_put(vnet->dev); - dprintf("> unregister_netdev(%s)\n", vnet->dev->name); + if(vnet && vnet->dev){ + iprintf("> Removing vnet device %s\n", vnet->dev->name); unregister_netdev(vnet->dev); - vnet->dev = NULL; } } static int vnet_dev_open(struct net_device *dev){ int err = 0; - dprintf(">\n"); + netif_start_queue(dev); - dprintf("<\n"); return err; } static int vnet_dev_stop(struct net_device *dev){ int err = 0; - dprintf(">\n"); + netif_stop_queue(dev); - dprintf("<\n"); return err; } @@ -168,25 +114,28 @@ Vnet *vnet = dev->priv; int len = 0; - dprintf("> skb=%p\n", skb); + if(!skb){ + wprintf("> skb NULL!\n"); + return -EINVAL; + } + if(!vnet){ + return -ENOTCONN; + } if(vnet->recursion++) { + extern void print_skb(const char *msg, int count, struct sk_buff *skb); + char vnetbuf[VNET_ID_BUF]; + vnet->stats.collisions++; vnet->stats.tx_errors++; - wprintf("> recursion!\n"); - dev_kfree_skb(skb); + wprintf("> recursion! vnet=%s\n", VnetId_ntoa(&vnet->vnet, vnetbuf)); + print_skb("RECURSION", 0, skb); + varp_print(iostdout); + kfree_skb(skb); goto exit; } - if(!skb){ - err = -EINVAL; - wprintf("> skb NULL!\n"); - goto exit; - } - dprintf("> skb->data=%p skb->mac.raw=%p\n", skb->data, skb->mac.raw); - if(skb->mac.raw < skb->data || skb->mac.raw > skb->nh.raw){ - wprintf("> skb mac duff!\n"); + if(!skb->mac.raw){ skb->mac.raw = skb->data; - } - //dev->trans_start = jiffies; + } len = skb->len; // Must not use skb pointer after vnet_skb_send(). err = vnet_skb_send(skb, &vnet->vnet); @@ -198,18 +147,22 @@ } exit: vnet->recursion--; - dprintf("<\n"); return 0; } +void vnet_dev_set_multicast_list(struct net_device *dev){ +} + +#if 0 +static int vnet_dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd){ + int err = 0; + + return err; +} + void vnet_dev_tx_timeout(struct net_device *dev){ - dprintf(">\n"); //dev->trans_start = jiffies; //netif_wake_queue(dev); -} - -void vnet_dev_set_multicast_list(struct net_device *dev){ - dprintf(">\n"); } static int (*eth_hard_header)(struct sk_buff *skb, @@ -227,18 +180,7 @@ exit: return err; } - -void vnet_default_mac(unsigned char *mac) -{ - static unsigned val = 1; - mac[0] = 0xAA; - mac[1] = 0xFF; - mac[2] = (unsigned char)((val >> 24) & 0xff); - mac[3] = (unsigned char)((val >> 16) & 0xff); - mac[4] = (unsigned char)((val >> 8) & 0xff); - mac[5] = (unsigned char)((val ) & 0xff); - val++; -} +#endif int vnet_device_mac(const char *device, unsigned char *mac){ int err; @@ -253,97 +195,98 @@ } void vnet_dev_mac(unsigned char *mac){ - const char *devices[] = { "eth0", "eth1", "eth2", NULL }; - const char **pdev; - int err = -ENODEV; - - for(pdev = devices; err && *pdev; pdev++){ - err = vnet_device_mac(*pdev, mac); - } - if(err){ - vnet_default_mac(mac); - } -} - -static int vnet_dev_init(struct net_device *dev){ - int err = 0; - Vnet *vnet = (void*)dev->priv; - - dprintf(">\n"); + mac[0] = 0xAA; + mac[1] = 0xFF; + get_random_bytes(mac + 2, 4); +} + +/** Initial setup of the device for a vnet. + */ +static void vnet_dev_init(struct net_device *dev){ ether_setup(dev); +#if 0 if(!eth_hard_header){ eth_hard_header = dev->hard_header; } dev->hard_header = vnet_dev_hard_header; + //dev->do_ioctl = vnet_dev_do_ioctl; + //dev->tx_timeout = vnet_dev_tx_timeout; + //dev->watchdog_timeo = TX_TIMEOUT; + +#endif dev->open = vnet_dev_open; dev->stop = vnet_dev_stop; - dev->uninit = vnet_dev_uninit; dev->destructor = vnet_dev_destructor; dev->hard_start_xmit = vnet_dev_hard_start_xmit; dev->get_stats = vnet_dev_get_stats; - dev->do_ioctl = vnet_dev_do_ioctl; dev->change_mtu = vnet_dev_change_mtu; - - dev->tx_timeout = vnet_dev_tx_timeout; - dev->watchdog_timeo = TX_TIMEOUT; dev->set_multicast_list = vnet_dev_set_multicast_list; - - dev->hard_header_len += vnet->header_n; - dev->mtu -= vnet->header_n; - - vnet_dev_mac(dev->dev_addr); dev->flags |= IFF_DEBUG; dev->flags |= IFF_PROMISC; dev->flags |= IFF_ALLMULTI; - dprintf("<\n"); - return err; + vnet_dev_mac(dev->dev_addr); +} + +/** Complete the setup of the device for a vnet. + * Associate the device and the vnet and set mtu etc. + */ +static int vnet_dev_setup(Vnet *vnet, struct net_device *dev){ + int err; + + Vnet_incref(vnet); + dev->priv = vnet; + vnet->dev = dev; + dev->hard_header_len += vnet->header_n; + if(!etherip_in_udp){ + dev->mtu -= vnet->header_n; + } + vnet->mtu = dev->mtu; + iprintf("> Adding vnet device %s\n", dev->name); + err = register_netdev(dev); + if(err){ + eprintf("> register_netdev(%s) = %d\n", dev->name, err); + vnet_dev_destructor(dev); + } + return err; +} + +static inline int roundup(int n, int k){ + return k * ((n + k - 1) / k); } /** Add the interface (net device) for a vnet. * Sets the dev field of the vnet on success. - * Does nothing if the vif already has an interface. - * - * @param vif vif + * Does nothing if the vnet already has an interface. + * + * @param vnet vnet * @return 0 on success, error code otherwise */ int vnet_dev_add(Vnet *vnet){ int err = 0; struct net_device *dev = NULL; - dprintf("> vnet=%p\n", vnet); if(vnet->dev) goto exit; - vnet->header_n = sizeof(struct iphdr) + sizeof(struct etheriphdr); - dev = kmalloc(sizeof(struct net_device), GFP_ATOMIC); + vnet->header_n = ETH_HLEN + sizeof(struct iphdr) + sizeof(struct etheriphdr); + if(etherip_in_udp){ + vnet->header_n += sizeof(struct VnetMsgHdr); + vnet->header_n += sizeof(struct udphdr); + } + vnet->header_n = roundup(vnet->header_n, 4); + dev = alloc_netdev(0, vnet->device, vnet_dev_init); if(!dev){ err = -ENOMEM; goto exit; } - *dev = (struct net_device){}; - dev->priv = vnet; - vnet->dev = dev; - - err = vnet_dev_set_name(dev); - if(err) goto exit; - vnet_dev_init(dev); - err = register_netdev(dev); - if(err){ - wprintf("> register_netdev(%s) = %d\n", dev->name, err); - } + err = vnet_dev_setup(vnet, dev); if(err) goto exit; rtnl_lock(); dev_open(dev); rtnl_unlock(); - //dev_hold(dev); - exit: - if(err){ - if(dev) kfree(dev); - vnet->dev = NULL; - } - dprintf("< err=%d\n", err); - return err; -} + exit: + return err; +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet_dev.h --- a/tools/vnet/vnet-module/vnet_dev.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet_dev.h Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -23,6 +23,5 @@ extern int vnet_dev_add(struct Vnet *vnet); extern void vnet_dev_remove(struct Vnet *vnet); -extern int Vnet_create(struct Vnet *info); #endif diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet_ioctl.c --- a/tools/vnet/vnet-module/vnet_ioctl.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet_ioctl.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the @@ -40,95 +40,25 @@ #include "vnet.h" #include "varp.h" #include "vnet_dev.h" - -#include "sxpr_parser.h" +#include "vnet_eval.h" +#include "vnet_forward.h" + #include "iostream.h" #include "kernel_stream.h" +#include "mem_stream.h" #include "sys_string.h" #include "sys_net.h" +#include "sxpr_parser.h" #define MODULE_NAME "VNET" #define DEBUG 1 #undef DEBUG #include "debug.h" -// Functions to manage vnets. -/* - -Have to rely on ethernet bridging being configured - but we can't rely -on the kernel interface being available to us (it's not exported @!$"%!). - -Create a vnet N: -- create the vnet device vnifN: using commands to /proc, kernel api -- create the vnet bridge vnetN: using brctl in user-space -- for best results something should keep track of the mapping vnet id <-> bridge name - -Add vif device vifD.N to vnet N. -- domain is configured with vifD.N on bridge vnetN -- vif script adds vif to bridge using brctl -- vif script detects that the bridge is a vnet bridge and - uses /proc commands to configure the mac on the vnet - -Wouldn't be hard to add support for specifying vnet keys(s) in -the control interface. - -*/ - - // id vnet id - // security security level - // ciphersuite: digest, cipher, keys?? -/* Security policy. - vnet - src: mac - dst: mac - coa: ip - Map vnet x coa -> security (none, auth, conf) - - Policy, e.g. - - same subnet x vnet - - diff subnet x vnet - - some subnet x vnet - - some host addr x vnet - - (security (net local) (vnet *) (mode none)) - (security (net (not local)) - - (security (addr, vnet) (local-subnet addr) none) - (security (addr, vnet) (not (local-subnet addr)) conf) - (security (addr, vnet) (host 15.144.27.80) - (security (addr, vnet) (subnet addr 15.144.24.0/24) auth) - (security (addr, vnet) t auth) - - (security (addr local) (mode none)) - (security (addr local/16) (mode none)) - (security (addr 15.144.0.0/16) (mode auth)) - (security (addr 15.0.0.0/8) (mode conf)) - (security (addr *) (mode drop)) - - ?Varp security - Use esp too - none, auth, conf, - Varp sends broadcasts (requests) and unicasts (replies). - Uses UDP. Could send over ESP if needed. - For bcast don't know where it goes, so security has to be by vnet. - For ucast know where it goes, so could do by vnet and addr. - - Similar issue for vnets: know where unicast goes but don't know where - bcast goes. - - Simplify: 2 levels - local ucast - nonlocal ucast, mcast - - (security (local none) (nonlocal conf)) - (security (local auth) (nonlocal conf)) - - VARP security matches vnet security. - - */ - /** @file * * Kernel interface to files in /proc. + * todo: Add a sysfs interface using kobject. */ #define PROC_ROOT "/proc/" @@ -137,6 +67,10 @@ enum { VNET_POLICY = 1, + VNET_VNETS, + VNET_VIFS, + VNET_VARP, + VNET_PEERS, }; typedef struct proc_dir_entry ProcEntry; @@ -144,14 +78,12 @@ typedef struct file File; static int proc_open_fn(struct inode *inode, File *file); -static ssize_t proc_read_fn(File *file, char *buffer, size_t count, loff_t *offset); -static ssize_t proc_write_fn(File *file, const char *buffer, size_t count, loff_t *offset) ; +//static ssize_t proc_read_fn(File *file, char *buffer, size_t count, loff_t *offset); +//static ssize_t proc_write_fn(File *file, const char *buffer, size_t count, loff_t *offset) ; //static int proc_flush_fn(File *file); static loff_t proc_lseek_fn(File * file, loff_t offset, int orig); static int proc_ioctl_fn(struct inode *inode, File *file, unsigned opcode, unsigned long arg); -static int proc_release_fn(struct inode *inode, File *file); - -static int eval(Sxpr exp); +//static int proc_release_fn(struct inode *inode, File *file); static int ProcEntry_has_name(ProcEntry *entry, const char *name, int namelen){ dprintf("> name=%.*s entry=%.*s\n", namelen, name, entry->namelen, entry->name); @@ -164,17 +96,6 @@ // Does interface stop r/w on first error? // Is release called after an error? // - -static struct file_operations proc_file_ops = { - //owner: THIS_MODULE, - open: proc_open_fn, - read: proc_read_fn, - write: proc_write_fn, - //flush: proc_flush_fn, - llseek: proc_lseek_fn, - ioctl: proc_ioctl_fn, - release: proc_release_fn, -}; static int proc_get_parser(File *file, Parser **val){ int err = 0; @@ -200,6 +121,7 @@ // Get entry from //ProcEntry *entry = (ProcEntry *)inode->u.generic_ip; //file->private_data = NULL; + //file->f_dentry->d_ino is inode. // Check for user privilege - deny otherwise. // -EACCESS int err = 0; @@ -221,33 +143,13 @@ return count; } +#if 0 static ssize_t proc_write_fn(File *file, const char *buffer, size_t count, loff_t *offset) { - // User write. - // Copy data into kernel space from buffer. - // Increment offset by count, return count (or code). - int err = 0; - char *data = NULL; - Parser *parser = NULL; - - //dprintf("> count=%d\n", count); - err = proc_get_parser(file, &parser); - if(err) goto exit; - data = allocate(count); - if(!data){ - err = -ENOMEM; - goto exit; - } - err = copy_from_user(data, buffer, count); - if(err) goto exit; - *offset += count; - err = Parser_input(parser, data, count); - exit: - deallocate(data); - err = (err < 0 ? err : count); - //dprintf("< err = %d\n", err); - return err; -} + return -EINVAL; +} +#endif + #if 0 static int proc_flush_fn(File *file){ @@ -299,7 +201,33 @@ return 0; } -static int proc_release_fn(Inode *inode, File *file){ +static ssize_t proc_policy_write_fn(File *file, const char *buffer, + size_t count, loff_t *offset) { + // User write. + // Copy data into kernel space from buffer. + // Increment offset by count, return count (or code). + int err = 0; + char *data = NULL; + Parser *parser = NULL; + + err = proc_get_parser(file, &parser); + if(err) goto exit; + data = allocate(count); + if(!data){ + err = -ENOMEM; + goto exit; + } + err = copy_from_user(data, buffer, count); + if(err) goto exit; + *offset += count; + err = Parser_input(parser, data, count); + exit: + deallocate(data); + err = (err < 0 ? err : count); + return err; +} + +static int proc_policy_release_fn(Inode *inode, File *file){ // User close. // Cleanup file->private_data, return errcode. int err = 0; @@ -313,7 +241,7 @@ if(err) goto exit; obj = parser->val; for(l = obj; CONSP(l); l = CDR(l)){ - err = eval(CAR(l)); + err = vnet_eval(CAR(l), iostdout, NULL); if(err) break; } exit: @@ -322,6 +250,130 @@ dprintf("< err=%d\n", err); return err; } + +static int proc_io_open(Inode *inode, File *file, IOStream **val){ + int err = 0; + IOStream *io = mem_stream_new(); + if(!io){ + err = -ENOMEM; + goto exit; + } + file->private_data = io; + exit: + *val = (err ? NULL: io); + return err; +} + +static ssize_t proc_io_read_fn(File *file, char *buffer, + size_t count, loff_t *offset){ + // User read. + // Copy data to user buffer, increment offset by count, return count. + int err = 0; + char kbuf[1024] = {}; + int kbuf_n = sizeof(kbuf); + int k, n = 0; + char *ubuf = buffer; + IOStream *io = file->private_data; + + dprintf(">\n"); + if(!io) goto exit; + while(n < count){ + k = count - n; + if(k > kbuf_n){ + k = kbuf_n; + } + k = IOStream_read(io, kbuf, k); + if(k <= 0) break; + if(copy_to_user(ubuf, kbuf, k)){ + err = -EFAULT; + goto exit; + } + n += k; + ubuf += k; + } + *offset += n; + exit: + return (err ? err : n); +} + +static int proc_io_release_fn(Inode *inode, File *file){ + // User close. + int err = 0; + IOStream *io = file->private_data; + if(io) IOStream_close(io); + dprintf("< err=%d\n", err); + return err; +} + +static int proc_vnets_open_fn(Inode *inode, File *file){ + int err = 0; + IOStream *io; + if(proc_io_open(inode, file, &io)) goto exit; + vnet_print(io); + exit: + return err; +} + +static int proc_vifs_open_fn(Inode *inode, File *file){ + int err = 0; + IOStream *io; + if(proc_io_open(inode, file, &io)) goto exit; + vif_print(io); + exit: + return err; +} + +static int proc_peers_open_fn(Inode *inode, File *file){ + int err = 0; + IOStream *io; + if(proc_io_open(inode, file, &io)) goto exit; + vnet_peer_print(io); + exit: + return err; +} + +static int proc_varp_open_fn(Inode *inode, File *file){ + int err = 0; + IOStream *io; + if(proc_io_open(inode, file, &io)) goto exit; + varp_print(io); + exit: + return err; +} + +static struct file_operations proc_policy_ops = { + open: proc_open_fn, + read: proc_read_fn, + write: proc_policy_write_fn, + //flush: proc_flush_fn, + llseek: proc_lseek_fn, + ioctl: proc_ioctl_fn, + release: proc_policy_release_fn, +}; + +static struct file_operations proc_vnets_ops = { + open: proc_vnets_open_fn, + read: proc_io_read_fn, + release: proc_io_release_fn, +}; + +static struct file_operations proc_vifs_ops = { + open: proc_vifs_open_fn, + read: proc_io_read_fn, + release: proc_io_release_fn, +}; + +static struct file_operations proc_peers_ops = { + open: proc_peers_open_fn, + read: proc_io_read_fn, + release: proc_io_release_fn, +}; + +static struct file_operations proc_varp_ops = { + open: proc_varp_open_fn, + read: proc_io_read_fn, + release: proc_io_release_fn, +}; static ProcEntry *proc_fs_root = &proc_root; @@ -343,7 +395,6 @@ *rest = path; return err; } - /** Parse a path relative to `dir'. If dir is null or the proc root * the path is relative to "/proc/", and the leading "/proc/" may be @@ -379,13 +430,14 @@ return result; } -static ProcEntry *ProcFS_register(const char *name, ProcEntry *dir, int val){ +static ProcEntry *ProcFS_register(const char *name, ProcEntry *dir, + int val, struct file_operations *ops){ mode_t mode = 0; ProcEntry *entry; entry = create_proc_entry(name, mode, dir); if(entry){ - entry->proc_fops = &proc_file_ops; + entry->proc_fops = ops; entry->data = (void*)val; // Whatever data we need. } return entry; @@ -430,366 +482,22 @@ dprintf("<\n"); } -static int stringof(Sxpr exp, char **s){ - int err = 0; - if(ATOMP(exp)){ - *s = atom_name(exp); - } else if(STRINGP(exp)){ - *s = string_string(exp); - } else { - err = -EINVAL; - *s = NULL; - } - return err; -} - -static int child_string(Sxpr exp, Sxpr key, char **s){ - int err = 0; - Sxpr val = sxpr_child_value(exp, key, ONONE); - err = stringof(val, s); - return err; -} - -#if 0 -static int intof(Sxpr exp, int *v){ - int err = 0; - char *s; - unsigned long l; - if(INTP(exp)){ - *v = OBJ_INT(exp); - } else { - err = stringof(exp, &s); - if(err) goto exit; - err = convert_atoul(s, &l); - *v = (int)l; - } - exit: - return err; -} - -static int child_int(Sxpr exp, Sxpr key, int *v){ - int err = 0; - Sxpr val = sxpr_child_value(exp, key, ONONE); - err = intof(val, v); - return err; -} -#endif - -static int vnetof(Sxpr exp, VnetId *v){ - int err = 0; - char *s; - err = stringof(exp, &s); - if(err) goto exit; - err = VnetId_aton(s, v); - exit: - return err; -} - -static int child_vnet(Sxpr exp, Sxpr key, VnetId *v){ - int err = 0; - Sxpr val = sxpr_child_value(exp, key, ONONE); - err = vnetof(val, v); - return err; -} - -static int macof(Sxpr exp, unsigned char *v){ - int err = 0; - char *s; - err = stringof(exp, &s); - if(err) goto exit; - err = mac_aton(s, v); - exit: - return err; -} - -static int child_mac(Sxpr exp, Sxpr key, unsigned char *v){ - int err = 0; - Sxpr val = sxpr_child_value(exp, key, ONONE); - err = macof(val, v); - return err; -} - -static int addrof(Sxpr exp, uint32_t *v){ - int err = 0; - char *s; - unsigned long w; - err = stringof(exp, &s); - if(err) goto exit; - err = get_inet_addr(s, &w); - if(err) goto exit; - *v = (uint32_t)w; - exit: - return err; -} - -static int child_addr(Sxpr exp, Sxpr key, uint32_t *v){ - int err = 0; - Sxpr val = sxpr_child_value(exp, key, ONONE); - err = addrof(val, v); - return err; -} - -/** Create a vnet. - * It is an error if a vnet with the same id exists. - * - * @param vnet vnet id - * @param device vnet device name - * @param security security level - * @return 0 on success, error code otherwise - */ -static int ctrl_vnet_add(VnetId *vnet, char *device, int security){ - int err = 0; - Vnet *vnetinfo = NULL; - - if(strlen(device) >= IFNAMSIZ){ - err = -EINVAL; - goto exit; - } - if(Vnet_lookup(vnet, &vnetinfo) == 0){ - err = -EEXIST; - goto exit; - } - err = Vnet_alloc(&vnetinfo); - if(err) goto exit; - vnetinfo->vnet = *vnet; - vnetinfo->security = security; - strcpy(vnetinfo->device, device); - err = Vnet_create(vnetinfo); - exit: - if(vnetinfo) Vnet_decref(vnetinfo); - return err; -} - -/** Delete a vnet. - * - * @param vnet vnet id - * @return 0 on success, error code otherwise - */ -static int ctrl_vnet_del(VnetId *vnet){ - int err = -ENOSYS; - // Can't delete if there are any vifs on the vnet. - - // Need to flush vif entries for the deleted vnet. - // Need to flush varp entries for the deleted vnet. - // Note that (un)register_netdev() hold rtnl_lock() around - // (un)register_netdevice(). - - //Vnet_del(vnet); - return err; -} - -/** Create an entry for a vif with the given vnet and vmac. - * - * @param vnet vnet id - * @param vmac mac address - * @return 0 on success, error code otherwise - */ -static int ctrl_vif_add(VnetId *vnet, Vmac *vmac){ - int err = 0; - Vnet *vnetinfo = NULL; - Vif *vif = NULL; - - dprintf(">\n"); - err = Vnet_lookup(vnet, &vnetinfo); - if(err) goto exit; - err = vif_create(vnet, vmac, &vif); - exit: - if(vnetinfo) Vnet_decref(vnetinfo); - if(vif) vif_decref(vif); - dprintf("< err=%d\n", err); - return err; -} - -/** Delete a vif. - * - * @param vnet vnet id - * @param vmac mac address - * @return 0 on success, error code otherwise - */ -static int ctrl_vif_del(VnetId *vnet, Vmac *vmac){ - int err = 0; - Vnet *vnetinfo = NULL; - Vif *vif = NULL; - - dprintf(">\n"); - err = Vnet_lookup(vnet, &vnetinfo); - if(err) goto exit; - err = vif_lookup(vnet, vmac, &vif); - if(err) goto exit; - vif_remove(vnet, vmac); - exit: - if(vnetinfo) Vnet_decref(vnetinfo); - if(vif) vif_decref(vif); - dprintf("< err=%d\n", err); - return err; -} - -/** (varp.print) - */ -static int eval_varp_print(Sxpr exp){ - int err = 0; - varp_print(); - return err; -} - -/** (varp.mcaddr (addr <addr>)) - */ -static int eval_varp_mcaddr(Sxpr exp){ - int err =0; - Sxpr oaddr = intern("addr"); - uint32_t addr; - - err = child_addr(exp, oaddr, &addr); - if(err < 0) goto exit; - varp_set_mcast_addr(addr); - exit: - return err; -} - -/** (varp.flush) - */ -static int eval_varp_flush(Sxpr exp){ - int err = 0; - varp_flush(); - return err; -} - -/** (vnet.add (id <id>) - * [(vnetif <name>)] - * [(security { none | auth | conf } )] - * ) - */ -static int eval_vnet_add(Sxpr exp){ - int err = 0; - Sxpr oid = intern("id"); - Sxpr osecurity = intern("security"); - Sxpr ovnetif = intern("vnetif"); - Sxpr csecurity; - VnetId vnet = {}; - char *device = NULL; - char dev[IFNAMSIZ] = {}; - char *security = NULL; - int sec; - - err = child_vnet(exp, oid, &vnet); - if(err) goto exit; - child_string(exp, ovnetif, &device); - if(!device){ - snprintf(dev, IFNAMSIZ-1, "vnif%04x", ntohs(vnet.u.vnet16[7])); - device = dev; - } - csecurity = sxpr_child_value(exp, osecurity, intern("none")); - err = stringof(csecurity, &security); - if(err) goto exit; - if(strcmp(security, "none")==0){ - sec = 0; - } else if(strcmp(security, "auth")==0){ - sec = SA_AUTH; - } else if(strcmp(security, "conf")==0){ - sec = SA_CONF; - } else { - err = -EINVAL; - goto exit; - } - err = ctrl_vnet_add(&vnet, device, sec); - exit: - dprintf("< err=%d\n", err); - return err; -} - -/** Delete a vnet. - * - * (vnet.del (id <id>)) - * - * @param vnet vnet id - * @return 0 on success, error code otherwise - */ -static int eval_vnet_del(Sxpr exp){ - int err = 0; - Sxpr oid = intern("id"); - VnetId vnet = {}; - - err = child_vnet(exp, oid, &vnet); - if(err) goto exit; - err = ctrl_vnet_del(&vnet); - exit: - return err; -} - -/** (vif.add (vnet <vnet>) (vmac <macaddr>)) - */ -static int eval_vif_add(Sxpr exp){ - int err = 0; - Sxpr ovnet = intern("vnet"); - Sxpr ovmac = intern("vmac"); - VnetId vnet = {}; - Vmac vmac = {}; - - err = child_vnet(exp, ovnet, &vnet); - if(err) goto exit; - err = child_mac(exp, ovmac, vmac.mac); - if(err) goto exit; - err = ctrl_vif_add(&vnet, &vmac); - exit: - return err; -} - -/** (vif.del (vnet <vnet>) (vmac <macaddr>)) - */ -static int eval_vif_del(Sxpr exp){ - int err = 0; - Sxpr ovnet = intern("vnet"); - Sxpr ovmac = intern("vmac"); - VnetId vnet = {}; - Vmac vmac = {}; - - err = child_vnet(exp, ovnet, &vnet); - if(err) goto exit; - err = child_mac(exp, ovmac, vmac.mac); - if(err) goto exit; - err = ctrl_vif_del(&vnet, &vmac); - exit: - return err; -} - -typedef struct SxprEval { - Sxpr elt; - int (*fn)(Sxpr); -} SxprEval; - -static int eval(Sxpr exp){ - int err = 0; - SxprEval defs[] = { - { intern("varp.print"), eval_varp_print }, - { intern("varp.mcaddr"), eval_varp_mcaddr }, - { intern("varp.flush"), eval_varp_flush }, - { intern("vif.add"), eval_vif_add }, - { intern("vif.del"), eval_vif_del }, - { intern("vnet.add"), eval_vnet_add }, - { intern("vnet.del"), eval_vnet_del }, - { ONONE, NULL } }; - SxprEval *def; - - iprintf("> "); objprint(iostdout, exp, 0); IOStream_print(iostdout, "\n"); - err = -ENOSYS; - for(def = defs; !NONEP(def->elt); def++){ - if(sxpr_elementp(exp, def->elt)){ - err = def->fn(exp); - break; - } - } - iprintf("< err=%d\n", err); - return err; -} - void __init ProcFS_init(void){ ProcEntry *root_entry; ProcEntry *policy_entry; + ProcEntry *vnets_entry; + ProcEntry *vifs_entry; + ProcEntry *peers_entry; + ProcEntry *varp_entry; dprintf(">\n"); root_entry = ProcFS_mkdir(MODULE_ROOT, NULL); if(!root_entry) goto exit; - policy_entry = ProcFS_register("policy", root_entry, VNET_POLICY); + policy_entry = ProcFS_register("policy", root_entry, VNET_POLICY, &proc_policy_ops); + vnets_entry = ProcFS_register("vnets", root_entry, VNET_VNETS, &proc_vnets_ops); + vifs_entry = ProcFS_register("vifs", root_entry, VNET_VIFS, &proc_vifs_ops); + peers_entry = ProcFS_register("peers", root_entry, VNET_PEERS, &proc_peers_ops); + varp_entry = ProcFS_register("varp", root_entry, VNET_VARP, &proc_varp_ops); exit: dprintf("<\n"); } diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/Makefile --- a/tools/vnet/vnetd/Makefile Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/Makefile Thu Feb 9 15:12:11 2006 @@ -29,6 +29,7 @@ INCLUDES += -I$(LIBXUTIL_DIR) INCLUDES += -I$(VNET_MODULE_DIR) +INCLUDES += -I$(shell pwd) #---------------------------------------------------------------------------- # GC. @@ -37,8 +38,12 @@ #LIBS += -L$(GC_LIB_DIR) CPPFLAGS += -D USE_GC +# Sometimes linux/atomic.h is not #ifdef __KERNEL__. +CPPFLAGS += -D __ARCH_I386_ATOMIC__ + #---------------------------------------------------------------------------- CFLAGS += -g +CFLAGS += -O2 CFLAGS += -Wall CFLAGS += $(INCLUDES) $(LIBS) @@ -49,6 +54,7 @@ PROG_DEP = .*.d vpath %.c $(LIBXUTIL_DIR) +vpath %.c $(VNET_MODULE_DIR) IPATHS:=$(INCLUDES:-I=) vpath %.h $(IPATHS) @@ -56,12 +62,25 @@ #---------------------------------------------------------------------------- VNETD_SRC:= VNETD_SRC+= connection.c -VNETD_SRC+= marshal.c VNETD_SRC+= select.c VNETD_SRC+= timer.c -VNETD_SRC+= vcache.c +VNETD_SRC+= spinlock.c +VNETD_SRC+= skbuff.c VNETD_SRC+= vnetd.c +VNETD_SRC+= skb_util.c +VNETD_SRC+= sxpr_util.c +VNETD_SRC+= timer_util.c +VNETD_SRC+= etherip.c +VNETD_SRC+= vnet.c +VNETD_SRC+= vnet_eval.c +VNETD_SRC+= vnet_forward.c +VNETD_SRC+= vif.c +VNETD_SRC+= tunnel.c +VNETD_SRC+= sa.c +VNETD_SRC+= varp.c + +#---------------------------------------------------------------------------- LIB_SRC:= LIB_SRC+= allocate.c LIB_SRC+= enum.c @@ -72,6 +91,7 @@ LIB_SRC+= socket_stream.c LIB_SRC+= string_stream.c LIB_SRC+= sxpr.c +LIB_SRC+= sxpr_parser.c LIB_SRC+= sys_net.c LIB_SRC+= sys_string.c LIB_SRC+= util.c diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/connection.c --- a/tools/vnet/vnetd/connection.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/connection.c Thu Feb 9 15:12:11 2006 @@ -27,9 +27,9 @@ #include "file_stream.h" #include "socket_stream.h" -#define DEBUG +#define MODULE_NAME "conn" +#define DEBUG 1 #undef DEBUG -#define MODULE_NAME "conn" #include "debug.h" /** Initialize a file stream from a file desciptor. @@ -40,7 +40,7 @@ * @param io return parameter for the stream * @return 0 on success, error code otherwise */ -static int stream_init(int fd, const char *mode, int buffered, IOStream **io){ +int stream_init(int fd, const char *mode, int buffered, IOStream **io){ int err = 0; *io = file_stream_fdopen(fd, mode); if(!*io){ @@ -65,7 +65,7 @@ return err; } -ConnList * ConnList_add(Conn *conn, ConnList *l){ +ConnList * ConnList_add(ConnList *l, Conn *conn){ ConnList *v; v = ALLOCATE(ConnList); v->conn = conn; @@ -73,7 +73,58 @@ return v; } -Conn *Conn_new(int (*fn)(Conn *), void *data){ +ConnList * ConnList_del(ConnList *l, Conn *conn){ + ConnList *prev, *curr, *next; + for(prev = NULL, curr = l; curr; prev = curr, curr = next){ + next = curr->next; + if(curr->conn == conn){ + if(prev){ + prev->next = curr->next; + } else { + l = curr->next; + } + } + } + return l; +} + +void ConnList_close(ConnList *l){ + for( ; l; l = l->next){ + Conn_close(l->conn); + } +} + +void ConnList_select(ConnList *l, SelectSet *set){ + for( ; l; l = l->next){ + Conn_select(l->conn, set); + } +} + +/** Handle connections according to a select set. + * + * @param set indicates ready connections + */ +ConnList * ConnList_handle(ConnList *l, SelectSet *set){ + ConnList *prev, *curr, *next; + Conn *conn; + int err; + + for(prev = NULL, curr = l; curr; prev = curr, curr = next){ + next = curr->next; + conn = curr->conn; + err = Conn_handle(conn, set); + if(err){ + if(prev){ + prev->next = curr->next; + } else { + l = curr->next; + } + } + } + return l; +} + +Conn *Conn_new(int (*fn)(Conn *conn, int mode), void *data){ Conn *conn; conn = ALLOCATE(Conn); conn->fn = fn; @@ -81,22 +132,40 @@ return conn; } -int Conn_handle(Conn *conn){ +int Conn_handler(Conn *conn, int mode){ int err = 0; dprintf(">\n"); if(conn->fn){ - err = conn->fn(conn); + err = conn->fn(conn, mode); } else { dprintf("> no handler\n"); err = -ENOSYS; } if(err < 0){ + dprintf("> err=%d, closing %d\n", err, conn->sock); Conn_close(conn); } dprintf("< err=%d\n", err); return err; } - + +int Conn_handle(Conn *conn, SelectSet *set){ + int err = 0; + int mode = SelectSet_in(set, conn->sock); + + dprintf("> sock=%d mode=%d\n", conn->sock, mode); + if(mode){ + err = Conn_handler(conn, mode); + + } + return err; +} + +void Conn_select(Conn *conn, SelectSet *set){ + dprintf("> sock=%d\n", conn->sock); + SelectSet_add(set, conn->sock, conn->mode); +} + /** Initialize a connection. * * @param conn connection @@ -104,10 +173,11 @@ * @param ipaddr ip address * @return 0 on success, error code otherwise */ -int Conn_init(Conn *conn, int sock, int type, struct sockaddr_in addr){ +int Conn_init(Conn *conn, int sock, int type, int mode, struct sockaddr_in addr){ int err = 0; conn->addr = addr; conn->type = type; + conn->mode = mode; conn->sock = sock; if(type == SOCK_STREAM){ err = stream_init(sock, "r", 0, &conn->in); @@ -149,9 +219,12 @@ addr_in.sin_port = port; err = connect(sock, addr, addr_n); if(err) goto exit; - err = Conn_init(conn, sock, socktype, addr_in); - exit: - if(err) eprintf("< err=%d\n", err); + err = Conn_init(conn, sock, socktype, 0, addr_in); + exit: + if(err){ + perror("Conn_connect"); + eprintf("< err=%d\n", err); + } return err; } @@ -165,3 +238,175 @@ if(conn->out) IOStream_close(conn->out); shutdown(conn->sock, 2); } + +/** Set socket option to reuse address. + */ +int setsock_reuse(int sock, int val){ + int err = 0; + err = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); + if(err < 0){ + err = -errno; + perror("setsockopt SO_REUSEADDR"); + } + return err; +} + +/** Set socket broadcast option. + */ +int setsock_broadcast(int sock, int val){ + int err = 0; + err = setsockopt(sock, SOL_SOCKET, SO_BROADCAST, &val, sizeof(val)); + if(err < 0){ + err = -errno; + perror("setsockopt SO_BROADCAST"); + } + return err; +} + +/** Join a socket to a multicast group. + */ +int setsock_multicast(int sock, uint32_t iaddr, uint32_t maddr){ + int err = 0; + struct ip_mreqn mreq = {}; + int mloop = 0; + // See 'man 7 ip' for these options. + mreq.imr_multiaddr.s_addr = maddr; // IP multicast address. + mreq.imr_address.s_addr = iaddr; // Interface IP address. + mreq.imr_ifindex = 0; // Interface index (0 means any). + err = setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &mloop, sizeof(mloop)); + if(err < 0){ + err = -errno; + perror("setsockopt IP_MULTICAST_LOOP"); + goto exit; + } + err = setsockopt(sock, SOL_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); + if(err < 0){ + err = -errno; + perror("setsockopt IP_ADD_MEMBERSHIP"); + goto exit; + } + exit: + return err; +} + +/** Set a socket's multicast ttl (default is 1). + */ +int setsock_multicast_ttl(int sock, uint8_t ttl){ + int err = 0; + err = setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); + if(err < 0){ + err = -errno; + perror("setsockopt IP_MULTICAST_TTL"); + } + return err; +} + +int setsock_pktinfo(int sock, int val){ + int err = 0; + err = setsockopt(sock, SOL_IP, IP_PKTINFO, &val, sizeof(val)); + if(err < 0){ + err = -errno; + perror("setsockopt IP_PKTINFO"); + } + return err; +} + +char * socket_flags(int flags){ + static char s[6]; + int i = 0; + s[i++] = (flags & VSOCK_CONNECT ? 'c' : '-'); + s[i++] = (flags & VSOCK_BIND ? 'b' : '-'); + s[i++] = (flags & VSOCK_REUSE ? 'r' : '-'); + s[i++] = (flags & VSOCK_BROADCAST ? 'B' : '-'); + s[i++] = (flags & VSOCK_MULTICAST ? 'M' : '-'); + s[i++] = '\0'; + return s; +} + +/** Create a socket. + * The flags can include VSOCK_REUSE, VSOCK_BROADCAST, VSOCK_CONNECT. + * + * @param socktype socket type + * @param saddr address + * @param port port + * @param flags flags + * @param val return value for the socket connection + * @return 0 on success, error code otherwise + */ +int create_socket(int socktype, uint32_t saddr, uint32_t port, int flags, int *val){ + int err = 0; + int sock = 0; + struct sockaddr_in addr_in; + struct sockaddr *addr = (struct sockaddr *)&addr_in; + socklen_t addr_n = sizeof(addr_in); + int reuse, bcast; + + //dprintf(">\n"); + reuse = (flags & VSOCK_REUSE); + bcast = (flags & VSOCK_BROADCAST); + addr_in.sin_family = AF_INET; + addr_in.sin_addr.s_addr = saddr; + addr_in.sin_port = port; + dprintf("> flags=%s addr=%s port=%d\n", socket_flags(flags), + inet_ntoa(addr_in.sin_addr), ntohs(addr_in.sin_port)); + + sock = socket(AF_INET, socktype, 0); + if(sock < 0){ + err = -errno; + goto exit; + } + if(reuse){ + err = setsock_reuse(sock, reuse); + if(err < 0) goto exit; + } + if(bcast){ + err = setsock_broadcast(sock, bcast); + if(err < 0) goto exit; + } + if(flags & VSOCK_CONNECT){ + err = connect(sock, addr, addr_n); + if(err < 0){ + err = -errno; + perror("connect"); + goto exit; + } + } + if(flags & VSOCK_BIND){ + err = bind(sock, addr, addr_n); + if(err < 0){ + err = -errno; + perror("bind"); + goto exit; + } + } + { + struct sockaddr_in self = {}; + socklen_t self_n = sizeof(self); + getsockname(sock, (struct sockaddr *)&self, &self_n); + dprintf("> sockname sock=%d addr=%s port=%d reuse=%d bcast=%d\n", + sock, inet_ntoa(self.sin_addr), ntohs(self.sin_port), + reuse, bcast); + } + exit: + *val = (err ? -1 : sock); + //dprintf("< err=%d\n", err); + return err; +} + +int Conn_socket(int socktype, uint32_t saddr, uint32_t port, int flags, Conn **val){ + int err; + int sock; + struct sockaddr_in addr_in; + Conn *conn; + + err = create_socket(socktype, saddr, port, flags, &sock); + if(err) goto exit; + conn = Conn_new(NULL, NULL); + addr_in.sin_family = AF_INET; + addr_in.sin_addr.s_addr = saddr; + addr_in.sin_port = port; + Conn_init(conn, sock, socktype, 0, addr_in); + exit: + *val = (err ? NULL : conn); + return err; +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/connection.h --- a/tools/vnet/vnetd/connection.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/connection.h Thu Feb 9 15:12:11 2006 @@ -20,6 +20,7 @@ #include <netinet/in.h> #include "iostream.h" +#include "select.h" /** A connection. * The underlying transport is a socket. @@ -29,9 +30,10 @@ struct sockaddr_in addr; int sock; int type; + int mode; // select mode IOStream *in; IOStream *out; - int (*fn)(struct Conn *); + int (*fn)(struct Conn *conn, int mode); void *data; } Conn; @@ -40,12 +42,35 @@ struct ConnList *next; } ConnList; -extern ConnList * ConnList_add(Conn *conn, ConnList *l); +extern ConnList * ConnList_add(ConnList *l, Conn *conn); +extern ConnList * ConnList_del(ConnList *l, Conn *conn); +extern void ConnList_close(ConnList *l); +extern void ConnList_select(ConnList *l, SelectSet *set); +extern ConnList * ConnList_handle(ConnList *l, SelectSet *set); -extern Conn * Conn_new(int (*fn)(struct Conn *), void *data); -extern int Conn_init(Conn *conn, int sock, int type, struct sockaddr_in addr); +extern Conn * Conn_new(int (*fn)(struct Conn *conn, int mode), void *data); +extern int Conn_init(Conn *conn, int sock, int type, int mode, struct sockaddr_in addr); extern int Conn_connect(Conn *conn, int type, struct in_addr ipaddr, uint16_t port); -extern int Conn_handle(Conn *conn); +extern void Conn_select(Conn *conn, SelectSet *set); +extern int Conn_handle(Conn *conn, SelectSet *set); extern void Conn_close(Conn *conn); +extern int Conn_socket(int socktype, uint32_t saddr, uint32_t port, int flags, Conn **val); + +/** Socket flags. */ +enum { + VSOCK_REUSE = 1, + VSOCK_BIND = 2, + VSOCK_CONNECT = 4, + VSOCK_BROADCAST = 8, + VSOCK_MULTICAST = 16, + }; + +extern int create_socket(int socktype, uint32_t saddr, uint32_t port, int flags, int *sock); +extern int setsock_reuse(int sock, int val); +extern int setsock_broadcast(int sock, int val); +extern int setsock_multicast(int sock, uint32_t iaddr, uint32_t maddr); +extern int setsock_multicast_ttl(int sock, uint8_t ttl); +extern int setsock_pktinfo(int sock, int val); +extern char * socket_flags(int flags); #endif /* ! _VNET_CONNECTION_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/select.c --- a/tools/vnet/vnetd/select.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/select.c Thu Feb 9 15:12:11 2006 @@ -21,6 +21,11 @@ #include "select.h" +#define MODULE_NAME "select" +#define DEBUG +#undef DEBUG +#include "debug.h" + /** Zero all the file descriptor sets. * * @param set select set @@ -34,6 +39,26 @@ FD_ZERO(&set->er); } +/** Add a file descriptor to the set. + * + * @param set select set + * @param fd file descriptor + * @param mode mask of sets to add to + * @return 0 on success, -1 otherwise + */ +void SelectSet_add(SelectSet *set, int fd, int mode){ + if(fd < 0) return; + if(mode & SELECT_READ){ + SelectSet_add_read(set, fd); + } + if(mode & SELECT_WRITE){ + SelectSet_add_write(set, fd); + } + if(mode & SELECT_ERROR){ + SelectSet_add_error(set, fd); + } +} + /** Add a file descriptor to the write set. * * @param set select set @@ -41,6 +66,8 @@ * @return 0 on success, -1 otherwise */ void SelectSet_add_read(SelectSet *set, int fd){ + dprintf("> fd=%d\n", fd); + if(fd < 0) return; FD_SET(fd, &set->rd); if(fd > set->n) set->n = fd; } @@ -52,7 +79,22 @@ * @return 0 on success, -1 otherwise */ void SelectSet_add_write(SelectSet *set, int fd){ + dprintf("> fd=%d\n", fd); + if(fd < 0) return; FD_SET(fd, &set->wr); + if(fd > set->n) set->n = fd; +} + +/** Add a file descriptor to the error set. + * + * @param set select set + * @param fd file descriptor + * @return 0 on success, -1 otherwise + */ +void SelectSet_add_error(SelectSet *set, int fd){ + dprintf("> fd=%d\n", fd); + if(fd < 0) return; + FD_SET(fd, &set->er); if(fd > set->n) set->n = fd; } diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/select.h --- a/tools/vnet/vnetd/select.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/select.h Thu Feb 9 15:12:11 2006 @@ -24,9 +24,37 @@ fd_set rd, wr, er; } SelectSet; +enum { + SELECT_READ = 1, + SELECT_WRITE = 2, + SELECT_ERROR = 4, +}; + extern void SelectSet_zero(SelectSet *set); +extern void SelectSet_add(SelectSet *set, int fd, int mode); extern void SelectSet_add_read(SelectSet *set, int fd); extern void SelectSet_add_write(SelectSet *set, int fd); +extern void SelectSet_add_error(SelectSet *set, int fd); extern int SelectSet_select(SelectSet *set, struct timeval *timeout); +static inline int SelectSet_in(SelectSet *set, int fd){ + return ((fd >= 0) + ? ((FD_ISSET(fd, &set->rd) ? SELECT_READ : 0) | + (FD_ISSET(fd, &set->wr) ? SELECT_WRITE : 0) | + (FD_ISSET(fd, &set->er) ? SELECT_ERROR : 0)) + : 0); +} + +static inline int SelectSet_in_read(SelectSet *set, int fd){ + return (fd >= 0) && FD_ISSET(fd, &set->rd); +} + +static inline int SelectSet_in_write(SelectSet *set, int fd){ + return (fd >= 0) && FD_ISSET(fd, &set->wr); +} + +static inline int SelectSet_in_err(SelectSet *set, int fd){ + return (fd >= 0) && FD_ISSET(fd, &set->er); +} + #endif /* ! _VFC_SELECT_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/timer.c --- a/tools/vnet/vnetd/timer.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/timer.c Thu Feb 9 15:12:11 2006 @@ -49,7 +49,7 @@ * @param expiry time (in seconds) * @return 0 on success, error code otherwise */ -static int timer_set(double expiry){ +static int itimer_set(double expiry){ struct itimerval val = {}; struct itimerval old = {}; double now, delay; @@ -69,7 +69,7 @@ return err; } -static void Timer_free(Timer *z){ +void Timer_free(Timer *z){ #ifndef USE_GC if(!z) return; deallocate(z); @@ -93,26 +93,16 @@ for(curr = timers; curr; curr = next){ next = curr->next; if(curr->expiry > now) break; - if(curr->fn) curr->fn(curr); - Timer_free(curr); + if(curr->fn) curr->fn(curr->data); } timers = curr; - timer_set((curr ? curr->expiry : 0)); + itimer_set((curr ? curr->expiry : 0)); return 0; } -Timer * Timer_set(double delay, TimerFn *fn, void *data){ - // Get 'now'. - double now = time_now(); - Timer *timer = NULL, *prev, *curr, *next; - timer = ALLOCATE(Timer); - if(!timer) goto exit; - // Add delay to now to get expiry time. - timer->expiry = now + delay; - timer->fn = fn; - timer->data = data; - +void Timer_add(Timer *timer){ // Insert timer in list ordered by (increasing) expiry time. + Timer *prev, *curr, *next; prev = NULL; for(curr = timers; curr; prev = curr, curr = next){ next = curr->next; @@ -126,7 +116,21 @@ timer->next = curr; // Set interval timer to go off for earliest expiry time. - timer_set(timer->expiry); + itimer_set(timer->expiry); +} + +Timer * Timer_set(double delay, TimerFn *fn, unsigned long data){ + // Get 'now'. + double now = time_now(); + Timer *timer = NULL; + timer = ALLOCATE(Timer); + if(!timer) goto exit; + // Add delay to now to get expiry time. + timer->expiry = now + delay; + timer->fn = fn; + timer->data = data; + + Timer_add(timer); exit: return timer; } @@ -145,7 +149,6 @@ timers = curr->next; } curr->next = NULL; - Timer_free(curr); break; } } diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/timer.h --- a/tools/vnet/vnetd/timer.h Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/timer.h Thu Feb 9 15:12:11 2006 @@ -21,19 +21,20 @@ struct Timer; -typedef void TimerFn(struct Timer *); +typedef void TimerFn(unsigned long); typedef struct Timer { + struct Timer *next; TimerFn *fn; - void *data; + unsigned long data; double expiry; - struct Timer *next; } Timer; extern void timer_alarm(void); extern double time_now(void); extern int process_timers(void); -extern Timer * Timer_set(double delay, TimerFn *fn, void *data); +extern Timer * Timer_set(double delay, TimerFn *fn, unsigned long data); +extern void Timer_add(Timer *timer); extern int Timer_cancel(Timer *timer); #endif /* ! _VNET_TIMER_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/vnetd.c --- a/tools/vnet/vnetd/vnetd.c Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/vnetd.c Thu Feb 9 15:12:11 2006 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx>. + * Copyright (C) 2005, 2006 Mike Wray <mike.wray@xxxxxx>. * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as @@ -14,96 +14,19 @@ * along with this library; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/** @file - * - * Vnetd tcp messages: - * - * - varp request: request care-of-addr for a vif. - * If know answer, reply. If not broadcast locally. - * - * - varp announce: reply to a varp request. - * If a (local) request is pending, remember and broadcast locally. - * - * - vnet subscribe: indicate there are local vifs in a vnet (use varp announce?). - * - * - vnet forward: tunneled broadcast packet to rebroadcast. - * Broadcast locally (if there are vifs in the vnet). - * - * - * Vnetd udp messages (varp): - * - * - local varp request: - * If know and vif is non-local, reply. - * If know and vif is local, do nothing (but announce will reset). - * If have entry saying is local and no-one answers - remove (? or rely on entry timeout). - * If don't know and there is no (quick) local reply, forward to peers. - * - * - remote varp request: - * If know, reply. - * If don't know, query locally (and queue request). - * - * - varp announce: remember and adjust vnet subscriptions. - * Forward to peers if a request is pending. - * - * Vnetd broadcast messages (tunneling): - * - * - etherip: forward to peers (on the right vnets) - * - * - esp: forward to peers (on the right vnets) - * - * - * For etherip can tell the vnet from the header (in clear). - * But for esp can't. So should use mcast to define? Or always some clear header? - * - * Make ssl on tcp connections optional. - * - * So far have been assuming esp for security. - * But could use vnetd to forward and use ssl on the connection. - * But has usual probs with efficiency. - * However, should 'just work' if the coa for the vif has been set - * to the vnetd. How? Vnetd configured to act as gateway for - * some peers? Then would rewrite varp announce to itself and forward - * traffic to peer. - * - * Simplify - make each vnetd have one peer? - * If need to link more subnets, add vnetds? - * - * Need requests table for each tcp conn (incoming). - * - entries we want to resolve (and fwd the answer). - * - * Need requests table for the udp socket. - * - entries we want to resolve (and return the answer). - * - * Need table of entries we know. - * - from caching local announce - * - from caching announce reply to forwarded request - * - * Problem with replying to requests from the cache - if the cache - * is out of date we reply with incorrect data. So if a VM migrates - * we will advertise the old location until it times out. - * - * So should probably not reply out of the cache at all - but always - * query for the answer. Could query direct to old location if - * entry is valid the first time, and broadcast if no reply in timeout. - * Causes delay if migrated - may as well broadcast. - * - * Need to watch out for query loops. If have 3 vnetds A,B,C and - * A gets a query, forwards to B and C. B forwards to C, which - * forwards to A, and on forever. So if have an entry that has been - * probed, do not forward it when get another query for it. - * - * @author Mike Wray <mike.wray@xxxxxxxxxx> - */ - - #include <stdlib.h> +#include <stdbool.h> +#include <stdint.h> #include <unistd.h> #include <stdio.h> #include <getopt.h> #include <errno.h> +#include <time.h> +#include <fcntl.h> #include <sys/types.h> -#include <time.h> +#include <sys/ioctl.h> #include <sys/socket.h> +#include <sys/un.h> #include <netinet/in.h> #include <arpa/inet.h> #include <string.h> @@ -112,15 +35,21 @@ #include <sys/wait.h> #include <sys/select.h> -#include <linux/ip.h> // For struct iphdr; - +#include <asm/types.h> // For __u32 etc. + +#include <linux/ip.h> // For struct iphdr. +#include <linux/udp.h> // For struct udphdr. + +#include <linux/if.h> #include <linux/if_ether.h> -#include "if_etherip.h" -#include "if_varp.h" +#include <linux/if_tun.h> + +#include "sys_kernel.h" +#include "skbuff.h" +#include "spinlock.h" #include "allocate.h" -#include "vnetd.h" #include "file_stream.h" #include "string_stream.h" #include "socket_stream.h" @@ -128,59 +57,621 @@ #include "enum.h" #include "sxpr.h" - -#include "marshal.h" +#include "sxpr_parser.h" + #include "connection.h" #include "select.h" #include "timer.h" -#include "vcache.h" - -int create_socket(int socktype, uint32_t saddr, uint32_t port, int flags, Conn **val); - -#ifndef TRUE -#define TRUE 1 -#endif - -#ifndef FALSE -#define FALSE 0 -#endif - -/** Socket flags. */ -enum { - VSOCK_REUSE=1, - VSOCK_BIND=2, - VSOCK_CONNECT=4, - VSOCK_BROADCAST=8, - VSOCK_MULTICAST=16, - }; - -#define PROGRAM "vnetd" -#define VERSION "0.1" - -#define MODULE_NAME PROGRAM -#define DEBUG + +#include "if_etherip.h" +#include "if_varp.h" +#include "varp.h" +#include "vnet.h" +#include "vnet_dev.h" +#include "vnet_eval.h" +#include "vnet_forward.h" +#include "tunnel.h" +#include "etherip.h" +#include "sxpr_util.h" + +#define MODULE_NAME "VNETD" +#define DEBUG 1 #undef DEBUG #include "debug.h" -#define OPT_PORT 'p' -#define KEY_PORT "port" -#define DOC_PORT "<port>\n\t" PROGRAM " UDP port (as a number or service name)" - -#define OPT_ADDR 'm' -#define KEY_ADDR "mcaddr" -#define DOC_ADDR "<address>\n\t" PROGRAM " multicast address" - -#define OPT_PEER 'r' -#define KEY_PEER "peer" -#define DOC_PEER "<peer>\n\t Peer " PROGRAM " to connect to (IP address or hostname)" +#define PROGRAM "vnetd" +#define VERSION "1.0" + +typedef struct Vnetd { + unsigned long port; + int ttl; + int verbose; + int etherip; + + int udp_sock; + struct sockaddr_in udp_sock_addr; + int mcast_sock; + struct sockaddr_in mcast_sock_addr; + int etherip_sock; + struct sockaddr_in etherip_sock_addr; + int unix_sock; + char *unix_path; + + int raw_sock; + + struct sockaddr_in ucast_addr; + struct sockaddr_in mcast_addr; + + HashTable *vnet_table; + + ConnList *conns; + +} Vnetd; + +Vnetd _vnetd = {}, *vnetd = &_vnetd; + +uint32_t vnetd_intf_addr(Vnetd *vnetd){ + return vnetd->ucast_addr.sin_addr.s_addr; +} + +uint32_t vnetd_mcast_addr(Vnetd *vnetd){ + return vnetd->mcast_addr.sin_addr.s_addr; +} + +void vnetd_set_mcast_addr(Vnetd *vnetd, uint32_t addr){ + varp_mcast_addr = addr; + vnetd->mcast_addr.sin_addr.s_addr = addr; +} + +uint16_t vnetd_mcast_port(Vnetd *vnetd){ + return vnetd->mcast_addr.sin_port; +} + +uint32_t vnetd_addr(void){ + return vnetd_intf_addr(vnetd); +} + +/** Open tap device. + */ +int tap_open(struct net_device *dev){ + int err; + /* IFF_TAP : Ethernet tap device. + * IFF_NO_PI : Don't add packet info struct when reading. + * IFF_ONE_QUEUE: Drop packets when the dev queue is full. The driver uses + * the queue size from the device, which defaults to 1000 for etherdev. + * If not set the driver stops the device queue when it goes over + * TUN_READQ_SIZE, which is 10. Broken - makes the device stall + * under load. + */ + struct ifreq ifr = { }; + ifr.ifr_flags = (IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE); + + dprintf(">\n"); + dev->tapfd = open("/dev/net/tun", O_RDWR); + if(dev->tapfd < 0){ + err = -errno; + perror("open"); + goto exit; + } + strcpy(ifr.ifr_name, dev->name); + err = ioctl(dev->tapfd, TUNSETIFF, (void *)&ifr); + if(err < 0){ + err = -errno; + perror("ioctl"); + goto exit; + } + strcpy(dev->name, ifr.ifr_name); + dprintf("> dev=%s\n", dev->name); + // Make it non-blocking. + fcntl(dev->tapfd, F_SETFL, O_NONBLOCK); + + exit: + if(err && (dev->tapfd >= 0)){ + close(dev->tapfd); + dev->tapfd = -1; + } + dprintf("< err=%d\n", err); + return err; +} + +/** Close tap device. + */ +int tap_close(struct net_device *dev){ + int err = 0; + + if(dev->tapfd >= 0){ + err = close(dev->tapfd); + dev->tapfd = -1; + } + return err; +} + +/** Open vnif tap device for a vnet. + */ +int vnet_dev_add(struct Vnet *vnet){ + int err = 0; + struct net_device *dev = ALLOCATE(struct net_device); + strcpy(dev->name, vnet->device); + err = tap_open(dev); + if(err){ + wprintf("> Unable to open tap device.\n" + "The tun module must be loaded and\n" + "the vnet kernel module must not be loaded."); + deallocate(dev); + goto exit; + } + vnet->dev = dev; + exit: + return err; +} + +/** Close vnif tap device for a vnet. + */ +void vnet_dev_remove(struct Vnet *vnet){ + if(vnet->dev){ + tap_close(vnet->dev); + deallocate(vnet->dev); + vnet->dev = NULL; + } +} + +/** Receive decapsulated ethernet packet on skb->dev. + * Always succeeds. The skb must not be referred to after + * this is called. + */ +int netif_rx(struct sk_buff *skb){ + int err = 0, n, k; + struct net_device *dev = skb->dev; + if(!dev){ + err = -ENODEV; + goto exit; + } + n = skb->tail - skb->mac.raw; + k = write(dev->tapfd, skb->mac.raw, n); + if(k < 0){ + err = -errno; + perror("write"); + } else if(k < n){ + //todo: What? + } + exit: + kfree_skb(skb); + return err; +} + +static const int SKB_SIZE = 1700; + +struct sk_buff *skb_new(void){ + return alloc_skb(SKB_SIZE, GFP_ATOMIC); +} + +/** Receive a packet and fill-in source and destination addresses. + * Just like recvfrom() but adds the destination address. + * The socket must have the IP_PKTINFO option set so that the + * destination address information is available. + * + * @param sock socket + * @param buf receive buffer + * @param len size of buffer + * @param flags receive flags + * @param from source address + * @param fromlen size of source address + * @param dest destination address + * @param destlen size of destination address + * @return number of bytes read on success, negative otherwise + */ +int recvfromdest(int sock, void *buf, size_t len, int flags, + struct sockaddr *from, socklen_t *fromlen, + struct sockaddr *dest, socklen_t *destlen){ + int ret = 0; + struct iovec iov; + struct msghdr msg; + struct cmsghdr *cmsg; + char cbuf[1024]; + struct in_pktinfo *info; + struct sockaddr_in *dest_in = (struct sockaddr_in *)dest; + + //dest_in->sin_family = AF_INET; + //dest_in->sin_port = 0; + getsockname(sock, dest, destlen); + + iov.iov_base = buf; + iov.iov_len = len; + msg.msg_name = from; + msg.msg_namelen = *fromlen; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + + ret = recvmsg(sock, &msg, flags); + if(ret < 0) goto exit; + *fromlen = msg.msg_namelen; + + for(cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)){ + if((cmsg->cmsg_level == SOL_IP) && (cmsg->cmsg_type == IP_PKTINFO)){ + info = (void*)CMSG_DATA(cmsg); + dest_in->sin_addr = info->ipi_addr; + break; + } + } + + exit: + return ret; +} + +/** Read an skb from a udp socket and fill in its headers. + */ +int skb_recv_udp(int sock, int flags, + struct sockaddr_in *peer, socklen_t *peer_n, + struct sockaddr_in *dest, socklen_t *dest_n, + struct sk_buff **pskb){ + int err = 0, n; + struct sk_buff *skb = skb_new(); + + skb->mac.raw = skb->data; + skb_reserve(skb, ETH_HLEN); + skb->nh.raw = skb->data; + skb_reserve(skb, sizeof(struct iphdr)); + // Rcvr wants skb->data pointing at the udphdr. + skb->h.raw = skb_put(skb, sizeof(struct udphdr)); + n = recvfromdest(sock, skb->tail, skb_tailroom(skb), flags, + (struct sockaddr *)peer, peer_n, + (struct sockaddr *)dest, dest_n); + if(n < 0){ + err = -errno; + //perror("recvfrom"); + goto exit; + } + dprintf("> peer=%s:%d\n", inet_ntoa(peer->sin_addr), ntohs(peer->sin_port)); + dprintf("> dest=%s:%d\n", inet_ntoa(dest->sin_addr), ntohs(dest->sin_port)); + skb_put(skb, n); + skb->protocol = skb->nh.iph->protocol = IPPROTO_UDP; + skb->nh.iph->saddr = peer->sin_addr.s_addr; + skb->h.uh->source = peer->sin_port; + skb->nh.iph->daddr = dest->sin_addr.s_addr; + skb->h.uh->dest = dest->sin_port; + exit: + if(err < 0){ + kfree_skb(skb); + *pskb = NULL; + } else { + *pskb = skb; + } + return (err < 0 ? err : n); +} + +/** Read an skb fom a raw socket and fill in its headers. + */ +int skb_recv_raw(int sock, int flags, + struct sockaddr_in *peer, socklen_t *peer_n, + struct sockaddr_in *dest, socklen_t *dest_n, + struct sk_buff **pskb){ + int err = 0, n; + struct sk_buff *skb = skb_new(); + + skb->mac.raw = skb->data; + skb_reserve(skb, ETH_HLEN); + skb->nh.raw = skb->data; + skb_reserve(skb, sizeof(struct iphdr)); + // Rcvr wants skb->data pointing after ip hdr, at raw protocol hdr. + n = recvfromdest(sock, skb->tail, skb_tailroom(skb), flags, + (struct sockaddr *)peer, peer_n, + (struct sockaddr *)dest, dest_n); + if(n < 0){ + err = -errno; + //perror("recvfrom"); + goto exit; + } + skb_put(skb, n); + // On a raw socket the port in the address is the protocol. + skb->protocol = skb->nh.iph->protocol = peer->sin_port; + skb->nh.iph->saddr = peer->sin_addr.s_addr; + skb->nh.iph->daddr = dest->sin_addr.s_addr; + exit: + if(err < 0){ + kfree_skb(skb); + *pskb = NULL; + } else { + *pskb = skb; + } + return (err < 0 ? err : n); +} + +/** Read an skb from a file descriptor. + * Used for skbs coming to us from the tap device. + * The skb content is an ethernet frame. + */ +int skb_read(int fd, struct sk_buff **pskb){ + int err = 0, n; + struct sk_buff *skb = skb_new(); + + // Reserve space for the headers we will add. + skb_reserve(skb, 100); + // Rcvr will want ethhdr on the skb. + skb->mac.raw = skb->tail; + n = read(fd, skb->tail, skb_tailroom(skb)); + if(n < 0){ + err = -errno; + //perror("read"); + goto exit; + } + skb_put(skb, n); + exit: + if(err < 0){ + kfree_skb(skb); + *pskb = NULL; + } else { + *pskb = skb; + } + return (err < 0 ? err : n); +} + +/** Read an skb from the tap device for a vnet and send it. + */ +int vnet_read(Vnet *vnet){ + int err; + struct sk_buff *skb = NULL; + + err = skb_read(vnet->dev->tapfd, &skb); + if(err < 0) goto exit; + err = vnet_skb_send(skb, &vnet->vnet); + exit: + if(skb) kfree_skb(skb); + return (err < 0 ? err : 0); +} + +/** Transmit an skb to the network. + */ +int _skb_xmit(struct sk_buff *skb, uint32_t saddr){ + int err = 0; + int sock; + unsigned char *data; + struct sockaddr_in addr = { .sin_family = AF_INET }; + int flags = 0; + + if(saddr){ + dprintf("> Raw IP send\n"); + sock = vnetd->raw_sock; + skb->nh.iph->saddr = saddr; + addr.sin_addr.s_addr = skb->nh.iph->daddr; + // Should be the protocol, but is ignored. See raw(7) man page. + addr.sin_port = 0; + // Data includes the ip header. + data = (void*)(skb->nh.iph); + } else { + switch(skb->nh.iph->protocol){ + case IPPROTO_UDP: + dprintf("> protocol=UDP\n"); + sock = vnetd->udp_sock; + // Data comes after the udp header. + data = (void*)(skb->h.uh + 1); + addr.sin_addr.s_addr = skb->nh.iph->daddr; + addr.sin_port = skb->h.uh->dest; + break; + case IPPROTO_ETHERIP: + dprintf("> protocol=ETHERIP\n"); + if(vnetd->etherip_sock < 0){ + err = -ENOSYS; + goto exit; + } + sock = vnetd->etherip_sock; + // Data comes after the ip header. + data = (void*)(skb->nh.iph + 1); + addr.sin_addr.s_addr = skb->nh.iph->daddr; + // Should be the protocol, but is ignored. See raw(7) man page. + addr.sin_port = 0; + break; + default: + err = -ENOSYS; + wprintf("> protocol=%d, %d\n", skb->nh.iph->protocol, skb->protocol); + goto exit; + } + } + + dprintf("> sending %d bytes to %s:%d protocol=%d\n", + skb->tail - data, + inet_ntoa(addr.sin_addr), + ntohs(addr.sin_port), + skb->nh.iph->protocol); + + err = sendto(sock, data, skb->tail - data, flags, + (struct sockaddr *)&addr, sizeof(addr)); + if(err < 0){ + err = -errno; + perror("sendto"); + } + exit: + if(err >= 0){ + // Caller will assume skb freed if no error. + kfree_skb(skb); + err = 0; + } + dprintf("< err=%d\n", err); + return err; +} + +int varp_open(uint32_t mcaddr, uint16_t port){ + return 0; +} + +void varp_close(void){ +} + +/** Create a raw socket. + * + * @param protocol protocol + * @param flags flags (VSOCK_*) + * @param mcaddr multicast addr used with flag VSOCK_MULTICAST + * @param sock return value for the socket + */ +int vnetd_raw_socket(Vnetd *vnetd, int protocol, int flags, + uint32_t mcaddr, int *sock){ + int err; + int bcast = (flags & VSOCK_BROADCAST); + + err = *sock = socket(AF_INET, SOCK_RAW, protocol); + if(err < 0){ + err = -errno; + perror("socket"); + goto exit; + } + if(bcast){ + err = setsock_broadcast(*sock, bcast); + if(err < 0) goto exit; + } + if(flags & VSOCK_MULTICAST){ + err = setsock_multicast(*sock, INADDR_ANY, mcaddr); + if(err < 0) goto exit; + } + //todo ?? fcntl(*sock, F_SETFL, O_NONBLOCK); + exit: + return err; +} + +int get_dev_address(char *dev, unsigned long *addr){ + int err = 0; + int sock = -1; + struct ifreq ifreq = {}; + struct sockaddr_in *in_addr; + + sock = socket(AF_INET, SOCK_DGRAM, 0); + if(sock < 0){ + err = -errno; + goto exit; + } + strncpy(ifreq.ifr_name, dev, IFNAMSIZ); + err = ioctl(sock, SIOCGIFADDR, &ifreq); + if(err){ + err = -errno; + goto exit; + } + in_addr = (struct sockaddr_in *) &ifreq.ifr_addr; + *addr = in_addr->sin_addr.s_addr; + //iprintf("> dev=%s addr=%s\n", dev, inet_ntoa(in_addr->sin_addr)); + exit: + if(sock >= 0) close(sock); + return err; +} + +int get_intf_address(unsigned long *addr){ + int err = 0; + char *devs[] = { "xen-br0", "eth0", "eth1", "eth2", NULL }; + char **dev; + + for(dev = devs; *dev; dev++){ + err = get_dev_address(*dev, addr); + if(err == 0) goto exit; + } + err = -ENOSYS; + exit: + return err; +} + +/** Get our own address. So we can ignore broadcast traffic + * we sent ourselves. + * + * @param addr + * @return 0 on success, error code otherwise + */ +int get_self_addr(struct sockaddr_in *addr){ + int err = 0; + char hostname[1024] = {}; + unsigned long saddr; + + err = gethostname(hostname, sizeof(hostname) - 1); + if(err){ + err = -errno; + perror("gethostname"); + goto exit; + } + err = get_host_address(hostname, &saddr); + if(err) goto exit; + addr->sin_addr.s_addr = saddr; + if(saddr == htonl(INADDR_LOOPBACK)){ + err = get_intf_address(&saddr); + if(err) goto exit; + } + addr->sin_addr.s_addr = saddr; + err = 0; + exit: + return err; +} + +static int eval_vnetd_mcaddr(Sxpr exp, IOStream *out, void *data){ + int err = 0; + Vnetd *vnetd = data; + Sxpr oaddr = intern("addr"); + Sxpr ottl = intern("ttl"); + uint32_t addr; + int ttl; + + err = child_addr(exp, oaddr, &addr); + if(err < 0) goto exit; + vnetd_set_mcast_addr(vnetd, addr); + if(child_int(exp, ottl, &ttl) == 0){ + vnetd->ttl = ttl; + } + exit: + return err; +} + +static int vnetd_eval_io(Vnetd *vnetd, Parser *parser, SxprEval *defs, + IOStream *in, IOStream *out){ + int err = 0; + char buf[1024]; + int k, n = sizeof(buf) - 1; + + for( ; ; ){ + k = IOStream_read(in, buf, n); + if(k < 0){ + err = k; + goto exit; + } + err = Parser_input(parser, buf, k); + if(err < 0) goto exit; + while(Parser_ready(parser)){ + Sxpr exp = Parser_get_val(parser); + if(NONEP(exp)) break; + err = vnet_eval_defs(defs, exp, out, vnetd); + if(err) goto exit; + } + if(Parser_at_eof(parser)) break; + } + exit: + return err; +} + +static int vnetd_configure(Vnetd *vnetd, char *file){ + int err = 0; + Parser *parser = NULL; + IOStream *io = NULL; + SxprEval defs[] = { + { .name = intern("peer.add"), .fn = eval_peer_add }, + { .name = intern("varp.mcaddr"), .fn = eval_vnetd_mcaddr }, + { .name = intern("vnet.add"), .fn = eval_vnet_add }, + { .name = ONONE, .fn = NULL } }; + + parser = Parser_new(); + io = file_stream_fopen(file, "rb"); + if(!io){ + err = -errno; + goto exit; + } + vnetd_eval_io(vnetd, parser, defs, io, iostdout); + exit: + if(io) IOStream_close(io); + Parser_free(parser); + return err; +} + +#define OPT_MCADDR 'a' +#define KEY_MCADDR "varp_mcaddr" +#define DOC_MCADDR "<addr>\n\t VARP multicast address" #define OPT_FILE 'f' #define KEY_FILE "file" #define DOC_FILE "<file>\n\t Configuration file to load" - -#define OPT_CTRL 'c' -#define KEY_CTRL "control" -#define DOC_CTRL "<port>\n\t " PROGRAM " control port (as a number or service name)" #define OPT_HELP 'h' #define KEY_HELP "help" @@ -204,9 +695,8 @@ FILE *out = (err ? stderr : stdout); fprintf(out, "Usage: %s [options]\n", PROGRAM); - fprintf(out, "-%c, --%s %s\n", OPT_ADDR, KEY_ADDR, DOC_ADDR); - fprintf(out, "-%c, --%s %s\n", OPT_PORT, KEY_PORT, DOC_PORT); - fprintf(out, "-%c, --%s %s\n", OPT_PEER, KEY_PEER, DOC_PEER); + fprintf(out, "-%c, --%s %s\n", OPT_MCADDR, KEY_MCADDR, DOC_MCADDR); + fprintf(out, "-%c, --%s %s\n", OPT_FILE, KEY_FILE, DOC_FILE); fprintf(out, "-%c, --%s %s\n", OPT_VERBOSE, KEY_VERBOSE, DOC_VERBOSE); fprintf(out, "-%c, --%s %s\n", OPT_VERSION, KEY_VERSION, DOC_VERSION); fprintf(out, "-%c, --%s %s\n", OPT_HELP, KEY_HELP, DOC_HELP); @@ -215,9 +705,8 @@ /** Short options. Options followed by ':' take an argument. */ static char *short_opts = (char[]){ - OPT_ADDR, ':', - OPT_PORT, ':', - OPT_PEER, ':', + OPT_MCADDR, ':', + OPT_FILE, ':', OPT_HELP, OPT_VERSION, OPT_VERBOSE, @@ -225,994 +714,38 @@ /** Long options. */ static struct option const long_opts[] = { - { KEY_ADDR, required_argument, NULL, OPT_ADDR }, - { KEY_PORT, required_argument, NULL, OPT_PORT }, - { KEY_PEER, required_argument, NULL, OPT_PEER }, + { KEY_MCADDR, required_argument, NULL, OPT_MCADDR }, + { KEY_FILE, required_argument, NULL, OPT_FILE }, { KEY_HELP, no_argument, NULL, OPT_HELP }, { KEY_VERSION, no_argument, NULL, OPT_VERSION }, { KEY_VERBOSE, no_argument, NULL, OPT_VERBOSE }, { NULL, 0, NULL, 0 } }; -/** Get address of vnetd. So we can ignore broadcast traffic - * we sent ourselves. - * - * @param addr - * @return 0 on success, error code otherwise - */ -int get_self_addr(struct sockaddr_in *addr){ - int err = 0; - char hostname[1024] = {}; - unsigned long saddr; - - //dprintf(">\n"); - err = gethostname(hostname, sizeof(hostname) -1); - if(err) goto exit; - err = get_host_address(hostname, &saddr); - if(err == 0){ err = -ENOENT; goto exit; } - err = 0; - addr->sin_addr.s_addr = saddr; - exit: - //dprintf("< err=%d\n", err); - return err; -} - -/** Marshal a message. - * - * @param io destination - * @param msg message - * @return number of bytes written, or negative error code - */ -int VnetMsg_marshal(IOStream *io, VnetMsg *msg){ - int err = 0; - int hdr_n = sizeof(VnetMsgHdr); - - err = marshal_uint16(io, msg->hdr.id); - if(err < 0) goto exit; - err = marshal_uint16(io, msg->hdr.opcode); - if(err < 0) goto exit; - switch(msg->hdr.id){ - case VNET_VARP_ID: - err = marshal_bytes(io, ((char*)msg) + hdr_n, sizeof(VarpHdr) - hdr_n); - break; - case VNET_FWD_ID: - err = marshal_uint16(io, msg->fwd.protocol); - if(err < 0) goto exit; - err = marshal_uint16(io, msg->fwd.len); - if(err < 0) goto exit; - err = marshal_bytes(io, msg->fwd.data, msg->fwd.len); - break; - default: - err = -EINVAL; - break; - } - exit: - return err; -} - -/** Unmarshal a message. - * - * @param io source - * @param msg message to unmarshal into - * @return number of bytes read, or negative error code - */ -int VnetMsg_unmarshal(IOStream *io, VnetMsg *msg){ - int err = 0; - int hdr_n = sizeof(VnetMsgHdr); - - dprintf("> id\n"); - err = unmarshal_uint16(io, &msg->hdr.id); - if(err < 0) goto exit; - dprintf("> opcode\n"); - err = unmarshal_uint16(io, &msg->hdr.opcode); - if(err < 0) goto exit; - switch(msg->hdr.id){ - case VNET_VARP_ID: - msg->hdr.opcode = htons(msg->hdr.opcode); - dprintf("> varp hdr_n=%d varphdr=%d\n", hdr_n, sizeof(VarpHdr)); - err = unmarshal_bytes(io, ((char*)msg) + hdr_n, sizeof(VarpHdr) - hdr_n); - break; - case VNET_FWD_ID: - dprintf("> forward\n"); - err = unmarshal_uint16(io, &msg->fwd.protocol); - if(err < 0) goto exit; - dprintf("> forward len\n"); - err = unmarshal_uint16(io, &msg->fwd.len); - if(err < 0) goto exit; - dprintf("> forward bytes\n"); - err = unmarshal_bytes(io, msg->fwd.data, msg->fwd.len); - break; - default: - wprintf("> Invalid id %d\n", msg->hdr.id); - err = -EINVAL; - break; - } - exit: - dprintf("< err=%d \n", err); - return err; -} - -Vnetd _vnetd = {}; -Vnetd *vnetd = &_vnetd; - -/** Counter for timer alarms. - */ -static unsigned timer_alarms = 0; - -/** Set vnetd defaults. - * - * @param vnetd vnetd - */ -void vnetd_set_defaults(Vnetd *vnetd){ - *vnetd = (Vnetd){}; - vnetd->port = htons(VNETD_PORT); - vnetd->peer_port = vnetd->port; //htons(VNETD_PEER_PORT); - vnetd->verbose = FALSE; - vnetd->peers = ONULL; - vnetd->mcast_addr.sin_addr.s_addr = VARP_MCAST_ADDR; - vnetd->mcast_addr.sin_port = vnetd->port; -} - -uint32_t vnetd_mcast_addr(Vnetd *vnetd){ - return vnetd->mcast_addr.sin_addr.s_addr; -} - -uint16_t vnetd_mcast_port(Vnetd *vnetd){ - return vnetd->mcast_addr.sin_port; -} - -/** Add a connection to a peer. - * - * @param vnetd vnetd - * @param conn connection - */ -void connections_add(Vnetd *vnetd, Conn *conn){ - vnetd->connections = ConnList_add(conn, vnetd->connections); -} - -/** Delete a connection to a peer. - * - * @param vnetd vnetd - * @param conn connection - */ -void connections_del(Vnetd *vnetd, Conn *conn){ - ConnList *prev, *curr, *next; - for(prev = NULL, curr = vnetd->connections; curr; prev = curr, curr = next){ - next = curr->next; - if(curr->conn == conn){ - if(prev){ - prev->next = curr->next; - } else { - vnetd->connections = curr->next; - } - } - } -} - -/** Close all connections to peers. - * - * @param vnetd vnetd - */ -void connections_close_all(Vnetd *vnetd){ - ConnList *l; - for(l = vnetd->connections; l; l = l->next){ - Conn_close(l->conn); - } - vnetd->connections = NULL; -} - -/** Add peer connections to a select set. - * - * @param vnetd vnetd - * @param set select set - */ -void connections_select(Vnetd *vnetd, SelectSet *set){ - ConnList *l; - for(l = vnetd->connections; l; l = l->next){ - SelectSet_add_read(set, l->conn->sock); - } -} - -/** Handle peer connections according to a select set. - * - * @param vnetd vnetd - * @param set indicates ready connections - */ -void connections_handle(Vnetd *vnetd, SelectSet *set){ - ConnList *prev, *curr, *next; - Conn *conn; - for(prev = NULL, curr = vnetd->connections; curr; prev = curr, curr = next){ - next = curr->next; - conn = curr->conn; - if(FD_ISSET(conn->sock, &set->rd)){ - int conn_err; - conn_err = Conn_handle(conn); - if(conn_err){ - if(prev){ - prev->next = curr->next; - } else { - vnetd->connections = curr->next; - } - } - } - } -} - -/** Forward a message from a peer onto the local subnet. - * - * @param vnetd vnetd - * @param vmsg message - * @return 0 on success, error code otherwise - */ -int vnetd_forward_local(Vnetd *vnetd, VnetMsg *vmsg){ - int err = 0; - int sock = 0; - struct sockaddr_in addr_in; - struct sockaddr *addr = (struct sockaddr *)&addr_in; - socklen_t addr_n = sizeof(addr_in); - - dprintf(">\n"); - switch(vmsg->fwd.protocol){ - case IPPROTO_ESP: - dprintf("> ESP\n"); - sock = vnetd->esp_sock; break; - case IPPROTO_ETHERIP: - dprintf("> Etherip\n"); - sock = vnetd->etherip_sock; break; - default: - err = -EINVAL; - goto exit; - } - addr_in.sin_family = AF_INET; - addr_in.sin_addr = vnetd->mcast_addr.sin_addr; - addr_in.sin_port = htons(vmsg->fwd.protocol); - dprintf("> send dst=%s protocol=%d len=%d\n", - inet_ntoa(addr_in.sin_addr), vmsg->fwd.protocol, vmsg->fwd.len); - err = sendto(sock, vmsg->fwd.data, vmsg->fwd.len, 0, addr, addr_n); - exit: - dprintf("< err=%d\n", err); - return err; -} - -/** Forward a message to a peer. - * - * @param conn peer connection - * @param protocol message protocol - * @param data message data - * @param data_n message size - * @return 0 on success, error code otherwise - */ -int vnetd_forward_peer(Conn *conn, int protocol, void *data, int data_n){ - int err = 0; - IOStream _io, *io = &_io; - StringData sdata; - char buf[1600]; - - dprintf("> addr=%s protocol=%d n=%d\n", - inet_ntoa(conn->addr.sin_addr), protocol, data_n); - string_stream_init(io, &sdata, buf, sizeof(buf)); - err = marshal_uint16(io, VNET_FWD_ID); - if(err < 0) goto exit; - err = marshal_uint16(io, 0); - if(err < 0) goto exit; - err = marshal_uint16(io, protocol); - if(err < 0) goto exit; - err = marshal_uint16(io, data_n); - if(err < 0) goto exit; - err = marshal_bytes(io, data, data_n); - if(err < 0) goto exit; - err = IOStream_write(conn->out, buf, IOStream_get_written(io)); - IOStream_flush(conn->out); - exit: - if(err < 0) perror(__FUNCTION__); - dprintf("< err=%d\n", err); - return err; -} - -/** Forward a message to all peers. - * - * @param vnetd vnetd - * @param protocol message protocol - * @param data message data - * @param data_n message size - * @return 0 on success, error code otherwise - */ -int vnetd_forward_peers(Vnetd *vnetd, int protocol, void *data, int data_n){ - int err = 0; - ConnList *curr, *next; - - dprintf(">\n"); - for(curr = vnetd->connections; curr; curr = next){ - next = curr->next; - vnetd_forward_peer(curr->conn, protocol, data, data_n); - } - dprintf("< err=%d\n", err); - return err; -} - -/** Handler for a peer connection. - * Reads a VnetMsg from the connection and handles it. - * - * @param conn peer connection - * @return 0 on success, error code otherwise - */ -int conn_handle_fn(Conn *conn){ - int err = 0; - VnetMsg *vmsg = ALLOCATE(VnetMsg); - IPMessage *msg = NULL; - - dprintf("> addr=%s port=%u\n", - inet_ntoa(conn->addr.sin_addr), - ntohs(conn->addr.sin_port)); - err = VnetMsg_unmarshal(conn->in, vmsg); - if(err < 0){ - wprintf("> Unmarshal error %d\n", err); - goto exit; - } - switch(vmsg->hdr.id){ - case VNET_VARP_ID: - dprintf("> Got varp message\n"); - msg = ALLOCATE(IPMessage); - msg->conn = conn; - msg->saddr = conn->addr; - msg->data = vmsg; - err = vcache_handle_message(msg, 0); - err = 0; - break; - case VNET_FWD_ID: - dprintf("> Got forward message\n"); - err = vnetd_forward_local(vnetd, vmsg); - err = 0; - break; - default: - wprintf("> Invalid id=%d\n", vmsg->hdr.id); - err = -EINVAL; - break; - } - exit: - dprintf("< err=%d\n", err); - return err; -} - -/** Accept an incoming tcp connection from a peer vnetd. - * - * @param sock tcp socket - * @return 0 on success, error code otherwise - */ -int vnetd_accept(Vnetd *vnetd, Conn *conn){ - Conn *new_conn = NULL; - struct sockaddr_in peer_in; - struct sockaddr *peer = (struct sockaddr *)&peer_in; - socklen_t peer_n = sizeof(peer_in); - int peersock; - int err = 0; - - //dprintf(">\n"); - new_conn = Conn_new(conn_handle_fn, vnetd); - //dprintf("> accept...\n"); - peersock = accept(conn->sock, peer, &peer_n); - //dprintf("> accept=%d\n", peersock); - if(peersock < 0){ - perror("accept"); - err = -errno; - goto exit; - } - iprintf("> Accepted connection from %s:%d\n", - inet_ntoa(peer_in.sin_addr), htons(peer_in.sin_port)); - err = Conn_init(new_conn, peersock, SOCK_STREAM, peer_in); - if(err) goto exit; - connections_add(vnetd, new_conn); - exit: - if(err){ - Conn_close(new_conn); - } - if(err < 0) wprintf("< err=%d\n", err); - return err; -} - -/** Connect to a peer vnetd. - * - * @param vnetd vnetd - * @param addr address - * @param port port - * @return 0 on success, error code otherwise - */ -int vnetd_connect(Vnetd *vnetd, struct in_addr addr, uint16_t port){ - Conn *conn = NULL; - int err = 0; - - //dprintf(">\n"); - conn = Conn_new(conn_handle_fn, vnetd); - err = Conn_connect(conn, SOCK_STREAM, addr, port); - if(err) goto exit; - connections_add(vnetd, conn); - exit: - if(err){ - Conn_close(conn); - } - //dprintf(" < err=%d\n", err); - return err; -} - -/** Handle a message on the udp socket. - * Expecting to see VARP messages only. - * - * @param sock udp socket - * @return 0 on success, error code otherwise - */ -int vnetd_handle_udp(Vnetd *vnetd, Conn *conn){ - int err = 0, rcv = 0; - struct sockaddr_in self_in; - struct sockaddr_in peer_in; - struct sockaddr *peer = (struct sockaddr *)&peer_in; - socklen_t peer_n = sizeof(peer_in); - VnetMsg *vmsg = NULL; - void *data; - int data_n; - int flags = 0; - IPMessage *msg = NULL; - - //dprintf(">\n"); - self_in = vnetd->addr; - vmsg = ALLOCATE(VnetMsg); - data = &vmsg->varp.varph; - data_n = sizeof(VarpHdr); - rcv = recvfrom(conn->sock, data, data_n, flags, peer, &peer_n); - if(rcv < 0){ - err = rcv; - goto exit; - } - dprintf("> Received %d bytes from %s:%d\n", - rcv, inet_ntoa(peer_in.sin_addr), htons(peer_in.sin_port)); - if(rcv != data_n){ - err = -EINVAL; - goto exit; - } - if(peer_in.sin_addr.s_addr == self_in.sin_addr.s_addr){ - //dprintf("> Ignoring message from self.\n"); - goto exit; - } - msg = ALLOCATE(IPMessage); - msg->conn = conn; - msg->saddr = peer_in; - msg->data = vmsg; - - err = vcache_handle_message(msg, 1); - exit: - //dprintf("< err=%d\n", err); - return err; -} - -/** Handle a message on a raw socket. - * Only deals with etherip and esp. - * Forwards messages to peers. - * - * @param vnetd vnetd - * @param sock socket - * @param protocol protocol - * @return 0 on success, error code otherwise - */ -int vnetd_handle_protocol(Vnetd *vnetd, int sock, int protocol){ - int err = 0, rcv = 0; - struct sockaddr_in self_in; - struct sockaddr_in peer_in; - struct sockaddr *peer = (struct sockaddr *)&peer_in; - socklen_t peer_n = sizeof(peer_in); - uint8_t buf[VNET_FWD_MAX]; - int buf_n = sizeof(buf); - char *data, *end; - int flags = 0; - struct iphdr *iph = NULL; - - //dprintf(">\n"); - self_in = vnetd->addr; - rcv = recvfrom(sock, buf, buf_n, flags, peer, &peer_n); - if(rcv < 0){ - err = rcv; - goto exit; - } - dprintf("> Received %d bytes from %s protocol=%d\n", - rcv, inet_ntoa(peer_in.sin_addr), protocol); - if(rcv < sizeof(struct iphdr)){ - wprintf("> Message too short for IP header\n"); - err = -EINVAL; - goto exit; - } - if(peer_in.sin_addr.s_addr == self_in.sin_addr.s_addr){ - dprintf("> Ignoring message from self.\n"); - goto exit; - } - data = buf; - end = buf + rcv; - iph = (void*)data; - data += (iph->ihl << 2); - vnetd_forward_peers(vnetd, protocol, data, end - data); - exit: - //dprintf("< err=%d\n", err); - return err; -} - -/** Socket select loop. - * Accepts connections on the tcp socket and handles - * messages on the other sockets. - * - * @return 0 on success, error code otherwise - */ -int vnetd_select(Vnetd *vnetd){ - int err = 0; - SelectSet set = {}; - while(1){ - SelectSet_zero(&set); - SelectSet_add_read(&set, vnetd->udp_conn->sock); - SelectSet_add_read(&set, vnetd->bcast_conn->sock); - SelectSet_add_read(&set, vnetd->etherip_sock); - SelectSet_add_read(&set, vnetd->esp_sock); - SelectSet_add_read(&set, vnetd->listen_conn->sock); - connections_select(vnetd, &set); - err = SelectSet_select(&set, NULL); - if(err == 0) continue; - if(err < 0){ - if(errno == EINTR){ - if(timer_alarms){ - timer_alarms = 0; - process_timers(); - } - continue; - } - perror("select"); - goto exit; - } - if(FD_ISSET(vnetd->udp_conn->sock, &set.rd)){ - vnetd_handle_udp(vnetd, vnetd->udp_conn); - } - if(FD_ISSET(vnetd->bcast_conn->sock, &set.rd)){ - vnetd_handle_udp(vnetd, vnetd->bcast_conn); - } - if(FD_ISSET(vnetd->etherip_sock, &set.rd)){ - vnetd_handle_protocol(vnetd, vnetd->etherip_sock, IPPROTO_ETHERIP); - } - if(FD_ISSET(vnetd->esp_sock, &set.rd)){ - vnetd_handle_protocol(vnetd, vnetd->esp_sock, IPPROTO_ESP); - } - connections_handle(vnetd, &set); - if(FD_ISSET(vnetd->listen_conn->sock, &set.rd)){ - vnetd_accept(vnetd, vnetd->listen_conn); - } - } - exit: - return err; -} - -/** Set socket option to reuse address. - */ -int setsock_reuse(int sock, int reuse){ - int err = 0; - err = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); - if(err < 0){ - err = -errno; - perror("setsockopt SO_REUSEADDR"); - } - return err; -} - -/** Set socket broadcast option. - */ -int setsock_broadcast(int sock, int bcast){ - int err = 0; - err = setsockopt(sock, SOL_SOCKET, SO_BROADCAST, &bcast, sizeof(bcast)); - if(err < 0){ - err = -errno; - perror("setsockopt SO_BROADCAST"); - } - return err; -} - -/** Join a socket to a multicast group. - */ -int setsock_multicast(int sock, uint32_t saddr){ - int err = 0; - struct ip_mreqn mreq = {}; - int mloop = 0; - // See 'man 7 ip' for these options. - mreq.imr_multiaddr.s_addr = saddr; // IP multicast address. - mreq.imr_address = vnetd->addr.sin_addr; // Interface IP address. - mreq.imr_ifindex = 0; // Interface index (0 means any). - err = setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &mloop, sizeof(mloop)); - if(err < 0){ - err = -errno; - perror("setsockopt IP_MULTICAST_LOOP"); - goto exit; - } - err = setsockopt(sock, SOL_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); - if(err < 0){ - err = -errno; - perror("setsockopt IP_ADD_MEMBERSHIP"); - goto exit; - } - exit: - return err; -} - -/** Set a socket's multicast ttl (default is 1). - */ -int setsock_multicast_ttl(int sock, uint8_t ttl){ - int err = 0; - err = setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); - if(err < 0){ - err = -errno; - perror("setsockopt IP_MULTICAST_TTL"); - } - return err; -} - - -char * socket_flags(int flags){ - static char s[6]; - int i = 0; - s[i++] = (flags & VSOCK_CONNECT ? 'c' : '-'); - s[i++] = (flags & VSOCK_BIND ? 'b' : '-'); - s[i++] = (flags & VSOCK_REUSE ? 'r' : '-'); - s[i++] = (flags & VSOCK_BROADCAST ? 'B' : '-'); - s[i++] = (flags & VSOCK_MULTICAST ? 'M' : '-'); - s[i++] = '\0'; - return s; -} - -/** Create a socket. - * The flags can include VSOCK_REUSE, VSOCK_BROADCAST, VSOCK_CONNECT. - * - * @param socktype socket type - * @param saddr address - * @param port port - * @param flags flags - * @param val return value for the socket connection - * @return 0 on success, error code otherwise - */ -int create_socket(int socktype, uint32_t saddr, uint32_t port, int flags, Conn **val){ - int err = 0; - int sock = 0; - struct sockaddr_in addr_in; - struct sockaddr *addr = (struct sockaddr *)&addr_in; - socklen_t addr_n = sizeof(addr_in); - Conn *conn = NULL; - int reuse, bcast; - - //dprintf(">\n"); - reuse = (flags & VSOCK_REUSE); - bcast = (flags & VSOCK_BROADCAST); - addr_in.sin_family = AF_INET; - addr_in.sin_addr.s_addr = saddr; - addr_in.sin_port = port; - dprintf("> flags=%s addr=%s port=%d\n", socket_flags(flags), - inet_ntoa(addr_in.sin_addr), ntohs(addr_in.sin_port)); - - sock = socket(AF_INET, socktype, 0); - if(sock < 0){ - err = -errno; - goto exit; - } - if(reuse){ - err = setsock_reuse(sock, reuse); - if(err < 0) goto exit; - } - if(bcast){ - err = setsock_broadcast(sock, bcast); - if(err < 0) goto exit; - } - if(flags & VSOCK_MULTICAST){ - err = setsock_multicast(sock, saddr); - if(err < 0) goto exit; - } - if(flags & VSOCK_CONNECT){ - err = connect(sock, addr, addr_n); - if(err < 0){ - err = -errno; - perror("connect"); - goto exit; - } - } - if(flags & VSOCK_BIND){ - err = bind(sock, addr, addr_n); - if(err < 0){ - err = -errno; - perror("bind"); - goto exit; - } - } - conn = Conn_new(NULL, NULL); - Conn_init(conn, sock, socktype, addr_in); - { - struct sockaddr_in self = {}; - socklen_t self_n; - getsockname(conn->sock, (struct sockaddr *)&self, &self_n); - dprintf("> sockname sock=%d addr=%s port=%d\n", - conn->sock, inet_ntoa(self.sin_addr), ntohs(self.sin_port)); - } - exit: - *val = (err ? NULL : conn); - //dprintf("< err=%d\n", err); - return err; -} - -/** Create the tcp listen socket. - * - * @param vnetd program arguments - * @param val return value for the socket - * @return 0 on success, error code otherwise - */ -int vnetd_listen_conn(Vnetd *vnetd, Conn **val){ - int err = 0; - int flags = VSOCK_BIND | VSOCK_REUSE; - //dprintf(">\n"); - err = create_socket(SOCK_STREAM, INADDR_ANY, vnetd->peer_port, flags, val); - if(err) goto exit; - err = listen((*val)->sock, 5); - if(err < 0){ - err = -errno; - perror("listen"); - goto exit; - } - exit: - if(err && *val){ - Conn_close(*val); - *val = NULL; - } - //dprintf("< err=%d\n", err); - return err; -} - -/** Create the udp socket. - * - * @param vnetd program arguments - * @param val return value for the socket - * @return 0 on success, error code otherwise - */ -int vnetd_udp_conn(Vnetd *vnetd, Conn **val){ - int err = 0; - uint32_t addr = INADDR_ANY; - uint16_t port = vnetd->port; - int flags = (VSOCK_BIND | VSOCK_REUSE); - err = create_socket(SOCK_DGRAM, addr, port, flags, val); - return err; -} - -/** Create the broadcast socket. - * - * @param vnetd program arguments - * @param val return value for the socket - * @return 0 on success, error code otherwise - */ -int vnetd_broadcast_conn(Vnetd *vnetd, Conn **val){ - int err = 0; - uint32_t addr = vnetd_mcast_addr(vnetd); - uint16_t port = vnetd_mcast_port(vnetd); - int flags = VSOCK_REUSE; - int multicast = IN_MULTICAST(ntohl(addr)); - - flags |= VSOCK_MULTICAST; - flags |= VSOCK_BROADCAST; - - err = create_socket(SOCK_DGRAM, addr, port, flags, val); - if(err < 0) goto exit; - if(multicast){ - err = setsock_multicast_ttl((*val)->sock, 1); - if(err < 0) goto exit; - } - if(0){ - struct sockaddr * addr = (struct sockaddr *)&vnetd->addr; - socklen_t addr_n = sizeof(vnetd->addr); - dprintf("> sock=%d bind addr=%s:%d\n", - (*val)->sock, inet_ntoa(vnetd->addr.sin_addr), ntohs(vnetd->addr.sin_port)); - err = bind((*val)->sock, addr, addr_n); - if(err < 0){ - err = -errno; - perror("bind"); - goto exit; - } - } - if(0){ - struct sockaddr_in self = {}; - socklen_t self_n; - getsockname((*val)->sock, (struct sockaddr *)&self, &self_n); - dprintf("> sockname sock=%d addr=%s port=%d\n", - (*val)->sock, inet_ntoa(self.sin_addr), ntohs(self.sin_port)); - } - exit: - return err; -} - -/** Type for signal handling functions. */ -typedef void SignalAction(int code, siginfo_t *info, void *data); - -/** Handle SIGCHLD by getting child exit status. - * This prevents child processes being defunct. - * - * @param code signal code - * @param info signal info - * @param data - */ -static void sigaction_SIGCHLD(int code, siginfo_t *info, void *data){ - int status; - pid_t pid; - pid = wait(&status); - dprintf("> child pid=%d status=%d\n", pid, status); -} - -/** Handle SIGPIPE. - * - * @param code signal code - * @param info signal info - * @param data - */ -static void sigaction_SIGPIPE(int code, siginfo_t *info, void *data){ - dprintf("> SIGPIPE\n"); -} - -/** Handle SIGALRM. - * - * @param code signal code - * @param info signal info - * @param data - */ -static void sigaction_SIGALRM(int code, siginfo_t *info, void *data){ - //dprintf("> SIGALRM\n"); - timer_alarms++; -} - -/** Install a handler for a signal. - * - * @param signum signal - * @param action handler - * @return 0 on success, error code otherwise - */ -static int catch_signal(int signum, SignalAction *action){ - int err = 0; - struct sigaction sig = {}; - sig.sa_sigaction = action; - sig.sa_flags = SA_SIGINFO; - err = sigaction(signum, &sig, NULL); - if(err){ - perror("sigaction"); - } - return err; -} - -/** Create a raw socket. - * - * @param protocol protocol - * @param flags flags - * @param sock return value for the socket - */ -int vnetd_raw_socket(int protocol, int flags, uint32_t mcaddr, int *sock){ - int err; - int bcast = (flags & VSOCK_BROADCAST); - //dprintf("> protocol=%d\n", protocol); - err = *sock = socket(AF_INET, SOCK_RAW, protocol); - if(err < 0){ - err = -errno; - perror("socket"); - goto exit; - } - if(bcast){ - err = setsock_broadcast(*sock, bcast); - if(err < 0) goto exit; - } - if(flags & VSOCK_MULTICAST){ - err = setsock_multicast(*sock, mcaddr); - if(err < 0) goto exit; - } - exit: - //dprintf("< err=%d\n", err); - return err; -} - -/** Connect to peer vnetds. - * - * @param vnetd vnetd - * @return 0 on success, error code otherwise - */ -int vnetd_peers(Vnetd *vnetd){ - int err =0; - Sxpr x, l; - struct in_addr addr = {}; - for(l = vnetd->peers; CONSP(l); l = CDR(l)){ - x = CAR(l); - addr.s_addr = OBJ_INT(x); - vnetd_connect(vnetd, addr, vnetd->peer_port); - } - return err; -} - -/** Vnet daemon main program. - * - * @param vnetd program arguments - * @return 0 on success, error code otherwise - */ -int vnetd_main(Vnetd *vnetd){ - int err = 0; - - //dprintf(">\n"); - err = get_self_addr(&vnetd->addr); - vnetd->addr.sin_port = vnetd->port; - iprintf("> VNETD\n"); - iprintf("> addr=%s port=%u\n", - inet_ntoa(vnetd->addr.sin_addr), htons(vnetd->port)); - iprintf("> mcaddr=%s port=%u\n", - inet_ntoa(vnetd->mcast_addr.sin_addr), htons(vnetd->port)); - iprintf("> peers port=%u ", htons(vnetd->peer_port)); - objprint(iostdout, vnetd->peers, 0); printf("\n"); - - err = vcache_init(); - err = vnetd_peers(vnetd); - - catch_signal(SIGCHLD,sigaction_SIGCHLD); - catch_signal(SIGPIPE,sigaction_SIGPIPE); - catch_signal(SIGALRM,sigaction_SIGALRM); - err = vnetd_listen_conn(vnetd, &vnetd->listen_conn); - if(err < 0) goto exit; - err = vnetd_udp_conn(vnetd, &vnetd->udp_conn); - if(err < 0) goto exit; - err = vnetd_broadcast_conn(vnetd, &vnetd->bcast_conn); - if(err < 0) goto exit; - { - int flags = (VSOCK_BROADCAST | VSOCK_MULTICAST); - uint32_t mcaddr = vnetd->mcast_addr.sin_addr.s_addr; - - err = vnetd_raw_socket(IPPROTO_ETHERIP, flags, mcaddr, &vnetd->etherip_sock); - if(err < 0) goto exit; - err = vnetd_raw_socket(IPPROTO_ESP, flags, mcaddr, &vnetd->esp_sock); - if(err < 0) goto exit; - } - err = vnetd_select(vnetd); - exit: - Conn_close(vnetd->listen_conn); - Conn_close(vnetd->udp_conn); - Conn_close(vnetd->bcast_conn); - connections_close_all(vnetd); - close(vnetd->etherip_sock); - close(vnetd->esp_sock); - //dprintf("< err=%d\n", err); - return err; -} - -/** Parse command-line arguments and call the vnetd main program. - * - * @param arg argument count - * @param argv arguments - * @return 0 on success, 1 otherwise - */ -extern int main(int argc, char *argv[]){ +static int vnetd_getopts(Vnetd *vnetd, int argc, char *argv[]){ int err = 0; int key = 0; int long_index = 0; - vnetd_set_defaults(vnetd); while(1){ key = getopt_long(argc, argv, short_opts, long_opts, &long_index); if(key == -1) break; switch(key){ - case OPT_ADDR:{ + case OPT_MCADDR: { unsigned long addr; - err = get_host_address(optarg, &addr); + err = get_inet_addr(optarg, &addr); if(err) goto exit; - vnetd->mcast_addr.sin_addr.s_addr = addr; + vnetd_set_mcast_addr(vnetd, addr); break; } - case OPT_PORT: - err = convert_service_to_port(optarg, &vnetd->port); + case OPT_FILE: + err = vnetd_configure(vnetd, optarg); if(err) goto exit; break; - case OPT_PEER:{ - unsigned long addr; - err = get_host_address(optarg, &addr); - if(err) goto exit; - //cons_push(&vnetd->peers, mkaddress(addr)); - cons_push(&vnetd->peers, mkint(addr)); - break; } case OPT_HELP: usage(0); break; case OPT_VERBOSE: - vnetd->verbose = TRUE; + vnetd->verbose = true; break; case OPT_VERSION: iprintf("> %s %s\n", PROGRAM, VERSION); @@ -1223,10 +756,445 @@ break; } } + exit: + return err; +} + +/** Initialise vnetd params. + * + * @param vnetd vnetd + */ +int vnetd_init(Vnetd *vnetd, int argc, char *argv[]){ + int err = 0; + + // Use etherip-in-udp encapsulation. + etherip_in_udp = true; + + *vnetd = (Vnetd){}; + vnetd->port = htons(VARP_PORT); + vnetd->verbose = false; + vnetd->ttl = 1; // Default multicast ttl. + vnetd->etherip = true; + vnetd->udp_sock = -1; + vnetd->mcast_sock = -1; + vnetd->etherip_sock = -1; + vnetd_set_mcast_addr(vnetd, htonl(VARP_MCAST_ADDR)); + vnetd->mcast_addr.sin_port = vnetd->port; + vnetd->unix_path = "/tmp/vnetd"; + + vnetd_getopts(vnetd, argc, argv); + + err = get_self_addr(&vnetd->ucast_addr); + vnetd->ucast_addr.sin_port = vnetd->port; + dprintf("> mcaddr=%s\n", inet_ntoa(vnetd->mcast_addr.sin_addr)); + dprintf("> addr =%s\n", inet_ntoa(vnetd->ucast_addr.sin_addr)); + return err; +} + +void vnet_select(Vnetd *vnetd, SelectSet *set){ + HashTable_for_decl(entry); + + HashTable_for_each(entry, vnetd->vnet_table){ + Vnet *vnet = entry->value; + struct net_device *dev = vnet->dev; + if(!dev) continue; + if(dev->tapfd < 0) continue; + SelectSet_add(set, dev->tapfd, SELECT_READ); + } +} + +void vnet_handle(Vnetd *vnetd, SelectSet *set){ + HashTable_for_decl(entry); + + HashTable_for_each(entry, vnetd->vnet_table){ + Vnet *vnet = entry->value; + struct net_device *dev = vnet->dev; + if(!dev) continue; + if(dev->tapfd < 0) continue; + if(SelectSet_in_read(set, dev->tapfd)){ + int n; + for(n = 64; n > 0; --n){ + if(vnet_read(vnet) < 0) break; + } + } + } +} + +int vnetd_handle_udp(Vnetd *vnetd, struct sockaddr_in *addr, int sock){ + int err = 0, n = 0; + struct sockaddr_in peer, dest; + socklen_t peer_n = sizeof(peer), dest_n = sizeof(dest); + int flags = MSG_DONTWAIT; + struct sk_buff *skb = NULL; + + dest = *addr; + n = skb_recv_udp(sock, flags, &peer, &peer_n, &dest, &dest_n, &skb); + if(n < 0){ + err = n; + goto exit; + } + dprintf("> Received %d bytes from=%s:%d dest=%s:%d\n", + n, + inet_ntoa(peer.sin_addr), htons(peer.sin_port), + inet_ntoa(dest.sin_addr), htons(dest.sin_port)); + if(peer.sin_addr.s_addr == vnetd_intf_addr(vnetd)){ + dprintf("> Ignoring message from self.\n"); + goto exit; + } + if(dest.sin_addr.s_addr == vnetd_mcast_addr(vnetd)){ + vnet_forward_send(skb); + } + err = varp_handle_message(skb); + + exit: + if(skb) kfree_skb(skb); + return err; +} + +int vnetd_handle_etherip(Vnetd *vnetd, struct sockaddr_in *addr, int sock){ + int err = 0, n = 0; + struct sockaddr_in peer, dest; + socklen_t peer_n = sizeof(peer), dest_n = sizeof(dest); + int flags = 0; + struct sk_buff *skb = NULL; + + dest = *addr; + n = skb_recv_raw(sock, flags, &peer, &peer_n, &dest, &dest_n, &skb); + if(n < 0){ + err = n; + goto exit; + } + dprintf("> Received %d bytes from=%s:%d dest=%s:%d\n", + n, + inet_ntoa(peer.sin_addr), htons(peer.sin_port), + inet_ntoa(dest.sin_addr), htons(dest.sin_port)); + if(peer.sin_addr.s_addr == vnetd_intf_addr(vnetd)){ + dprintf("> Ignoring message from self.\n"); + goto exit; + } + err = etherip_protocol_recv(skb); + exit: + if(skb) kfree_skb(skb); + return err; +} + +typedef struct ConnClient { + Vnetd *vnetd; + Parser *parser; +} ConnClient; + +int conn_handle_fn(Conn *conn, int mode){ + int err; + ConnClient *client = conn->data; + char data[1024] = {}; + int k; + int done = false; + + k = IOStream_read(conn->in, data, sizeof(data)); + if(k < 0){ + err = k; + goto exit; + } + if(!client->parser){ + err = -ENOSYS; + goto exit; + } + if((k == 0) && Parser_at_eof(client->parser)){ + err = -EINVAL; + goto exit; + } + err = Parser_input(client->parser, data, k); + if(err < 0) goto exit; + while(Parser_ready(client->parser)){ + Sxpr sxpr = Parser_get_val(client->parser); + err = vnet_eval(sxpr, conn->out, NULL); + if(err) goto exit; + done = true; + } + if(done || Parser_at_eof(client->parser)){ + // Close at EOF. + err = -EIO; + } + exit: + if(err < 0){ + Parser_free(client->parser); + client->parser = NULL; + } + return (err < 0 ? err : 0); +} + +int vnetd_handle_unix(Vnetd *vnetd, int sock){ + int err; + ConnClient *client = NULL; + Conn *conn = NULL; + struct sockaddr_un peer = {}; + int peer_n = sizeof(peer); + int peersock; + + peersock = accept(sock, (struct sockaddr *)&peer, &peer_n); + if(peersock < 0){ + perror("accept"); + err = -errno; + goto exit; + } + // We want non-blocking i/o. + fcntl(peersock, F_SETFL, O_NONBLOCK); + client = ALLOCATE(ConnClient); + client->vnetd = vnetd; + client->parser = Parser_new(); + conn = Conn_new(conn_handle_fn, client); + err = Conn_init(conn, peersock, SOCK_STREAM, SELECT_READ, + (struct sockaddr_in){}); + if(err) goto exit; + vnetd->conns = ConnList_add(vnetd->conns, conn); + exit: + if(err){ + Conn_close(conn); + close(peersock); + } + if(err < 0) wprintf("< err=%d\n", err); + return err; +} + +void vnetd_select(Vnetd *vnetd, SelectSet *set){ + SelectSet_add(set, vnetd->unix_sock, SELECT_READ); + SelectSet_add(set, vnetd->udp_sock, SELECT_READ); + SelectSet_add(set, vnetd->mcast_sock, SELECT_READ); + if(vnetd->etherip_sock >= 0){ + SelectSet_add(set, vnetd->etherip_sock, SELECT_READ); + } + vnet_select(vnetd, set); + ConnList_select(vnetd->conns, set); +} + +void vnetd_handle(Vnetd *vnetd, SelectSet *set){ + if(SelectSet_in_read(set, vnetd->unix_sock)){ + vnetd_handle_unix(vnetd, vnetd->unix_sock); + } + if(SelectSet_in_read(set, vnetd->udp_sock)){ + int n; + + for(n = 256; n > 0; --n){ + if(vnetd_handle_udp(vnetd, &vnetd->udp_sock_addr, vnetd->udp_sock) < 0){ + break; + } + } + } + if(SelectSet_in_read(set, vnetd->mcast_sock)){ + vnetd_handle_udp(vnetd, &vnetd->mcast_sock_addr, vnetd->mcast_sock); + } + if((vnetd->etherip_sock >= 0) && + SelectSet_in_read(set, vnetd->etherip_sock)){ + vnetd_handle_etherip(vnetd, &vnetd->etherip_sock_addr, vnetd->etherip_sock); + } + vnet_handle(vnetd, set); + vnetd->conns = ConnList_handle(vnetd->conns, set); +} + +/** Counter for timer alarms. + */ +static unsigned timer_alarms = 0; + +int vnetd_main(Vnetd *vnetd){ + int err = 0; + SelectSet _set = {}, *set = &_set; + struct timeval _timeout = {}, *timeout = &_timeout; + + vnetd->vnet_table = vnet_table; + + for( ; ; ){ + timeout->tv_sec = 0; + timeout->tv_usec = 500000; + SelectSet_zero(set); + vnetd_select(vnetd, set); + err = SelectSet_select(set, timeout); + if(err == 0) continue; + if(err < 0){ + switch(errno){ + case EINTR: + if(timer_alarms){ + timer_alarms = 0; + process_timers(); + } + continue; + case EBADF: + continue; + default: + perror("select"); + goto exit; + } + } + vnetd_handle(vnetd, set); + } + exit: + return err; +} + +int getsockaddr(int sock, struct sockaddr_in *addr){ + socklen_t addr_n = sizeof(struct sockaddr_in); + return getsockname(sock, (struct sockaddr*)addr, &addr_n); +} + +int vnetd_etherip_sock(Vnetd *vnetd){ + int err = 0; + + if(!vnetd->etherip) goto exit; + err = vnetd_raw_socket(vnetd, IPPROTO_ETHERIP, + (VSOCK_BROADCAST | VSOCK_MULTICAST), + vnetd_mcast_addr(vnetd), + &vnetd->etherip_sock); + if(err < 0) goto exit; + err = setsock_pktinfo(vnetd->etherip_sock, true); + if(err < 0) goto exit; + getsockaddr(vnetd->etherip_sock, &vnetd->etherip_sock_addr); + exit: + return err; +} + +int vnetd_udp_sock(Vnetd *vnetd){ + int err; + uint32_t mcaddr = vnetd_mcast_addr(vnetd); + + err = create_socket(SOCK_DGRAM, INADDR_ANY, vnetd->port, + (VSOCK_BIND | VSOCK_REUSE), + &vnetd->udp_sock); + if(err < 0) goto exit; + err = setsock_pktinfo(vnetd->udp_sock, true); + if(err < 0) goto exit; + getsockaddr(vnetd->udp_sock, &vnetd->udp_sock_addr); + vnetd->mcast_sock_addr.sin_addr.s_addr = vnetd_intf_addr(vnetd); + + err = create_socket(SOCK_DGRAM, mcaddr, vnetd_mcast_port(vnetd), + (VSOCK_REUSE | VSOCK_BROADCAST | VSOCK_MULTICAST), + &vnetd->mcast_sock); + if(err < 0) goto exit; + err = setsock_pktinfo(vnetd->udp_sock, true); + if(err < 0) goto exit; + err = setsock_multicast(vnetd->mcast_sock, INADDR_ANY, mcaddr); + if(err < 0) goto exit; + err = setsock_multicast_ttl(vnetd->mcast_sock, vnetd->ttl); + if(err < 0) goto exit; + getsockaddr(vnetd->mcast_sock, &vnetd->mcast_sock_addr); + vnetd->mcast_sock_addr.sin_addr.s_addr = mcaddr; + + exit: + if(err < 0){ + close(vnetd->udp_sock); + close(vnetd->mcast_sock); + vnetd->udp_sock = -1; + vnetd->mcast_sock = -1; + } + return err; +} + +int vnetd_raw_sock(Vnetd *vnetd){ + int err; + + err = vnetd_raw_socket(vnetd, IPPROTO_RAW, + (VSOCK_BROADCAST), + vnetd_mcast_addr(vnetd), + &vnetd->raw_sock); + if(err){ + close(vnetd->raw_sock); + vnetd->raw_sock = -1; + } + return err; +} + +int vnetd_unix_sock(Vnetd *vnetd){ + int err = 0; + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + socklen_t addr_n; + + vnetd->unix_sock = socket(addr.sun_family, SOCK_STREAM, 0); + if(vnetd->unix_sock < 0){ + err = -errno; + perror("unix socket"); + goto exit; + } + unlink(vnetd->unix_path); + strcpy(addr.sun_path, vnetd->unix_path); + addr_n = sizeof(addr) - sizeof(addr.sun_path) + strlen(vnetd->unix_path) + 1; + err = bind(vnetd->unix_sock, (struct sockaddr *)&addr, addr_n); + if(err < 0){ + err = -errno; + perror("unix bind"); + goto exit; + } + err = listen(vnetd->unix_sock, 5); + if(err < 0){ + err = -errno; + perror("unix listen"); + } + exit: + return err; +} + +/** Handle SIGPIPE. + * + * @param code signal code + * @param info signal info + * @param data + */ +static void sigaction_SIGPIPE(int code, siginfo_t *info, void *data){ + dprintf("> SIGPIPE\n"); +} + +/** Handle SIGALRM. + * + * @param code signal code + * @param info signal info + * @param data + */ +static void sigaction_SIGALRM(int code, siginfo_t *info, void *data){ + timer_alarms++; +} + +/** Type for signal handling functions. */ +typedef void SignalAction(int code, siginfo_t *info, void *data); + +/** Install a handler for a signal. + * + * @param signum signal + * @param action handler + * @return 0 on success, error code otherwise + */ +static int catch_signal(int signum, SignalAction *action){ + int err = 0; + struct sigaction sig = {}; + dprintf(">\n"); + sig.sa_sigaction = action; + sig.sa_flags = SA_SIGINFO; + err = sigaction(signum, &sig, NULL); + if(err){ + err = -errno; + perror("sigaction"); + } + return err; +} + +int main(int argc, char *argv[]){ + int err = 0; + + err = tunnel_module_init(); + if(err < 0) goto exit; + err = vnet_init(); + if(err < 0) goto exit; + err = vnetd_init(vnetd, argc, argv); + if(err < 0) goto exit; + err = catch_signal(SIGPIPE, sigaction_SIGPIPE); + if(err < 0) goto exit; + err = catch_signal(SIGALRM, sigaction_SIGALRM); + if(err < 0) goto exit; + err = vnetd_etherip_sock(vnetd); + if(err < 0) goto exit; + err = vnetd_udp_sock(vnetd); + if(err < 0) goto exit; + err = vnetd_raw_sock(vnetd); + if(err < 0) goto exit; + err = vnetd_unix_sock(vnetd); + if(err < 0) goto exit; err = vnetd_main(vnetd); - exit: - if(err && key > 0){ - eprintf("> Error in arg %c\n", key); - } +exit: return (err ? 1 : 0); } diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/doc/Makefile --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/doc/Makefile Thu Feb 9 15:12:11 2006 @@ -0,0 +1,50 @@ +#!/usr/bin/make -f +# -*- mode: Makefile; -*- + +VERSION = 1.0 +HEADER = Vnet + +INSTALL = install +INSTALL_DIR = $(INSTALL) -d -m0755 + +PS2PDF := ps2pdf +DVIPS := dvips +LATEX := latex +LATEX2HTML := latex2html +DOXYGEN := doxygen +POD2MAN := pod2man + +MAN_DIR := /usr/share/man + +DOC_MAN5SRC := $(wildcard man/*.pod.5) +DOC_MAN1SRC := $(wildcard man/*.pod.1) +DOC_MAN1 := $(patsubst man/%.pod.1,man1/%.1,$(DOC_MAN1SRC)) +DOC_MAN5 := $(patsubst man/%.pod.5,man5/%.5,$(DOC_MAN5SRC)) + +.PHONY: all man clean install + +all: man + +man: + @if which $(POD2MAN) 1>/dev/null 2>/dev/null; then \ + $(MAKE) $(DOC_MAN1) $(DOC_MAN5); fi + +man1/%.1: man/%.pod.1 Makefile + $(INSTALL_DIR) $(@D) + $(POD2MAN) --release=$(VERSION) --name=`echo $@ | sed 's/^man1.//'| \ + sed 's/.1//'` -s 1 -c $(HEADER) $< $@ + +man5/%.5: man/%.pod.5 Makefile + $(INSTALL_DIR) $(@D) + $(POD2MAN) --release=$(VERSION) --name=`echo $@ | sed 's/^man5.//'| \ + sed 's/.5//'` -s 5 -c $(HEADER) $< $@ + +clean: + @$(RM) -rf man5 + @$(RM) -rf man1 + + install: all + $(INSTALL_DIR) $(DESTDIR)$(MAN_DIR) + $(CP) -dR man1 $(DESTDIR)$(MAN_DIR) + $(CP) -dR man5 $(DESTDIR)$(MAN_DIR) + diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/doc/man/vn.pod.1 --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/doc/man/vn.pod.1 Thu Feb 9 15:12:11 2006 @@ -0,0 +1,176 @@ +=head1 NAME + +vn - Vnet (virtual networking) management utility. + +=head1 SYNOPSIS + +vn <command> [args] + +=head1 DESCRIPTION + +The B<vn> utility manages vnets, virtual networks for virtual machines. +Before using vnets, the vnet kernel module must be installed or +the user-space daemon vnetd must be running. Using the kernel module is recommended, +see the B<insmod> command below. + +A vnet is a virtual network that behaves like a private LAN, transporting +Ethernet frames. Each vnet is identified by a 128-bit vnet id and +has a network device that interfaces to it. Ethernet packets written +to the device are encapsulated and sent to the network. +Received vnet packets are decapsulated and delivered from the device +corresponding to their vnet id. The default encapsulation uses UDP on port 1798. + +Usually each vnet device is enslaved to a corresponding bridge, and virtual +machine interfaces are attached to vnets by enslaving them to the bridge. +Each vnet behaves like a private LAN: traffic on one vnet is not visible +on other vnets, and interfaces on a vnet cannot see traffic on the +physical network. + +Vnets can be connected together into larger networks +by direct bridging or packet forwarding, or by using multihomed vms +with interfaces on several vnets, or vnets and the physical network. +As vnet interfaces are discovered dynamically, vnet connectivity is maintained +if a vm using a vnet is migrated from one physical machine to another. + +In the commands vnet ids can be given in two forms. Long form, as 8 4-digit hex fields +separated by colons, for example 0000:0000:0000:0000:0000:0000:0000:0004, and +short form as a hex field, for example 0004 or 4. The short form is the same as the +long form with the first 7 fields zero. Vnet id 0000:0000:0000:0000:0000:0000:0000:0001 +is reserved for the physical network and has no vnet device. + +Vnets use multicast to discover the location of virtual interfaces, by default +using multicast group 224.10.0.1. If all the machines hosting vnets are on +the same subnet, or reachable by multicast, vnets will span all the machines +automatically. If some machines are not reachable by multicast you can configure +vnets to perform multicast forwarding using UDP. + +The vnet devices are fully-functional network devices, so you can add IP addresses +to them and test connectivity without any vms running. +For example, using vnif0004 on machines A and B: + + A> ifconfig vnif0004 10.0.0.11 + B> ifconfig vnif0004 10.0.0.12 + B> ping 10.0.0.11 + +If the vnet device is enslaved to a bridge you will have to add the IP address +to the bridge instead. Use C<brctl show> or C<vn vnets> to see if a vnet +device is on a bridge. + +=over 4 + +=item B<insmod> I<[varp_mcaddr=ADDR]> + +Insert the vnet kernel module, optionally supplying the multicast +address to use, default 224.10.0.1. + +=item B<varp> + +Print varp infrormation and varp cache. + +=item B<vnets> [options] + +Print the list of vnets (virtual networks). If a vnet device is on a bridge, +also shows the bridge and its bridged interfaces. + +=over 4 + +=item B<-a | --all> + +Also print the vifs on each vnet and varp information. + +=item B<-l | --long> + +Also print the ifconfig for the vnet devices. + +=back + +=item B<vnet-create> I<[options]> I<vnetid> + +Create a vnet with the given id. The options are: + +=over 4 + +=item B<-s | --security> I<level> + +Security level, which can be one of I<none> for no security, +I<auth> for message authentication, and I<conf> for message +authentication and confidentiality. The default is no security. +Security is provided using IPSEC, but uses hard-wired keys. + +=item B<-b | --bridge> I<bridgename> + +Create a bridge for the vnet called I<bridgename> and enslave +the vnet device to it. + +=item B<-v | --vnetif> I<vnetifname> + +Use I<vnetifname> as the name for the vnet device. If this option +is not specified the default isto name the device vnifN where N +is the last field of the vnet id as 4 hex characters. +For example vnif0004. Network device names can be at +most 14 characters. + +=back + +=item B<vnet-delete> I<[options]> I<vnetid> + +Delete the vnet with the given id. The vnet device goes away too. + +=over 4 + +=item B<-b | --bridge> + +If this option is specified, delete the bridge associated with the vnet. + +=back + +=item B<vifs> + +Print the list of vifs (virtual interfaces). + +=item B<vif-add> I<[-i|-interface]> I<vnet> I<vmac> + +Add a vif to a vnet. Here I<vnet> is the vnet id and I<vmac> +is the vif's MAC address. Alternatively, I<vmac> can be the name of +a network device if the I<-i> or -I<--interface> flag is given. + +It is not usually necessary to use B<vif-add> as vnets automatically +add vifs for the MAC addresses they see. + +=item B<vif-delete> I<[-i|-interface]> I<vnet> I<vmac> + +Delete a vif from a vnet. Here I<vnet> is the vnet id and I<vmac> +is the vif's MAC address. Alternatively, I<vmac> can be the name of +a network device if the I<-i> of -I<--interface> flag is given. + +It is not usually necessary to use B<vif-delete> as vnets periodically +delete unused vifs. + +=item B<peers> + +Print the list of peer vnet machines to forward multicasts to, and accept +forwarded multicasts from. + +=item B<peer-add> I<addr> + +Add the peer with the given IP address or hostname. + +=item B<peer-delete> I<addr> + +Delete the peer with the given IP address or hostname. + +=back + +=head1 AUTHOR + +The author of vn and vnets is Mike Wray of HP Labs. Please send problems, bugs, +enhancements requests etc. to mike.wray@xxxxxxx + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2006 Mike Wray <mike.wray@xxxxxx>. + +This library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or +(at your option) any later version. \ No newline at end of file diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/scripts/Makefile --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/scripts/Makefile Thu Feb 9 15:12:11 2006 @@ -0,0 +1,18 @@ +# -*- mode: Makefile; -*- +#============================================================================ + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +INSTALL_DIR = $(INSTALL) -d -m0755 + +SBIN_DIR = $(DESTDIR)/usr/sbin + +.PHONY: all install clean + +all: + +install: + $(INSTALL_DIR) $(SBIN_DIR) + $(INSTALL_PROG) vn $(SBIN_DIR) + +clean: \ No newline at end of file diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/scripts/vn --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/scripts/vn Thu Feb 9 15:12:11 2006 @@ -0,0 +1,904 @@ +#!/usr/bin/env python2.4 +# -*- mode: python; -*- +#============================================================================ +# Copyright (C) 2005, 2006 Mike Wray <mike.wray@xxxxxx> +# +# This library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ + +# Vnet (network virtualization) control utility. + +import os +import os.path +import re +import socket +import sys +from getopt import getopt, GetoptError + +sys.path.append('/usr/lib/python') +sys.path.append('/usr/lib64/python') + +from xen.xend import sxp +from xen.xend.PrettyPrint import prettyprint + +# Path of unix-domain socket to vnetd. +VNETD_PATH = "/tmp/vnetd" + +def vnetd_running(): + return os.path.exists(VNETD_PATH) + +def vnetd_open(): + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(VNETD_PATH) + fi = sock.makefile('r', 0) + fo = sock.makefile('w', 0) + return (fi, fo) + +os.defpath += ':/sbin:/usr/sbin:/usr/local/sbin' +CMD_IFCONFIG = 'ifconfig' +CMD_BRCTL = 'brctl' + +opts = None + +class Opts: + + def __init__(self, **kwds): + for (k, v) in kwds.items(): + setattr(self, k, v) + +opts = Opts(verbose=False, dryrun=False) + +def set_opts(val): + global opts + opts = val + return opts + +def cmd(prog, *args): + """Execute command 'prog' with 'args', optionally printing the command. + """ + global opts + command = " ".join([ prog ] + map(str, args)) + if opts.verbose: + print command + if not opts.dryrun: + os.system(command) + +def vif_bridge_add(bridge, vif): + """Add a network interface to a bridge. + """ + cmd(CMD_BRCTL, 'addif', bridge, vif) + +def vif_bridge_rem(bridge, vif): + """Remove a network interface from a bridge. + """ + cmd(CMD_BRCTL, 'delif', bridge, vif) + +def bridge_create(bridge, **kwd): + """Create a bridge. + Defaults hello time to 0, forward delay to 0 and stp off. + """ + cmd(CMD_BRCTL, 'addbr', bridge) + if kwd.get('hello', None) is None: + kwd['hello'] = 0 + if kwd.get('fd', None) is None: + kwd['fd'] = 0 + if kwd.get('stp', None) is None: + kwd['stp'] = 'off' + bridge_set(bridge, **kwd) + cmd(CMD_IFCONFIG, bridge, "up") + +def bridge_set(bridge, hello=None, fd=None, stp=None): + """Set bridge parameters. + """ + if hello is not None: + cmd(CMD_BRCTL, 'sethello', bridge, hello) + if fd is not None: + cmd(CMD_BRCTL, 'setfd', bridge, fd) + if stp is not None: + cmd(CMD_BRCTL, 'stp', bridge, stp) + +def bridge_del(bridge): + """Delete a bridge. + """ + cmd(CMD_IFCONFIG, bridge, 'down') + cmd(CMD_BRCTL, 'delbr', bridge) + +class Bridge: + # Network interfaces are at /sys/class/net/*. + # A bridge interface has ./bridge dir, ./brif is dir of bridged interfaces + # (symlinks to the brport dirs). + # If an interface is bridged ./brport is bridged port info, + # brport/bridge is a symlink to the bridge. + + INTERFACE_DIR = "/sys/class/net" + + def isBridge(klass, dev): + """Test if a network interface is a bridge. + """ + devdir = os.path.join(klass.INTERFACE_DIR, dev) + brdir = os.path.join(devdir, "bridge") + try: + os.stat(brdir) + return True + except: + return False + + isBridge = classmethod(isBridge) + + def getInterfaces(klass): + """Get a list of the network interfaces. + """ + try: + v = os.listdir(klass.INTERFACE_DIR) + v.sort() + return v + except: + return [] + + getInterfaces = classmethod(getInterfaces) + + def getInterfaceAddr(klass, intf): + intfdir = os.path.join(klass.INTERFACE_DIR, intf) + addrfile = os.path.join(intfdir, "address") + try: + f = file(addrfile, "rb") + except Exception, ex: + #print ex + return None + try: + return f.readline().strip() + finally: + f.close() + + getInterfaceAddr = classmethod(getInterfaceAddr) + + def getBridges(klass): + """Get a list of the bridges. + """ + return [ dev for dev in klass.getInterfaces() if klass.isBridge(dev) ] + + getBridges = classmethod(getBridges) + + def getBridgeInterfaces(klass, dev): + """Get a list of the interfaces attached to a bridge. + """ + devdir = os.path.join(klass.INTERFACE_DIR, dev) + intfdir = os.path.join(devdir, "brif") + try: + v = os.listdir(intfdir) + v.sort() + return v + except: + return [] + + getBridgeInterfaces = classmethod(getBridgeInterfaces) + + def getBridge(klass, dev): + """Get the bridge an interface is attached to (if any). + """ + devdir = os.path.join(klass.INTERFACE_DIR, dev) + brfile = os.path.join(devdir, "brport/bridge") + try: + brpath = os.readlink(brfile) + return os.path.basename(brpath) + except: + return None + + getBridge = classmethod(getBridge) + +def vnet_cmd(expr): + """Send a command expression to the vnet implementation. + """ + if vnetd_running(): + (fi, fo) = vnetd_open() + else: + fi = None + fo = file("/proc/vnet/policy", "wb") + try: + sxp.show(expr, fo) + fo.flush() + finally: + if fi: fi.close() + if fo: fo.close() + +def varp_flush(): + """Flush the varp cache. + """ + expr = ['varp.flush'] + return vnet_cmd(expr) + +def vif_add(vnetid, vmac): + """Tell the vnet implementation to add a vif to a vnet. + """ + expr = ['vif.add', ['vnet', vnetid], ['vmac', vmac]] + return vnet_cmd(expr) + +def vif_del(vnetid, vmac): + """Tell the vnet implementation to delete a vif from a vnet. + """ + expr = ['vif.del', ['vnet', vnetid], ['vmac', vmac]] + return vnet_cmd(expr) + +def vnet_add(vnetid, vnetif=None, security=None): + """Tell the vnet implementation to add a vnet. + """ + expr = ['vnet.add', ['id', vnetid]] + if vnetif: + expr.append(['vnetif', vnetif]) + if security: + expr.append(['security', security]) + return vnet_cmd(expr) + +def peer_add(addr, port=None): + expr = ['peer.add', ['addr', addr]] + if port: + expr.append(['port', port]) + return vnet_cmd(expr) + +def peer_del(addr, port=None): + expr = ['peer.del', ['addr', addr]] + return vnet_cmd(expr) + +def vnet_del(vnetid): + """Tell the vnet implementation to delete a vnet. + """ + expr = ['vnet.del', ['id', vnetid]] + return vnet_cmd(expr) + +def vnet_create(vnetid, vnetif=None, bridge=None, security=None): + """Tell the vnet implementation to add a vnet. + If 'bridge' is non-null, create the bridge and add the vnet interface + to it. + """ + vnet_add(vnetid, vnetif=vnetif, security=security) + val = vnet_lookup(vnetid) + if not vnetif: + vnetif = sxp.child_value(val, "vnetif") + vmac = get_mac(vnetif) + emac = get_mac("eth0") or get_mac("eth1") or get_mac("eth2") + if emac and vmac != emac: + set_mac(vnetif, emac) + cmd(CMD_IFCONFIG, vnetif, 'up') + if bridge: + bridge_create(bridge) + vif_bridge_add(bridge, vnetif) + return val + +def vnet_delete(vnet, delbridge=False): + """Tell the vnet implementation to delete a vnet. + If the vnet interface is attached to a bridge, + remove it from the bridge, and if delbridge is true + delete the bridge. + """ + v = vnet_lookup(vnet) + if not v: + raise GetoptError("vnet not found: %s" % vnet) + vnetid = sxp.child_value(v, "id") + vnetif = sxp.child_value(v, "vnetif") + bridge = Bridge.getBridge(vnetif) + if bridge: + vif_bridge_rem(bridge, vnetif) + if delbridge: + bridge_del(bridge) + return vnet_del(vnetid) + +def get_mac(intf): + """Get the mac address of an interface. + """ + try: + return Bridge.getInterfaceAddr(intf) + except: + pass + + hwre = re.compile(".*\s+HWaddr\s+(?P<mac>\S*)\s+.*") + fin = os.popen("%s %s" % (CMD_IFCONFIG, intf), 'r') + try: + for x in fin: + m = hwre.match(x) + if not m: + continue + info = m.groupdict() + return info['mac'] + return None + finally: + fin.close() + +def set_mac(intf, mac): + cmd(CMD_IFCONFIG, intf, 'down') + cmd(CMD_IFCONFIG, intf, 'hw', 'ether', mac) + cmd(CMD_IFCONFIG, intf, 'up') + +def get_addr(host): + return socket.gethostbyname(host) + +def get_port(srv): + return srv + +def vnetidof(v): + """Normalise a vnet id. Adds leading 0 fields to make up 8 if + there aren't enough. Pads all fields to 4 hex digits. + """ + try: + l = v.split(":") + l = [ int(x or 0, 16) for x in l ] + l = [ 0 ] * (8 - len(l)) + l + return ":".join([ "%04x" % x for x in l ]) + except: + return None + +def vnet_lookup(vnet, vnets=None): + """Find the vnet with the given vnet id or vnet interface. + + @param vnet id or interface + @param vnets list of vnet info to use (get from implementation if None) + @return vnet info or None if not found + """ + vnetid = vnetidof(vnet) + if vnets is None: + vnets = vnet_list() + for v in vnets: + vid = sxp.child_value(v, "id") + if vid == vnet or vid == vnetid: + return v + if sxp.child_value(v, "vnetif") == vnet: + return v + return None + +def get_vnetid(vnet): + """Get the normalised vnet id of the given vnet id or vnet interface. + Raises an error if the vnet cannot be found. + """ + v = vnet_lookup(vnet) + if not v: + raise GetoptError("vnet not found: %s" % vnet) + vnetid = sxp.child_value(v, "id") + return vnetid + +def vif_list(): + """Get the list of vif info from the vnet implementation. + """ + if vnetd_running(): + (fi, fo) = vnetd_open() + sxp.show(['vif.list'], fo) + fo.flush() + else: + fi = file("/proc/vnet/vifs") + fo = None + try: + return sxp.parse(fi) or [] + finally: + if fi: fi.close() + if fo: fo.close() + +def vnets_filter(vnetlist, vnets): + """Filter a list of vnet info by a list of vnet ids or interfaces. + """ + if vnets is None: + val = vnetlist + else: + val = [] + for x in vnets: + v = vnet_lookup(x, vnets=vnetlist) + if not v: + continue + val.append(v) + return val + +def vnet_list(vnets=None): + """Get the list of vnet info from the vnet implementation, + sorted by vnet id. + + @param vnets list of vnet ids or interfaces to filter the results by + """ + if vnetd_running(): + (fi, fo) = vnetd_open() + sxp.show(['vnet.list'], fo) + fo.flush() + else: + fi = file("/proc/vnet/vnets") + fo = None + try: + val = vnets_filter(sxp.parse(fi) or [], vnets) + val.sort(lambda x, y: + cmp(sxp.child_value(x, "id"), + sxp.child_value(y, "id"))) + return val + finally: + if fi: fi.close() + if fo: fo.close() + +def vnif_list(vnets=None): + """Get the list of vnet interface names from the vnet implementation. + + @param vnets list of vnet ids or interfaces to filter the results by + """ + vnifs = [] + for v in vnet_list(vnets=vnets): + vnetif = sxp.child_value(v, "vnetif") + if vnetif: + vnifs.append(vnetif) + return vnifs + +def varp_list(): + """Get the list of varp info from the vnet implementation. + """ + if vnetd_running(): + (fi, fo) = vnetd_open() + sxp.show(['varp.list'], fo) + fo.flush() + else: + fi = file("/proc/vnet/varp") + fo = None + try: + return sxp.parse(fi) or [] + finally: + if fi: fi.close() + if fo: fo.close() + +def peer_list(): + if vnetd_running(): + (fi, fo) = vnetd_open() + sxp.show(['peer.list'], fo) + fo.flush() + else: + fi = file("/proc/vnet/peers") + fo = None + try: + return sxp.parse(fi) or [] + finally: + if fi: fi.close() + if fo: fo.close() + +class Opt: + """Declares command-line options for a command. + """ + + def getopt(klass, argv, opts, args): + """Get options and args from argv. + The value opts in the return value has an attribute for + eacho option or arg. The value args in the return value + is the remaining arguments. + + @param argv arguments + @param opts option specifiers (list of Opt objects) + @param args arg specififiers (list of Arg objects) + @return (opts, args) + """ + shortopts = "".join([ x.optShort() for x in opts ]) + longopts = [ x.optLong() for x in opts ] + (ovals, oargs) = getopt(argv[1:], shortopts, longopts) + odir = Opts() + for x in opts: + x.setDefault(odir) + for (k, v) in ovals: + for x in opts: + x.setOpt(k, v, odir) + argc = len(oargs) + if len(oargs) < len(args): + raise GetoptError("insufficient arguments for %s" % argv[0]) + for (x, v) in zip(args, oargs): + x.setArg(v, odir) + return (odir, oargs[len(args): ]) + + getopt = classmethod(getopt) + + def gethelp(klass, opts, args): + l = [] + for x in opts: + l.append(x.help()) + for x in args: + l.append(x.help()) + return " ".join(l) + + gethelp = classmethod(gethelp) + + """A command=-line option. + + @param name option name (this attribute is set to value in opts) + @param short short option flag (single-character string) + @param long long option name (defaults to option name, pass "" to suppress) + @param arg argument name (option has no arg if not specified) + """ + def __init__(self, name, short=None, long=None, arg=False): + self.name = name + self.short = short + if long is None: + long = name + elif not long: + long = None + self.long = long + self.arg = arg + + def help(self): + s = self.keyShort() + l = self.keyLong() + if s and l: + return "[%s | %s]" % (s, l) + else: + return s or l + + def keyShort(self): + if self.short: + return "-%s" % self.short + else: + return None + + def keyLong(self): + if self.long: + return "--%s" % self.long + else: + return None + + def optLong(self): + if not self.long: + return None + if self.arg: + return "%s=" % self.long + else: + return self.long + + def optShort(self): + if not self.short: + return None + if self.arg: + return "%s:" % self.short + else: + return self.short + + def setDefault(self, vals): + if self.arg: + setattr(vals, self.name, None) + else: + setattr(vals, self.name, False) + + def setOpt(self, k, v, vals): + if k in [ self.keyShort(), self.keyLong() ]: + if self.arg: + setattr(vals, self.name, v) + else: + if v not in [ None, '' ]: + raise GetoptError("option %s does not take an argument" % k) + setattr(vals, self.name, True) + +class Arg: + + """A command-line parameter. Args get their values from arguments + left over after option processing and are assigned in order. + The value is accessible as the attribute called 'name' in opts. + + @param name argument name + """ + def __init__(self, name): + self.name = name + + def setArg(self, v, vals): + setattr(vals, self.name, v) + + def help(self): + return "<%s>" % self.name + +class VnMain: + + """Methods beginning with this prefix are commands. + They must all have arguments like this: + + op_foo(self, argv, args, opts) + + argv: original command-line arguments + args: arguments left after option processing + opts: option and arg values (accessible as attributes) + + Method options are specified by setting attribute + .opts on the method to a list of Option objects. + For args set .args to a list of Arg objects. + Use .use for short usage string, .help for long help. + + Each option or arg defines an attribute in opts. For example + an option with name 'foo' is accessible as 'opts.foo'. + """ + opPrefix = "op_" + + def __init__(self, argv): + if argv: + self.name = argv[0] + else: + self.name = "vn" + self.argv = argv + self.argc = len(argv) + + def error(self, v): + print >>sys.stderr, "%s: %s" % (self.name, v) + sys.exit(1) + + def getFunction(self, opname): + key = self.opPrefix + opname.replace("-", "_") + fn = getattr(self, key, None) + if not fn: + raise ValueError("unknown command: %s" % opname) + return fn + + def main(self): + if self.argc < 2: + args = ["help"] + else: + args = self.argv[1:] + try: + fn = self.getFunction(args[0]) + except ValueError, ex: + self.error(ex) + try: + fnopts = self.getOpts(fn) + fnargs = self.getArgs(fn) + (opts, parms) = Opt.getopt(args, fnopts, fnargs) + return fn(args, parms, opts) + except GetoptError, ex: + self.error(ex) + except ValueError, ex: + self.error(ex) + except Exception, ex: + import traceback; traceback.print_exc() + self.error(ex) + + def getOpts(self, meth): + return getattr(meth, "opts", []) + + def getArgs(self, meth): + return getattr(meth, "args", []) + + def getUse(self, meth): + return getattr(meth, "use", "") + + def getHelp(self, meth): + return getattr(meth, "help", "") or self.getUse(meth) + + def fnHelp(self, meth): + return Opt.gethelp(self.getOpts(meth), self.getArgs(meth)) + + def printHelp(self, fn, opt_long): + meth = getattr(self, fn) + opname = fn[len(self.opPrefix):].replace("_", "-") + if opt_long: + help = self.getHelp(meth) + print "\n %s" % opname + if help: + print "%s" % help + else: + use = self.getUse(meth) + print " %s %s" % (opname, self.fnHelp(meth)) + if use: + print "\t\t%s" % use + + def show_vnif(self, dev): + cmd(CMD_IFCONFIG, dev) + bridge = Bridge.getBridge(dev) + if bridge: + print " Bridge:", bridge + interfaces = Bridge.getBridgeInterfaces(bridge) + if dev in interfaces: + interfaces.remove(dev) + if interfaces: + print " Interfaces:", ", ".join(interfaces) + print + + def op_help(self, argv, args, opts): + if opts.long: + print '%s <command> <options>' % self.name + print self.long_help + else: + print '%s:' % self.name + l = dir(self) + l.sort() + for fn in l: + if fn.startswith(self.opPrefix): + self.printHelp(fn, opts.long) + print + + op_help.opts = [ Opt('long', short='l') ] + + def op_vnets(self, argv, args, opts): + vnets = vnet_list(vnets=args or None) + for v in vnets: + prettyprint(v, width=50) + print + if not opts.long: + continue + vnif = sxp.child_value(v, "vnetif") + if not vnif: + continue + self.show_vnif(vnif) + if opts.all: + vnetids = {} + for v in vnets: + vnetids[sxp.child_value(v, "id")] = v + for v in vif_list(): + vnet = sxp.child_value(v, "vnet") + if vnet not in vnetids: + continue + prettyprint(v) + print + for v in varp_list(): + prettyprint(v) + print + + op_vnets.opts = [ Opt('all', short='a'), Opt('long', short='l') ] + + def op_vnifs(self, argv, args, opts): + vnifs = vnif_list(vnets=args or None) + for vnif in vnifs: + self.show_vnif(vnif) + + def op_vifs(self, argv, args, opts): + for v in vif_list(): + prettyprint(v) + print + + def op_varp(self, argv, args, opts): + for v in varp_list(): + prettyprint(v) + print + + def op_varp_flush(self, argv, args, opts): + varp_flush() + + def op_vnet_create(self, argv, args, opts): + return vnet_create(opts.vnet, + vnetif=opts.vnetif, + bridge=opts.bridge, + security=opts.security) + + op_vnet_create.args = [ Arg('vnet') ] + op_vnet_create.opts = [ Opt('security', short='s', arg="SECURITY"), + Opt('bridge', short='b', arg="BRIDGE"), + Opt('vnetif', short='v', arg="VNETIF") ] + + def op_vnet_delete(self, argv, args, opts): + vnetid = get_vnetid(opts.vnet) + return vnet_delete(vnetid, delbridge=opts.bridge) + + op_vnet_delete.args = [ Arg('vnet') ] + op_vnet_delete.opts = [ Opt('bridge', short='b') ] + + def op_vif_add(self, argv, args, opts): + vnetid = get_vnetid(opts.vnet) + if opts.interface: + vmac = get_mac(opts.vmac) + if not vmac: + raise ValueError("interface not found: %s" % opts.vmac) + else: + vmac = opts.vmac + return vif_add(vnetid, vmac) + + op_vif_add.args = [ Arg('vnet'), Arg('vmac') ] + op_vif_add.opts = [ Opt('interface', short='i') ] + + def op_vif_delete(self, argv, args, opts): + vnetid = get_vnetid(opts.vnet) + if opts.interface: + vmac = get_mac(opts.vmac) + else: + vmac = opts.vmac + return vif_del(vnetid, vmac) + + op_vif_delete.args = [ Arg('vnet'), Arg('vmac') ] + op_vif_delete.opts = [ Opt('interface', short='i') ] + + def op_peer_add(self, argv, args, opts): + addr = get_addr(opts.addr) + if(opts.port): + port = get_port(opts.port) + else: + port = None + return peer_add(addr, port) + + op_peer_add.args = [ Arg('addr') ] + op_peer_add.opts = [ Opt('port', short='p') ] + + def op_peer_delete(self, argv, args, opts): + addr = get_addr(opts.addr) + return peer_del(addr) + + op_peer_delete.args = [ Arg('addr') ] + + def op_peers(self, argv, args, opts): + for v in peer_list(): + prettyprint(v) + print + + def op_bridges(self, argv, args, opts): + if opts.long: + for bridge in Bridge.getBridges(): + cmd(CMD_IFCONFIG, bridge) + interfaces = Bridge.getBridgeInterfaces(bridge) + if interfaces: + print " Interfaces:", ", ".join(interfaces) + print + else: + for bridge in Bridge.getBridges(): + print bridge, + interfaces = Bridge.getBridgeInterfaces(bridge) + if interfaces: + print ":", ", ".join(interfaces) + else: + print + + op_bridges.opts = [ Opt('long', short='l') ] + + def op_insmod(self, argv, args, opts): + """Insert the vnet kernel module.""" + cmd("/etc/xen/scripts/vnet-insert", *args) + + long_help = """Control utility for vnets (virtual networking). +Report bugs to Mike Wray <mike.wray@xxxxxx>. +""" + + op_help.use = "Print help." + op_help.help = "Print help, long help if the option -l or --long is given." + + op_vnets.use = """Print vnets.""" + op_vnets.help = """Print vnet information, where options are: + -a, -all Print vnets, vifs and varp info. + -l, --long Print ifconfigs for vnet interfaces.""" + + op_vifs.use = "Print vifs." + + op_vnifs.use = "Print ifconfigs for vnet network interfaces." + + op_varp.use = "Print varp info and entries in the varp cache." + + op_varp_flush.use = "Flush the varp cache." + + op_vnet_create.use = "Create a vnet." + + op_vnet_delete.use = "Delete a vnet." + op_vnet_delete.help = """Delete a vnet. + -b, --bridge Delete the bridge the vnet interface is attached to. + """ + + op_vif_add.use = "Add a vif to a vnet." + op_vif_add.help = """Add a vif to a vnet. Not usually needed as vifs +are added automatically. + -i, --interface The vmac is the name of an interface to get the mac from.""" + + op_vif_delete.use = "Delete a vif from a vnet." + op_vif_delete.help = """Delete a vif from a vnet. Not usually needed as vifs +are removed periodically. + -i, --interface The vmac is the name of an interface to get the mac from.""" + + op_peer_add.use = "Add a peer." + op_peer_add.help = """Add a peer: <addr> <port> +Vnets use multicast to discover interfaces, but networks are often configured +not to forward multicast. Vnets forward multicasts to peers using UDP. +Only add peers if multicasts are not working, check with + +ping -b 224.10.0.1 + +Only add peers at one machine in a subnet, otherwise you may cause forwarding +loops. +""" + + op_peer_delete.use = "Delete a peer." + op_peer_delete.help= "Delete a peer: <addr>" + + op_peers.use = "List peers." + op_peers.help = "List peers." + + op_bridges.use = "Print bridges." + + op_insmod.use = "Insert the vnet kernel module, optionally with parameters." + +if __name__ == "__main__": + vn = VnMain(sys.argv) + vn.main() + diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/sxpr_util.c --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/sxpr_util.c Thu Feb 9 15:12:11 2006 @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ +#include "sys_net.h" +#include "if_varp.h" +#include "varp_util.h" +#include "sxpr_util.h" + +int stringof(Sxpr exp, char **s){ + int err = 0; + if(ATOMP(exp)){ + *s = atom_name(exp); + } else if(STRINGP(exp)){ + *s = string_string(exp); + } else { + err = -EINVAL; + *s = NULL; + } + return err; +} + +int child_string(Sxpr exp, Sxpr key, char **s){ + int err = 0; + Sxpr val = sxpr_child_value(exp, key, ONONE); + err = stringof(val, s); + return err; +} + +int intof(Sxpr exp, int *v){ + int err = 0; + char *s; + unsigned long l; + if(INTP(exp)){ + *v = OBJ_INT(exp); + } else { + err = stringof(exp, &s); + if(err) goto exit; + err = convert_atoul(s, &l); + *v = (int)l; + } + exit: + return err; +} + +int child_int(Sxpr exp, Sxpr key, int *v){ + int err = 0; + Sxpr val = sxpr_child_value(exp, key, ONONE); + err = intof(val, v); + return err; +} + +int vnetof(Sxpr exp, VnetId *v){ + int err = 0; + char *s; + err = stringof(exp, &s); + if(err) goto exit; + err = VnetId_aton(s, v); + exit: + return err; +} + +int child_vnet(Sxpr exp, Sxpr key, VnetId *v){ + int err = 0; + Sxpr val = sxpr_child_value(exp, key, ONONE); + err = vnetof(val, v); + return err; +} + +int macof(Sxpr exp, unsigned char *v){ + int err = 0; + char *s; + err = stringof(exp, &s); + if(err) goto exit; + err = mac_aton(s, v); + exit: + return err; +} + +int child_mac(Sxpr exp, Sxpr key, unsigned char *v){ + int err = 0; + Sxpr val = sxpr_child_value(exp, key, ONONE); + err = macof(val, v); + return err; +} + +int addrof(Sxpr exp, uint32_t *v){ + int err = 0; + char *s; + unsigned long w; + err = stringof(exp, &s); + if(err) goto exit; + err = get_inet_addr(s, &w); + if(err) goto exit; + *v = (uint32_t)w; + exit: + return err; +} + +int child_addr(Sxpr exp, Sxpr key, uint32_t *v){ + int err = 0; + Sxpr val = sxpr_child_value(exp, key, ONONE); + err = addrof(val, v); + return err; +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/sxpr_util.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/sxpr_util.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef _SXPR_UTIL_H_ +#define _SXPR_UTIL__H_ + +#include "sxpr.h" +struct VnetId; + +int stringof(Sxpr exp, char **s); +int child_string(Sxpr exp, Sxpr key, char **s); +int intof(Sxpr exp, int *v); +int child_int(Sxpr exp, Sxpr key, int *v); +int vnetof(Sxpr exp, struct VnetId *v); +int child_vnet(Sxpr exp, Sxpr key, struct VnetId *v); +int macof(Sxpr exp, unsigned char *v); +int child_mac(Sxpr exp, Sxpr key, unsigned char *v); +int addrof(Sxpr exp, uint32_t *v); +int child_addr(Sxpr exp, Sxpr key, uint32_t *v); + +#endif /* ! _SXPR_UTIL_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/timer_util.c --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/timer_util.c Thu Feb 9 15:12:11 2006 @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifdef __KERNEL__ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/version.h> + +#include <linux/spinlock.h> +#include <asm/semaphore.h> + +#else + +#include "sys_kernel.h" +#include "spinlock.h" + +#endif + +#include "timer_util.h" + +#define MODULE_NAME "TIMER" +#define DEBUG 1 +#undef DEBUG +#include "debug.h" + +#ifdef __KERNEL__ + +void timer_init(struct timer_list *timer, void (*fn)(unsigned long), void *data){ + init_timer(timer); + timer->data = (unsigned long)data; + timer->function = fn; +} + +void timer_set(struct timer_list *timer, unsigned long ttl){ + unsigned long now = jiffies; + timer->expires = now + ttl; + add_timer(timer); +} + +#else + +void timer_init(struct Timer *timer, void (*fn)(unsigned long), void *data){ + *timer = (struct Timer){}; + timer->data = (unsigned long)data; + timer->fn = fn; +} + +void timer_set(struct Timer *timer, unsigned long ttl){ + double now = time_now(); + timer->expiry = now + (double)ttl/(double)HZ; + Timer_cancel(timer); + Timer_add(timer); +} + +#endif diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/timer_util.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/timer_util.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _VNET_TIMER_UTIL_H_ +#define _VNET_TIMER_UTIL_H_ + +#ifdef __KERNEL__ + +struct timer_list; +#define timer_cancel del_timer + +#else /* __KERNEL__ */ + +#include "timer.h" +#define timer_list Timer +#define HZ 1000 +#define jiffies (unsigned long)(time_now() * HZ) +#define timer_cancel Timer_cancel + +#endif /* __KERNEL__ */ + +void timer_init(struct timer_list *timer, void (*fn)(unsigned long), void *data); +void timer_set(struct timer_list *timer, unsigned long ttl); + +#endif /*! _VNET_TIMER_UTIL_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet_eval.c --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet_eval.c Thu Feb 9 15:12:11 2006 @@ -0,0 +1,378 @@ +/* + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifdef __KERNEL__ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/version.h> +#include <linux/errno.h> + +#else + +#include "sys_kernel.h" +#include "spinlock.h" + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#endif + +#include "vnet.h" +#include "varp.h" +#include "vif.h" +#include "vnet_forward.h" +#include "sa.h" + +#include "iostream.h" + +#ifdef __KERNEL__ +#include "kernel_stream.h" +#else +#include "file_stream.h" +#endif + +#include "sxpr_util.h" +#include "vnet_eval.h" + +#define MODULE_NAME "VNET" +#define DEBUG 1 +#undef DEBUG +#include "debug.h" + +/** Create a vnet. + * It is an error if a vnet with the same id exists. + * + * @param vnet vnet id + * @param device vnet device name + * @param security security level + * @return 0 on success, error code otherwise + */ +static int ctrl_vnet_add(VnetId *vnet, char *device, int security){ + int err = 0; + Vnet *vnetinfo = NULL; + + if(strlen(device) >= IFNAMSIZ){ + err = -EINVAL; + goto exit; + } + if(Vnet_lookup(vnet, NULL) == 0){ + err = -EEXIST; + goto exit; + } + err = Vnet_alloc(&vnetinfo); + if(err) goto exit; + vnetinfo->vnet = *vnet; + vnetinfo->security = security; + strcpy(vnetinfo->device, device); + err = Vnet_create(vnetinfo); + exit: + if(vnetinfo) Vnet_decref(vnetinfo); + return err; +} + +/** Create an entry for a vif with the given vnet and vmac. + * + * @param vnet vnet id + * @param vmac mac address + * @return 0 on success, error code otherwise + */ +static int ctrl_vif_add(VnetId *vnet, Vmac *vmac){ + int err = 0; + Vif *vif = NULL; + + err = Vnet_lookup(vnet, NULL); + if(err) goto exit; + err = vif_create(vnet, vmac, 0, &vif); + exit: + if(vif) vif_decref(vif); + return err; +} + +/** Delete a vif. + * + * @param vnet vnet id + * @param vmac mac address + * @return 0 on success, error code otherwise + */ +static int ctrl_vif_del(VnetId *vnet, Vmac *vmac){ + int err = 0; + Vif *vif = NULL; + + err = Vnet_lookup(vnet, NULL); + if(err) goto exit; + err = vif_lookup(vnet, vmac, &vif); + if(err) goto exit; + vif_remove(vnet, vmac); + exit: + if(vif) vif_decref(vif); + return err; +} + +/** (varp.print) + */ +static int eval_varp_print(Sxpr exp, IOStream *out, void *data){ + int err = 0; + vnet_print(out); + vif_print(out); + varp_print(out); + return err; +} + +static int eval_varp_list(Sxpr exp, IOStream *out, void *data){ + int err = 0; + varp_print(out); + return err; +} + +/** (varp.mcaddr (addr <addr>)) + */ +static int eval_varp_mcaddr(Sxpr exp, IOStream *out, void *data){ + int err =0; + Sxpr oaddr = intern("addr"); + uint32_t addr; + + err = child_addr(exp, oaddr, &addr); + if(err < 0) goto exit; + varp_set_mcast_addr(addr); + exit: + return err; +} + +/** (varp.flush) + */ +static int eval_varp_flush(Sxpr exp, IOStream *out, void *data){ + int err = 0; + varp_flush(); + return err; +} + +/** (vnet.add (id <id>) + * [(vnetif <name>)] + * [(security { none | auth | conf } )] + * ) + */ +int eval_vnet_add(Sxpr exp, IOStream *out, void *data){ + int err = 0; + Sxpr oid = intern("id"); + Sxpr osecurity = intern("security"); + Sxpr ovnetif = intern("vnetif"); + Sxpr csecurity; + VnetId vnet = {}; + char *device = NULL; + char dev[IFNAMSIZ] = {}; + char *security = NULL; + int sec; + + err = child_vnet(exp, oid, &vnet); + if(err) goto exit; + child_string(exp, ovnetif, &device); + if(!device){ + snprintf(dev, IFNAMSIZ-1, "vnif%04x", ntohs(vnet.u.vnet16[7])); + device = dev; + } + csecurity = sxpr_child_value(exp, osecurity, intern("none")); + err = stringof(csecurity, &security); + if(err) goto exit; + if(strcmp(security, "none")==0){ + sec = 0; + } else if(strcmp(security, "auth")==0){ + sec = SA_AUTH; + } else if(strcmp(security, "conf")==0){ + sec = SA_CONF; + } else { + err = -EINVAL; + goto exit; + } + err = ctrl_vnet_add(&vnet, device, sec); + exit: + return err; +} + +/** Delete a vnet. + * + * (vnet.del (id <id>)) + * + * @param vnet vnet id + * @return 0 on success, error code otherwise + */ +static int eval_vnet_del(Sxpr exp, IOStream *out, void *data){ + int err = 0; + Sxpr oid = intern("id"); + VnetId vnet = {}; + + err = child_vnet(exp, oid, &vnet); + if(err) goto exit; + err = Vnet_del(&vnet); + exit: + return err; +} + +static int eval_vnet_list(Sxpr exp, IOStream *out, void *data){ + int err = 0; + vnet_print(out); + return err; +} + +/** (vif.add (vnet <vnet>) (vmac <macaddr>)) + */ +static int eval_vif_add(Sxpr exp, IOStream *out, void *data){ + int err = 0; + Sxpr ovnet = intern("vnet"); + Sxpr ovmac = intern("vmac"); + VnetId vnet = {}; + Vmac vmac = {}; + + err = child_vnet(exp, ovnet, &vnet); + if(err) goto exit; + err = child_mac(exp, ovmac, vmac.mac); + if(err) goto exit; + err = ctrl_vif_add(&vnet, &vmac); + exit: + return err; +} + +/** (vif.del (vnet <vnet>) (vmac <macaddr>)) + */ +static int eval_vif_del(Sxpr exp, IOStream *out, void *data){ + int err = 0; + Sxpr ovnet = intern("vnet"); + Sxpr ovmac = intern("vmac"); + VnetId vnet = {}; + Vmac vmac = {}; + + err = child_vnet(exp, ovnet, &vnet); + if(err) goto exit; + err = child_mac(exp, ovmac, vmac.mac); + if(err) goto exit; + err = ctrl_vif_del(&vnet, &vmac); + exit: + return err; +} + +static int eval_vif_list(Sxpr exp, IOStream *out, void *data){ + int err = 0; + vif_print(out); + return err; +} + +/** Eval a vnet add request. + * + * (peer.add (addr <addr>) [(port <port>)]) + * + * @param exp request + * @param out output stream + * @param data data + * @return 0 on success, error code otherwise + */ +int eval_peer_add(Sxpr exp, IOStream *out, void *data){ + int err = 0; + Sxpr oaddr = intern("addr"); + Sxpr oport = intern("port"); + VarpAddr addr = { .family = AF_INET }; + int port; + + err = child_addr(exp, oaddr, &addr.u.ip4.s_addr); + if(err < 0) goto exit; + err = child_int(exp, oport, &port); + if(err < 0){ + err = 0; + port = varp_port; + } + if(err) goto exit; + err = vnet_peer_add(&addr, port); + exit: + return err; +} + +/** Eval a peer delete request. + * + * (peer.del (addr <addr>)) + * + * @param vnetd vnetd + * @param exp request + * @param out output stream + * @param data data + * @return 0 on success, error code otherwise + */ +static int eval_peer_del(Sxpr exp, IOStream *out, void *data){ + int err = 0; + Sxpr oaddr = intern("addr"); + VarpAddr addr = { .family = AF_INET }; + + err = child_addr(exp, oaddr, &addr.u.ip4.s_addr); + if(err < 0) goto exit; + err = vnet_peer_del(&addr); + exit: + return err; +} + +/** Eval a peer list request. + * + * (peer.list) + * + * @param exp request + * @param out output stream + * @param data data + * @return 0 on success, error code otherwise + */ +static int eval_peer_list(Sxpr exp, IOStream *out, void *data){ + int err = 0; + vnet_peer_print(out); + return err; +} + +int vnet_eval_defs(SxprEval *defs, Sxpr exp, IOStream *io, void *data){ + int err = 0; + SxprEval *def; + + iprintf("> "); objprint(iostdout, exp, 0); IOStream_print(iostdout, "\n"); + err = -ENOSYS; + for(def = defs; !NONEP(def->name); def++){ + if(sxpr_elementp(exp, def->name)){ + err = def->fn(exp, io, data); + break; + } + } + iprintf("< err=%d\n", err); + return err; +} + +int vnet_eval(Sxpr exp, IOStream *io, void *data){ + SxprEval defs[] = { + { .name = intern("peer.add"), .fn = eval_peer_add }, + { .name = intern("peer.del"), .fn = eval_peer_del }, + { .name = intern("peer.list"), .fn = eval_peer_list }, + { .name = intern("varp.flush"), .fn = eval_varp_flush }, + { .name = intern("varp.list"), .fn = eval_varp_list }, + { .name = intern("varp.mcaddr"), .fn = eval_varp_mcaddr }, + { .name = intern("varp.print"), .fn = eval_varp_print }, + { .name = intern("vif.add"), .fn = eval_vif_add }, + { .name = intern("vif.del"), .fn = eval_vif_del }, + { .name = intern("vif.list"), .fn = eval_vif_list }, + { .name = intern("vnet.add"), .fn = eval_vnet_add }, + { .name = intern("vnet.del"), .fn = eval_vnet_del }, + { .name = intern("vnet.list"), .fn = eval_vnet_list }, + { .name = ONONE, .fn = NULL } }; + return vnet_eval_defs(defs, exp, io, data); +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet_eval.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet_eval.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef _VNET_EVAL_H_ +#define _VNET_EVAL_H_ + +#include "sxpr.h" +struct IOStream; + +typedef struct SxprEval { + Sxpr name; + int (*fn)(Sxpr, struct IOStream *, void *data); +} SxprEval; + +extern int eval_peer_add(Sxpr exp, struct IOStream *out, void *data); +extern int eval_vnet_add(Sxpr exp, struct IOStream *out, void *data); +extern int vnet_eval_defs(SxprEval *defs, Sxpr exp, struct IOStream *out, void *data); +extern int vnet_eval(Sxpr exp, struct IOStream *out, void *data); + +#endif /* ! _VNET_EVAL_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet_forward.c --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet_forward.c Thu Feb 9 15:12:11 2006 @@ -0,0 +1,383 @@ +/* + * Copyright (C) 2005, 2006 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ +#ifdef __KERNEL__ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> + +#include <linux/version.h> +#include <linux/spinlock.h> + +#include <linux/skbuff.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv4.h> +#include <linux/udp.h> + +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/checksum.h> + +#else + +#include <netinet/in.h> +#include <arpa/inet.h> + +#include "sys_kernel.h" +#include "spinlock.h" +#include "skbuff.h" +#include <linux/ip.h> +#include <linux/udp.h> + +#endif + +#include <varp.h> +#include <if_varp.h> +#include <varp.h> +#include <skb_util.h> +#include <skb_context.h> + +#include "allocate.h" +#include "iostream.h" +#include "hash_table.h" +#include "vnet_forward.h" + +#define MODULE_NAME "VNET" +#define DEBUG 1 +#undef DEBUG +#include "debug.h" + +extern int _skb_xmit(struct sk_buff *skb, uint32_t saddr); + +typedef struct VnetPeer { + struct VarpAddr addr; + uint16_t port; + atomic_t refcount; + int tx_packets; + int rx_packets; +} VnetPeer; + +static HashTable *vnet_peer_table = NULL; +static rwlock_t vnet_peer_table_lock = RW_LOCK_UNLOCKED; + +#define vnet_peer_read_lock(flags) read_lock_irqsave(&vnet_peer_table_lock, (flags)) +#define vnet_peer_read_unlock(flags) read_unlock_irqrestore(&vnet_peer_table_lock, (flags)) +#define vnet_peer_write_lock(flags) write_lock_irqsave(&vnet_peer_table_lock, (flags)) +#define vnet_peer_write_unlock(flags) write_unlock_irqrestore(&vnet_peer_table_lock, (flags)) + +static void VnetPeer_decref(VnetPeer *peer){ + if(!peer) return; + if(atomic_dec_and_test(&peer->refcount)){ + kfree(peer); + } +} + +static void VnetPeer_incref(VnetPeer *peer){ + if(!peer) return; + atomic_inc(&peer->refcount); +} + +static void VnetPeer_print(VnetPeer *peer, IOStream *io){ + char addrbuf[VARP_ADDR_BUF]; + + IOStream_print(io, "(vnet_peer\n"); + IOStream_print(io, " (addr %s)\n", VarpAddr_ntoa(&peer->addr, addrbuf)); + IOStream_print(io, " (port %d)\n", htons(peer->port)); + IOStream_print(io, " (tx_packets %d)\n", peer->tx_packets); + IOStream_print(io, " (rx_packets %d)\n", peer->tx_packets); + IOStream_print(io, ")\n"); +} + +static int VnetPeer_forward(VnetPeer *peer, struct sk_buff *fwdskb){ + int err = 0; + const int ip_n = sizeof(struct iphdr); + const int udp_n = sizeof(struct udphdr); + const int vnet_n = sizeof(struct VnetMsgHdr); + int head_n = 16 + ip_n + udp_n + vnet_n; + int push_n = 0; + struct sk_buff *skb = NULL; + struct VnetMsgHdr *vhdr; + uint32_t saddr = 0; + uint16_t sport = varp_port; + uint32_t daddr = peer->addr.u.ip4.s_addr; + uint16_t dport = varp_port; + + if(!fwdskb) goto exit; + if(daddr == fwdskb->nh.iph->saddr){ + // Don't forward if the skb src addr is the peer addr. + dprintf("> Forward loop on " IPFMT "\n", NIPQUAD(daddr)); + goto exit; + } + // On entry fwdskb->data should be at fwdskb->nh.raw (adjust if not). + // Also fwdskb->h.raw and fwdskb->nh.raw are set. + if(fwdskb->data > fwdskb->nh.raw){ + push_n = fwdskb->data - fwdskb->nh.raw; + head_n += push_n; + } + // If has headroom, copies header (which incs ref on dst), + // otherwise only clones header, which does not inc ref on dst. + skb = skb_realloc_headroom(fwdskb, head_n); + //skb = skb_copy_expand(fwdskb, head_n, 0, GFP_ATOMIC); + if(!skb){ + err = -ENOMEM; + goto exit; + } + + if(push_n){ + skb_push(skb, push_n); + } + +#ifdef DEBUG + printk("\nOriginal packet:\n"); + print_iphdr(__FUNCTION__, skb); + skb_print_bits(__FUNCTION__, skb, 0, skb->len); +#endif + + skb->mac.raw = NULL; + vhdr = (void*)skb_push(skb, vnet_n); + vhdr->id = htons(VFWD_ID); + vhdr->opcode = 0; + + // Setup the UDP header. + skb->h.raw = skb_push(skb, udp_n); + skb->h.uh->source = sport; // Source port. + skb->h.uh->dest = dport; // Destination port. + skb->h.uh->len = htons(skb->len); // Total packet length (bytes). + skb->h.uh->check = 0; + + // Setup the IP header. + skb->nh.raw = skb_push(skb, ip_n); + skb->nh.iph->version = 4; // Standard version. + skb->nh.iph->ihl = ip_n / 4; // IP header length (32-bit words). + skb->nh.iph->tos = 0; // No special type-of-service. + skb->nh.iph->tot_len = htons(skb->len); // Total packet length (bytes). + skb->nh.iph->id = 0; // No flow id. + skb->nh.iph->protocol = IPPROTO_UDP; // IP protocol number. + skb->nh.iph->frag_off = 0; + skb->nh.iph->ttl = 64; // Linux default time-to-live. + skb->nh.iph->saddr = saddr; // Source address. + skb->nh.iph->daddr = daddr; // Destination address. + skb->nh.iph->check = 0; + +#ifdef DEBUG + printk("\nWrapped packet:\n"); + print_iphdr(__FUNCTION__, skb); + print_udphdr(__FUNCTION__, skb); + skb_print_bits(__FUNCTION__, skb, 0, 0 * skb->len); +#endif + + err = _skb_xmit(skb, saddr); + peer->tx_packets++; + + exit: + if(err < 0) kfree_skb(skb); + return err; +} + +int vnet_peer_get(VarpAddr *addr, VnetPeer **peer){ + unsigned long flags; + + vnet_peer_read_lock(flags); + *peer = HashTable_get(vnet_peer_table, addr); + VnetPeer_incref(*peer); + vnet_peer_read_unlock(flags); + return (*peer ? 0 : -ENOENT); +} + +int vnet_peer_add(VarpAddr *addr, uint16_t port){ + int err = 0; + unsigned long flags; + VnetPeer *peer; + + vnet_peer_write_lock(flags); + peer = HashTable_get(vnet_peer_table, addr); + if(peer){ + VnetPeer_incref(peer); + goto exit; + } + peer = ALLOCATE(VnetPeer); + if(!peer){ + err = -ENOMEM; + goto exit; + } + peer->addr = *addr; + peer->port = port; + VnetPeer_incref(peer); + if(!HashTable_add(vnet_peer_table, &peer->addr, peer)){ + VnetPeer_decref(peer); + err = -ENOMEM; + } + exit: + vnet_peer_write_unlock(flags); + return err; +} + +int vnet_peer_del(VarpAddr *addr){ + int ret = 0; + unsigned long flags; + + vnet_peer_write_lock(flags); + ret = HashTable_remove(vnet_peer_table, addr); + vnet_peer_write_unlock(flags); + return ret; +} + +void vnet_peer_print(IOStream *io){ + HashTable_for_decl(entry); + unsigned long flags; + + if(!vnet_peer_table) return; + vnet_peer_read_lock(flags); + HashTable_for_each(entry, vnet_peer_table){ + VnetPeer *peer = entry->value; + VnetPeer_print(peer, io); + } + vnet_peer_read_unlock(flags); +} + +int vnet_forward_send(struct sk_buff *skb){ + int err = 0; + unsigned long flags; + HashTable_for_decl(entry); + int count = 0; + + if(!vnet_peer_table){ + goto exit; + } + vnet_peer_read_lock(flags); + HashTable_for_each(entry, vnet_peer_table){ + VnetPeer *peer = entry->value; + VnetPeer_forward(peer, skb); + count++; + } + vnet_peer_read_unlock(flags); + exit: + return err; +} + +int vnet_forward_recv(struct sk_buff *skb){ + int err = 0; + VarpAddr addr = { .family = AF_INET }; + VnetPeer *peer = NULL; + unsigned char eth[ETH_HLEN] = {}; + struct sk_buff *recvskb; + + if(!vnet_peer_table){ + dprintf("> no table\n"); + return -ENOSYS; + } + // On entry mac.raw, h.raw, nh.raw are set. + // skb->data points after the fwd vnet header, at the complete + // forwarded packet (which has IP hdr, no eth hdr). + + // Save the eth hdr and source addr (peer). + memcpy(eth, skb->mac.raw, ETH_HLEN); + addr.u.ip4.s_addr = skb->nh.iph->saddr; + err = vnet_peer_get(&addr, &peer); + if(err){ + wprintf("> no peer for " IPFMT "\n", NIPQUAD(skb->nh.iph->saddr)); + goto exit; + } + peer->rx_packets++; + skb->mac.raw = NULL; + skb->nh.raw = skb->data; + skb->h.raw = (void*)(skb->nh.iph + 1); + if(!skb->nh.iph->saddr){ + skb->nh.iph->saddr = addr.u.ip4.s_addr; + } +#ifdef __KERNEL__ + // Fix IP options, checksum, skb dst, netfilter state. + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->dev = NULL; + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); +#endif // __KERNEL__ + + skb->mac.raw = skb->nh.raw - ETH_HLEN; + memcpy(skb->mac.raw, eth, ETH_HLEN); + + // Map destination mcast addresses to our mcast address. + if(MULTICAST(skb->nh.iph->daddr)){ + skb->nh.iph->daddr = varp_mcast_addr; + //xmit does this: ip_eth_mc_map(varp_mcast_addr, eth_hdr(skb)->h_dest); + } + + // Handle (a copy of) it ourselves, because + // if it is looped-back by xmit it will be ignored. + //recvskb = skb_clone(skb, GFP_ATOMIC); + recvskb = pskb_copy(skb, GFP_ATOMIC); + if(recvskb){ + // Data points at the unwrapped iphdr, but varp_handle_message() + // expects it to point at the udphdr, so pull. + skb_pull(recvskb, sizeof(struct iphdr)); + if(varp_handle_message(recvskb) <= 0){ + kfree_skb(recvskb); + } + } + err = _skb_xmit(skb, skb->nh.iph->saddr); + if(err >= 0) err = 1; + exit: + return err; +} + +/** Hash function for keys in the peer table. + */ +static Hashcode peer_key_hash_fn(void *k){ + return hash_hvoid(0, k, sizeof(struct VarpAddr)); +} + +/** Equality function for keys in the peer table. + */ +static int peer_key_equal_fn(void *k1, void *k2){ + return memcmp(k1, k2, sizeof(struct VarpAddr)) == 0; +} + +static void peer_entry_free_fn(HashTable *table, HTEntry *entry){ + if(!entry) return; + VnetPeer_decref((VnetPeer*)entry->value); + HTEntry_free(entry); +} + +int vnet_forward_init(void){ + int err = 0; + if(vnet_peer_table) goto exit; + vnet_peer_table = HashTable_new(0); + if(!vnet_peer_table){ + err = -ENOMEM; + goto exit; + } + vnet_peer_table->key_equal_fn = peer_key_equal_fn; + vnet_peer_table->key_hash_fn = peer_key_hash_fn; + vnet_peer_table->entry_free_fn = peer_entry_free_fn; + exit: + return err; +} + +void vnet_forward_exit(void){ + HashTable_free(vnet_peer_table); + vnet_peer_table = NULL; +} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnet-module/vnet_forward.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnet-module/vnet_forward.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2005, 2006 Mike Wray <mike.wray@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free software Foundation, Inc., + * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef _VNET_FORWARD_H_ +#define _VNET_FORWARD_H_ + +#include <if_varp.h> + +struct sk_buff; +struct IOStream; + +extern int vnet_peer_add(struct VarpAddr *addr, uint16_t port); +extern int vnet_peer_del(struct VarpAddr *addr); +extern void vnet_peer_print(struct IOStream *io); + +extern int vnet_forward_send(struct sk_buff *skb); +extern int vnet_forward_recv(struct sk_buff *skb); +extern int vnet_forward_init(void); +extern void vnet_forward_exit(void); + +#endif /* _VNET_FORWARD_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/list.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/list.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,284 @@ +#ifndef _VNETD_LIST_H_ +#define _VNETD_LIST_H_ + +/* Taken from Linux kernel code, but de-kernelized for userspace. */ +#include <stddef.h> + +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +/** + * list_del_rcu - deletes entry from list without re-initialization + * @entry: the element to delete from the list. + * + * Note: list_empty on entry does not return true after this, + * the entry is in an undefined state. It is useful for RCU based + * lockfree traversal. + * + * In particular, it means that we can not poison the forward + * pointers that may still be used for walking the list. + */ +static inline void list_del_rcu(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = LIST_POISON2; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + + +/** + * list_for_each_entry_safe - iterate over list of given type safe against + * removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + + + +#endif /* _VNETD_LIST_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/selector.c --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/selector.c Thu Feb 9 15:12:11 2006 @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx>. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of the + * License, or (at your option) any later version. This library is + * distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <stdbool.h> +#include <stdio.h> +#include <unistd.h> +#include <errno.h> + +#include "connection.h" +#include "selector.h" + +#define MODULE_NAME "select" +#define DEBUG 1 +#undef DEBUG +#include "debug.h" + +void Selector_init(Selector *sel){ + INIT_LIST_HEAD(&sel->list); +} + +/** Close a selector and remove it from its list. + * + * @param sel selector (may be null) + */ +void Selector_close(Selector *sel){ + if(!sel) return; + dprintf(">\n"); + if(sel->close){ + sel->close(sel); + } + if(sel->list.next + && sel->list.next != LIST_POISON1 + && !list_empty(&sel->list)){ + list_del_init(&sel->list); + } +} + +/** Add a selector to a select set. + * The selector is closed if it has no 'select' function, + * or it has one and it returns an error. + * + * @param sel selector + * @param set select set + */ +int Selector_select(Selector *sel, SelectSet *set){ + int err = -EINVAL; + dprintf(">\n"); + if(sel->select){ + err = sel->select(sel, set); + } + if(err){ + Selector_close(sel); + } + return err; +} + +/** Call a selector with a select set. + * The selector is closed if it has no 'selected' function, + * or it has one and it returns an error. + * + * @param sel selector + * @param set select set + */ +int Selector_selected(Selector *sel, SelectSet *set){ + int err = -EINVAL; + dprintf(">\n"); + if(sel->selected){ + err = sel->selected(sel, set); + } + if(err){ + Selector_close(sel); + } + return err; +} + +int conn_select_fn(Selector *sel, SelectSet *set){ + int err = -EINVAL; + Conn *conn = sel->data; + + dprintf(">\n"); + if(conn){ + err = 0; + SelectSet_add(set, conn->sock, conn->mode); + } + return err; +} + +int conn_selected_fn(Selector *sel, SelectSet *set){ + int err = -EINVAL; + Conn *conn = sel->data; + + dprintf(">\n"); + if(conn){ + err = Conn_handle(conn, set); + } + return err; +} + +void conn_close_fn(Selector *sel){ + Conn *conn = sel->data; + + wprintf("> sel=%p\n", sel); + if(conn){ + Conn_close(conn); + } +} + +void Selector_conn_init(Selector *sel, Conn *conn, + int mode, void *data, + int (*fn)(struct Conn *conn, int mode)){ + conn->mode = SELECT_READ; + conn->data = data; + conn->fn = fn; + sel->data = conn; + sel->select = conn_select_fn; + sel->close = conn_close_fn; + sel->selected = conn_selected_fn; +} + diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/selector.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/selector.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx>. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of the + * License, or (at your option) any later version. This library is + * distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _VNETD_SELECTOR_H_ +#define _VNETD_SELECTOR_H_ + +#include "list.h" +#include "select.h" + +struct Conn; + +typedef struct Selector { + + /** List the selector is linked into (if any). */ + struct list_head list; + + /** Function called by Selector_select() to add a selector to a select set. + * The selector is closed if this returns an error (non-zero). + */ + int (*select)(struct Selector *sel, struct SelectSet *set); + + /** Function called by Selector_selected() to notify a selector of select set. + * The selector is closed if this returns an error (non-zero). + */ + int (*selected)(struct Selector *sel, struct SelectSet *set); + + /** Function called by Selector_close() to close a selector. + */ + void (*close)(struct Selector *sel); + + /** User data. */ + void *data; + +} Selector; + +void Selector_init(struct Selector *sel); +void Selector_close(struct Selector *sel); +int Selector_select(struct Selector *sel, struct SelectSet *set); +int Selector_selected(struct Selector *sel, struct SelectSet *set); + +int conn_select_fn(struct Selector *sel, struct SelectSet *set); +int conn_selected_fn(struct Selector *sel, struct SelectSet *set); +void conn_close_fn(struct Selector *sel); +void Selector_conn_init(struct Selector *sel, struct Conn *conn, + int mode, void *data, + int (*fn)(struct Conn *conn, int mode)); + +#endif /* _VNETD_SELECTOR_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/skbuff.c --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/skbuff.c Thu Feb 9 15:12:11 2006 @@ -0,0 +1,530 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox <iiitac@xxxxxxxxxxxxxx> + * Florian La Roche <rzsfl@xxxxxxxxxxxx> + * + * Fixes: + * Alan Cox : Fixed the worst of the load + * balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * Robert Olsson : Removed skb_head_pool + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "allocate.h" +#include "debug.h" +#include "skbuff.h" + +#define SKB_DATA_ALIGN(size) ((((size) + 7) >> 3) << 3) + +/** + * skb_over_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_put(). Not user callable. + */ +void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + eprintf("skput:over: %p:%d put:%d\n", here, skb->len, sz); + BUG(); +} + +/** + * skb_under_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_push(). Not user callable. + */ + +void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + eprintf("skput:under: %p:%d put:%d\n", here, skb->len, sz); + BUG(); +} + +/** + * alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + */ +struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) +{ + struct sk_buff *skb; + u8 *data; + + /* Get the HEAD */ + skb = ALLOCATE(struct sk_buff); + if (!skb) + goto out; + + /* Get the DATA. Size must match skb_add_mtu(). */ + size = SKB_DATA_ALIGN(size); + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + memset(skb, 0, offsetof(struct sk_buff, truesize)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + skb->list = NULL; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; +out: + return skb; +nodata: + kfree(skb); + skb = NULL; + goto out; +} + + +void skb_release_data(struct sk_buff *skb) +{ + kfree(skb->head); +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +void kfree_skbmem(struct sk_buff *skb) +{ + skb_release_data(skb); + kfree(skb); +} + +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ + if (skb->list) { + wprintf("Warning: kfree_skb passed an skb still " + "on a list.\n"); + //BUG(); + } + + if(skb->destructor) { + skb->destructor(skb); + } + kfree_skbmem(skb); +} + + +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an &sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns %NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) +{ + return pskb_copy(skb, gfp_mask); +} + +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ + /* + * Shift between the two data areas in bytes + */ + unsigned long offset = new->data - old->data; + + new->list = NULL; + new->protocol = old->protocol; + new->h.raw = old->h.raw + offset; + new->nh.raw = old->nh.raw + offset; + new->mac.raw = old->mac.raw + offset; + new->pkt_type = old->pkt_type; + new->destructor = NULL; + atomic_set(&new->users, 1); +} + + +/** + * pskb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @nhead: room to add at head + * @ntail: room to add at tail + * @gfp_mask: allocation priority + * + * Expands (or creates identical copy, if &nhead and &ntail are zero) + * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * reference count of 1. Returns zero in the case of success or error, + * if expansion failed. In the last case, &sk_buff is not changed. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) +{ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; + + if (skb_shared(skb)) + BUG(); + + size = SKB_DATA_ALIGN(size); + + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + /* Copy only real data... and, alas, header. This should be + * optimized for the cases when header is void. */ + memcpy(data + nhead, skb->head, skb->tail - skb->head); + memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); + + skb_release_data(skb); + + off = (data + nhead) - skb->head; + + skb->head = data; + skb->end = data + size; + skb->data += off; + skb->tail += off; + skb->mac.raw += off; + skb->h.raw += off; + skb->nh.raw += off; + return 0; + +nodata: + return -ENOMEM; +} + +struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask); + + if (!n) + goto out; + + /* Set the data pointer */ + skb_reserve(n, skb->data - skb->head); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + memcpy(n->data, skb->data, n->len); + + n->data_len = skb->data_len; + n->len = skb->len; + + copy_skb_header(n, skb); +out: + return n; +} + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ + struct sk_buff *skb2; + int delta = headroom - skb_headroom(skb); + + if (delta <= 0) + skb2 = pskb_copy(skb, GFP_ATOMIC); + else { + skb2 = skb_copy_expand(skb, headroom, 0, GFP_ATOMIC); + } + return skb2; +} + + +/** + * skb_copy_expand - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass %GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + * + * BUG ALERT: ip_summed is not copied. Why does this work? Is it used + * only by netfilter in the cases when checksum is recalculated? --ANK + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, int gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask); + int head_copy_len, head_copy_off; + + if (!n) + return NULL; + + skb_reserve(n, newheadroom); + + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + head_copy_len = skb_headroom(skb); + head_copy_off = 0; + if (newheadroom <= head_copy_len) + head_copy_len = newheadroom; + else + head_copy_off = newheadroom - head_copy_len; + + /* Copy the linear header and data. */ + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)) + BUG(); + + copy_skb_header(n, skb); + + return n; +} + + +/* Copy some data bits from skb to kernel buffer. */ + +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ + int copy; + int start = skb_headlen(skb); + + if (offset > (int)skb->len - len) + goto fault; + + /* Copy header. */ + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + memcpy(to, skb->data + offset, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + + if (!len) + return 0; + +fault: + return -EFAULT; +} + + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + kfree_skb(skb); +} + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * + * Works even without knowing the list it is sitting on, which can be + * handy at times. It also means that THE LIST MUST EXIST when you + * unlink. Thus a list must have its contents unlinked before it is + * destroyed. + */ +void skb_unlink(struct sk_buff *skb) +{ + struct sk_buff_head *list = skb->list; + + if (list) { + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + if (skb->list == list) + __skb_unlink(skb, skb->list); + spin_unlock_irqrestore(&list->lock, flags); + } +} + + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ + +void skb_append(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_append(old, newsk); + spin_unlock_irqrestore(&old->list->lock, flags); +} + + +/** + * skb_insert - insert a buffer + * @old: buffer to insert before + * @newsk: buffer to insert + * + * Place a packet before a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * A buffer cannot be placed on two lists at the same time. + */ + +void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_insert(newsk, old->prev, old, old->list); + spin_unlock_irqrestore(&old->list->lock, flags); +} + diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/skbuff.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/skbuff.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,538 @@ +/* + * Definitions for the 'struct sk_buff' memory handlers. + * + * Authors: + * Alan Cox, <gw4pts@xxxxxxxxxxxxxxx> + * Florian La Roche, <rzsfl@xxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _VNET_SKBUFF_H +#define _VNET_SKBUFF_H + +#include "sys_kernel.h" +#include "spinlock.h" + +struct sk_buff; + +struct sk_buff_head { + /* These two members must be first. */ + struct sk_buff *next; + struct sk_buff *prev; + + __u32 qlen; + spinlock_t lock; +}; + + + +#define MAX_SKB_FRAGS 8 // (65536/PAGE_SIZE + 2) + +typedef struct skb_frag_struct skb_frag_t; + +struct skb_frag_struct { + //struct page *page; + void *page; + __u16 page_offset; + __u16 size; +}; + +/* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +struct skb_shared_info { + atomic_t dataref; + unsigned int nr_frags; + unsigned short tso_size; + unsigned short tso_segs; + struct sk_buff *frag_list; + skb_frag_t frags[MAX_SKB_FRAGS]; +}; + +struct sk_buff { + /* These two members must be first. */ + struct sk_buff *next; + struct sk_buff *prev; + + struct sk_buff_head *list; + struct net_device *dev; + + union { + struct tcphdr *th; + struct udphdr *uh; + struct icmphdr *icmph; + struct igmphdr *igmph; + struct iphdr *ipiph; + struct ipv6hdr *ipv6h; + unsigned char *raw; + } h; + + union { + struct iphdr *iph; + struct ipv6hdr *ipv6h; + struct arphdr *arph; + unsigned char *raw; + } nh; + + union { + unsigned char *raw; + } mac; + + unsigned int len, + data_len; + unsigned char pkt_type; + unsigned short protocol; + + void (*destructor)(struct sk_buff *skb); + + /* These elements must be at the end, see alloc_skb() for details. */ + unsigned int truesize; + atomic_t users; + unsigned char *head, + *data, + *tail, + *end; +}; + +extern void skb_over_panic(struct sk_buff *skb, int len, + void *here); +extern void skb_under_panic(struct sk_buff *skb, int len, + void *here); + +#define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end)) + +extern void __kfree_skb(struct sk_buff *skb); +extern struct sk_buff *alloc_skb(unsigned int size, int priority); +extern struct sk_buff *skb_clone(struct sk_buff *skb, int priority); +extern struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask); +extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, + unsigned int headroom); +extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, + int priority); + +extern int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); + +static inline void kfree_skb(struct sk_buff *skb) +{ + if (atomic_read(&skb->users) == 1 || atomic_dec_and_test(&skb->users)) + __kfree_skb(skb); +} + +static inline void dev_kfree_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +static inline int skb_cloned(const struct sk_buff *skb) +{ + return 0; +} + +/** + * skb_shared - is the buffer shared + * @skb: buffer to check + * + * Returns true if more than one person has a reference to this + * buffer. + */ +static inline int skb_shared(const struct sk_buff *skb) +{ + return atomic_read(&skb->users) != 1; +} + +/** + * skb_peek + * @list_: list to peek at + * + * Peek an &sk_buff. Unlike most other operations you _MUST_ + * be careful with this one. A peek leaves the buffer on the + * list and someone else may run off with it. You must hold + * the appropriate locks or have a private queue to do this. + * + * Returns %NULL for an empty list or a pointer to the head element. + * The reference count is not incremented and the reference is therefore + * volatile. Use with caution. + */ +static inline struct sk_buff *skb_peek(struct sk_buff_head *list_) +{ + struct sk_buff *list = ((struct sk_buff *)list_)->next; + if (list == (struct sk_buff *)list_) + list = NULL; + return list; +} + +/** + * skb_peek_tail + * @list_: list to peek at + * + * Peek an &sk_buff. Unlike most other operations you _MUST_ + * be careful with this one. A peek leaves the buffer on the + * list and someone else may run off with it. You must hold + * the appropriate locks or have a private queue to do this. + * + * Returns %NULL for an empty list or a pointer to the tail element. + * The reference count is not incremented and the reference is therefore + * volatile. Use with caution. + */ +static inline struct sk_buff *skb_peek_tail(struct sk_buff_head *list_) +{ + struct sk_buff *list = ((struct sk_buff *)list_)->prev; + if (list == (struct sk_buff *)list_) + list = NULL; + return list; +} + +/** + * skb_queue_len - get queue length + * @list_: list to measure + * + * Return the length of an &sk_buff queue. + */ +static inline __u32 skb_queue_len(const struct sk_buff_head *list_) +{ + return list_->qlen; +} + +static inline void skb_queue_head_init(struct sk_buff_head *list) +{ + spin_lock_init(&list->lock); + list->prev = list->next = (struct sk_buff *)list; + list->qlen = 0; +} + +/* + * Insert an sk_buff at the start of a list. + * + * The "__skb_xxxx()" functions are the non-atomic ones that + * can only be called with interrupts disabled. + */ + +/** + * __skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of a list. This function takes no locks + * and you must therefore hold required locks before calling it. + * + * A buffer cannot be placed on two lists at the same time. + */ +extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); +static inline void __skb_queue_head(struct sk_buff_head *list, + struct sk_buff *newsk) +{ + struct sk_buff *prev, *next; + + newsk->list = list; + list->qlen++; + prev = (struct sk_buff *)list; + next = prev->next; + newsk->next = next; + newsk->prev = prev; + next->prev = prev->next = newsk; +} + +/** + * __skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the end of a list. This function takes no locks + * and you must therefore hold required locks before calling it. + * + * A buffer cannot be placed on two lists at the same time. + */ +extern void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); +static inline void __skb_queue_tail(struct sk_buff_head *list, + struct sk_buff *newsk) +{ + struct sk_buff *prev, *next; + + newsk->list = list; + list->qlen++; + next = (struct sk_buff *)list; + prev = next->prev; + newsk->next = next; + newsk->prev = prev; + next->prev = prev->next = newsk; +} + + +/** + * __skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. This function does not take any locks + * so must be used with appropriate locks held only. The head item is + * returned or %NULL if the list is empty. + */ +extern struct sk_buff *skb_dequeue(struct sk_buff_head *list); +static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) +{ + struct sk_buff *next, *prev, *result; + + prev = (struct sk_buff *) list; + next = prev->next; + result = NULL; + if (next != prev) { + result = next; + next = next->next; + list->qlen--; + next->prev = prev; + prev->next = next; + result->next = result->prev = NULL; + result->list = NULL; + } + return result; +} + + +/* + * Insert a packet on a list. + */ +extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk); +static inline void __skb_insert(struct sk_buff *newsk, + struct sk_buff *prev, struct sk_buff *next, + struct sk_buff_head *list) +{ + newsk->next = next; + newsk->prev = prev; + next->prev = prev->next = newsk; + newsk->list = list; + list->qlen++; +} + +/* + * Place a packet after a given packet in a list. + */ +extern void skb_append(struct sk_buff *old, struct sk_buff *newsk); +static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk) +{ + __skb_insert(newsk, old, old->next, old->list); +} + +/* + * remove sk_buff from list. _Must_ be called atomically, and with + * the list known.. + */ +extern void skb_unlink(struct sk_buff *skb); +static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) +{ + struct sk_buff *next, *prev; + + list->qlen--; + next = skb->next; + prev = skb->prev; + skb->next = skb->prev = NULL; + skb->list = NULL; + next->prev = prev; + prev->next = next; +} + + +/* XXX: more streamlined implementation */ + +/** + * __skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. This function does not take any locks + * so must be used with appropriate locks held only. The tail item is + * returned or %NULL if the list is empty. + */ +extern struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); +static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) +{ + struct sk_buff *skb = skb_peek_tail(list); + if (skb) + __skb_unlink(skb, list); + return skb; +} + + +/* + * Add data to an sk_buff + */ +static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp = skb->tail; + skb->tail += len; + skb->len += len; + return tmp; +} + +/** + * skb_put - add data to a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer. If this would + * exceed the total buffer size the kernel will panic. A pointer to the + * first byte of the extra data is returned. + */ +static inline unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp = skb->tail; + skb->tail += len; + skb->len += len; + if (unlikely(skb->tail>skb->end)) + skb_over_panic(skb, len, current_text_addr()); + return tmp; +} + +static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + return skb->data; +} + +/** + * skb_push - add data to the start of a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer at the buffer + * start. If this would exceed the total buffer headroom the kernel will + * panic. A pointer to the first byte of the extra data is returned. + */ +static inline unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + if (unlikely(skb->data<skb->head)){ + skb_under_panic(skb, len, current_text_addr()); + } + return skb->data; +} + +static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len) +{ + skb->len -= len; + //BUG_ON(skb->len < skb->data_len); + return skb->data += len; +} + +/** + * skb_pull - remove data from the start of a buffer + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the next data in the buffer + * is returned. Once the data has been pulled future pushes will overwrite + * the old data. + */ +static inline unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +{ + return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); +} + +static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) +{ + return (len <= skb->len); +} + +static inline unsigned int skb_headlen(const struct sk_buff *skb) +{ + return skb->len - skb->data_len; +} + +/** + * skb_headroom - bytes at buffer head + * @skb: buffer to check + * + * Return the number of bytes of free space at the head of an &sk_buff. + */ +static inline int skb_headroom(const struct sk_buff *skb) +{ + return skb->data - skb->head; +} + +/** + * skb_tailroom - bytes at buffer end + * @skb: buffer to check + * + * Return the number of bytes of free space at the tail of an sk_buff + */ +static inline int skb_tailroom(const struct sk_buff *skb) +{ + return skb->end - skb->tail; +} + +/** + * skb_reserve - adjust headroom + * @skb: buffer to alter + * @len: bytes to move + * + * Increase the headroom of an empty &sk_buff by reducing the tail + * room. This is only allowed for an empty buffer. + */ +static inline void skb_reserve(struct sk_buff *skb, unsigned int len) +{ + skb->data += len; + skb->tail += len; +} + +/** + * __skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function does not take the + * list lock and the caller must hold the relevant locks to use it. + */ +extern void skb_queue_purge(struct sk_buff_head *list); +static inline void __skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = __skb_dequeue(list)) != NULL) + kfree_skb(skb); +} + +/** + * __dev_alloc_skb - allocate an skbuff for sending + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned in there is no free memory. + */ +static inline struct sk_buff *__dev_alloc_skb(unsigned int length, + int gfp_mask) +{ + struct sk_buff *skb = alloc_skb(length + 16, gfp_mask); + if (likely(skb)) + skb_reserve(skb, 16); + return skb; +} + +/** + * dev_alloc_skb - allocate an skbuff for sending + * @length: length to allocate + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned in there is no free memory. Although this function + * allocates memory it can be called from an interrupt. + */ +static inline struct sk_buff *dev_alloc_skb(unsigned int length) +{ + return __dev_alloc_skb(length, GFP_ATOMIC); +} + +#define MULTICAST(x) (((x) & htonl(0xf0000000)) == htonl(0xe0000000)) + +#endif /* _VNET_SKBUFF_H */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/spinlock.c --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/spinlock.c Thu Feb 9 15:12:11 2006 @@ -0,0 +1,65 @@ +#include "spinlock.h" + +int atomic_read(const atomic_t *v){ + return v->val; +} + +int atomic_dec_and_test(atomic_t *v){ + if(v->val > 0){ + v->val--; + return v->val == 0; + } + return 0; +} + +void atomic_inc(atomic_t *v){ + v->val++; +} + +void atomic_set(atomic_t *v, int x){ + v->val = x; +} + +void spin_lock_init(spinlock_t *lock){ + *lock = (spinlock_t){}; +} + +unsigned long _spin_lock_irqsave(spinlock_t *lock){ + lock->val++; + return 0; +} + +void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags){ + lock->val--; +} + +unsigned long _read_lock_irqsave(rwlock_t *lock){ + lock->val++; + return 0; +} + +void read_unlock_irqrestore(rwlock_t *lock, unsigned long flags){ + lock->val--; +} + +unsigned long _write_lock_irqsave(rwlock_t *lock){ + lock->val++; + return 0; +} + +void write_unlock_irqrestore(rwlock_t *lock, unsigned long flags){ + lock->val--; +} + +void init_MUTEX(struct semaphore *sem){ + *sem = (struct semaphore){ .count = 1 }; +} + +void down(struct semaphore *sem){ + sem->count--; +} + +void up(struct semaphore *sem){ + sem->count++; +} + diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/spinlock.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/spinlock.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,47 @@ +#ifndef _VNET_SPINLOCK_H_ +#define _VNET_SPINLOCK_H_ + +typedef struct atomic_t { + unsigned val; +} atomic_t; + +int atomic_read(const atomic_t *v); +int atomic_dec_and_test(atomic_t *v); +void atomic_inc(atomic_t *v); +void atomic_set(atomic_t *v, int x); + +typedef struct spinlock_t { + unsigned val; +} spinlock_t; + +#define SPIN_LOCK_UNLOCKED ((struct spinlock_t){}) + +void spin_lock_init(spinlock_t *lock); + +unsigned long _spin_lock_irqsave(spinlock_t *lock); +#define spin_lock_irqsave(lock, flags) flags = _spin_lock_irqsave(lock) +void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags); + +typedef struct rwlock_t{ + unsigned val; +} rwlock_t; + +#define RW_LOCK_UNLOCKED ((struct rwlock_t){}) + +unsigned long _read_lock_irqsave(rwlock_t *lock); +#define read_lock_irqsave(lock, flags) flags = _read_lock_irqsave(lock) +void read_unlock_irqrestore(rwlock_t *lock, unsigned long flags); + +unsigned long _write_lock_irqsave(rwlock_t *lock); +#define write_lock_irqsave(lock, flags) flags = _write_lock_irqsave(lock) +void write_unlock_irqrestore(rwlock_t *lock, unsigned long flags); + +struct semaphore { + int count; +}; + +void init_MUTEX(struct semaphore *sem); +void down(struct semaphore *sem); +void up(struct semaphore *sem); + +#endif /* ! _VNET_SPINLOCK_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/sys_kernel.h --- /dev/null Thu Feb 9 15:09:00 2006 +++ b/tools/vnet/vnetd/sys_kernel.h Thu Feb 9 15:12:11 2006 @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _VNET_SYS_KERNEL_H_ +#define _VNET_SYS_KERNEL_H_ + +/** @file Compatibility replacements for some kernel defs. + */ + +#include <assert.h> +#include <asm/types.h> +//#include <sys/types.h> +#include <unistd.h> + +#define printk printf + +#define likely(x) x +#define unlikely(x) x +#define current_text_addr() NULL + +#define BUG_ON(x) assert(x) +#define BUG() BUG_ON(1) +#define kmalloc(n, m) allocate_type(n, m) +#define kfree(p) deallocate(p) +#define in_atomic() 0 + +#define __init +#define __exit + +#define module_init(x) +#define module_exit(x) +#define MODULE_LICENSE(x) +#define MODULE_PARM(v, t) +#define MODULE_PARM_DESC(v, s) + +enum { + GFP_USER, + GFP_ATOMIC, + GFP_KERNEL, +}; + +typedef signed char s8; +typedef unsigned char u8; + +typedef signed short s16; +typedef unsigned short u16; + +typedef signed int s32; +typedef unsigned int u32; + +typedef signed long long s64; +typedef unsigned long long u64; + +#include "allocate.h" + +#endif /* ! _VNET_SYS_KERNEL_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/marshal.c --- a/tools/vnet/vnetd/marshal.c Thu Feb 9 15:09:00 2006 +++ /dev/null Thu Feb 9 15:12:11 2006 @@ -1,223 +0,0 @@ -/* - * Copyright (C) 2001 - 2004 Mike Wray <mike.wray@xxxxxx>. - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of the - * License, or (at your option) any later version. This library is - * distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <errno.h> -#include "sys_net.h" -#include "allocate.h" -#include "marshal.h" - -#define MODULE_NAME "marshal" -#define DEBUG -#undef DEBUG -#include "debug.h" - -#define ARRAY_SIZE(ary) (sizeof(ary)/sizeof((ary)[0])) - -/* Messages are coded as msgid followed by message fields. - * Initial message on any channel is hello - so can check version - * compatibility. - * - * char* -> uint16_t:n <n bytes> - * ints/uints go as suitable number of bytes (e.g. uint16_t is 2 bytes). - * optional fields go as '1' <val> or '0' (the 0/1 is 1 byte). - * lists go as ('1' <elt>)* '0' - */ - -int marshal_flush(IOStream *io){ - int err = 0; - err = IOStream_flush(io); - return err; -} - -int marshal_bytes(IOStream *io, void *s, uint32_t s_n){ - int err = 0; - int n; - n = IOStream_write(io, s, s_n); - if(n < 0){ - err = n; - } else if (n < s_n){ - dprintf("> Wanted %d, got %d\n", s_n, n); - err = -EIO; - } - return err; -} - -int unmarshal_bytes(IOStream *io, void *s, uint32_t s_n){ - int err = 0; - int n; - //dprintf("> s_n=%d\n", s_n); - n = IOStream_read(io, s, s_n); - //dprintf("> n=%d\n", n); - if(n < 0){ - err = n; - } else if(n < s_n){ - dprintf("> Wanted %d, got %d\n", s_n, n); - err = -EIO; - } - //dprintf("< err=%d\n", err); - return err; -} - -int marshal_uint8(IOStream *io, uint8_t x){ - return marshal_bytes(io, &x, sizeof(x)); -} - -int unmarshal_uint8(IOStream *io, uint8_t *x){ - return unmarshal_bytes(io, x, sizeof(*x)); -} - -int marshal_uint16(IOStream *io, uint16_t x){ - x = htons(x); - return marshal_bytes(io, &x, sizeof(x)); -} - -int unmarshal_uint16(IOStream *io, uint16_t *x){ - int err = 0; - err = unmarshal_bytes(io, x, sizeof(*x)); - *x = ntohs(*x); - return err; -} - -int marshal_int32(IOStream *io, int32_t x){ - int err = 0; - //dprintf("> x=%d\n", x); - x = htonl(x); - err = marshal_bytes(io, &x, sizeof(x)); - //dprintf("< err=%d\n", err); - return err; -} - -int unmarshal_int32(IOStream *io, int32_t *x){ - int err = 0; - //dprintf(">\n"); - err = unmarshal_bytes(io, x, sizeof(*x)); - *x = ntohl(*x); - //dprintf("< err=%d x=%d\n", err, *x); - return err; -} - -int marshal_uint32(IOStream *io, uint32_t x){ - int err = 0; - //dprintf("> x=%u\n", x); - x = htonl(x); - err = marshal_bytes(io, &x, sizeof(x)); - //dprintf("< err=%d\n", err); - return err; -} - -int unmarshal_uint32(IOStream *io, uint32_t *x){ - int err = 0; - //dprintf(">\n"); - err = unmarshal_bytes(io, x, sizeof(*x)); - *x = ntohl(*x); - //dprintf("< err=%d x=%u\n", err, *x); - return err; -} - -int marshal_uint64(IOStream *io, uint64_t x){ - int err; - err = marshal_uint32(io, (uint32_t) ((x >> 32) & 0xffffffff)); - if(err) goto exit; - err = marshal_uint32(io, (uint32_t) ( x & 0xffffffff)); - exit: - return err; -} - -int unmarshal_uint64(IOStream *io, uint64_t *x){ - int err = 0; - uint32_t hi, lo; - err = unmarshal_uint32(io, &hi); - if(err) goto exit; - err = unmarshal_uint32(io, &lo); - *x = (((uint64_t) hi) << 32) | lo; - exit: - return err; -} - -int marshal_net16(IOStream *io, net16_t x){ - return marshal_bytes(io, &x, sizeof(x)); -} - -int unmarshal_net16(IOStream *io, net16_t *x){ - int err = 0; - err = unmarshal_bytes(io, x, sizeof(*x)); - return err; -} - -int marshal_net32(IOStream *io, net32_t x){ - return marshal_bytes(io, &x, sizeof(x)); -} - -int unmarshal_net32(IOStream *io, net32_t *x){ - int err = 0; - err = unmarshal_bytes(io, x, sizeof(*x)); - return err; -} - -int marshal_string(IOStream *io, char *s, uint32_t s_n){ - int err; - //dprintf("> s=%s\n", s); - err = marshal_uint32(io, s_n); - if(err) goto exit; - err = marshal_bytes(io, s, s_n); - exit: - //dprintf("< err=%d\n", err); - return err; -} - -int unmarshal_string(IOStream *io, char *s, uint32_t s_n){ - int err = 0, val_n = 0; - //dprintf(">\n"); - err = unmarshal_uint32(io, &val_n); - if(err) goto exit; - if(val_n >= s_n){ - err = -EINVAL; - goto exit; - } - err = unmarshal_bytes(io, s, val_n); - if(err) goto exit; - s[val_n] = '\0'; - exit: - //dprintf("< err=%d s=%s\n", err, s); - return err; -} - -int unmarshal_new_string(IOStream *io, char **s, uint32_t *s_n){ - int err = 0, val_n = 0; - char *val = NULL; - //dprintf(">\n"); - err = unmarshal_uint32(io, &val_n); - if(err) goto exit; - val = allocate(val_n + 1); - if(!val){ - err = -ENOMEM; - goto exit; - } - err = unmarshal_bytes(io, val, val_n); - if(err) goto exit; - val[val_n] = '\0'; - exit: - if(err){ - if(val) deallocate(val); - val = NULL; - val_n = 0; - } - *s = val; - if(s_n) *s_n = val_n; - //dprintf("< err=%d s=%s\n", err, *s); - return err; -} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/marshal.h --- a/tools/vnet/vnetd/marshal.h Thu Feb 9 15:09:00 2006 +++ /dev/null Thu Feb 9 15:12:11 2006 @@ -1,58 +0,0 @@ -/* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx>. - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of the - * License, or (at your option) any later version. This library is - * distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#ifndef _XEN_LIB_MARSHAL_H_ -#define _XEN_LIB_MARSHAL_H_ - -#include "iostream.h" - -/** A 16-bit uint in network order, e.g. a port number. */ -typedef uint16_t net16_t; - -/** A 32-bit uint in network order, e.g. an IP address. */ -typedef uint32_t net32_t; - -extern int marshal_flush(IOStream *io); - -extern int marshal_bytes(IOStream *io, void *s, uint32_t s_n); -extern int unmarshal_bytes(IOStream *io, void *s, uint32_t s_n); - -extern int marshal_uint8(IOStream *io, uint8_t x); -extern int unmarshal_uint8(IOStream *io, uint8_t *x); - -extern int marshal_uint16(IOStream *io, uint16_t x); -extern int unmarshal_uint16(IOStream *io, uint16_t *x); - -extern int marshal_uint32(IOStream *io, uint32_t x); -extern int unmarshal_uint32(IOStream *io, uint32_t *x); - -extern int marshal_int32(IOStream *io, int32_t x); -extern int unmarshal_int32(IOStream *io, int32_t *x); - -extern int marshal_uint64(IOStream *io, uint64_t x); -extern int unmarshal_uint64(IOStream *io, uint64_t *x); - -extern int marshal_net16(IOStream *io, net16_t x); -extern int unmarshal_net16(IOStream *io, net16_t *x); - -extern int marshal_net32(IOStream *io, net32_t x); -extern int unmarshal_net32(IOStream *io, net32_t *x); - -extern int marshal_string(IOStream *io, char *s, uint32_t s_n); -extern int unmarshal_string(IOStream *io, char *s, uint32_t s_n); -extern int unmarshal_new_string(IOStream *io, char **s, uint32_t *s_n); - -#endif /* ! _XEN_LIB_MARSHAL_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/vcache.c --- a/tools/vnet/vnetd/vcache.c Thu Feb 9 15:09:00 2006 +++ /dev/null Thu Feb 9 15:12:11 2006 @@ -1,652 +0,0 @@ -/* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx>. - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of the - * License, or (at your option) any later version. This library is - * distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <stdlib.h> -#include <unistd.h> -#include <stdio.h> -#include <getopt.h> -#include <errno.h> -#include <sys/types.h> -#include <time.h> -#include <sys/socket.h> -#include <netinet/in.h> -#include <arpa/inet.h> -#include <string.h> - -#include "allocate.h" -#include "hash_table.h" -#include "sys_net.h" -#include "sys_string.h" -#include "connection.h" -#include "marshal.h" -#include "timer.h" - -#undef offsetof -#include "vnetd.h" -#include "vcache.h" - -#define MODULE_NAME "VARP" -#define DEBUG 1 -#undef DEBUG -#include "debug.h" - -#include "varp_util.c" - -static VarpCache *vcache = NULL; - -void IPMessageQueue_init(IPMessageQueue *queue, int maxlen){ - queue->msg = NULL; - queue->len = 0; - queue->maxlen = maxlen; -} - -void IPMessageQueue_clear(IPMessageQueue *queue){ - queue->msg = NULL; - queue->len = 0; -} - -void IPMessageQueue_truncate(IPMessageQueue *queue, int n){ - IPMessage **p = &queue->msg; - int i; - for(i = 1; *p; p = &(*p)->next, i++){ - if(i == n){ - *p = NULL; - break; - } - } -} - -void IPMessageQueue_add(IPMessageQueue *queue, IPMessage *msg){ - msg->next = queue->msg; - queue->msg = msg; - queue->len++; - if(queue->len >= queue->maxlen){ - IPMessageQueue_truncate(queue, queue->maxlen); - } -} - -IPMessage * IPMessageQueue_pop(IPMessageQueue *queue){ - IPMessage *msg = NULL; - if(queue->len > 0){ - queue->len--; - msg = queue->msg; - queue->msg = msg->next; - msg->next = NULL; - } - return msg; -} - -void VarpCache_sweep(VarpCache *z, int all); - -/** Send a varp protocol message. - * - * @param opcode varp opcode (host order) - * @param vnet vnet id (in network order) - * @param vmac vmac (in network order) - * @return 0 on success, error code otherwise - */ -int varp_send(Conn *conn, uint16_t opcode, VnetId *vnet, Vmac *vmac, VarpAddr *addr){ - int err = 0; - int varp_n = sizeof(VarpHdr); - VarpHdr varph = {}; -#ifdef DEBUG - char vnetbuf[VNET_ID_BUF]; - char addrbuf[VARP_ADDR_BUF]; -#endif - - varph.hdr.id = htons(VARP_ID); - varph.hdr.opcode = htons(opcode); - varph.vnet = *vnet; - varph.vmac = *vmac; - varph.addr = *addr; - - if(0){ - struct sockaddr_in self; - socklen_t self_n; - getsockname(conn->sock, (struct sockaddr *)&self, &self_n); - dprintf("> sockname addr=%s port=%d\n", - inet_ntoa(self.sin_addr), ntohs(self.sin_port)); - } - dprintf("> addr=%s opcode=%d\n", - inet_ntoa(conn->addr.sin_addr), opcode); - dprintf("> vnet=%s vmac=" MACFMT " addr=%s\n", - VnetId_ntoa(vnet, vnetbuf), - MAC6TUPLE(vmac->mac), - VarpAddr_ntoa(addr, addrbuf)); - err = marshal_bytes(conn->out, &varph, varp_n); - marshal_flush(conn->out); - dprintf("< err=%d\n", err); - return err; -} - -/* Test some flags. - * - * @param z varp entry - * @param flags to test - * @return nonzero if flags set - */ -int VCEntry_get_flags(VCEntry *z, int flags){ - return z->flags & flags; -} - -/** Set some flags. - * - * @param z varp entry - * @param flags to set - * @param set set flags on if nonzero, off if zero - * @return new flags value - */ -int VCEntry_set_flags(VCEntry *z, int flags, int set){ - if(set){ - z->flags |= flags; - } else { - z->flags &= ~flags; - } - return z->flags; -} - -/** Print a varp entry. - * - * @param ventry varp entry - */ -void VCEntry_print(VCEntry *ventry){ - if(ventry){ - char *state, *flags; - char vnetbuf[VNET_ID_BUF]; - char addrbuf[VARP_ADDR_BUF]; - - switch(ventry->state){ - case VCACHE_STATE_INCOMPLETE: state = "INC"; break; - case VCACHE_STATE_REACHABLE: state = "RCH"; break; - case VCACHE_STATE_FAILED: state = "FLD"; break; - default: state = "UNK"; break; - } - flags = (VCEntry_get_flags(ventry, VCACHE_FLAG_PROBING) ? "P" : " "); - - printf("VENTRY(%p %s %s vnet=%s vmac=" MACFMT " addr=%s time=%g)\n", - ventry, - state, flags, - VnetId_ntoa(&ventry->key.vnet, vnetbuf), - MAC6TUPLE(ventry->key.vmac.mac), - VarpAddr_ntoa(&ventry->addr, addrbuf), - ventry->timestamp); - } else { - printf("VENTRY: Null!\n"); - } -} - -int VCEntry_schedule(VCEntry *ventry); -void VCEntry_solicit(VCEntry *ventry); - -/** Function called when a varp entry timer goes off. - * If the entry is still incomplete, carries on probing. - * Otherwise stops probing. - * - * @param arg ventry - */ -static void ventry_timer_fn(Timer *timer){ - VCEntry *ventry = timer->data; - int probing = 0, scheduled = 0; - - //dprintf(">\n"); VCEntry_print(ventry); - if(ventry->state == VCACHE_STATE_REACHABLE){ - // Do nothing. - } else { - // Probe if haven't run out of tries, otherwise fail. - if(ventry->probes < VCACHE_PROBE_MAX){ - //probing = 1; - ventry->probes++; - scheduled = VCEntry_schedule(ventry); - //VCEntry_solicit(ventry); - probing = scheduled; - } else { - ventry->state = VCACHE_STATE_FAILED; - IPMessageQueue_clear(&ventry->queue); - } - } - if(!probing){ - VCEntry_set_flags(ventry, - (VCACHE_FLAG_PROBING - | VCACHE_FLAG_REMOTE_PROBE - | VCACHE_FLAG_LOCAL_PROBE), - 0); - } - VCEntry_set_flags(ventry, VCACHE_FLAG_PROBING, probing); - //dprintf("<\n"); -} - -/** Schedule the varp entry timer. - * - * @param ventry varp entry - */ -int VCEntry_schedule(VCEntry *ventry){ - int scheduled = 0; - if(ventry->probes == 1){ - scheduled = 1; - Timer_set(VCACHE_LOCAL_DELAY, ventry_timer_fn, ventry); - } else { - VCEntry_solicit(ventry); - } - return scheduled; -} - -/** Create a varp entry. Initializes the internal state. - * - * @param vnet vnet id - * @param vmac virtual MAC address (copied) - * @return ventry or null - */ -VCEntry * VCEntry_new(VnetId *vnet, Vmac *vmac){ - VCEntry *z = ALLOCATE(VCEntry); - z->state = VCACHE_STATE_INCOMPLETE; - z->timestamp = time_now(); - z->key.vnet = *vnet; - z->key.vmac = *vmac; - return z; -} - -/** Hash function for keys in the varp cache. - * Hashes the vnet id and mac. - * - * @param k key (VCKey) - * @return hashcode - */ -Hashcode vcache_key_hash_fn(void *k){ - VCKey *key = k; - Hashcode h = 0; - h = VnetId_hash(h, &key->vnet); - h = Vmac_hash(h, &key->vmac); - return h; -} - -/** Test equality for keys in the varp cache. - * Compares vnet and mac. - * - * @param k1 key to compare (VCKey) - * @param k2 key to compare (VCKey) - * @return 1 if equal, 0 otherwise - */ -int vcache_key_equal_fn(void *k1, void *k2){ - VCKey *key1 = k1; - VCKey *key2 = k2; - return (VnetId_eq(&key1->vnet , &key2->vnet) && - Vmac_eq(&key1->vmac, &key2->vmac)); -} - -void VarpCache_schedule(VarpCache *z); - -/** Function called when the varp table timer goes off. - * Sweeps old varp cache entries and reschedules itself. - * - * @param arg varp table - */ -static void vcache_timer_fn(Timer *timer){ - VarpCache *z = timer->data; - //dprintf("> z=%p\n", z); - if(z){ - VarpCache_sweep(z, 0); - VarpCache_schedule(z); - } - //dprintf("<\n"); -} - -/** Schedule the varp table timer. - * - * @param z varp table - */ -void VarpCache_schedule(VarpCache *z){ - Timer_set(VCACHE_ENTRY_TTL, vcache_timer_fn, z); -} - -/** Print a varp table. - * - * @param z table - */ -void VarpCache_print(VarpCache *z){ - HashTable_for_decl(entry); - VCEntry *ventry; - - dprintf(">\n"); - HashTable_for_each(entry, vcache->table){ - ventry = entry->value; - VCEntry_print(ventry); - } - dprintf("<\n"); -} - -/** Print the varp cache. - */ -void vcache_print(void){ - VarpCache_print(vcache); -} - -/** Create a varp table. - * - * @return new table or null - */ -VarpCache * VarpCache_new(void){ - VarpCache *z = NULL; - - z = ALLOCATE(VarpCache); - z->table = HashTable_new(VCACHE_BUCKETS); - z->table->key_equal_fn = vcache_key_equal_fn; - z->table->key_hash_fn = vcache_key_hash_fn; - VarpCache_schedule(z); - return z; -} - -/** Add a new entry to the varp table. - * - * @param z table - * @param vnet vnet id - * @param vmac virtual MAC address (copied) - * @return new entry or null - */ -VCEntry * VarpCache_add(VarpCache *z, VnetId *vnet, Vmac *vmac){ - VCEntry *ventry; - HTEntry *entry; - - ventry = VCEntry_new(vnet, vmac); - //dprintf("> "); VCEntry_print(ventry); - entry = HashTable_add(z->table, ventry, ventry); - return ventry; -} - -/** Remove an entry from the varp table. - * - * @param z table - * @param ventry entry to remove - * @return removed count - */ -int VarpCache_remove(VarpCache *z, VCEntry *ventry){ - return HashTable_remove(z->table, ventry); -} - -/** Lookup an entry in the varp table. - * - * @param z table - * @param vnet vnet id - * @param vmac virtual MAC addres - * @return entry found or null - */ -VCEntry * VarpCache_lookup(VarpCache *z, VnetId *vnet, Vmac *vmac){ - VCKey key = { .vnet = *vnet, .vmac = *vmac }; - VCEntry *ventry; - ventry = HashTable_get(z->table, &key); - return ventry; -} - -void VCEntry_solicit(VCEntry *ventry){ - dprintf(">\n"); - if(VCEntry_get_flags(ventry, VCACHE_FLAG_LOCAL_PROBE)){ - dprintf("> local probe\n"); - varp_send(vnetd->bcast_conn, VARP_OP_REQUEST, - &ventry->key.vnet, &ventry->key.vmac, &ventry->addr); - } - if(VCEntry_get_flags(ventry, VCACHE_FLAG_REMOTE_PROBE)){ - ConnList *l; - dprintf("> remote probe\n"); - for(l = vnetd->connections; l; l = l->next){ - varp_send(l->conn, VARP_OP_REQUEST, - &ventry->key.vnet, &ventry->key.vmac, &ventry->addr); - } - - } - dprintf("<\n"); -} - -int VCEntry_resolve(VCEntry *ventry, IPMessage *msg, int flags){ - int err = 0; - - dprintf("> "); //VCEntry_print(ventry); - ventry->state = VCACHE_STATE_INCOMPLETE; - VCEntry_set_flags(ventry, flags, 1); - IPMessageQueue_add(&ventry->queue, msg); - if(!VCEntry_get_flags(ventry, VCACHE_FLAG_PROBING)){ - VCEntry_set_flags(ventry, VCACHE_FLAG_PROBING, 1); - ventry->probes = 1; - VCEntry_schedule(ventry); - //VCEntry_solicit(ventry); - } - dprintf("< err=%d\n", err); - return err; -} - -/** Update a ventry. Sets the address and state to those given - * and sets the timestamp to 'now'. - * - * @param ventry varp entry - * @param addr care-of address - * @param state state - * @return 0 on success, error code otherwise - */ -int VCEntry_update(VCEntry *ventry, IPMessage *msg, VarpHdr *varph, int state){ - int err = 0; - double now = time_now(); - - if(VCEntry_get_flags(ventry, VCACHE_FLAG_PERMANENT)) goto exit; - ventry->addr = varph->addr; - ventry->timestamp = now; - ventry->state = state; - if(ventry->state == VCACHE_STATE_REACHABLE){ - // Process the output queue. - IPMessage *msg; - while((msg = IPMessageQueue_pop(&ventry->queue))){ - dprintf("> announce\n"); - varp_send(msg->conn, VARP_OP_ANNOUNCE, - &ventry->key.vnet, &ventry->key.vmac, &ventry->addr); - } - } - exit: - return err; -} - -/** Update the ventry corresponding to the given varp header. - * - * @param z table - * @param varph varp header - * @param state state - * @return 0 on success, -ENOENT if no entry found - */ -int VarpCache_update(VarpCache *z, IPMessage *msg, VarpHdr *varph, int state){ - int err = 0; - VCEntry *ventry; - - dprintf(">\n"); - ventry = VarpCache_lookup(z, &varph->vnet, &varph->vmac); - if(ventry){ - err = VCEntry_update(ventry, msg, varph, state); - } else { - err = -ENOENT; - } - dprintf("< err=%d\n", err); - return err; -} - - -/** Put old varp entries into the incomplete state. - * Permanent entries are not changed. - * If 'all' is non-zero, all non-permanent entries - * are put into the incomplete state, regardless of age. - * - * @param z table - * @param all reset all entries if non-zero - */ -void VarpCache_sweep(VarpCache *z, int all){ - HashTable_for_decl(entry); - VCEntry *ventry; - double now = time_now(); - double old = now - VCACHE_ENTRY_TTL; - - dprintf(">\n"); - HashTable_for_each(entry, vcache->table){ - ventry = entry->value; - if(!VCEntry_get_flags(ventry, VCACHE_FLAG_PERMANENT) && - (all || (ventry->timestamp < old))){ - ventry->state = VCACHE_STATE_INCOMPLETE; - } - } - dprintf("<\n"); -} - -/** Forward a varp message. - * If local forwards it to remote vnetds. - * If not local forwards it to local net. - * - * @param varph varp message to forward - * @param local whether it's local or not - */ -void vcache_forward_varp(VarpHdr *varph, int local){ - uint16_t opcode = ntohs(varph->hdr.opcode); - if(local){ - ConnList *l; - for(l = vnetd->connections; l; l = l->next){ - varp_send(l->conn, opcode, &varph->vnet, &varph->vmac, &varph->addr); - } - } else { - varp_send(vnetd->bcast_conn, opcode, &varph->vnet, &varph->vmac, &varph->addr); - } -} - -/** Handle a varp request. - * - * @param msg incoming message - * @param varph varp message - * @return 0 if ok, -ENOENT if no matching vif, or error code - */ -#if 1 -int vcache_handle_request(IPMessage *msg, VarpHdr *varph, int local){ - dprintf("> local=%d\n", local); - vcache_forward_varp(varph, local); - dprintf("<\n"); - return 0; -} - -#else -int vcache_handle_request(IPMessage *msg, VarpHdr *varph, int local){ - int err = -ENOENT; - VnetId *vnet; - Vmac *vmac; - VCEntry *ventry = NULL; - int reply = 0; - - dprintf(">\n"); - vnet = &varph->vnet; - vmac = &varph->vmac; - ventry = VarpCache_lookup(vcache, vnet, vmac); - if(!ventry){ - ventry = VarpCache_add(vcache, vnet, vmac); - } - if(local){ - // Request coming from the local subnet (on our udp port). - if(ventry->state == VCACHE_STATE_REACHABLE){ - if(local){ - // Have an entry, and it's non-local - reply (locally). - // Potential out-of-date cache problem. - // Should query remotely instead of replying. - varp_send(conn, VARP_OP_ANNOUNCE, ventry); - } - } else { - // Incomplete entry. Resolve. - VCEntry_resolve(ventry, msg, VCACHE_FLAG_REMOTE_PROBE); - } - } else { - // Non-local request (on one of our tcp connetions). - if(ventry->state == VCACHE_STATE_REACHABLE){ - if(local){ - // Have an entry and it's local - reply (remotely). - // Potential out-of-date cache problem. - // Should query locally instead of replying. - varp_send(msg->conn, VARP_OP_ANNOUNCE, ventry); - } else { - // Have a non-local entry - do nothing and assume someone else - // will reply. - } - } else { - // Incomplete entry. Resolve. - VCEntry_resolve(ventry, msg, VCACHE_FLAG_LOCAL_PROBE); - } - } - exit: - dprintf("< err=%d\n", err); - return err; -} -#endif - -/** Handle a varp announce message. - * Update the matching ventry if we have one. - * - * @param msg incoming message - * @param varp message - * @return 0 if OK, -ENOENT if no matching entry - */ -int vcache_handle_announce(IPMessage *msg, VarpHdr *varph, int local){ - int err = 0; - - vcache_forward_varp(varph, local); - err = VarpCache_update(vcache, msg, varph, VCACHE_STATE_REACHABLE); - return err; -} - -/** Handle an incoming varp message. - * - * @param msg incoming message - * @return 0 if OK, error code otherwise - */ -int vcache_handle_message(IPMessage *msg, int local){ - int err = -EINVAL; - VnetMsg *vmsg = msg->data; - VarpHdr *varph = &vmsg->varp.varph; - - dprintf(">\n"); -#ifdef DEBUG - { - char vnetbuf[VNET_ID_BUF]; - dprintf("> src=%s:%d\n", inet_ntoa(msg->saddr.sin_addr), ntohs(msg->saddr.sin_port)); - dprintf("> dst=%s:%d\n", inet_ntoa(msg->daddr.sin_addr), ntohs(msg->daddr.sin_port)); - dprintf("> opcode=%d vnet=%s vmac=" MACFMT "\n", - ntohs(varph->opcode), - VnetId_ntoa(&varph->vnet, vnetbuf), - MAC6TUPLE(varph->vmac.mac)); - } -#endif - switch(ntohs(varph->hdr.opcode)){ - case VARP_OP_REQUEST: - err = vcache_handle_request(msg, varph, local); - break; - case VARP_OP_ANNOUNCE: - err = vcache_handle_announce(msg, varph, local); - break; - default: - break; - } - dprintf("< err=%d\n", err); - return err; -} - -/** Initialize the varp cache. - * - * @return 0 on success, error code otherwise - */ -int vcache_init(void){ - int err = 0; - - if(!vcache){ - vcache = VarpCache_new(); - } - return err; -} diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/vcache.h --- a/tools/vnet/vnetd/vcache.h Thu Feb 9 15:09:00 2006 +++ /dev/null Thu Feb 9 15:12:11 2006 @@ -1,141 +0,0 @@ -/* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx>. - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of the - * License, or (at your option) any later version. This library is - * distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#ifndef _VNET_VCACHE_H_ -#define _VNET_VCACHE_H_ - -#include "hash_table.h" - -/** Time-to-live of varp cache entries (in seconds).*/ -#define VCACHE_ENTRY_TTL 30.0 - -/** Maximum number of varp probes to make. */ -#define VCACHE_PROBE_MAX 5 - -/** Interval between varp probes (in seconds). */ -#define VCACHE_PROBE_INTERVAL 3.0 - -/** Delay before forwarding a local probe (in seconds). */ -#define VCACHE_LOCAL_DELAY 2.0 - -/** Number of buckets in the varp cache (must be prime). */ -#define VCACHE_BUCKETS 3001 - -enum { - VCACHE_STATE_INCOMPLETE = 1, - VCACHE_STATE_REACHABLE = 2, - VCACHE_STATE_FAILED = 3 -}; - -enum { - VCACHE_FLAG_PROBING = 1, - VCACHE_FLAG_PERMANENT = 2, - VCACHE_FLAG_LOCAL_PROBE = 4, - VCACHE_FLAG_REMOTE_PROBE = 8, -}; - - -#include <asm/byteorder.h> -/* - * Display an IP address in readable format. - */ - -#define NIPQUAD(addr) \ - ((unsigned char *)&addr)[0], \ - ((unsigned char *)&addr)[1], \ - ((unsigned char *)&addr)[2], \ - ((unsigned char *)&addr)[3] - -#if defined(__LITTLE_ENDIAN) -#define HIPQUAD(addr) \ - ((unsigned char *)&addr)[3], \ - ((unsigned char *)&addr)[2], \ - ((unsigned char *)&addr)[1], \ - ((unsigned char *)&addr)[0] -#elif defined(__BIG_ENDIAN) -#define HIPQUAD NIPQUAD -#else -#error "Please fix asm/byteorder.h" -#endif /* __LITTLE_ENDIAN */ - -#define IPFMT "%u.%u.%u.%u" -#define MACFMT "%02x:%02x:%02x:%02x:%02x:%02x" - -#define MAC6TUPLE(_mac) (_mac)[0], (_mac)[1], (_mac)[2], (_mac)[3], (_mac)[4], (_mac)[5] - -typedef struct IPMessage { - Conn *conn; - struct sockaddr_in saddr; - struct sockaddr_in daddr; - void *data; - struct IPMessage *next; -} IPMessage; - -typedef struct IPMessageQueue { - IPMessage *msg; - int len; - int maxlen; -} IPMessageQueue; - -/** Key for varp cache entries. */ -typedef struct VCKey { - /** Vnet id (network order). */ - VnetId vnet; - /** Virtual MAC address. */ - Vmac vmac; -} VCKey; - -typedef struct VCEntry { - /** Key for the entry. */ - VCKey key; - - /** Care-of address for the key. */ - VarpAddr addr; - - /** Alias coa if we are a gateway. */ - //uint32_t gateway; - /** Encapsulation to use (if a gateway). */ - //uint32_t encaps; - - /** Where this entry came from. */ - VarpAddr source; - - /** Last-updated timestamp. */ - double timestamp; - - /** State. */ - short state; - - /** Flags. */ - short flags; - - /** Number of probes sent. */ - int probes; - - /** List of messages to reply to when completes. */ - IPMessageQueue queue; - -} VCEntry; - -/** The varp cache. Varp cache entries indexed by VCKey. */ -typedef struct VarpCache { - HashTable *table; -} VarpCache; - -int vcache_init(void); -int vcache_handle_message(IPMessage *msg, int local); - -#endif /* ! _VNET_VCACHE_H_ */ diff -r a0e7daa2df33 -r 71b0f00f6344 tools/vnet/vnetd/vnetd.h --- a/tools/vnet/vnetd/vnetd.h Thu Feb 9 15:09:00 2006 +++ /dev/null Thu Feb 9 15:12:11 2006 @@ -1,81 +0,0 @@ -/* - * Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx>. - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of the - * License, or (at your option) any later version. This library is - * distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#ifndef _VNET_VNETD_H_ -#define _VNET_VNETD_H_ - -#include <asm/types.h> -#include <linux/if_ether.h> -#include "if_varp.h" -#include "varp_util.h" - -#include "connection.h" -#include "sxpr.h" - -/** Vnetd udp port in host order. */ -#define VNETD_PORT VARP_PORT - -/** Vnetd peer port in host order. */ -#define VNETD_PEER_PORT (VARP_PORT + 1) - -typedef struct VnetMsgVarp { - VarpHdr varph; -} VnetMsgVarp; - -#define VNET_FWD_MAX (1500 + 200) - -typedef struct VnetMsgFwd { - VnetMsgHdr; - uint16_t protocol; - uint16_t len; - uint8_t data[VNET_FWD_MAX]; -} __attribute__((packed)) VnetMsgFwd; - -typedef union VnetMsg { - VnetMsgHdr hdr; - VnetMsgVarp varp; - VnetMsgFwd fwd; -} VnetMsg; - -enum { - VNET_VARP_ID = VARP_ID, - VNET_FWD_ID = 200, -}; - -typedef struct Vnetd { - unsigned long port; - unsigned long peer_port; - int verbose; - - int esp_sock; - int etherip_sock; - - struct sockaddr_in addr; - struct sockaddr_in mcast_addr; - - Sxpr peers; - - Conn *listen_conn; - Conn *udp_conn; - Conn *bcast_conn; - - ConnList *connections; - -} Vnetd; - -extern Vnetd *vnetd; - -#endif /* ! _VNET_VNETD_H_ */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |