[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH][RFC] net/bridge: Add basic VEPA support to Xen Dom0



This patch adds basic Virtual Ethernet Port Aggregator (VEPA)
capabilities to the Xen Dom0 kernel Ethernet bridging code.

A Virtual Ethernet Port Aggregator (VEPA) is a capability within
a physical end station that collaborates with an adjacent, external
bridge to provide distributed bridging support between multiple
virtual end stations and external networks. The VEPA collaborates
by forwarding all station-originated frames to the adjacent bridge
for frame processing and frame relay (including so-called 'hairpin'
forwarding) and by steering and replicating frames received from
the VEPA uplink to the appropriate destinations. A VEPA may be
implemented in software or in conjunction with embedded hardware.

In particular, the patch extends the Xen Dom0 Ethernet bridge to act as
(1) a VEPA - for this we have added VEPA forwarding functionality and
    added a configuration option for a VEPA uplink port, or as
(2) a bridge supporting 'hairpin' forwarding - for this we have added a
    bridge port 'hairpin' mode which allows sending frames back out
    through the port the frame was received on.

Configuration of VEPA capabilities through Linux userspace bridge
utilities is provided by an additional patch 'bridge-utils: add
basic VEPA support'.

Integration of VEPA capabilities and Xen tools is provided by a
separate patch 'tools: Add basic VEPA support' which extends Xen's
userspace network scripts.

You can find additional information on VEPA here:
http://tech.groups.yahoo.com/group/evb/
http://www.ieee802.org/1/files/public/docs2009/new-hudson-vepa_seminar-20090514d.pdf

Signed-off-by: Paul Congdon <paul.congdon@xxxxxx>
Signed-off-by: Anna Fischer <anna.fischer@xxxxxx>

---

A similar patch has also been submitted to the Linux/KVM developer community
for inclusion of the VEPA capabilities into the mainline Linux kernel.

---

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -343,6 +343,15 @@
 
        fdb = fdb_find(head, addr);
        if (likely(fdb)) {
+               /*
+                * If we are a VEPA and the source port is the uplink,
+                * this could be a reflected packet, so don't learn any
+                * addresses that already are in the fdb but on other ports
+                */
+               if ((br->flags & BR_VEPA_MODE) && br->uplink == source &&
+                   fdb->dst != br->uplink)
+                       return;
+
                /* attempt to update an entry for a local interface */
                if (unlikely(fdb->is_local)) {
                        if (net_ratelimit()) 
@@ -364,3 +373,16 @@
                spin_unlock(&br->hash_lock);
        }
 }
+
+struct net_bridge_port *br_vepa_find_src(struct net_bridge *br,
+                  const unsigned char *addr)
+{
+       struct hlist_head *head = &br->hash[br_mac_hash(addr)];
+       struct net_bridge_fdb_entry *fdb;
+
+       fdb = fdb_find(head, addr);
+       if (fdb)
+               return fdb->dst;
+       else
+               return NULL;
+}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -24,7 +24,8 @@
 static inline int should_deliver(const struct net_bridge_port *p, 
                                 const struct sk_buff *skb)
 {
-       return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING);
+       return (((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
+               p->state == BR_STATE_FORWARDING);
 }
 
 static inline unsigned packet_length(const struct sk_buff *skb)
@@ -92,6 +93,17 @@
 }
 
 /* called with rcu_read_lock */
+void br_vepa_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+       if (to != NULL) {
+               __br_forward(to, skb);
+               return;
+       }
+
+       kfree_skb(skb);
+}
+
+/* called with rcu_read_lock */
 void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
 {
        if (should_deliver(to, skb)) {
@@ -109,6 +121,7 @@
 {
        struct net_bridge_port *p;
        struct net_bridge_port *prev;
+       struct net_bridge_port *sp = NULL;
 
        if (clone) {
                struct sk_buff *skb2;
@@ -121,6 +134,13 @@
                skb = skb2;
        }
 
+       /*
+        * If we are a VEPA, then we do not want to send the frame
+        * to the port it came from originally.
+        */
+       if (br->flags & BR_VEPA_MODE)
+               sp = br_vepa_find_src(br, eth_hdr(skb)->h_source);
+
        prev = NULL;
 
        list_for_each_entry_rcu(p, &br->port_list, list) {
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -168,6 +168,8 @@
        list_del_rcu(&p->list);
 
        rcu_assign_pointer(dev->br_port, NULL);
+       if (br->uplink == p)
+               br->uplink = NULL;
 
        kobject_uevent(&p->kobj, KOBJ_REMOVE);
        kobject_del(&p->kobj);
@@ -225,6 +227,7 @@
        br->topology_change_detected = 0;
        br->ageing_time = 300 * HZ;
        INIT_LIST_HEAD(&br->age_list);
+       br->uplink = NULL;
 
        br_stp_timer_init(br);
 
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -52,6 +52,15 @@
        br = p->br;
        br_fdb_update(br, p, eth_hdr(skb)->h_source);
 
+       /*
+        * If we are a VEPA, and the receiving port is not the uplink we
+        * simply want to send this frame to the uplink (after learning)
+        */
+       if ((br->flags & BR_VEPA_MODE) && p != br->uplink) {
+               br_vepa_deliver(br->uplink, skb);
+               goto out;
+       }
+
        if (p->state == BR_STATE_LEARNING)
                goto drop;
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -84,6 +84,9 @@
        struct kobject                  kobj;
        struct work_struct              carrier_check;
        struct rcu_head                 rcu;
+
+       unsigned long                   flags;
+#define BR_HAIRPIN_MODE                0x00000001
 };
 
 struct net_bridge
@@ -96,6 +99,8 @@
        struct hlist_head               hash[BR_HASH_SIZE];
        struct list_head                age_list;
        unsigned long                   feature_mask;
+       unsigned long                   flags;
+#define BR_VEPA_MODE           0x00000010
 
        /* STP */
        bridge_id                       designated_root;
@@ -120,6 +125,9 @@
        struct timer_list               topology_change_timer;
        struct timer_list               gc_timer;
        struct kobject                  ifobj;
+
+       /* VEPA */
+       struct net_bridge_port          *uplink;
 };
 
 extern struct notifier_block br_device_notifier;
@@ -157,6 +165,9 @@
 extern void br_fdb_update(struct net_bridge *br,
                          struct net_bridge_port *source,
                          const unsigned char *addr);
+extern struct net_bridge_port *br_vepa_find_src(struct net_bridge *br,
+                                               const unsigned char *addr);
+
 
 /* br_forward.c */
 extern void br_deliver(const struct net_bridge_port *to,
@@ -171,6 +182,8 @@
 extern void br_flood_forward(struct net_bridge *br,
                      struct sk_buff *skb,
                      int clone);
+extern void br_vepa_deliver(const struct net_bridge_port *to,
+               struct sk_buff *skb);
 
 /* br_if.c */
 extern int br_add_bridge(const char *name);
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -290,6 +290,69 @@
                         show_group_addr, store_group_addr);
 
 
+static ssize_t show_vepa_mode(struct class_device *d, char *buf)
+{
+       struct net_bridge *br = to_bridge(d);
+       int vepa_mode = (br->flags & BR_VEPA_MODE) ? 1 : 0;
+       return sprintf(buf, "%d\n", vepa_mode);
+}
+
+static ssize_t store_vepa_mode(struct class_device *d, const char *buf,
+                              size_t len)
+{
+       struct net_bridge *br = to_bridge(d);
+       int vepa_mode = 0;
+
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       if (sscanf(buf, "%d", &vepa_mode) != 1)
+               return -EINVAL;
+
+       rtnl_lock();
+       if (vepa_mode)
+               br->flags |= BR_VEPA_MODE;
+       else
+               br->flags &= ~BR_VEPA_MODE;
+       rtnl_unlock();
+
+       return len;
+}
+static CLASS_DEVICE_ATTR(vepa_mode, S_IRUGO | S_IWUSR, show_vepa_mode,
+                  store_vepa_mode);
+
+static ssize_t show_uplink_port(struct class_device *d, char *buf)
+{
+       struct net_bridge *br = to_bridge(d);
+       if (br->uplink && br->uplink->dev)
+               return sprintf(buf, "%s\n", br->uplink->dev->name);
+       else
+               return sprintf(buf, "\n");
+}
+
+static ssize_t store_uplink_port(struct class_device *d, const char *buf,
+                                size_t len)
+{
+       struct net_bridge *br = to_bridge(d);
+       struct net_device *dev;
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       dev = dev_get_by_name(buf);
+       if (!dev || !dev->br_port || (dev->br_port->br != br)) {
+               br->uplink = NULL;
+               return -EINVAL;
+       }
+
+       rtnl_lock();
+       br->uplink = dev->br_port;
+       rtnl_unlock();
+
+       return len;
+}
+static CLASS_DEVICE_ATTR(uplink_port, S_IRUGO | S_IWUSR, show_uplink_port,
+                  store_uplink_port);
+
 static struct attribute *bridge_attrs[] = {
        &class_device_attr_forward_delay.attr,
        &class_device_attr_hello_time.attr,
@@ -308,6 +371,8 @@
        &class_device_attr_topology_change_timer.attr,
        &class_device_attr_gc_timer.attr,
        &class_device_attr_group_addr.attr,
+       &class_device_attr_vepa_mode.attr,
+       &class_device_attr_uplink_port.attr,
        NULL
 };
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -137,6 +137,22 @@
 }
 static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL);
 
+static ssize_t show_hairpin_mode(struct net_bridge_port *p, char *buf)
+{
+       int hairpin_mode = (p->flags & BR_HAIRPIN_MODE) ? 1 : 0;
+       return sprintf(buf, "%d\n", hairpin_mode);
+}
+static ssize_t store_hairpin_mode(struct net_bridge_port *p, unsigned long v)
+{
+       if (v)
+               p->flags |= BR_HAIRPIN_MODE;
+       else
+               p->flags &= ~BR_HAIRPIN_MODE;
+       return 0;
+}
+static BRPORT_ATTR(hairpin_mode, S_IRUGO | S_IWUSR,
+                  show_hairpin_mode, store_hairpin_mode);
+
 static struct brport_attribute *brport_attrs[] = {
        &brport_attr_path_cost,
        &brport_attr_priority,
@@ -152,6 +168,7 @@
        &brport_attr_message_age_timer,
        &brport_attr_forward_delay_timer,
        &brport_attr_hold_timer,
+       &brport_attr_hairpin_mode,
        NULL
 };

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.