[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 14/17] Add support for receiver-map mode.
In this mode of operation, the receiving domain maps the sending domain's buffers, rather than grant-copying them into local memory. This is marginally faster, but requires the receiving domain to be somewhat trusted, because: a) It can see anything else which happens to be on the same page as the transmit buffer, and b) It can just hold onto the pages indefinitely, causing a memory leak in the transmitting domain. It's therefore only really suitable for talking to a trusted peer, and we use it in that way. Signed-off-by: Steven Smith <steven.smith@xxxxxxxxxx> --- drivers/net/xen-netchannel2/Makefile | 3 +- drivers/net/xen-netchannel2/chan.c | 14 + drivers/net/xen-netchannel2/netchannel2_core.h | 17 +- drivers/net/xen-netchannel2/receiver_map.c | 787 ++++++++++++++++++++++++ drivers/net/xen-netchannel2/recv_packet.c | 23 + drivers/net/xen-netchannel2/rscb.c | 46 +- drivers/net/xen-netchannel2/util.c | 14 + drivers/net/xen-netchannel2/xmit_packet.c | 12 +- include/xen/interface/io/netchannel2.h | 20 + 9 files changed, 920 insertions(+), 16 deletions(-) create mode 100644 drivers/net/xen-netchannel2/receiver_map.c diff --git a/drivers/net/xen-netchannel2/Makefile b/drivers/net/xen-netchannel2/Makefile index 565ba89..d6fb796 100644 --- a/drivers/net/xen-netchannel2/Makefile +++ b/drivers/net/xen-netchannel2/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_XEN_NETCHANNEL2) += netchannel2.o netchannel2-objs := chan.o netchan2.o rscb.o util.o \ - xmit_packet.o offload.o recv_packet.o poll.o + xmit_packet.o offload.o recv_packet.o poll.o \ + receiver_map.o ifeq ($(CONFIG_XEN_NETDEV2_BACKEND),y) netchannel2-objs += netback2.o diff --git a/drivers/net/xen-netchannel2/chan.c b/drivers/net/xen-netchannel2/chan.c index d5eb26e..e96a8ee 100644 --- a/drivers/net/xen-netchannel2/chan.c +++ b/drivers/net/xen-netchannel2/chan.c @@ -404,6 +404,13 @@ struct netchannel2 *nc2_new(struct xenbus_device *xd) return NULL; } + if (local_trusted) { + if (init_receive_map_mode() < 0) { + nc2_release(nc); + return NULL; + } + } + netdev->netdev_ops = &nc2_net_device_ops; /* We need to hold the ring lock in order to send messages @@ -504,6 +511,8 @@ int nc2_attach_rings(struct netchannel2 *nc, spin_unlock_bh(&nc->rings.lock); + resume_receive_map_mode(); + netif_carrier_on(nc->net_device); /* Kick it to get it going. */ @@ -635,6 +644,11 @@ int nc2_get_evtchn_port(struct netchannel2 *nc) return nc->rings.evtchn; } +void nc2_suspend(struct netchannel2 *nc) +{ + suspend_receive_map_mode(); +} + /* @ncrp has been recently nc2_kick()ed. Do all of the necessary stuff. */ static int process_ring(struct napi_struct *napi, diff --git a/drivers/net/xen-netchannel2/netchannel2_core.h b/drivers/net/xen-netchannel2/netchannel2_core.h index b5aa584..2572017 100644 --- a/drivers/net/xen-netchannel2/netchannel2_core.h +++ b/drivers/net/xen-netchannel2/netchannel2_core.h @@ -38,6 +38,7 @@ enum transmit_policy { transmit_policy_unknown = 0, transmit_policy_first = 0xf001, transmit_policy_grant = transmit_policy_first, + transmit_policy_map, transmit_policy_small, transmit_policy_last = transmit_policy_small }; @@ -321,6 +322,11 @@ struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc, struct netchannel2_msg_hdr *hdr, unsigned nr_frags, unsigned frags_off); +struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc, + struct netchannel2_msg_packet *msg, + struct netchannel2_msg_hdr *hdr, + unsigned nr_frags, + unsigned frags_off); enum prepare_xmit_result { PREP_XMIT_OKAY = 0, @@ -333,9 +339,11 @@ enum prepare_xmit_result prepare_xmit_allocate_small( struct sk_buff *skb); enum prepare_xmit_result prepare_xmit_allocate_grant( struct netchannel2_ring_pair *ncrp, - struct sk_buff *skb); + struct sk_buff *skb, + int use_subpage_grants); void xmit_grant(struct netchannel2_ring_pair *ncrp, struct sk_buff *skb, + int use_subpage_grants, volatile void *msg); void queue_finish_packet_message(struct netchannel2_ring_pair *ncrp, @@ -354,6 +362,8 @@ void fetch_fragment(struct netchannel2_ring_pair *ncrp, struct netchannel2_fragment *frag, unsigned off); +void pull_through(struct sk_buff *skb, unsigned count); + void nc2_kick(struct netchannel2_ring_pair *ncrp); int nc2_map_grants(struct grant_mapping *gm, @@ -367,6 +377,11 @@ void queue_packet_to_interface(struct sk_buff *skb, void nc2_rscb_on_gntcopy_fail(void *ctxt, struct gnttab_copy *gop); +int init_receive_map_mode(void); +void deinit_receive_map_mode(void); +void suspend_receive_map_mode(void); +void resume_receive_map_mode(void); + int nc2_start_xmit(struct sk_buff *skb, struct net_device *dev); int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp, struct sk_buff *skb); diff --git a/drivers/net/xen-netchannel2/receiver_map.c b/drivers/net/xen-netchannel2/receiver_map.c new file mode 100644 index 0000000..0d7ff84 --- /dev/null +++ b/drivers/net/xen-netchannel2/receiver_map.c @@ -0,0 +1,787 @@ +/* Support for mapping packets into the local domain, rather than + copying them or using pre-posted buffers. We only implement + receive-side support here; for transmit-side, we use the rscb.c + implementation. */ +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <asm/xen/page.h> +#include <xen/live_maps.h> +#include <xen/grant_table.h> +#include <xen/balloon.h> +#include <xen/evtchn.h> +#include "netchannel2_core.h" + +#define MAX_MAPPED_FRAGS 1024 +#define MAX_MAPPED_PACKETS MAX_PENDING_FINISH_PACKETS +#define SKB_MIN_PAYLOAD_SIZE 128 + +static DEFINE_SPINLOCK(global_map_lock); +static struct receive_mapper *receive_mapper; + +/* How long do we leave the packets in the Linux stack before trying + to copy them, in jiffies? */ +#define PACKET_TIMEOUT (HZ/2) + +/* A slot into which we could map a fragment. */ +struct rx_map_fragment { + struct list_head list; + struct rx_map_packet *packet; + grant_handle_t handle; /* 0 if the fragment isn't currently + * mapped */ + struct netchannel2_fragment nc_frag; +}; + +struct rx_map_packet { + struct list_head list; + struct list_head frags; + /* We take a reference for every mapped fragment associated + with the packet. When the refcnt goes to zero, the packet + is finished, and can be moved to the + finished_packets_list. */ + atomic_t refcnt; + unsigned id; + unsigned long expires; /* We expect Linux to have finished + with the packet by this time (in + jiffies), or we try to copy it. */ + struct netchannel2 *nc; + uint8_t flags; +}; + +struct receive_mapper { + struct page_foreign_tracker *tracker; + + struct page **pages; + + /* Nests inside the netchannel2 lock. The + finished_packets_lock nests inside this. */ + spinlock_t rm_lock; + + /* Packet fragments which we've mapped, or slots into which we + could map packets. The free list and count are protected + by @rm_lock. */ + struct rx_map_fragment frags[MAX_MAPPED_FRAGS]; + struct list_head free_frags; + + struct rx_map_packet packets[MAX_MAPPED_PACKETS]; + struct list_head free_packets; + struct list_head active_packets; + unsigned nr_free_packets; + + /* Packets which Linux has finished with but which we haven't + returned to the other endpoint yet. */ + spinlock_t finished_packets_lock; /* BH-safe leaf lock, + * acquired from the page + * free callback. Nests + * inside the rm_lock. */ + struct list_head finished_packets; + + struct tasklet_struct gc_tasklet; + + struct timer_list expire_timer; + + /* Set if we're trying to run the mapper down prior to + suspending the domain. */ + uint8_t suspending; +}; + +static void suspend_receive_mapper(struct receive_mapper *rm); + +static unsigned fragment_idx(const struct rx_map_fragment *frag) +{ + return frag - receive_mapper->frags; +} + +static int alloc_rx_frags_for_packet(unsigned nr_frags, + struct rx_map_packet *packet) +{ + struct rx_map_fragment *rmf; + unsigned x; + + INIT_LIST_HEAD(&packet->frags); + for (x = 0; x < nr_frags; x++) { + if (list_empty(&receive_mapper->free_frags)) + goto err; + rmf = list_entry(receive_mapper->free_frags.next, + struct rx_map_fragment, + list); + rmf->packet = packet; + rmf->handle = -1; + list_move(&rmf->list, &packet->frags); + } + return 0; + +err: + list_splice_init(&packet->frags, &receive_mapper->free_frags); + return -EBUSY; +} + +static struct rx_map_packet *alloc_rx_packet(struct netchannel2 *nc, + unsigned nr_frags) +{ + struct rx_map_packet *rmp; + + spin_lock(&receive_mapper->rm_lock); + if (list_empty(&receive_mapper->free_packets) || + receive_mapper->suspending) { + spin_unlock(&receive_mapper->rm_lock); + return NULL; + } + rmp = list_entry(receive_mapper->free_packets.next, + struct rx_map_packet, list); + + if (alloc_rx_frags_for_packet(nr_frags, rmp) < 0) { + spin_unlock(&receive_mapper->rm_lock); + return NULL; + } + list_del(&rmp->list); + atomic_set(&rmp->refcnt, nr_frags); + rmp->nc = nc; + receive_mapper->nr_free_packets--; + + spin_unlock(&receive_mapper->rm_lock); + + return rmp; +} + +struct grant_unmapper { + unsigned nr_gops; + struct gnttab_unmap_grant_ref gop_queue[32]; +}; + +static void do_unmaps(struct grant_unmapper *unmapper) +{ + int ret; + unsigned x; + + if (unmapper->nr_gops != 0) { + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmapper->gop_queue, + unmapper->nr_gops); + BUG_ON(ret); + for (x = 0; x < unmapper->nr_gops; x++) { + set_phys_to_machine( + __pa(unmapper->gop_queue[x].host_addr) >> + PAGE_SHIFT, + INVALID_P2M_ENTRY); + } + } + unmapper->nr_gops = 0; +} + +static void grant_unmap(struct grant_unmapper *unmapper, + void *va, + int handle) +{ + struct gnttab_unmap_grant_ref *gop; + if (unmapper->nr_gops == ARRAY_SIZE(unmapper->gop_queue)) + do_unmaps(unmapper); + gop = &unmapper->gop_queue[unmapper->nr_gops]; + gnttab_set_unmap_op(gop, (unsigned long)va, GNTMAP_host_map, handle); + unmapper->nr_gops++; +} + +/* A tasklet which is invoked shortly after a packet is released so + that we can send the FINISH_PACKET message. */ +static void gc_tasklet(unsigned long _rm) +{ + struct list_head packets; + struct rx_map_packet *packet; + struct rx_map_fragment *rx_frag; + struct list_head released_fragments; + unsigned nr_released_packets; + unsigned idx; + struct grant_unmapper unmapper; + struct page *page; + struct netchannel2 *locked_nc; + + INIT_LIST_HEAD(&packets); + + spin_lock(&receive_mapper->finished_packets_lock); + list_splice_init(&receive_mapper->finished_packets, &packets); + spin_unlock(&receive_mapper->finished_packets_lock); + + /* Unmap the fragments. */ + unmapper.nr_gops = 0; + BUG_ON(packets.next == NULL); + list_for_each_entry(packet, &packets, list) { + BUG_ON(packet->list.next == NULL); + BUG_ON(atomic_read(&packet->refcnt) != 0); + BUG_ON(packet->frags.next == NULL); + list_for_each_entry(rx_frag, &packet->frags, list) { + BUG_ON(rx_frag->list.next == NULL); + if (rx_frag->handle == -1) + continue; + idx = fragment_idx(rx_frag); + page = receive_mapper->pages[idx]; + stop_tracking_page(page); + grant_unmap(&unmapper, page_address(page), + rx_frag->handle); + } + } + do_unmaps(&unmapper); + + /* Tell the other end that the packets are finished, and + accumulate the fragments into a local free list. */ + INIT_LIST_HEAD(&released_fragments); + nr_released_packets = 0; + + locked_nc = NULL; + list_for_each_entry(packet, &packets, list) { + if (locked_nc != packet->nc) { + if (locked_nc) { + spin_unlock(&locked_nc->rings.lock); + nc2_kick(&locked_nc->rings); + } + spin_lock(&packet->nc->rings.lock); + locked_nc = packet->nc; + } + BUG_ON(packet->frags.next == NULL); + list_for_each_entry(rx_frag, &packet->frags, list) { + BUG_ON(rx_frag->list.next == NULL); + idx = fragment_idx(rx_frag); + gnttab_reset_grant_page(receive_mapper->pages[idx]); + } + nr_released_packets++; + list_splice_init(&packet->frags, &released_fragments); + queue_finish_packet_message(&locked_nc->rings, packet->id, + packet->flags); + } + + if (locked_nc) { + spin_unlock(&locked_nc->rings.lock); + nc2_kick(&locked_nc->rings); + locked_nc = NULL; + + spin_lock(&receive_mapper->rm_lock); + list_splice(&packets, &receive_mapper->free_packets); + list_splice(&released_fragments, &receive_mapper->free_frags); + receive_mapper->nr_free_packets += nr_released_packets; + + /* Reprogram the expire timer. */ + if (!list_empty(&receive_mapper->active_packets)) { + mod_timer(&receive_mapper->expire_timer, + list_entry(receive_mapper->active_packets.next, + struct rx_map_packet, + list)->expires); + } + spin_unlock(&receive_mapper->rm_lock); + } +} + +/* Decrement the refcnt on @rmp and, if necessary, move it to the + finished packets list and schedule the GC tasklet. */ +static void put_rx_map_packet(struct rx_map_packet *rmp) +{ + if (atomic_dec_and_test(&rmp->refcnt)) { + /* Remove it from the active list. */ + spin_lock_bh(&receive_mapper->rm_lock); + list_del(&rmp->list); + spin_unlock_bh(&receive_mapper->rm_lock); + + /* Add it to the finished list. */ + spin_lock_bh(&receive_mapper->finished_packets_lock); + list_add_tail(&rmp->list, &receive_mapper->finished_packets); + spin_unlock_bh(&receive_mapper->finished_packets_lock); + + tasklet_schedule(&receive_mapper->gc_tasklet); + } +} + + +/* The page @page, which was previously part of a receiver-mapped SKB, + * has been released. If it was the last page involved in its SKB, + * the packet is finished and we can tell the other end that it's + * finished. + */ +static void netchan2_page_release(struct page *page, unsigned order) +{ + struct rx_map_fragment *frag; + struct rx_map_packet *rmp; + + BUG_ON(order != 0); + + frag = (struct rx_map_fragment *)page->mapping; + rmp = frag->packet; + + put_rx_map_packet(rmp); +} + +/* Unmap the packet, removing all other references to it. The caller + * should take an additional reference to the packet before calling + * this, to stop it disappearing underneath us. The only way of + * checking whether this succeeded is to look at the packet's + * reference count after it returns. + */ +static void unmap_this_packet(struct rx_map_packet *rmp) +{ + struct rx_map_fragment *rx_frag; + unsigned idx; + int r; + int cnt; + + /* Unmap every fragment in the packet. We don't fail the whole + function just because gnttab_copy_grant_page() failed, + because success or failure will be inferable from the + reference count on the packet (this makes it easier to + handle the case where some pages have already been copied, + for instance). */ + cnt = 0; + list_for_each_entry(rx_frag, &rmp->frags, list) { + idx = fragment_idx(rx_frag); + if (rx_frag->handle != -1) { + r = gnttab_copy_grant_page(rx_frag->handle, + &receive_mapper->pages[idx]); + if (r == 0) { + /* We copied the page, so it's not really + mapped any more. */ + rx_frag->handle = -1; + atomic_dec(&rmp->refcnt); + } + } + cnt++; + } + + /* Caller should hold a reference. */ + BUG_ON(atomic_read(&rmp->refcnt) == 0); +} + +static void unmap_all_packets(void) +{ + struct rx_map_packet *rmp; + struct rx_map_packet *next; + struct list_head finished_packets; + int need_tasklet; + + INIT_LIST_HEAD(&finished_packets); + + spin_lock_bh(&receive_mapper->rm_lock); + + list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets, + list) { + atomic_inc(&rmp->refcnt); + unmap_this_packet(rmp); + if (atomic_dec_and_test(&rmp->refcnt)) + list_move(&rmp->list, finished_packets.prev); + } + spin_unlock_bh(&receive_mapper->rm_lock); + + need_tasklet = !list_empty(&finished_packets); + + spin_lock_bh(&receive_mapper->finished_packets_lock); + list_splice(&finished_packets, receive_mapper->finished_packets.prev); + spin_unlock_bh(&receive_mapper->finished_packets_lock); + + if (need_tasklet) + tasklet_schedule(&receive_mapper->gc_tasklet); +} + +static void free_receive_mapper(struct receive_mapper *rm) +{ + unsigned x; + + /* Get rid of any packets which are currently mapped. */ + suspend_receive_mapper(rm); + + /* Stop the expiry timer. We know it won't get requeued + * because there are no packets outstanding and rm->suspending + * is set (because of suspend_receive_mapper()). */ + del_timer_sync(&rm->expire_timer); + + /* Wait for any last instances of the tasklet to finish. */ + tasklet_kill(&rm->gc_tasklet); + + if (rm->pages != NULL) { + for (x = 0; x < MAX_MAPPED_FRAGS; x++) { + if (PageForeign(rm->pages[x])) + ClearPageForeign(rm->pages[x]); + rm->pages[x]->mapping = NULL; + } + free_empty_pages_and_pagevec(rm->pages, MAX_MAPPED_FRAGS); + } + if (rm->tracker != NULL) + free_page_foreign_tracker(rm->tracker); + kfree(rm); +} + +/* Timer invoked shortly after a packet expires, so that we can copy + the data and get it back from Linux. This is necessary if a packet + gets stuck in a socket RX queue somewhere, or you risk a + deadlock. */ +static void expire_timer(unsigned long data) +{ + struct rx_map_packet *rmp, *next; + struct list_head finished_packets; + int need_tasklet; + + INIT_LIST_HEAD(&finished_packets); + + spin_lock(&receive_mapper->rm_lock); + list_for_each_entry_safe(rmp, next, &receive_mapper->active_packets, + list) { + if (time_after(rmp->expires, jiffies)) { + mod_timer(&receive_mapper->expire_timer, rmp->expires); + break; + } + atomic_inc(&rmp->refcnt); + unmap_this_packet(rmp); + if (atomic_dec_and_test(&rmp->refcnt)) { + list_move(&rmp->list, finished_packets.prev); + } else { + /* Couldn't unmap the packet, either because + it's in use by real hardware or we've run + out of memory. Send the packet to the end + of the queue and update the expiry time so + that we try again later. */ + /* Note that this can make the active packet + list slightly out of order. Oh well; it + won't be by more than a few jiffies, and it + doesn't really matter that much. */ + rmp->expires = jiffies + PACKET_TIMEOUT; + list_move(&rmp->list, + receive_mapper->active_packets.prev); + } + } + spin_unlock(&receive_mapper->rm_lock); + + need_tasklet = !list_empty(&finished_packets); + + spin_lock(&receive_mapper->finished_packets_lock); + list_splice(&finished_packets, receive_mapper->finished_packets.prev); + spin_unlock(&receive_mapper->finished_packets_lock); + + if (need_tasklet) + tasklet_schedule(&receive_mapper->gc_tasklet); +} + +static struct receive_mapper *new_receive_mapper(void) +{ + struct receive_mapper *rm; + unsigned x; + + rm = kzalloc(sizeof(*rm), GFP_KERNEL); + if (!rm) + goto err; + INIT_LIST_HEAD(&rm->free_frags); + INIT_LIST_HEAD(&rm->free_packets); + INIT_LIST_HEAD(&rm->active_packets); + INIT_LIST_HEAD(&rm->finished_packets); + spin_lock_init(&rm->rm_lock); + spin_lock_init(&rm->finished_packets_lock); + for (x = 0; x < MAX_MAPPED_FRAGS; x++) + list_add_tail(&rm->frags[x].list, &rm->free_frags); + for (x = 0; x < MAX_MAPPED_PACKETS; x++) + list_add_tail(&rm->packets[x].list, &rm->free_packets); + rm->nr_free_packets = MAX_MAPPED_PACKETS; + + setup_timer(&rm->expire_timer, expire_timer, 0); + tasklet_init(&rm->gc_tasklet, gc_tasklet, 0); + + rm->tracker = alloc_page_foreign_tracker(MAX_MAPPED_FRAGS); + if (!rm->tracker) + goto err; + rm->pages = alloc_empty_pages_and_pagevec(MAX_MAPPED_FRAGS); + if (!rm->pages) + goto err; + for (x = 0; x < MAX_MAPPED_FRAGS; x++) { + SetPageForeign(rm->pages[x], netchan2_page_release); + rm->pages[x]->mapping = (void *)&rm->frags[x]; + } + + return rm; + +err: + if (rm != NULL) + free_receive_mapper(rm); + return NULL; +} + +static void attach_frag_to_skb(struct sk_buff *skb, + struct rx_map_fragment *frag) +{ + unsigned idx; + struct skb_shared_info *shinfo; + skb_frag_t *sk_frag; + + shinfo = skb_shinfo(skb); + sk_frag = &shinfo->frags[shinfo->nr_frags]; + idx = fragment_idx(frag); + sk_frag->page = receive_mapper->pages[idx]; + sk_frag->page_offset = frag->nc_frag.off; + sk_frag->size = frag->nc_frag.size; + shinfo->nr_frags++; +} + +struct rx_plan { + int is_failed; + unsigned nr_mops; + struct gnttab_map_grant_ref mops[8]; + struct rx_map_fragment *frags[8]; +}; + +static void flush_grant_operations(struct rx_plan *rp) +{ + unsigned x; + int ret; + struct gnttab_map_grant_ref *mop; + + if (rp->nr_mops == 0) + return; + if (!rp->is_failed) { + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + rp->mops, + rp->nr_mops); + BUG_ON(ret); + for (x = 0; x < rp->nr_mops; x++) { + mop = &rp->mops[x]; + if (mop->status != 0) { + rp->is_failed = 1; + } else { + rp->frags[x]->handle = mop->handle; + set_phys_to_machine( + __pa(mop->host_addr) >> PAGE_SHIFT, + FOREIGN_FRAME(mop->dev_bus_addr >> + PAGE_SHIFT)); + } + } + } + rp->nr_mops = 0; +} + +static void map_fragment(struct rx_plan *rp, + struct rx_map_fragment *rx_frag, + struct netchannel2 *nc) +{ + unsigned idx = fragment_idx(rx_frag); + struct gnttab_map_grant_ref *mop; + + if (rp->nr_mops == ARRAY_SIZE(rp->mops)) + flush_grant_operations(rp); + mop = &rp->mops[rp->nr_mops]; + gnttab_set_map_op(mop, + (unsigned long)page_address(receive_mapper->pages[idx]), + GNTMAP_host_map | GNTMAP_readonly, + rx_frag->nc_frag.receiver_map.gref, + nc->rings.otherend_id); + rp->frags[rp->nr_mops] = rx_frag; + rp->nr_mops++; +} + +/* Unmap a packet which has been half-mapped. */ +static void unmap_partial_packet(struct rx_map_packet *rmp) +{ + unsigned idx; + struct rx_map_fragment *rx_frag; + struct grant_unmapper unmapper; + + unmapper.nr_gops = 0; + list_for_each_entry(rx_frag, &rmp->frags, list) { + if (rx_frag->handle == -1) + continue; + idx = fragment_idx(rx_frag); + grant_unmap(&unmapper, + page_address(receive_mapper->pages[idx]), + rx_frag->handle); + } + do_unmaps(&unmapper); +} + +struct sk_buff *handle_receiver_map_packet(struct netchannel2 *nc, + struct netchannel2_msg_packet *msg, + struct netchannel2_msg_hdr *hdr, + unsigned nr_frags, + unsigned frags_off) +{ + struct sk_buff *skb; + struct rx_map_fragment *rx_frag; + unsigned x; + unsigned len; + struct rx_map_packet *rmp; + unsigned idx; + struct rx_plan plan; + unsigned prefix_size; + + memset(&plan, 0, sizeof(plan)); + + rmp = alloc_rx_packet(nc, nr_frags); + if (rmp == NULL) + return NULL; + + if (msg->prefix_size < SKB_MIN_PAYLOAD_SIZE) + prefix_size = SKB_MIN_PAYLOAD_SIZE; + else + prefix_size = msg->prefix_size; + /* As in posted_buffers.c, we don't limit the total size of + the packet, because we don't need to allocate more memory + for very large packets. The prefix is safe because it's + only a 16 bit number. A 64k allocation won't always + succeed, but it's unlikely to trigger the OOM killer or + otherwise interfere with the normal operation of the local + domain. */ + skb = dev_alloc_skb(prefix_size + NET_IP_ALIGN); + if (skb == NULL) { + spin_lock(&receive_mapper->rm_lock); + list_splice(&rmp->frags, &receive_mapper->free_frags); + list_add(&rmp->list, &receive_mapper->free_packets); + receive_mapper->nr_free_packets++; + spin_unlock(&receive_mapper->rm_lock); + return NULL; + } + skb_reserve(skb, NET_IP_ALIGN); + + rmp->id = msg->id; + rmp->flags = msg->flags; + + rx_frag = list_entry(rmp->frags.next, struct rx_map_fragment, list); + for (x = 0; x < nr_frags; x++) { + fetch_fragment(&nc->rings, x, &rx_frag->nc_frag, frags_off); + if (rx_frag->nc_frag.size > PAGE_SIZE || + rx_frag->nc_frag.off >= PAGE_SIZE || + rx_frag->nc_frag.size + rx_frag->nc_frag.off > PAGE_SIZE) { + plan.is_failed = 1; + break; + } + map_fragment(&plan, rx_frag, nc); + rx_frag = list_entry(rx_frag->list.next, + struct rx_map_fragment, + list); + } + + flush_grant_operations(&plan); + if (plan.is_failed) + goto fail_and_unmap; + + /* Grab the prefix off of the ring. */ + nc2_copy_from_ring_off(&nc->rings.cons_ring, + skb_put(skb, msg->prefix_size), + msg->prefix_size, + frags_off + + nr_frags * sizeof(struct netchannel2_fragment)); + + /* All fragments mapped, so we know that this is going to + work. Transfer the receive slots into the SKB. */ + len = 0; + list_for_each_entry(rx_frag, &rmp->frags, list) { + attach_frag_to_skb(skb, rx_frag); + idx = fragment_idx(rx_frag); + start_tracking_page(receive_mapper->tracker, + receive_mapper->pages[idx], + nc->rings.otherend_id, + rx_frag->nc_frag.receiver_map.gref, + idx, + nc); + len += rx_frag->nc_frag.size; + } + + skb->len += len; + skb->data_len += len; + skb->truesize += len; + + spin_lock(&receive_mapper->rm_lock); + list_add_tail(&rmp->list, &receive_mapper->active_packets); + rmp->expires = jiffies + PACKET_TIMEOUT; + if (rmp == list_entry(receive_mapper->active_packets.next, + struct rx_map_packet, + list)) + mod_timer(&receive_mapper->expire_timer, rmp->expires); + spin_unlock(&receive_mapper->rm_lock); + + if (skb_headlen(skb) < SKB_MIN_PAYLOAD_SIZE) + pull_through(skb, + SKB_MIN_PAYLOAD_SIZE - skb_headlen(skb)); + + return skb; + +fail_and_unmap: + pr_debug("Failed to map received packet!\n"); + unmap_partial_packet(rmp); + + spin_lock(&receive_mapper->rm_lock); + list_splice(&rmp->frags, &receive_mapper->free_frags); + list_add_tail(&rmp->list, &receive_mapper->free_packets); + receive_mapper->nr_free_packets++; + spin_unlock(&receive_mapper->rm_lock); + + kfree_skb(skb); + return NULL; +} + +static void suspend_receive_mapper(struct receive_mapper *rm) +{ + spin_lock_bh(&rm->rm_lock); + /* Stop any more packets coming in. */ + rm->suspending = 1; + + /* Wait for Linux to give back all of the SKBs which we've + given it. */ + while (rm->nr_free_packets != MAX_MAPPED_PACKETS) { + spin_unlock_bh(&rm->rm_lock); + unmap_all_packets(); + msleep(100); + spin_lock_bh(&rm->rm_lock); + } + spin_unlock_bh(&rm->rm_lock); +} + +static void resume_receive_mapper(void) +{ + spin_lock_bh(&receive_mapper->rm_lock); + receive_mapper->suspending = 0; + spin_unlock_bh(&receive_mapper->rm_lock); +} + + +int init_receive_map_mode(void) +{ + struct receive_mapper *new_rm; + spin_lock(&global_map_lock); + while (receive_mapper == NULL) { + spin_unlock(&global_map_lock); + new_rm = new_receive_mapper(); + if (new_rm == NULL) + return -ENOMEM; + spin_lock(&global_map_lock); + if (receive_mapper == NULL) { + receive_mapper = new_rm; + } else { + spin_unlock(&global_map_lock); + free_receive_mapper(new_rm); + spin_lock(&global_map_lock); + } + } + spin_unlock(&global_map_lock); + return 0; +} + +void deinit_receive_map_mode(void) +{ + if (!receive_mapper) + return; + BUG_ON(spin_is_locked(&global_map_lock)); + free_receive_mapper(receive_mapper); + receive_mapper = NULL; +} + +void suspend_receive_map_mode(void) +{ + if (!receive_mapper) + return; + suspend_receive_mapper(receive_mapper); +} + +void resume_receive_map_mode(void) +{ + if (!receive_mapper) + return; + resume_receive_mapper(); +} + +struct netchannel2 *nc2_get_interface_for_page(struct page *p) +{ + BUG_ON(!page_is_tracked(p)); + if (!receive_mapper || + tracker_for_page(p) != receive_mapper->tracker) + return NULL; + return get_page_tracker_ctxt(p); +} diff --git a/drivers/net/xen-netchannel2/recv_packet.c b/drivers/net/xen-netchannel2/recv_packet.c index 80c5d5d..8c38788 100644 --- a/drivers/net/xen-netchannel2/recv_packet.c +++ b/drivers/net/xen-netchannel2/recv_packet.c @@ -112,6 +112,28 @@ void nc2_handle_packet_msg(struct netchannel2 *nc, nr_frags, frags_off); queue_finish_packet_message(ncrp, msg.id, msg.flags); break; + case NC2_PACKET_TYPE_receiver_map: + if (!nc->local_trusted) { + /* The remote doesn't trust us, so they + shouldn't be sending us receiver-map + packets. Just treat it as an RSCB + packet. */ + skb = NULL; + } else { + skb = handle_receiver_map_packet(nc, &msg, hdr, + nr_frags, + frags_off); + /* Finish message will be sent when we unmap + * the packet. */ + } + if (skb == NULL) { + /* We can't currently map this skb. Use a + receiver copy instead. */ + skb = handle_receiver_copy_packet(nc, ncrp, &msg, hdr, + nr_frags, frags_off); + queue_finish_packet_message(ncrp, msg.id, msg.flags); + } + break; default: pr_debug("Unknown packet type %d\n", msg.type); nc->stats.rx_errors++; @@ -285,4 +307,5 @@ int __init nc2_init(void) void __exit nc2_exit(void) { + deinit_receive_map_mode(); } diff --git a/drivers/net/xen-netchannel2/rscb.c b/drivers/net/xen-netchannel2/rscb.c index de6e8c6..c929c73 100644 --- a/drivers/net/xen-netchannel2/rscb.c +++ b/drivers/net/xen-netchannel2/rscb.c @@ -210,6 +210,7 @@ struct sk_buff *handle_receiver_copy_packet(struct netchannel2 *nc, struct grant_packet_plan { volatile struct netchannel2_fragment *out_fragment; grant_ref_t gref_pool; + int use_subpage_grants; unsigned prefix_avail; }; @@ -224,14 +225,15 @@ static inline int nfrags_skb(struct sk_buff *skb, int prefix_size) start_grant = ((unsigned long)skb->data + prefix_size) & ~(PAGE_SIZE-1); end_grant = ((unsigned long)skb->data + - skb_headlen(skb) + PAGE_SIZE - 1) & + skb_headlen(skb) + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); return ((end_grant - start_grant) >> PAGE_SHIFT) + skb_shinfo(skb)->nr_frags; } enum prepare_xmit_result prepare_xmit_allocate_grant(struct netchannel2_ring_pair *ncrp, - struct sk_buff *skb) + struct sk_buff *skb, + int use_subpage_grants) { struct skb_cb_overlay *skb_co = get_skb_overlay(skb); unsigned nr_fragments; @@ -242,13 +244,23 @@ enum prepare_xmit_result prepare_xmit_allocate_grant(struct netchannel2_ring_pai if (allocate_txp_slot(ncrp, skb) < 0) return PREP_XMIT_BUSY; - /* We're going to have to get the remote to issue a grant copy - hypercall anyway, so there's no real benefit to shoving the - headers inline. */ - /* (very small packets won't go through here, so there's no - chance that we could completely eliminate the grant - copy.) */ - inline_prefix_size = sizeof(struct ethhdr); + if (use_subpage_grants) { + /* We're going to have to get the remote to issue a + grant copy hypercall anyway, so there's no real + benefit to shoving the headers inline. */ + /* (very small packets won't go through here, so + there's no chance that we could completely + eliminate the grant copy.) */ + inline_prefix_size = sizeof(struct ethhdr); + } else { + /* If we're going off-box (and we probably are, if the + remote is trusted), putting the header in the ring + potentially saves a TLB miss in the bridge, which + is worth doing. */ + inline_prefix_size = PACKET_PREFIX_SIZE; + if (skb_headlen(skb) < inline_prefix_size) + inline_prefix_size = skb_headlen(skb); + } if (skb_co->nr_fragments == 0) { nr_fragments = nfrags_skb(skb, inline_prefix_size); @@ -278,10 +290,14 @@ enum prepare_xmit_result prepare_xmit_allocate_grant(struct netchannel2_ring_pai have to recompute it next time around. */ return PREP_XMIT_BUSY; } + skb_co->gref_pool = gref_pool; skb_co->inline_prefix_size = inline_prefix_size; - skb_co->type = NC2_PACKET_TYPE_receiver_copy; + if (use_subpage_grants) + skb_co->type = NC2_PACKET_TYPE_receiver_copy; + else + skb_co->type = NC2_PACKET_TYPE_receiver_map; return PREP_XMIT_OKAY; } @@ -319,15 +335,19 @@ static void prepare_subpage_grant(struct netchannel2_ring_pair *ncrp, GTF_readonly, trans_domid, trans_gref); - } else { + } else if (plan->use_subpage_grants) { gnttab_grant_foreign_access_ref_subpage(gref, ncrp->otherend_id, virt_to_mfn(page_address(page)), GTF_readonly, off_in_page, size); + } else { + gnttab_grant_foreign_access_ref(gref, + ncrp->otherend_id, + virt_to_mfn(page_address(page)), + GTF_readonly); } - frag->off = off_in_page; frag->size = size; plan->out_fragment++; @@ -357,6 +377,7 @@ static int grant_data_area(struct netchannel2_ring_pair *ncrp, void xmit_grant(struct netchannel2_ring_pair *ncrp, struct sk_buff *skb, + int use_subpage_grants, volatile void *msg_buf) { volatile struct netchannel2_msg_packet *msg = msg_buf; @@ -367,6 +388,7 @@ void xmit_grant(struct netchannel2_ring_pair *ncrp, skb_frag_t *frag; memset(&plan, 0, sizeof(plan)); + plan.use_subpage_grants = use_subpage_grants; plan.prefix_avail = skb_co->inline_prefix_size; plan.out_fragment = msg->frags; plan.gref_pool = skb_co->gref_pool; diff --git a/drivers/net/xen-netchannel2/util.c b/drivers/net/xen-netchannel2/util.c index 57e6aed..0d242a4 100644 --- a/drivers/net/xen-netchannel2/util.c +++ b/drivers/net/xen-netchannel2/util.c @@ -91,6 +91,20 @@ void release_tx_packet(struct netchannel2_ring_pair *ncrp, } gnttab_release_grant_reference(&ncrp->gref_pool, gref); } + } else if (skb_co->type == NC2_PACKET_TYPE_receiver_map) { + while (1) { + r = gnttab_claim_grant_reference(&skb_co->gref_pool); + if (r == -ENOSPC) + break; + gref = (grant_ref_t)r; + r = gnttab_end_foreign_access_ref(gref, 0); + if (r == 0) { + printk(KERN_WARNING "Failed to end remote access to packet memory.\n"); + } else { + gnttab_release_grant_reference(&ncrp->gref_pool, + gref); + } + } } else if (skb_co->gref_pool != 0) { gnttab_subfree_grant_references(skb_co->gref_pool, &ncrp->gref_pool); diff --git a/drivers/net/xen-netchannel2/xmit_packet.c b/drivers/net/xen-netchannel2/xmit_packet.c index 4c9e0b5..eb4090b 100644 --- a/drivers/net/xen-netchannel2/xmit_packet.c +++ b/drivers/net/xen-netchannel2/xmit_packet.c @@ -13,6 +13,8 @@ static enum transmit_policy transmit_policy(struct netchannel2 *nc, { if (skb->len <= PACKET_PREFIX_SIZE && !skb_is_nonlinear(skb)) return transmit_policy_small; + else if (nc->remote_trusted) + return transmit_policy_map; else return transmit_policy_grant; } @@ -72,7 +74,10 @@ enum prepare_xmit_result prepare_xmit_allocate_resources(struct netchannel2 *nc, r = prepare_xmit_allocate_small(&nc->rings, skb); break; case transmit_policy_grant: - r = prepare_xmit_allocate_grant(&nc->rings, skb); + r = prepare_xmit_allocate_grant(&nc->rings, skb, 1); + break; + case transmit_policy_map: + r = prepare_xmit_allocate_grant(&nc->rings, skb, 0); break; default: BUG(); @@ -168,7 +173,10 @@ int nc2_really_start_xmit(struct netchannel2_ring_pair *ncrp, /* Nothing to do */ break; case transmit_policy_grant: - xmit_grant(ncrp, skb, msg); + xmit_grant(ncrp, skb, 1, msg); + break; + case transmit_policy_map: + xmit_grant(ncrp, skb, 0, msg); break; default: BUG(); diff --git a/include/xen/interface/io/netchannel2.h b/include/xen/interface/io/netchannel2.h index 1cca607..f264995 100644 --- a/include/xen/interface/io/netchannel2.h +++ b/include/xen/interface/io/netchannel2.h @@ -46,6 +46,9 @@ struct netchannel2_fragment { struct { grant_ref_t gref; } receiver_copy; + struct { + grant_ref_t gref; + } receiver_map; }; }; struct netchannel2_msg_packet { @@ -98,6 +101,22 @@ struct netchannel2_msg_packet { * Due to backend bugs, it is in not safe to use this * packet type except on bypass rings. * + * receiver_map -- The transmitting domain has granted the receiving + * domain access to the original RX buffers using + * full (mappable) grant references. This can be + * treated the same way as receiver_copy, but the + * receiving domain also has the option of mapping + * the fragments, rather than copying them. If it + * decides to do so, it should ensure that the fragments + * will be unmapped in a reasonably timely fashion, + * and don't e.g. become stuck in a receive buffer + * somewhere. In general, anything longer than about + * a second is likely to cause problems. Once all + * grant references have been unmapper, the receiving + * domain should send a FINISH message. + * + * This packet type may not be used on bypass rings. + * * small -- The packet does not have any fragment descriptors * (i.e. the entire thing is inline in the ring). The receiving * domain should simply the copy the packet out of the ring @@ -110,6 +129,7 @@ struct netchannel2_msg_packet { * that it is correct to treat receiver_map and small packets as * receiver_copy ones. */ #define NC2_PACKET_TYPE_receiver_copy 1 +#define NC2_PACKET_TYPE_receiver_map 3 #define NC2_PACKET_TYPE_small 4 #define NC2_PACKET_SEGMENTATION_TYPE_none 0 -- 1.6.3.1 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |