>From 3bced1452e1e619e7f4701cf67ba88c2627aa376 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 20 Feb 2017 13:33:34 +0000 Subject: [PATCH WIP 1/2] drivers/net: add xen-netback PMD Introduce Xen network backend support, namely xen-netback. This mostly means adding a boilerplate driver with a initially reduced set of features (i.e. without feature-sg and no multi queue). It handles grant operations and notifications correctly, and almost all state machine. Additionally it supports one early version of staging grants (here after feature-persistent=1) to allow DPDK to have a set of premapped grants and hence avoid the grant copy (slow)paths. This driver is implemented using xen provided libraries for event channels, gnttab and xenstore operations. Signed-off-by: Joao Martins --- drivers/net/Makefile | 1 + drivers/net/xen-netback/Makefile | 68 ++ .../xen-netback/rte_pmd_xen-netback_version.map | 3 + drivers/net/xen-netback/xnb.h | 159 ++++ drivers/net/xen-netback/xnb_ethdev.c | 701 +++++++++++++++ drivers/net/xen-netback/xnb_ethdev.h | 34 + drivers/net/xen-netback/xnb_ring.c | 240 +++++ drivers/net/xen-netback/xnb_rxtx.c | 683 +++++++++++++++ drivers/net/xen-netback/xnb_xenbus.c | 975 +++++++++++++++++++++ mk/rte.app.mk | 1 + 10 files changed, 2865 insertions(+) create mode 100644 drivers/net/xen-netback/Makefile create mode 100644 drivers/net/xen-netback/rte_pmd_xen-netback_version.map create mode 100644 drivers/net/xen-netback/xnb.h create mode 100644 drivers/net/xen-netback/xnb_ethdev.c create mode 100644 drivers/net/xen-netback/xnb_ethdev.h create mode 100644 drivers/net/xen-netback/xnb_ring.c create mode 100644 drivers/net/xen-netback/xnb_rxtx.c create mode 100644 drivers/net/xen-netback/xnb_xenbus.c diff --git a/drivers/net/Makefile b/drivers/net/Makefile index bc93230..a4bf7cb 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -55,6 +55,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD) += thunderx DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3 DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt +DIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += xen-netback ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y) DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost diff --git a/drivers/net/xen-netback/Makefile b/drivers/net/xen-netback/Makefile new file mode 100644 index 0000000..c6299b0 --- /dev/null +++ b/drivers/net/xen-netback/Makefile @@ -0,0 +1,68 @@ +# BSD LICENSE +# +# Copyright(c) 2016, Oracle and/or its affiliates. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +include $(RTE_SDK)/mk/rte.vars.mk + +# +# library name +# +LIB = librte_pmd_xen-netback.a +LIBABIVER := 1 +EXPORT_MAP := rte_pmd_xen-netback_version.map + +LDLIBS += -lpthread +LDLIBS += -lxenstore -lxenctrl +# OL6 and OL7 has it on /usr/lib64 +LDLIBS += -L/usr/lib64 + +CFLAGS += -O0 -D_GNU_SOURCE -g +CFLAGS += $(WERROR_FLAGS) +CFLAGS += -D__XEN_TOOLS__ +#CFLAGS += -DDEBUG +#CFLAGS += -DDEBUG_PACKET + +# +# all source are stored in SRCS-y +# +SRCS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += xnb_ethdev.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += xnb_xenbus.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += xnb_rxtx.c +SRCS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += xnb_ring.c + +# this lib depends upon: +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += lib/librte_eal +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += lib/librte_hash +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += lib/librte_mbuf +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += lib/librte_ether +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += lib/librte_kvargs +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += lib/librte_mempool +DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK) += lib/librte_net + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/drivers/net/xen-netback/rte_pmd_xen-netback_version.map b/drivers/net/xen-netback/rte_pmd_xen-netback_version.map new file mode 100644 index 0000000..dc4d417 --- /dev/null +++ b/drivers/net/xen-netback/rte_pmd_xen-netback_version.map @@ -0,0 +1,3 @@ +DPDK_16.04 { + local: *; +}; diff --git a/drivers/net/xen-netback/xnb.h b/drivers/net/xen-netback/xnb.h new file mode 100644 index 0000000..39c92d2 --- /dev/null +++ b/drivers/net/xen-netback/xnb.h @@ -0,0 +1,159 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#if __XEN_LATEST_INTERFACE_VERSION__ >= 0x00040700 +#define XC_WANT_COMPAT_GNTTAB_API +#define XC_WANT_COMPAT_EVTCHN_API +#endif + +#include + +#define RTE_XEN_MAX_PKT_BURST 256 + +struct pending_req { + union { + netif_tx_request_t txreq; + netif_rx_request_t rxreq; + } u; + bool more; + struct rte_mbuf *mbuf; +}; + +struct xenvif_ring { + xc_interface *xch; + xc_gnttab *gnttabh; + xc_evtchn *evtchnh; + + struct netif_tx_sring *tx_addr; + netif_tx_back_ring_t tx_ring; + struct netif_rx_sring *rx_addr; + netif_rx_back_ring_t rx_ring; + + grant_ref_t ring_ref; + evtchn_port_t evtchn; + evtchn_port_or_error_t port; + + struct gnttab_copy *gop; + struct pending_req *pending; + struct rte_hash *grants; + uint16_t grants_cnt; + + domid_t dom; + char *name; +}; + + +/* The Xenbus related domain state entries */ +struct xenvif_state { + XenbusState state; + domid_t domid; + unsigned handle; + char *path; +}; + +/* The Xen virtual interface queues */ +struct xenvif_queue { + unsigned int id; + char *path; + + struct xenvif_ring tx; + struct xenvif_ring rx; + + struct xenvif *vif; +}; + +/* The Frontend features capabilities */ +struct xenvif_features { + uint8_t rx_poll; + uint8_t sg; + uint8_t tcp4; + uint8_t tcp4_prefix; + uint8_t tcp6; + uint8_t tcp6_prefix; + uint8_t ip4_csum; + uint8_t ip6_csum; + uint8_t mcast_ctrl; + uint8_t pgnt; + uint8_t zc; +}; + +/* The Domain related backend and frontend state */ +struct xenvif { + struct xenvif_state fe; + struct xenvif_state be; + + void *priv; + char *ifname; + struct xenvif_queue *queues; + unsigned num_queues; + + struct xenvif_features features; +}; + +struct xenbus_ops { + /* xenstore ids /backend/ (NULL terminated) */ + const char **ids; + + /* device state changes */ + int (*init)(struct xenvif *); + int (*connect)(struct xenvif *); + int (*disconnect)(struct xenvif *); + int (*close)(struct xenvif *); +}; + +int rte_xen_ring_map(struct xenvif_ring *ring); +int rte_xen_ring_unmap(struct xenvif_ring *ring); + +void *rte_xen_ring_get_page(struct xenvif_ring *ring, grant_ref_t ref, + bool writable); + +int rte_xenbus_backend_register(struct xenbus_ops *, + unsigned max_cores); + +int rte_xenbus_backend_start(void); +void rte_xenbus_backend_stop(void); + +uint16_t rte_xen_enqueue_burst(struct xenvif *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +uint16_t rte_xen_dequeue_burst(struct xenvif *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, uint16_t count); diff --git a/drivers/net/xen-netback/xnb_ethdev.c b/drivers/net/xen-netback/xnb_ethdev.c new file mode 100644 index 0000000..67cd1b3 --- /dev/null +++ b/drivers/net/xen-netback/xnb_ethdev.c @@ -0,0 +1,701 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xnb.h" + +#define ETH_XNB_IFACE_ARG "iface" +#define ETH_XNB_QUEUES_ARG "queues" + +static const char *drivername = "XEN NETBACK PMD"; + +static const char *valid_arguments[] = { + ETH_XNB_IFACE_ARG, + ETH_XNB_QUEUES_ARG, + NULL +}; + +static struct ether_addr base_eth_addr = { + .addr_bytes = { + 0x58 /* X */, + 0x45 /* E */, + 0x4E /* N */, + 0x42 /* B */, + 0x45 /* E */, + 0x00 + } +}; + +struct xnb_queue { + rte_atomic32_t allow_queuing; + rte_atomic32_t while_queuing; + struct xenvif *device; + struct pmd_internal *internal; + struct rte_mempool *mb_pool; + uint8_t port; + uint16_t queue_id; + uint64_t rx_pkts; + uint64_t tx_pkts; + uint64_t missed_pkts; + uint64_t rx_bytes; + uint64_t tx_bytes; +}; + +struct pmd_internal { + char *dev_name; + char *iface_name; + uint16_t max_queues; + + volatile uint16_t once; +}; + +struct internal_list { + TAILQ_ENTRY(internal_list) next; + struct rte_eth_dev *eth_dev; +}; + +TAILQ_HEAD(internal_list_head, internal_list); +static struct internal_list_head internal_list = + TAILQ_HEAD_INITIALIZER(internal_list); + +static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER; + +static rte_atomic16_t nb_started_ports; +static pthread_t session_th; + +static struct rte_eth_link pmd_link = { + .link_speed = 10000, + .link_duplex = ETH_LINK_FULL_DUPLEX, + .link_status = ETH_LINK_DOWN +}; + +static int eth_dev_configure(struct rte_eth_dev *dev __rte_unused) +{ + return 0; +} + +static inline struct internal_list *find_internal_resource(char *ifname) +{ + int found = 0; + struct internal_list *list; + struct pmd_internal *internal; + + if (ifname == NULL) + return NULL; + + pthread_mutex_lock(&internal_list_lock); + + TAILQ_FOREACH(list, &internal_list, next) { + internal = list->eth_dev->data->dev_private; + if (!strcmp(internal->iface_name, ifname)) { + found = 1; + break; + } + } + + pthread_mutex_unlock(&internal_list_lock); + + if (!found) + return NULL; + + return list; +} + +static void eth_dev_infos_get(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_dev_info *dev_info __rte_unused) +{ + struct pmd_internal *internal; + + internal = dev->data->dev_private; + if (internal == NULL) { + RTE_LOG(ERR, PMD, "Invalid device specified\n"); + return; + } + + dev_info->driver_name = drivername; + dev_info->max_mac_addrs = 1; + dev_info->max_rx_pktlen = (uint32_t)-1; + dev_info->max_rx_queues = internal->max_queues; + dev_info->max_tx_queues = internal->max_queues; + dev_info->min_rx_bufsize = 0; +} + +static void eth_stats_get(struct rte_eth_dev *dev __rte_unused, + struct rte_eth_stats *stats __rte_unused) +{ + unsigned i; + unsigned long rx_total = 0, tx_total = 0, tx_missed_total = 0; + unsigned long rx_total_bytes = 0, tx_total_bytes = 0; + struct xnb_queue *xnbq; + + for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS && + i < dev->data->nb_rx_queues; i++) { + if (dev->data->rx_queues[i] == NULL) + continue; + xnbq = dev->data->rx_queues[i]; + stats->q_ipackets[i] = xnbq->rx_pkts; + rx_total += stats->q_ipackets[i]; + + stats->q_ibytes[i] = xnbq->rx_bytes; + rx_total_bytes += stats->q_ibytes[i]; + } + + for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS && + i < dev->data->nb_tx_queues; i++) { + if (dev->data->tx_queues[i] == NULL) + continue; + xnbq = dev->data->tx_queues[i]; + stats->q_opackets[i] = xnbq->tx_pkts; + tx_missed_total += xnbq->missed_pkts; + tx_total += stats->q_opackets[i]; + + stats->q_obytes[i] = xnbq->tx_bytes; + tx_total_bytes += stats->q_obytes[i]; + } + + stats->ipackets = rx_total; + stats->opackets = tx_total; + stats->imissed = tx_missed_total; + stats->ibytes = rx_total_bytes; + stats->obytes = tx_total_bytes; +} + +static void eth_stats_reset(struct rte_eth_dev *dev __rte_unused) +{ + struct xnb_queue *xnbq; + unsigned i; + + for (i = 0; i < dev->data->nb_rx_queues; i++) { + if (dev->data->rx_queues[i] == NULL) + continue; + xnbq = dev->data->rx_queues[i]; + xnbq->rx_pkts = 0; + xnbq->rx_bytes = 0; + } + for (i = 0; i < dev->data->nb_tx_queues; i++) { + if (dev->data->tx_queues[i] == NULL) + continue; + xnbq = dev->data->tx_queues[i]; + xnbq->tx_pkts = 0; + xnbq->tx_bytes = 0; + xnbq->missed_pkts = 0; + } +} + +static int xnb_init(struct xenvif *vif __rte_unused) +{ + return 0; +} + +static int xnb_close(struct xenvif *vif __rte_unused) +{ + return 0; +} + +static int xnb_connect(struct xenvif *dev __rte_unused) +{ + struct rte_eth_dev *eth_dev; + struct internal_list *list; + struct pmd_internal *internal; + struct xnb_queue *xnbq; + unsigned i; + + if (dev == NULL) { + RTE_LOG(INFO, PMD, "Invalid argument\n"); + return -1; + } + + list = find_internal_resource(dev->ifname); + if (list == NULL) { + RTE_LOG(INFO, PMD, "Invalid device name\n"); + return -1; + } + + eth_dev = list->eth_dev; + internal = eth_dev->data->dev_private; + + for (i = 0; i < eth_dev->data->nb_rx_queues; i++) { + xnbq = eth_dev->data->rx_queues[i]; + if (xnbq == NULL) + continue; + + if (rte_xen_ring_map(&dev->queues[i].rx)) { + RTE_LOG(INFO, PMD, "Cannot map RX%d\n", i); + return -1; + } + xnbq->device = dev; + xnbq->internal = internal; + xnbq->port = eth_dev->data->port_id; + } + for (i = 0; i < eth_dev->data->nb_tx_queues; i++) { + xnbq = eth_dev->data->tx_queues[i]; + if (xnbq == NULL) + continue; + + if (rte_xen_ring_map(&dev->queues[i].tx)) { + RTE_LOG(INFO, PMD, "Cannot map TX%d\n", i); + return -1; + } + + xnbq->device = dev; + xnbq->internal = internal; + xnbq->port = eth_dev->data->port_id; + } + + dev->priv = eth_dev; + eth_dev->data->dev_link.link_status = ETH_LINK_UP; + + for (i = 0; i < eth_dev->data->nb_rx_queues; i++) { + xnbq = eth_dev->data->rx_queues[i]; + if (xnbq == NULL) + continue; + + rte_atomic32_set(&xnbq->allow_queuing, 1); + } + for (i = 0; i < eth_dev->data->nb_tx_queues; i++) { + xnbq = eth_dev->data->tx_queues[i]; + if (xnbq == NULL) + continue; + + rte_atomic32_set(&xnbq->allow_queuing, 1); + } + + RTE_LOG(INFO, PMD, "New connection established\n"); + return 0; +} + +static int xnb_disconnect(struct xenvif *vif __rte_unused) +{ + return 0; +} + +static void * xnb_session(void *param __rte_unused) +{ + static const char *xnb_ids[] = { + "vif", NULL + }; + + static struct xenbus_ops xnb_ops = { + .ids = xnb_ids, + .init = xnb_init, + .connect = xnb_connect, + .disconnect = xnb_disconnect, + .close = xnb_close + }; + + if (rte_xenbus_backend_register(&xnb_ops, rte_lcore_count() ) < 0) + RTE_LOG(ERR, PMD, "Can't register callbacks\n"); + + /* start event handling */ + rte_xenbus_backend_start(); + + return NULL; +} + +static int xnb_session_start(void) +{ + int ret; + + ret = pthread_create(&session_th, NULL, xnb_session, NULL); + if (ret) + RTE_LOG(ERR, PMD, "Can't create a thread\n"); + + return ret; +} + +static void xnb_session_stop(void) +{ + int ret; + + ret = pthread_cancel(session_th); + if (ret) + RTE_LOG(ERR, PMD, "Can't cancel the thread\n"); + + ret = pthread_join(session_th, NULL); + if (ret) + RTE_LOG(ERR, PMD, "Can't join the thread\n"); +} + +static int eth_dev_start(struct rte_eth_dev *dev __rte_unused) +{ + int ret = 0; + + /* We need only one message handling thread */ + if (rte_atomic16_add_return(&nb_started_ports, 1) == 1) + ret = xnb_session_start(); + + return ret; +} + +static void eth_dev_stop(struct rte_eth_dev *dev __rte_unused) +{ + if (rte_atomic16_sub_return(&nb_started_ports, 1) == 0) + xnb_session_stop(); +} + +static int eth_link_update(struct rte_eth_dev *dev __rte_unused, + int wait_to_complete __rte_unused) +{ + return 0; +} + +static uint16_t eth_dev_rx_pkt_burst(void *queue, + struct rte_mbuf **pkts, + uint16_t nb_pkts) +{ + struct xnb_queue *r = queue; + uint16_t i, nb_rx = 0; + + if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) + return 0; + + rte_atomic32_set(&r->while_queuing, 1); + + if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) + goto out; + + /* Dequeue packets from guest TX queue */ + nb_rx = rte_xen_dequeue_burst(r->device, r->queue_id, r->mb_pool, + pkts, nb_pkts); + + r->rx_pkts += nb_rx; + + for (i = 0; likely(i < nb_rx); i++) { + pkts[i]->port = r->port; + r->rx_bytes += pkts[i]->pkt_len; + } + +out: + rte_atomic32_set(&r->while_queuing, 0); + + return nb_rx; +} + +static int eth_rx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_id, + uint16_t nb_desc __rte_unused, + unsigned int socket_id, + const struct rte_eth_rxconf *rx_conf __rte_unused, + struct rte_mempool *mp) +{ + struct xnb_queue *xnbq; + + xnbq = rte_zmalloc_socket(NULL, sizeof(struct xnb_queue), + RTE_CACHE_LINE_SIZE, socket_id); + if (xnbq == NULL) { + RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n"); + return -ENOMEM; + } + + xnbq->mb_pool = mp; + xnbq->queue_id = queue_id; + dev->data->rx_queues[queue_id] = xnbq; + return 0; +} + +static void eth_rx_queue_release(void *queue) +{ + rte_free(queue); +} + +static uint16_t eth_dev_tx_pkt_burst(void *queue, + struct rte_mbuf **pkts, + uint16_t nb_pkts) +{ + struct xnb_queue *r = queue; + uint16_t i, nb_tx = 0; + + if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) + return 0; + + rte_atomic32_set(&r->while_queuing, 1); + + if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0)) + goto out; + + /* Enqueue packets to guest RX queue */ + nb_tx = rte_xen_enqueue_burst(r->device, r->queue_id, pkts, nb_pkts); + + r->tx_pkts += nb_tx; + r->missed_pkts += nb_pkts - nb_tx; + + for (i = 0; likely(i < nb_tx); i++) + r->tx_bytes += pkts[i]->pkt_len; + + for (i = 0; likely(i < nb_tx); i++) + rte_pktmbuf_free(pkts[i]); +out: + rte_atomic32_set(&r->while_queuing, 0); + + return nb_tx; +} + +static int eth_tx_queue_setup(struct rte_eth_dev *dev, + uint16_t queue_id, + uint16_t nb_desc __rte_unused, + unsigned int socket_id, + const struct rte_eth_txconf *tx_conf __rte_unused) +{ + struct xnb_queue *xnbq; + + xnbq = rte_zmalloc_socket(NULL, sizeof(struct xnb_queue), + RTE_CACHE_LINE_SIZE, socket_id); + if (xnbq == NULL) { + RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n"); + return -ENOMEM; + } + + xnbq->queue_id = queue_id; + dev->data->tx_queues[queue_id] = xnbq; + return 0; +} + +static void eth_tx_queue_release(void *queue __rte_unused) +{ + rte_free(queue); +} + +static struct eth_dev_ops xnb_dev_ops __rte_unused = { + /* device */ + .dev_configure = eth_dev_configure, + .dev_infos_get = eth_dev_infos_get, + .dev_start = eth_dev_start, + .dev_stop = eth_dev_stop, + .link_update = eth_link_update, + + /* queue setup */ + .rx_queue_setup = eth_rx_queue_setup, + .rx_queue_release = eth_rx_queue_release, + .tx_queue_setup = eth_tx_queue_setup, + .tx_queue_release = eth_tx_queue_release, + + /* statistics */ + .stats_get = eth_stats_get, + .stats_reset = eth_stats_reset, +}; + +static int xnb_eth_dev_create(const char *name, char *iface_name, int16_t queues, + const unsigned numa_node) +{ + struct rte_eth_dev_data *data = NULL; + struct pmd_internal *internal = NULL; + struct rte_eth_dev *eth_dev = NULL; + struct ether_addr *eth_addr = NULL; + struct internal_list *list = NULL; + + RTE_LOG(INFO, PMD, "Creating Xen netback backend on numa socket %u\n", + numa_node); + + /* now do all data allocation - for eth_dev structure, dummy pci driver + * and internal (private) data + */ + data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node); + if (data == NULL) + goto error; + + internal = rte_zmalloc_socket(name, sizeof(*internal), 0, numa_node); + if (internal == NULL) + goto error; + + list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node); + if (list == NULL) + goto error; + + /* reserve an ethdev entry */ + eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL); + if (eth_dev == NULL) + goto error; + + eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node); + if (eth_addr == NULL) + goto error; + *eth_addr = base_eth_addr; + eth_addr->addr_bytes[5] = eth_dev->data->port_id; + + TAILQ_INIT(ð_dev->link_intr_cbs); + + /* now put it all together + * - store queue data in internal, + * - store numa_node info in ethdev data + * - point eth_dev_data to internals + * - and point eth_dev structure to new eth_dev_data structure + */ + internal->dev_name = strdup(name); + if (internal->dev_name == NULL) + goto error; + internal->iface_name = strdup(iface_name); + if (internal->iface_name == NULL) + goto error; + + list->eth_dev = eth_dev; + pthread_mutex_lock(&internal_list_lock); + TAILQ_INSERT_TAIL(&internal_list, list, next); + pthread_mutex_unlock(&internal_list_lock); + + data->dev_private = internal; + data->port_id = eth_dev->data->port_id; + memmove(data->name, eth_dev->data->name, sizeof(data->name)); + data->nb_rx_queues = queues; + data->nb_tx_queues = queues; + internal->max_queues = queues; + data->dev_link = pmd_link; + data->mac_addrs = eth_addr; + + /* We'll replace the 'data' originally allocated by eth_dev. So the + * vhost PMD resources won't be shared between multi processes. + */ + eth_dev->data = data; + eth_dev->dev_ops = &xnb_dev_ops; + eth_dev->driver = NULL; + data->dev_flags = + RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC; + data->kdrv = RTE_KDRV_NONE; + data->drv_name = internal->dev_name; + data->numa_node = numa_node; + + /* finally assign rx and tx ops */ + eth_dev->rx_pkt_burst = eth_dev_rx_pkt_burst; + eth_dev->tx_pkt_burst = eth_dev_tx_pkt_burst; + + return data->port_id; + +error: + if (internal) + free(internal->dev_name); + rte_free(eth_addr); + if (eth_dev) + rte_eth_dev_release_port(eth_dev); + rte_free(internal); + rte_free(list); + rte_free(data); + + return -1; +} + +static inline int open_iface(const char *key __rte_unused, const char *value, + void *extra_args) +{ + const char **iface_name = extra_args; + + if (value == NULL) + return -1; + + *iface_name = value; + + return 0; +} + +static inline int open_queues(const char *key __rte_unused, const char *value, + void *extra_args) +{ + uint16_t *q = extra_args; + + if (value == NULL || extra_args == NULL) + return -EINVAL; + + *q = (uint16_t)strtoul(value, NULL, 0); + if (*q == USHRT_MAX && errno == ERANGE) + return -1; + + if (*q > RTE_MAX_QUEUES_PER_PORT) + return -1; + + return 0; +} + +static int rte_pmd_xnb_devinit(const char *name __rte_unused, + const char *params __rte_unused) +{ + struct rte_kvargs *kvlist = NULL; + int ret = 0; + char *iface_name; + uint16_t queues; + + RTE_LOG(INFO, PMD, "Initializing pmd_xnb for %s\n", name); + + kvlist = rte_kvargs_parse(params, valid_arguments); + if (kvlist == NULL) + return -1; + + if (rte_kvargs_count(kvlist, ETH_XNB_IFACE_ARG) == 1) { + ret = rte_kvargs_process(kvlist, ETH_XNB_IFACE_ARG, + &open_iface, &iface_name); + if (ret < 0) + goto out_free; + } else { + ret = -1; + goto out_free; + } + + if (rte_kvargs_count(kvlist, ETH_XNB_QUEUES_ARG) == 1) { + ret = rte_kvargs_process(kvlist, ETH_XNB_QUEUES_ARG, + &open_queues, &queues); + if (ret < 0) + goto out_free; + + } else + queues = 1; + + xnb_eth_dev_create(name, iface_name, queues, rte_socket_id()); + +out_free: + rte_kvargs_free(kvlist); + return ret; +}; + +static int rte_pmd_xnb_devuninit(const char *name __rte_unused) +{ + return 0; +}; + +struct rte_driver pmd_xnb_drv = { + .type = PMD_VDEV, + .init = rte_pmd_xnb_devinit, + .uninit = rte_pmd_xnb_devuninit, +}; + +PMD_REGISTER_DRIVER(pmd_xnb_drv, eth_xnb); +DRIVER_REGISTER_PARAM_STRING(eth_xnb, + "iface= " + "queues="); diff --git a/drivers/net/xen-netback/xnb_ethdev.h b/drivers/net/xen-netback/xnb_ethdev.h new file mode 100644 index 0000000..a88792f --- /dev/null +++ b/drivers/net/xen-netback/xnb_ethdev.h @@ -0,0 +1,34 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include diff --git a/drivers/net/xen-netback/xnb_ring.c b/drivers/net/xen-netback/xnb_ring.c new file mode 100644 index 0000000..7067589 --- /dev/null +++ b/drivers/net/xen-netback/xnb_ring.c @@ -0,0 +1,240 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2 +#include +#else +#include +#endif + +#include "xnb.h" + +#include +#include +#include + +#define RTE_LOGTYPE_XENRING RTE_LOGTYPE_USER1 + +#define RTE_XEN_MAX_PKT_GRANTS (4 * RTE_XEN_MAX_PKT_BURST) + +struct grant { + grant_ref_t ref; + void *page; +}; + +/* Hash functions for the domains table */ +static uint32_t grants_hash_crc(const void *data, + __rte_unused uint32_t data_len, + uint32_t init_val) +{ + const grant_ref_t *gref = data; + +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2 + init_val = rte_hash_crc_4byte(*gref, init_val); +#else /* RTE_MACHINE_CPUFLAG_SSE4_2 */ + init_val = rte_jhash_1word(*gref, init_val); +#endif + + return init_val; +} + +static struct rte_hash* grants_hash_init(struct xenvif_ring *ring) +{ + char s[64] = { 0 }; + struct rte_hash_parameters grants_hash_params = { + .name = NULL, + .entries = RTE_XEN_MAX_PKT_GRANTS, + .key_len = sizeof(uint32_t), + .hash_func = grants_hash_crc, + .hash_func_init_val = 0, + }; + + snprintf(s, sizeof(s), "grants_hash_%u_%s", ring->dom, ring->name); + grants_hash_params.name = s; + grants_hash_params.socket_id = 0; + + return rte_hash_create(&grants_hash_params); +} + +int rte_xen_ring_map(struct xenvif_ring *ring) +{ + int sz; + + /* Open event channel handle */ + ring->evtchnh = xc_evtchn_open(NULL, 0); + fcntl(xc_evtchn_fd(ring->evtchnh), F_SETFD, FD_CLOEXEC); + + /* Allocate event channel + * Failing to do so implies xen-netback is running already for that + * interface */ + ring->port = xc_evtchn_bind_interdomain(ring->evtchnh, + ring->dom, ring->evtchn); + if (ring->port == -1) { + RTE_LOG(ERR, XENRING, "%s: failed to bind evtchn %d\n", + ring->name, ring->evtchn); + xc_evtchn_close(ring->evtchnh); + return -1; + } + + ring->xch = xc_interface_open(0, 0, 0); + if (!ring->xch) { + RTE_LOG(ERR, XENRING, "%s: failed to open xc", ring->name); + return -1; + } + + ring->grants = grants_hash_init(ring); + + sz = sizeof(struct gnttab_copy) * RTE_XEN_MAX_PKT_GRANTS; + ring->gop = malloc(sz); + if (!ring->gop) { + RTE_LOG(ERR, XENRING, "%s: failed to init copy ops", ring->name); + return -1; + } + memset(ring->gop, 0, sz); + + sz = sizeof(struct pending_req) * RTE_XEN_MAX_PKT_GRANTS; + ring->pending = malloc(sz); + if (!ring->pending) { + RTE_LOG(ERR, XENRING, "%s: failed to init pending", ring->name); + return -1; + } + + /* Open grant table handle */ + ring->gnttabh = xc_gnttab_open(NULL, 0); + + if (xc_gnttab_set_max_grants(ring->gnttabh, + RTE_XEN_MAX_PKT_BURST) < 0) { + RTE_LOG(ERR, XENRING, "%s: failed to set max grants", + ring->name); + return -1; + } + + RTE_LOG(INFO, XENRING, "%s: gnttab %p evtchn %p (fd %d)\n", + ring->name, ring->gnttabh, ring->evtchnh, + xc_evtchn_fd(ring->evtchnh)); + + /* Map ring */ + if (!strncmp(ring->name, "TX", 2)) { + ring->tx_addr = xc_gnttab_map_grant_ref(ring->gnttabh, + ring->dom, ring->ring_ref, + PROT_READ | PROT_WRITE); + if (!ring->tx_addr) + return -1; + + BACK_RING_INIT(&ring->tx_ring, ring->tx_addr, XC_PAGE_SIZE); + + RTE_LOG(ERR, XENRING, "%s: ref %u dom %u -> addr %p\n", + ring->name, ring->ring_ref, ring->dom, ring->tx_addr); + } else { + ring->rx_addr = xc_gnttab_map_grant_ref(ring->gnttabh, + ring->dom, ring->ring_ref, + PROT_READ | PROT_WRITE); + if (!ring->rx_addr) + return -1; + + BACK_RING_INIT(&ring->rx_ring, ring->rx_addr, XC_PAGE_SIZE); + + RTE_LOG(ERR, XENRING, "%s: ref %u dom %u -> addr %p\n", + ring->name, ring->ring_ref, ring->dom, ring->rx_addr); + } + + RTE_LOG(ERR, XENRING, "%s: evtchn %d -> port %d\n", + ring->name, ring->evtchn, ring->port); + + return 0; +} + +int rte_xen_ring_unmap(struct xenvif_ring *ring) +{ + /* Deallocate event channel */ + xc_evtchn_unbind(ring->evtchnh, ring->port); + ring->port = -1; + + /* Unmap ring */ + if (!strncmp(ring->name, "TX", 2)) + xc_gnttab_munmap(ring->gnttabh, ring->tx_addr, 1); + else + xc_gnttab_munmap(ring->gnttabh, ring->rx_addr, 1); + + /* Unmap initial buffers */ + + /* Close event channel handle + * Close grant table handle */ + xc_evtchn_close(ring->evtchnh); + xc_gnttab_close(ring->gnttabh); + + RTE_LOG(INFO, XENRING, "%s: closed gnttab %p evtchn %p (fd %d)\n", + ring->name, ring->gnttabh, ring->evtchnh, + xc_evtchn_fd(ring->evtchnh)); + + return 0; +} + +void *rte_xen_ring_get_page(struct xenvif_ring *ring, grant_ref_t ref, + bool writable) +{ + struct grant *gnt = NULL; + unsigned flags = writable ? (PROT_READ | PROT_WRITE) : PROT_READ; + + rte_hash_lookup_data(ring->grants, &ref, (void**) &gnt); + + if (gnt) + return gnt->page; + + if (ring->grants_cnt >= RTE_XEN_MAX_PKT_GRANTS) + return NULL; + + gnt = malloc(sizeof(struct grant)); + if (!gnt) { + RTE_LOG(ERR, XENRING, "%s: error allocating grant ref %u\n", + ring->name, ref); + return NULL; + } + + gnt->ref = ref; + gnt->page = xc_gnttab_map_grant_ref(ring->gnttabh, ring->dom, ref, + flags); + if (!gnt->page) + return NULL; + + rte_hash_add_key_data(ring->grants, &ref, gnt); + ring->grants_cnt++; + return gnt->page; +} + diff --git a/drivers/net/xen-netback/xnb_rxtx.c b/drivers/net/xen-netback/xnb_rxtx.c new file mode 100644 index 0000000..3177883 --- /dev/null +++ b/drivers/net/xen-netback/xnb_rxtx.c @@ -0,0 +1,683 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "xnb.h" + +#define XEN_PAGE_SHIFT 12 +#define XEN_PFN_DOWN(x) ((x) >> XEN_PAGE_SHIFT) +#define XEN_PAGE_SIZE XC_PAGE_SIZE +#define XEN_PAGE_MASK (~(XEN_PAGE_SIZE-1)) + +#define RTE_LOGTYPE_XENRING RTE_LOGTYPE_USER1 + +#define RTE_XEN_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, XEN_PAGE_SIZE) +#define RTE_XEN_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, XEN_PAGE_SIZE) + +#define BUG_ON(x) do {\ + if (x) \ + rte_panic("XEN: x"); \ +} while (0) + +#ifdef DEBUG_PACKET +#define PRINT_PACKET_BUFF 6072 +#define PRINT_PACKET(str, addr, size, header) do { \ + char *pkt_addr = (char*)(addr); \ + unsigned int index; \ + char packet[PRINT_PACKET_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, PRINT_PACKET_BUFF, "Header size %d: ", (size));\ + else \ + snprintf(packet, PRINT_PACKET_BUFF, "Packet size %d: ", (size));\ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, PRINT_PACKET_BUFF), \ + PRINT_PACKET_BUFF - strnlen(packet, PRINT_PACKET_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, PRINT_PACKET_BUFF), \ + PRINT_PACKET_BUFF - strnlen(packet, PRINT_PACKET_BUFF), "\n"); \ + \ + RTE_LOG(DEBUG, XENRING, "%s %s", str, packet); \ +} while(0) + +#define XEN_LOG_PKT(ring, mbuf) do { \ + PRINT_PACKET(ring->name, \ + (uintptr_t) rte_pktmbuf_mtod_offset(mbuf, void *, 0), \ + mbuf->pkt_len, 0); \ +} while (0) + +#else +#define PRINT_PACKET(str, addr, size, header) do{} while(0) +#define XEN_LOG_PKT(ring, mbuf) do {} while(0) +#endif + +#ifdef DEBUG +#define XEN_LOG_GOP(gmfn_buf, gref_buf, flags, len) do { \ + RTE_LOG(ERR, XENRING, \ + "gop %s: gmfn %x offset %u " \ + "%s: ref %u offset %u size %d\n", \ + flags & GNTCOPY_dest_gref ? "src" : "dst", \ + gmfn_buf->u.gmfn, \ + gmfn_buf->offset, \ + flags & GNTCOPY_dest_gref ? "dst" : "src", \ + gref_buf->u.ref, \ + gref_buf->offset, \ + len); \ +} while (0) + +#define XEN_LOG_RXGOP(gop) do { \ + RTE_LOG(INFO, XENRING, \ + "(%s) rxgop size %d " \ + "src: gmfn %x offset %u dst: ref %u offset %u\n",\ + (gop)->status != GNTST_okay ? "not ok" : "ok", \ + (gop)->len, \ + (gop)->source.u.gmfn, \ + (gop)->source.offset, \ + (gop)->dest.u.ref, \ + (gop)->dest.offset); \ +} while (0) + +#define XEN_LOG_TXGOP(gop) do { \ + RTE_LOG(INFO, XENRING, \ + "(%s) txgop size %d " \ + "src: ref %u offset %u dst: gmfn %x offset %u\n",\ + (gop)->status != GNTST_okay ? "not ok" : "ok", \ + (gop)->len, \ + (gop)->source.u.ref, \ + (gop)->source.offset, \ + (gop)->dest.u.gmfn, \ + (gop)->dest.offset); \ +} while (0) + +#define XEN_LOG_TXREQ(ring, txreq, i) do { \ + RTE_LOG(INFO, XENRING, \ + "%s get req[%u]: id=%d ref=%u offset=%d\n", \ + (ring)->name, i, \ + (txreq)->id, (txreq)->gref, (txreq)->offset); \ +} while (0) + +#define XEN_LOG_PREQ(ring, p, txreq) do { \ + RTE_LOG(INFO, XENRING, "%s set req[%u]: id=%d ref=%u offset=%d\n", \ + (ring)->name, (p) - (ring)->pending, \ + (txreq)->id, (txreq)->gref, (txreq)->offset); \ +} while (0) + +#define XEN_LOG_DEBUG(fmt, ...) RTE_LOG(INFO, XENRING, fmt, ##__VA_ARGS__) +#else +#define XEN_LOG_GOP(gmfn, gref, flags, len) do {} while(0) +#define XEN_LOG_RXGOP(gop) do {} while(0) +#define XEN_LOG_TXGOP(gop) do {} while(0) +#define XEN_LOG_TXREQ(ring, txreq, i) do {} while(0) +#define XEN_LOG_PREQ(ring, p, txreq) do {} while(0) +#define XEN_LOG_DEBUG(fmt, ...) do {} while(0) +#endif + + +/* Sets up grant copy operation. + * Determines by "flags" which direction the copy is. + */ +static inline int make_copy_gop(struct gnttab_copy *copy_gop, + uint16_t len, domid_t dom, uint8_t flags, + grant_ref_t gref, uint16_t offset, + unsigned long gmfn, uint16_t gofs) +{ + struct gnttab_copy_ptr *gref_buf, *gmfn_buf; + + if (flags & GNTCOPY_dest_gref) { + gmfn_buf = ©_gop->source; + gmfn_buf->domid = DOMID_SELF; + + gref_buf = ©_gop->dest; + gref_buf->domid = dom; + } else { + gref_buf = ©_gop->source; + gref_buf->domid = dom; + + gmfn_buf = ©_gop->dest; + gmfn_buf->domid = DOMID_SELF; + } + + gmfn_buf->u.gmfn = XEN_PFN_DOWN(gmfn); + gmfn_buf->offset = gofs; + gref_buf->u.ref = gref; + gref_buf->offset = offset; + copy_gop->flags = flags; + + if (gofs + len > XEN_PAGE_SIZE) + copy_gop->len = XEN_PAGE_SIZE - gofs; + else + copy_gop->len = len; + + XEN_LOG_GOP(gmfn_buf, gref_buf, flags, copy_gop->len); + + return len - copy_gop->len; +} + +/* Assumes an mbuf.size <= XEN_PAGE_SIZE */ +static inline bool gop_mbuf_copy(struct gnttab_copy **gop, + struct pending_req **r, + unsigned long pfn, + uint8_t flags, domid_t dom, + grant_ref_t ref, uint16_t offset, uint16_t len) +{ + struct gnttab_copy *copy_gop = *gop; + struct pending_req *req = *r; + uint16_t gofs = pfn & ~XEN_PAGE_MASK; + uint16_t avail; + + /* There is still some remaining data but mbuf + * area crosses XEN_PAGE_SIZE boundary + */ + while (len) { + /* */ + avail = make_copy_gop(copy_gop, len, dom, flags, ref, + offset, pfn, gofs); + len -= copy_gop->len; + offset += copy_gop->len; + + if (avail) { + pfn++; + gofs = 0; + } + + /* Copy as much to a gref as possible. "more" will be set + * if there is more than one grant operation from/to the + * reference, in which case no ring request. Which leads + * to no response - but only to those provided by frontend. + */ + if (*gop != copy_gop) { + req->more = 1; + req->mbuf = NULL; + } + + ++copy_gop; + ++req; + }; + + *gop = copy_gop; + *r = req; + return !len; +} + +static void make_rx_response(struct xenvif_ring *ring, netif_rx_request_t *req, + int16_t size, uint16_t flags) +{ + RING_IDX i = ring->rx_ring.rsp_prod_pvt; + netif_rx_response_t *resp; + + resp = RING_GET_RESPONSE(&ring->rx_ring, i); + resp->offset = 0; + resp->flags = flags; + resp->id = req->id; + resp->status = size; + + ring->rx_ring.rsp_prod_pvt = ++i; +} + +static inline uint16_t get_rx_flags(struct rte_mbuf *m) +{ + uint16_t flags = 0; + + if ((m->ol_flags & PKT_TX_UDP_CKSUM) || + (m->ol_flags & PKT_TX_TCP_CKSUM)) + flags |= NETRXF_csum_blank | NETRXF_data_validated; + else + flags |= NETRXF_data_validated; + + return flags; + +} +/* Sets up grant operations from frontend grant refs *from* an mbuf */ +static int gop_from_mbuf(struct xenvif_ring *ring, + struct xenvif *vif, + struct rte_mbuf *m, + RING_IDX *rc, + struct gnttab_copy **gop) +{ + struct pending_req *last_req; + struct gnttab_copy *copy_gop = *gop; + struct pending_req *p = &ring->pending[copy_gop - ring->gop]; + netif_rx_request_t *rxreq = &p->u.rxreq; + netif_rx_back_ring_t *rx_ring = &ring->rx_ring; + unsigned long addr; + + RING_COPY_REQUEST(rx_ring, *rc, rxreq); + rx_ring->req_cons = ++(*rc); + + BUG_ON(m->pkt_len > XEN_PAGE_SIZE); + + if (vif->features.pgnt) { + void *page; + + page = rte_xen_ring_get_page(ring, rxreq->gref, true); + if (page) { + rte_memcpy(page, rte_pktmbuf_mtod_offset(m, void *, 0), + m->pkt_len); + make_rx_response(ring, rxreq, m->pkt_len, + get_rx_flags(m)); + return 1; + } + } + + /* First is always NULL. On receive side, the last + * fragment of the mbuf grant operations is the one + * that contains the sent mbuf. + */ + p->mbuf = NULL; + + addr = rte_mbuf_data_dma_addr_default(m); + gop_mbuf_copy(gop, &p, addr, + GNTCOPY_dest_gref, ring->dom, + rxreq->gref, 0, m->pkt_len); + + /* The last one gets the mbuf set + * and we prepend the pkt_len. The pending_req + * with mbuf != NULL will then make the response + * to the guest + */ + last_req = --p; + last_req->mbuf = m; + + return 0; +} + +uint16_t +rte_xen_enqueue_burst(struct xenvif *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count) +{ + struct xenvif_queue *queue = &dev->queues[queue_id]; + struct xenvif_ring *ring = &queue->rx; + netif_rx_back_ring_t *rx_ring = &ring->rx_ring; + struct gnttab_copy *gop = ring->gop; + struct pending_req *pending = ring->pending; + RING_IDX rc, rp; + uint32_t recv = 0; + int notify = 0; + uint16_t nr_gops = 0; + int ret = -1; + int i = 0; + + rc = rx_ring->req_cons; + rp = rx_ring->sring->req_prod; + xen_rmb(); /* Ensure we see queued requests up to 'rp'. */ + + /* mbufs are continguous in 1G/2M page size and we don't support GSO + * which means worst case page crosses XEN_PAGE_SIZE boundary + */ + count = RTE_MIN((uint32_t) rp - rc, count); + if (count == 0) + return 0; + + for (recv = 0; recv < count; recv++) { + if (gop_from_mbuf(ring, queue->vif, + pkts[recv], &rc, &gop)) + continue; + + rte_compiler_barrier(); + } + + /* Grant copy the refs to the mbufs */ + nr_gops = gop - ring->gop; + if (nr_gops) { + ret = xc_gnttab_op(ring->xch, GNTTABOP_copy, + ring->gop, sizeof(struct gnttab_copy), + nr_gops); + + if (unlikely(ret)) + RTE_LOG(ERR, XENRING,"%s: grant copy failed (err %d).\n", + queue->rx.name, ret); + } + + /* Produce the responses */ + for (i = 0; i < nr_gops; i++) { + struct rte_mbuf *m = pending[i].mbuf; + netif_rx_request_t *rxreq = &pending[i].u.rxreq; + int16_t st = 0; + + if (unlikely(ring->gop[i].status != GNTST_okay)) + st = NETIF_RSP_ERROR; + + if (m != NULL) { + XEN_LOG_RXGOP(&ring->gop[i]); + XEN_LOG_PKT(ring, m); + make_rx_response(ring, rxreq, + !st ? (int16_t) m->pkt_len : st, + get_rx_flags(m)); + } + } + + if (likely(recv)) { + /* Notify the guest if necessary. */ + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&ring->rx_ring, notify); + if (notify) + xc_evtchn_notify(ring->evtchnh, ring->port); + } + + return recv; +} + +static void make_tx_response(struct xenvif_ring *ring, netif_tx_request_t *req, + int8_t status) +{ + RING_IDX i = ring->tx_ring.rsp_prod_pvt; + netif_tx_response_t *resp; + + resp = RING_GET_RESPONSE(&ring->tx_ring, i); + resp->id = req->id; + resp->status = status; + +#ifdef DEBUG + RTE_LOG(INFO, XENRING, "%s resp id=%d (ref=%u offset=%d)\n", + ring->name, resp->id, req->gref, req->offset); +#endif + + ring->tx_ring.rsp_prod_pvt = ++i; +} + +static int8_t count_tx_requests(struct xenvif_ring *ring, + netif_tx_request_t *first, + RING_IDX rc) +{ + netif_tx_back_ring_t *tx_ring; + netif_tx_request_t txreq; + int slots = 0; + + if (!(first->flags & NETTXF_more_data)) + return 0; + + tx_ring = &ring->tx_ring; + slots = 0; + + do { + RING_COPY_REQUEST(tx_ring, rc + slots, &txreq); + first->size -= txreq.size; + slots++; + } while (txreq.flags & NETTXF_more_data); + + XEN_LOG_DEBUG("slots %u\n", slots); + return slots; +} + +/* Sets a tx request that is pending validation */ +static inline void set_tx_request(struct xenvif_ring *ring __rte_unused, + struct pending_req *req, + netif_tx_request_t *txreq) +{ + XEN_LOG_PREQ(ring, req, txreq); + memcpy(&req->u.txreq, txreq, sizeof(*txreq)); + req->more = 0; + req->mbuf = NULL; +} + + +static void parse_ethernet(struct rte_mbuf *m, + uint16_t *l4_proto, void **l4_hdr) +{ + struct ipv4_hdr *ipv4_hdr; + struct ipv6_hdr *ipv6_hdr; + void *l3_hdr = NULL; + struct ether_hdr *eth_hdr; + uint16_t ethertype; + + eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); + + m->l2_len = sizeof(struct ether_hdr); + ethertype = rte_be_to_cpu_16(eth_hdr->ether_type); + + if (ethertype == ETHER_TYPE_VLAN) { + struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1); + + m->l2_len += sizeof(struct vlan_hdr); + ethertype = rte_be_to_cpu_16(vlan_hdr->eth_proto); + } + + l3_hdr = (char *)eth_hdr + m->l2_len; + + switch (ethertype) { + case ETHER_TYPE_IPv4: + ipv4_hdr = (struct ipv4_hdr *)l3_hdr; + *l4_proto = ipv4_hdr->next_proto_id; + m->l3_len = (ipv4_hdr->version_ihl & 0x0f) * 4; + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV4; + break; + case ETHER_TYPE_IPv6: + ipv6_hdr = (struct ipv6_hdr *)l3_hdr; + *l4_proto = ipv6_hdr->proto; + m->l3_len = sizeof(struct ipv6_hdr); + *l4_hdr = (char *)l3_hdr + m->l3_len; + m->ol_flags |= PKT_TX_IPV6; + break; + default: + m->l3_len = 0; + *l4_proto = 0; + break; + } +} + +static inline void set_mbuf_chksum(netif_tx_request_t *txreq, + struct rte_mbuf *m) +{ + uint16_t l4_proto = 0; + void *l4_hdr = NULL; + + if (txreq->flags & NETTXF_csum_blank && + txreq->flags & NETTXF_data_validated) { + parse_ethernet(m, &l4_proto, &l4_hdr); + switch (l4_proto) { + case IPPROTO_TCP: + m->ol_flags |= PKT_TX_TCP_CKSUM; + break; + case IPPROTO_UDP: + m->ol_flags |= PKT_TX_UDP_CKSUM; + break; + case IPPROTO_SCTP: + m->ol_flags |= PKT_TX_SCTP_CKSUM; + break; + } + } +} + +/* Sets up grant operations from frontend grant refs *to* an mbuf */ +static int gop_to_mbuf(struct xenvif_ring *ring, + struct xenvif *vif, + struct rte_mbuf *m, + RING_IDX *rc, + struct gnttab_copy **gop) +{ + struct gnttab_copy *copy_gop = *gop; + struct pending_req *p = &ring->pending[copy_gop - ring->gop]; + struct pending_req *first = p; + netif_tx_request_t *txreq = &p->u.txreq; + netif_tx_back_ring_t *tx_ring = &ring->tx_ring; + unsigned long addr; + uint16_t ofs, len; + RING_IDX ri = *rc; + bool pgnt = vif->features.pgnt; + char *page = NULL; + + RING_COPY_REQUEST(tx_ring, ri, txreq); + + m->pkt_len = len = txreq->size; + count_tx_requests(ring, txreq, ++ri); + set_mbuf_chksum(txreq, m); + + BUG_ON(txreq->size > XEN_PAGE_SIZE); + + ofs = 0; + addr = rte_mbuf_data_dma_addr_default(m); + set_tx_request(ring, p, txreq); + + while (len) { + if (pgnt) + page = rte_xen_ring_get_page(ring, txreq->gref, false); + + if (page) { + rte_memcpy(rte_pktmbuf_mtod_offset(m, void*, ofs), + page + txreq->offset, txreq->size); + make_tx_response(ring, txreq, NETIF_RSP_OKAY); + } else { + gop_mbuf_copy(gop, &p, addr + ofs, GNTCOPY_source_gref, + ring->dom, txreq->gref, txreq->offset, + txreq->size); + } + + len -= txreq->size; + ofs += txreq->size; + + /* More slots remaining */ + if (len) { + txreq = &p->u.txreq; + RING_COPY_REQUEST(tx_ring, ri++, txreq); + set_tx_request(ring, p, txreq); + } + } + + tx_ring->req_cons = ri; + *rc = ri; + + /* The first one gets the mbuf set + * and we prepend the pkt_len. The pending_req + * with mbuf != NULL will then make the response + * to the guest + */ + first->mbuf = m; + + /* If no grant ops were set up */ + return (copy_gop - *gop) == 0; +} + +uint16_t +rte_xen_dequeue_burst(struct xenvif *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t count) +{ + struct xenvif_queue *queue = &dev->queues[queue_id]; + struct xenvif_ring *ring = &queue->tx; + struct gnttab_copy *gop = ring->gop; + struct pending_req *pending = ring->pending; + netif_tx_back_ring_t *tx_ring = &ring->tx_ring; + RING_IDX rc, rp; + uint16_t sent = 0; + int notify = 0; + uint16_t nr_gops = 0; + int ret = -1; + int i = 0; + + rc = tx_ring->req_cons; + rp = tx_ring->sring->req_prod; + xen_rmb(); /* Ensure we see queued requests up to 'rp'. */ + + count = RTE_MIN((uint32_t) rp - rc, count); + + rte_prefetch0(&queue->vif->features); + rte_prefetch0(&tx_ring->sring->ring[rc & RTE_XEN_TX_RING_SIZE]); + + /* Get requests and setup the pages */ + while ((rc != rp) && sent < count) { + struct rte_mbuf *m; + + /* Allocate an mbuf and populate the structure. */ + m = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(m == NULL)) { + RTE_LOG(ERR, XENRING, + "%s: Failed to allocate memory for mbuf.\n", + queue->tx.name); + break; + } + + /* Copy if data is inline */ + if (likely(gop_to_mbuf(ring, queue->vif, m, &rc, &gop))) { + pkts[sent] = m; + sent++; + continue; + } + } + + /* Grant copy the refs to the mbufs */ + nr_gops = gop - ring->gop; + if (nr_gops) { + ret = xc_gnttab_op(ring->xch, GNTTABOP_copy, + ring->gop, sizeof(struct gnttab_copy), + nr_gops); + + if (unlikely(ret)) + RTE_LOG(ERR, XENRING,"%s: grant copy failed (err %d).\n", + queue->tx.name, ret); + } + + /* Produce the responses */ + for (i = 0; i < nr_gops; i++) { + struct rte_mbuf *m = pending[i].mbuf; + netif_tx_request_t *txreq = &pending[i].u.txreq; + bool more = pending[i].more; + int8_t st = NETIF_RSP_OKAY; + + if (unlikely(ring->gop[i].status != GNTST_okay)) + st = NETIF_RSP_ERROR; + + XEN_LOG_TXGOP(&ring->gop[i]); + + if (!more) { + XEN_LOG_TXREQ(ring, txreq, i); + make_tx_response(ring, txreq, st); + } + + if (m == NULL) + continue; + + XEN_LOG_PKT(ring, m); + pkts[sent] = m; + sent++; + } + + if (likely(sent)) { + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&ring->tx_ring, notify); + if (notify) + xc_evtchn_notify(ring->evtchnh, ring->port); + } + + return sent; +} diff --git a/drivers/net/xen-netback/xnb_xenbus.c b/drivers/net/xen-netback/xnb_xenbus.c new file mode 100644 index 0000000..3d4c8e1 --- /dev/null +++ b/drivers/net/xen-netback/xnb_xenbus.c @@ -0,0 +1,975 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "xnb.h" + +#include +#include +#include +#include + +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2 +#include +#else +#include +#endif + +#include +#include +#include + +#define RTE_LOGTYPE_XEN RTE_LOGTYPE_USER1 + +#define XENBUS_MAX_ENTRY 64 +#define XENBUS_DOMAINS_HASH_ENTRIES 1024 + +/* Key for the domain hash table */ +struct xenvif_hash_key { + domid_t domid; + unsigned handle; +}; + +/* The Xenbus watch path and token */ +struct xenbus_watch { + char *path; + const char *token; + LIST_ENTRY(xenbus_watch) next; +}; + +/* Registered Xenbus client + * Listening on /local/domain/domid>/backend/{vif,dpvif} */ +struct xenbus_device { + /* Backend domain id and it's base path */ + unsigned int domid; + char *dompath; + unsigned int max_cores; + + /* Xenstore handle */ + struct xs_handle *xsh; + /* Xenstore file descriptor */ + int xsh_fd; + + /* upcalls for registered backend */ + struct xenbus_ops *callbacks; + struct xenbus_watch watch; + + /* domains look up table */ + struct rte_hash *domains; +}; + +static struct xenbus_device *dev = NULL; + +static struct xenvif* xenbus_backend_get(struct xenbus_device *dev, + unsigned domid, unsigned int handle); + +/* + * Xenstore/Xenbus helper functions + */ + +static int xenbus_printf(xs_transaction_t xbt, char *basename, + const char *key, const char *fmt, ...) +{ + char *path = NULL; + char *buf = NULL; + va_list ap; + int ret, len; + + va_start(ap, fmt); + len = vasprintf(&buf, fmt, ap); + va_end(ap); + + if (len <= 0) + return -ENOMEM; + + len = asprintf(&path, "%s/%s", basename, key); + if (len <= 0) + return -ENOMEM; + + ret = xs_write(dev->xsh, xbt, path, buf, strlen(buf)); + if (ret) + RTE_LOG(INFO, XEN, "xs_write: %s = %s\n", path, buf); + + free(path); + free(buf); + + return ret; +} + +static int xenbus_scanf(xs_transaction_t xbt, char *basename, const char *key, + const char *fmt, ...) +{ + char *path = NULL; + char *buf = NULL; + va_list ap; + unsigned int ret = 0; + + if (asprintf(&path, "%s/%s", basename, key) <= 0) + return -ENOMEM; + + buf = xs_read(dev->xsh, xbt, path, &ret); + if (!buf) { + RTE_LOG(ERR, XEN, "xs_read: failed on path %s\n", path); + return -EINVAL; + } + + RTE_LOG(INFO, XEN, "xs_read: %s = %s\n", path, buf); + + va_start(ap, fmt); + ret = vsscanf(buf, fmt, ap); + va_end(ap); + + free(buf); + free(path); + + return ret <= 0 ? -EINVAL : (int) ret; +} + +/* Converts xenbus state to a string */ +static const char *xenbus_strstate(enum xenbus_state state) +{ + static const char *names[] = { + "Unknown", + "Initialising", + "InitWait", + "Initialised", + "Connected", + "Closing", + "Closed", + "Reconfiguring", + "Reconfigured", + }; + + return state < sizeof(names) ? names[state] : "Invalid"; +} + +/* + * Xen Virtual Interface routines (Backend) + * + * ( Initialising ) -> Announce features + * -> Update 'state' to "InitWait" + * + * ( Connected ) -> Read frontend features + * -> Fetch TX/RX ring refs and event channels + * -> Update 'state' to "Connected" + * + * We propagate an event to tell downstream consumers that the queue + * is initialized. This allows a PMD (app) to initialize the TX / RX + * grant references and event channels in each lcore. + * + * Here we handle all xenstore interactions and transitions with the + * frontend. All assume the libxc handling is done downstream. + */ + +static int xenvif_queue_init(struct xenvif_queue *queue) +{ + char *path = NULL; + int ret = -1; + + if (queue->vif->num_queues == 1) + path = queue->vif->fe.path; + else { + if (asprintf(&path, "%s/queue-%u", + queue->vif->fe.path, queue->id) < 0) + return ret; + } + + if ((xenbus_scanf(XBT_NULL, path, "tx-ring-ref", "%u", + &queue->tx.ring_ref) != 1) || + (xenbus_scanf(XBT_NULL, path, "rx-ring-ref", "%u", + &queue->rx.ring_ref) != 1)) + return ret; + + if ((xenbus_scanf(XBT_NULL, path, "event-channel-tx", "%u", + &queue->tx.evtchn) != 1) || + (xenbus_scanf(XBT_NULL, path, "event-channel-rx", "%u", + &queue->rx.evtchn) != 1)) + return ret; + + if (asprintf(&queue->tx.name, "TX%u", queue->id) < 0) + return ret; + + if (asprintf(&queue->rx.name, "RX%u", queue->id) < 0) + return ret; + + queue->tx.dom = queue->rx.dom = queue->vif->fe.domid; + + ret = 0; + return ret; +} + +static struct xenvif* xenvif_alloc(unsigned int domid, unsigned int handle, + unsigned max_queues) +{ + struct xenvif *vif; + + vif = malloc(sizeof(struct xenvif)); + if (!vif) + return NULL; + + vif->be.domid = domid; + vif->be.handle = handle; + vif->num_queues = max_queues; + + return vif; +} + +static int xenvif_write_features(struct xenvif *vif) +{ + xs_transaction_t xbt; + char *path = vif->be.path; + +again: + xbt = xs_transaction_start(dev->xsh); + if (!xbt) + goto fail; + + /* Write features supported */ + if (xenbus_printf(xbt, path, "feature-sg", "%u", 0) <= 0) + goto abort_transaction; + + if (xenbus_printf(xbt, path, "feature-gso-tcpv4", "%u", 0) <= 0) + goto abort_transaction; + + if (xenbus_printf(xbt, path, "feature-gso-tcpv6", "%u", 0) <= 0) + goto abort_transaction; + + if (xenbus_printf(xbt, path, "feature-ipv6-csum-offload", "%u", 1) <= 0) + goto abort_transaction; + + if (xenbus_printf(xbt, path, "feature-rx-copy", "%u", 1) <= 0) + goto abort_transaction; + + if (xenbus_printf(xbt, path, "feature-rx-flip", "%u", 0) <= 0) + goto abort_transaction; + + if (xenbus_printf(xbt, path, "feature-multicast-control", "%u", 0) <= 0) + goto abort_transaction; + + if (!xs_transaction_end(dev->xsh, xbt, false)) { + if (errno == EAGAIN) + goto again; + if (errno) + goto fail; + } + + /* Optional features supported */ + if (xenbus_printf(XBT_NULL, path, "feature-split-event-channels", "%u", + 1) <= 0) + goto abort_transaction; + + if (xenbus_printf(XBT_NULL, path, "multi-queue-max-queues", "%u", + vif->num_queues) <= 0) + goto abort_transaction; + + if (xenbus_printf(XBT_NULL, path, "feature-persistent", "%u", 1) <= 0) + goto abort_transaction; + + return 0; + +abort_transaction: + xs_transaction_end(dev->xsh, xbt, true); + +fail: + RTE_LOG(INFO, XEN, "%s: vif%d.%d: failed to write features\n", + __func__, vif->be.domid, vif->be.handle); + return -1; +} + +static int xenvif_read_features(struct xenvif *vif) +{ + char *path = vif->fe.path; + int ret = 0; + unsigned int num_queues = 1; + + /* Read feature-rx-copy (mandatory) */ + if (xenbus_scanf(XBT_NULL, path, "request-rx-copy", "%u", &ret) <= 0) { + RTE_LOG(ERR, XEN, "%s: error reading request-rx-copy\n", + __func__); + return -1; + } + + /* Read number of supported queues */ + if (xenbus_scanf(XBT_NULL, path, "multi-queue-num-queues", + "%u", &num_queues) <= 0) + num_queues = 1; + + if (num_queues < vif->num_queues) + vif->num_queues = num_queues; + + /* Read supported feature by frontend */ + if (xenbus_scanf(XBT_NULL, path, "feature-rx-notify", "%u", + &vif->features.rx_poll) <= 0) + vif->features.rx_poll = 1; + + if (xenbus_scanf(XBT_NULL, path, "feature-sg", "%u", + &vif->features.sg) <= 0) + vif->features.sg = 0; + + if (xenbus_scanf(XBT_NULL, path, "feature-gso-tcpv4", "%u", + &vif->features.tcp4) <= 0) + vif->features.tcp4 = 0; + + if (xenbus_scanf(XBT_NULL, path, "feature-gso-tcpv4-prefix", "%u", + &vif->features.tcp4_prefix) <= 0) + vif->features.tcp4_prefix = 0; + + if (xenbus_scanf(XBT_NULL, path, "feature-gso-tcpv6", "%u", + &vif->features.tcp6) <= 0) + vif->features.tcp6 = 0; + + if (xenbus_scanf(XBT_NULL, path, "feature-gso-tcpv6-prefix", "%u", + &vif->features.tcp6_prefix) <= 0) + vif->features.tcp6_prefix = 0; + + if (xenbus_scanf(XBT_NULL, path, "feature-no-csum-offload", "%u", + &vif->features.ip4_csum) <= 0) + vif->features.ip4_csum = 1; + + if (xenbus_scanf(XBT_NULL, path, "feature-ipv6-csum-offload", "%u", + &vif->features.ip6_csum) <= 0) + vif->features.ip6_csum = 0; + + if (xenbus_scanf(XBT_NULL, path, "feature-multicast-control", "%u", + &vif->features.mcast_ctrl) <= 0) + vif->features.mcast_ctrl = 0; + + if (xenbus_scanf(XBT_NULL, path, "feature-persistent", "%u", + &vif->features.pgnt) <= 0) + vif->features.pgnt = 0; + + /* Pseudo features internal to device only */ + vif->features.zc = 0; + + return 0; +} + +static int xenvif_alloc_queues(struct xenvif *vif) +{ + struct xenvif_queue *queues; + size_t sz = sizeof(struct xenvif_queue) * vif->num_queues; + unsigned int i; + + queues = malloc(sz); + if (!queues) + return -1; + + memset(queues, 0, sz); + vif->queues = queues; + + for (i = 0; i < vif->num_queues; i++) { + vif->queues[i].id = i; + vif->queues[i].vif = vif; + } + + return 0; +} + +static int xenvif_read_queues(struct xenvif *vif) +{ + int ret = -1; + unsigned int i = 0; + + if (xenvif_alloc_queues(vif) < 0) + return -1; + + if (vif->num_queues == 1) + ret = xenvif_queue_init(&vif->queues[0]); + else { + for (i = 0; i < vif->num_queues; i++) { + ret = xenvif_queue_init(&vif->queues[i]); + if (ret < 0) + return -1; + } + } + + return ret; +} + +static int xenvif_connect(struct xenvif *vif) +{ + if (xenvif_read_features(vif) < 0) + return -1; + + if (xenvif_read_queues(vif) < 0) + return -1; + + return 0; +} + +/* + * Backend <-> Frontend state management + * + * It is called when an entry is added or a watch token matches the same + * name. Here we run the main state machine in the backend. + * + * State transitions: + * + * Init(1) + * + * | + * | + * v + * + * InitWait (2) ---> Connected (4) + * + * ^ \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | v v + * + * Closed (6) <---> Closing(5) + * + * Init(1) Created by the toolstack + * InitWait(2) Frontend is initializing + * Connected(4) Frontend is connected + * Closing(5) Frontend is closing + * Closed(6) Frontend is closed + */ + +/* Switches state of the backend */ +static int xenbus_backend_set_state(struct xenvif *domain, + enum xenbus_state state) +{ + xs_transaction_t xbt; + bool abort; + + /* Do not fire unnecessary watches, and/or + * risking recreating the base directory if state + * is closed already */ + if (domain->be.state == state) + return 0; + +again: + xbt = xs_transaction_start(dev->xsh); + if (!xbt) + return 0; + + abort = true; + if (xenbus_printf(xbt, domain->be.path, "state", "%u", state)) + abort = false; + + if (!xs_transaction_end(dev->xsh, xbt, abort)) { + if (errno == -EAGAIN && !abort) + goto again; + /* TODO fail here */ + } else + domain->be.state = state; + + if (!abort) + RTE_LOG(INFO, XEN, "%s: vif%d.%d -> %s\n", __func__, + domain->be.domid, domain->be.handle, + xenbus_strstate(domain->be.state)); + + return abort ? -EINVAL : 0; +} + +/* Backend initial state */ +static void xenbus_backend_state_init(struct xenvif *domain __rte_unused) +{ + if (!xenvif_write_features(domain)) { + if (dev && dev->callbacks) + dev->callbacks->init(domain); + + xenbus_backend_set_state(domain, XenbusStateInitWait); + } +} + +/* Frontend is initializing */ +static void xenbus_backend_state_initwait(struct xenvif *domain __rte_unused) +{ +} + +/* Frontend is connected */ +static void xenbus_backend_state_connect(struct xenvif *domain) +{ + if (!xenvif_connect(domain)) { + if (dev && dev->callbacks) + dev->callbacks->connect(domain); + + xenbus_backend_set_state(domain, XenbusStateConnected); + } +} + +/* Frontend is closing */ +static void xenbus_backend_state_closing(struct xenvif *domain) +{ + if (dev && dev->callbacks) + dev->callbacks->disconnect(domain); + + xenbus_backend_set_state(domain, XenbusStateClosing); +} + +/* Frontend is closed or unknown */ +static void xenbus_backend_state_closed(struct xenvif *domain) +{ + if (dev && dev->callbacks) + dev->callbacks->close(domain); + + xenbus_backend_set_state(domain, XenbusStateClosed); +} + +/* When frontend state changes */ +static void xenbus_frontend_state_changed(struct xenvif *domain, + XenbusState state) +{ + switch (state) { + case XenbusStateInitialising: + xenbus_backend_state_initwait(domain); + break; + case XenbusStateInitialised: + case XenbusStateInitWait: + break; + case XenbusStateConnected: + xenbus_backend_state_connect(domain); + break; + case XenbusStateClosing: + xenbus_backend_state_closing(domain); + break; + case XenbusStateClosed: + case XenbusStateUnknown: + xenbus_backend_state_closed(domain); + break; + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + break; + } +} + +/* Read frontend state */ +static int xenbus_frontend_state_read(struct xenvif *domain) +{ + xs_transaction_t xbt; + int ret = -1; + bool abort; + +again: + xbt = xs_transaction_start(dev->xsh); + + abort = true; + if (xenbus_scanf(xbt, domain->fe.path, "state", "%d", &ret) == 1) { + domain->fe.state = ret; + abort = false; + } + + if (!xs_transaction_end(dev->xsh, xbt, abort)) { + if (errno == EAGAIN && !abort) + goto again; + + return -1; + } + + return 0; +} + +static void xenbus_frontend_changed(struct xenbus_device *dev, + unsigned int domid, + unsigned int handle) +{ + struct xenvif *domain = xenbus_backend_get(dev, domid, handle); + + if (!xenbus_frontend_state_read(domain)) { + RTE_LOG(INFO, XEN, "%s: vif%d.%d -> %s\n", __func__, domid, + handle, xenbus_strstate(domain->fe.state)); + + xenbus_frontend_state_changed(domain, domain->fe.state); + } +} + +/* + * Interface management routines + */ + +/* Hash functions for the domains table */ +static uint32_t xenbus_backend_hash_crc(const void *data, + __rte_unused uint32_t data_len, + uint32_t init_val) +{ + const struct xenvif_hash_key *k = data; + +#ifdef RTE_MACHINE_CPUFLAG_SSE4_2 + init_val = rte_hash_crc_4byte(k->domid, init_val); + init_val = rte_hash_crc_4byte(k->handle, init_val); +#else /* RTE_MACHINE_CPUFLAG_SSE4_2 */ + init_val = rte_jhash_1word(k->domid, init_val); + init_val = rte_jhash_1word(k->handle, init_val); +#endif + + return init_val; +} + +static void xenbus_backend_init(struct xenbus_device *dev) +{ + char s[64] = { 0 }; + struct rte_hash_parameters domains_hash_params = { + .name = NULL, + .entries = XENBUS_DOMAINS_HASH_ENTRIES, + .key_len = sizeof(domid_t), + .hash_func = xenbus_backend_hash_crc, + .hash_func_init_val = 0, + }; + + snprintf(s, sizeof(s), "xen_domains_hash_%d", 0); + domains_hash_params.name = s; + domains_hash_params.socket_id = 0; + + dev->domains = rte_hash_create(&domains_hash_params); +} + +static int xenbus_backend_read(struct xenvif *domain) +{ + unsigned int handle = domain->be.handle; + domid_t domid = domain->be.domid; + char *path; + + path = malloc(sizeof(char) * XENBUS_MAX_ENTRY); + snprintf(path, XENBUS_MAX_ENTRY, "%s/%d/%d", + dev->watch.path, domid, handle); + domain->be.path = path; + + /* read backend state */ + if (xenbus_scanf(XBT_NULL, path, "state", "%d", + &domain->be.state) != 1) + return -1; + + /* read backend vifname */ + if (xenbus_scanf(XBT_NULL, path, "vifname", "%ms", + &domain->ifname) != 1) + domain->ifname = strdup(""); + + /* %ms is a GNU extension + * XXX BSD compatibility */ + if (xenbus_scanf(XBT_NULL, path, "frontend", "%ms", + &domain->fe.path) != 1) + return -1; + + if (xenbus_scanf(XBT_NULL, path, "frontend-id", "%d", + &domain->fe.domid) != 1) + return -1; + + return 0; +} + +/* + * Add a new domain to our table whenever the backend// entry + * shows up. Here we setup any watches for state changes. + */ +static int xenbus_backend_add(struct xenbus_device *dev, + unsigned int domid, unsigned int handle) +{ + struct xenvif_hash_key k = { .domid = domid, .handle = handle }; + struct xenvif *domain; + char *path = NULL, *token = NULL; + int ret; + + domain = xenvif_alloc(domid, handle, dev->max_cores); + if (!domain) { + RTE_LOG(ERR, XEN, "%s: error allocating vif%d.%d\n", + __func__, domid, handle); + return -1; + } + + rte_hash_add_key_data(dev->domains, &k, domain); + + RTE_LOG(INFO, XEN, "%s: set key = (%u,%u) data = %p\n", + __func__, k.domid, k.handle, domain); + + /* read 'frontend' and 'frontend-id' entries */ + if (xenbus_backend_read(domain) < 0) { + RTE_LOG(ERR, XEN, "%s: error reading backend entries\n", + __func__); + return -1; + } + + /* watch frontend state changes */ + if (xenbus_scanf(XBT_NULL, domain->fe.path, "state", "%d", + &domain->fe.state) != 1) + return -1; + + asprintf(&path, "%s/state", domain->fe.path); + asprintf(&token, "%d/%d/fe", domid, handle); + ret = xs_watch(dev->xsh, path, token); + if (!ret) { + RTE_LOG(ERR, XEN, "%s: failed to watch otherend state %s\n", + __func__, path); + return -1; + } + + xenbus_backend_state_init(domain); + return 0; +} + +#if 0 +/* + * Deletes a new domain from the domain table. + */ +static void xenbus_backend_del(struct xenbus_device *dev, + unsigned int domid, unsigned int handle) +{ + struct xenvif_hash_key k = { 0 }; + struct xenvif *dom = NULL; + + k.domid = domid; + k.handle = handle; + + rte_hash_lookup_data(dev->domains, &k, (void**) &dom); + rte_hash_del_key(dev->domains, &k); + + free(dom); +} +#endif + +static struct xenvif* xenbus_backend_get(struct xenbus_device *dev, + unsigned domid, + unsigned int handle) +{ + struct xenvif_hash_key k; + struct xenvif *dom = NULL; + + k.domid = domid; + k.handle = handle; + + rte_hash_lookup_data(dev->domains, &k, (void**) &dom); + + RTE_LOG(INFO, XEN, "%s: get key = (%u,%u) data = %p\n", + __func__, k.domid, k.handle, dom); + + return dom; +} + +/* + * Xenbus global state management + */ + +/* Register the dev on xenstore */ +static int xenbus_register(struct xenbus_device *dev, const char *type) +{ + struct xenbus_watch *w = &dev->watch; + ssize_t sz; + int ret; + + sz = strlen(dev->dompath) + 20; + w->path = malloc(sz); + memset(w->path, 0, sz); + w->token = type; + + snprintf(w->path, sz, "backend/%s", w->token); + ret = xs_watch(dev->xsh, w->path, w->token); + if (!ret) { + RTE_LOG(ERR, XEN, "%s: failed to watch backend path %s \n", + __func__, w->path); + return -1; + } + + RTE_LOG(INFO, XEN, "%s: registering %s domid %d cores %d\n", __func__, + type, dev->domid, dev->max_cores); + return 0; +} + +/* + * Quite inneficient, but we listen on /backend/ and add domains + * and trigger the initial state changed. + */ +static void xenbus_update(struct xenbus_device *dev, char *path) +{ + char entry[XENBUS_MAX_ENTRY]; + unsigned domid, handle; + int ret, len; + + len = strlen(dev->watch.path); + if (strncmp(path, dev->watch.path, len) != 0) + return; + + if (!strlen(path+len)) + return; + + /* Parse path _or_ watch token */ + ret = sscanf(path+len, "/%u/%u/%255s", &domid, &handle, (char *) &entry); + + /* Currently we have a global watch on /backend/vif + * which leads to watch _all_ changes in the directory. + * In the meantime, we should switch to the event channel that + * is triggered when a domain gets created. + * + * NB: For the purposes of the prototype for this approach. + * + * TODO: * Change to listening on @introduceDomain special watch + * * Change to listening on @releaseDomain special watch + */ + if (ret == 2 && + !xenbus_backend_get(dev, domid, handle)) { + + /* domain is introduced */ + xenbus_backend_add(dev, domid, handle); + + RTE_LOG(INFO, XEN, "%s: new domain (dom = %d handle = %d)\n", + __func__, domid, handle); + } +} + +static void xenbus_event_loop(struct xenbus_device *dev) +{ + char **token; + char *name, *path; + char type[XENBUS_MAX_ENTRY]; + struct pollfd fds = { + .fd = dev->xsh_fd, + .events = POLLIN | POLLOUT, + }; + + for (;poll(&fds, 1, 1000) >= 0;) { + unsigned int domid, handle; + int ret; + + if (!(token = xs_check_watch(dev->xsh))) + continue; + + name = token[XS_WATCH_TOKEN]; + path = token[XS_WATCH_PATH]; + + if (!strcmp(name, "vif")) { + xenbus_update(dev, path); + continue; + } + + ret = sscanf(name, "%u/%u/%255s", &domid, &handle, (char*)&type); + if (ret == 3 && !strcmp(type, "fe")) + xenbus_frontend_changed(dev, domid, handle); + else + RTE_LOG(INFO, XEN, "%s: (unknown) watch %s path %s\n", + __func__, name, path); + } +} + +static int xenbus_get_domain_id(void) +{ + char *buf; + unsigned int len; + + buf = xs_read(dev->xsh, XBT_NULL, "domid", &len); + if (!buf) { + RTE_LOG(ERR, XEN, "%s: failed read domain id\n", __func__); + return -1; + } + + errno = 0; + dev->domid = atoi(buf); + if (errno != 0) + return -1; + + free(buf); + + return 0; +} + +static int xenbus_init(void) +{ + /* initialize xenstore related state */ + dev->xsh = xs_domain_open(); + if (!dev->xsh) { + RTE_LOG(ERR, XEN, "%s: failed to open xenstore\n", __func__); + return -1; + } + + if (xenbus_get_domain_id() < 0) { + RTE_LOG(ERR, XEN, "%s: failed read domain id\n", __func__); + return -1; + } + + dev->dompath = xs_get_domain_path(dev->xsh, dev->domid); + if (!dev->dompath) { + RTE_LOG(ERR, XEN, "%s: failed read domain path\n", __func__); + return -1; + } + + dev->xsh_fd = xs_fileno(dev->xsh); + + /* create domains hash table */ + xenbus_backend_init(dev); + + RTE_LOG(INFO, XEN, "%s: done\n", __func__); + return 0; +} + +static int xenbus_exit(void) +{ + int ret = 0; + + if (!dev->xsh) + return ret; + + xs_daemon_close(dev->xsh); + free(dev->dompath); + + return ret; +} + +/* + * Top level library functions exported to DPDK + */ + +int rte_xenbus_backend_register(struct xenbus_ops *ops, + unsigned max_cores) +{ + if (!dev) { + dev = malloc(sizeof(struct xenbus_device)); + memset(dev, 0, sizeof(struct xenbus_device)); + } + + dev->callbacks = (struct xenbus_ops*) ops; + dev->max_cores = max_cores > 1 ? max_cores : 1; + + return 0; +} + +int rte_xenbus_backend_start(void) +{ + if (!dev || (!dev->xsh && xenbus_init() < 0)) + return -1; + + xenbus_register(dev, *(dev->callbacks->ids)); + xenbus_event_loop(dev); + + return 0; +} + +void rte_xenbus_backend_stop(void) +{ + if (!dev->xsh) + return; + + xenbus_exit(); +} diff --git a/mk/rte.app.mk b/mk/rte.app.mk index eb28e11..bf050af 100644 --- a/mk/rte.app.mk +++ b/mk/rte.app.mk @@ -130,6 +130,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y) _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += -lrte_pmd_vhost endif # $(CONFIG_RTE_LIBRTE_VHOST) _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio +_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_XEN_NETBACK)+= -lrte_pmd_xen-netback ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y) _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += -lrte_pmd_aesni_mb -- 2.1.4