Add an option to enable/disable sending and respecting PAUSE frames as defined in 802.3x and 802.3z specifications. We will configure the Link level PAUSEs (as opposed to PFC). In simple words Ethernel Flow Control relies on sending/receiving PAUSE (XOFF) MAC frames that indicate the sender that receiver's buffer is almost full. The idea is to avoid receive buffer overflow. When receiver's buffer is being freed it will send XON frame to indicate to the sender that it may transmit again. - Added DPDK-specific command option to toggle the feature. - Sending PAUSEs is enabled by default. Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com>
344 lines
11 KiB
C++
344 lines
11 KiB
C++
/*
|
|
* This file is open source software, licensed to you under the terms
|
|
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
|
|
* distributed with this work for additional information regarding copyright
|
|
* ownership. You may not use this file except in compliance with the License.
|
|
*
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing,
|
|
* software distributed under the License is distributed on an
|
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
* KIND, either express or implied. See the License for the
|
|
* specific language governing permissions and limitations
|
|
* under the License.
|
|
*/
|
|
/*
|
|
* Copyright (C) 2014 Cloudius Systems, Ltd.
|
|
*/
|
|
|
|
#include "native-stack.hh"
|
|
#include "native-stack-impl.hh"
|
|
#include "net.hh"
|
|
#include "ip.hh"
|
|
#include "tcp-stack.hh"
|
|
#include "udp.hh"
|
|
#include "virtio.hh"
|
|
#include "dpdk.hh"
|
|
#include "xenfront.hh"
|
|
#include "proxy.hh"
|
|
#include "dhcp.hh"
|
|
#include <memory>
|
|
#include <queue>
|
|
#ifdef HAVE_OSV
|
|
#include <osv/firmware.hh>
|
|
#include <gnu/libc-version.h>
|
|
#endif
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <unistd.h>
|
|
|
|
namespace net {
|
|
|
|
enum class xen_info {
|
|
nonxen = 0,
|
|
userspace = 1,
|
|
osv = 2,
|
|
};
|
|
|
|
#ifdef HAVE_XEN
|
|
static xen_info is_xen()
|
|
{
|
|
struct stat buf;
|
|
if (!stat("/proc/xen", &buf) || !stat("/dev/xen", &buf)) {
|
|
return xen_info::userspace;
|
|
}
|
|
|
|
#ifdef HAVE_OSV
|
|
const char *str = gnu_get_libc_release();
|
|
if (std::string("OSv") != str) {
|
|
return xen_info::nonxen;
|
|
}
|
|
auto firmware = osv::firmware_vendor();
|
|
if (firmware == "Xen") {
|
|
return xen_info::osv;
|
|
}
|
|
#endif
|
|
|
|
return xen_info::nonxen;
|
|
}
|
|
#endif
|
|
|
|
void create_native_net_device(boost::program_options::variables_map opts) {
|
|
std::unique_ptr<device> dev;
|
|
|
|
#ifdef HAVE_XEN
|
|
auto xen = is_xen();
|
|
if (xen != xen_info::nonxen) {
|
|
dev = xen::create_xenfront_net_device(opts, xen == xen_info::userspace);
|
|
} else
|
|
#endif
|
|
|
|
#ifdef HAVE_DPDK
|
|
if (opts.count("dpdk-pmd")) {
|
|
// Hardcoded port index 0.
|
|
// TODO: Inherit it from the opts
|
|
dev = create_dpdk_net_device(0, smp::count,
|
|
!(opts.count("lro") && opts["lro"].as<std::string>() == "off"),
|
|
!(opts.count("hw-fc") && opts["hw-fc"].as<std::string>() == "off"));
|
|
} else
|
|
#endif
|
|
dev = create_virtio_net_device(opts);
|
|
|
|
auto sem = std::make_shared<semaphore>(0);
|
|
std::shared_ptr<device> sdev(dev.release());
|
|
for (unsigned i = 0; i < smp::count; i++) {
|
|
smp::submit_to(i, [opts, sdev] {
|
|
uint16_t qid = engine().cpu_id();
|
|
if (qid < sdev->hw_queues_count()) {
|
|
auto qp = sdev->init_local_queue(opts, qid);
|
|
std::map<unsigned, float> cpu_weights;
|
|
for (unsigned i = sdev->hw_queues_count() + qid % sdev->hw_queues_count(); i < smp::count; i+= sdev->hw_queues_count()) {
|
|
cpu_weights[i] = 1;
|
|
}
|
|
cpu_weights[qid] = opts["hw-queue-weight"].as<float>();
|
|
qp->configure_proxies(cpu_weights);
|
|
sdev->set_local_queue(std::move(qp));
|
|
} else {
|
|
auto master = qid % sdev->hw_queues_count();
|
|
sdev->set_local_queue(create_proxy_net_device(master, sdev.get()));
|
|
}
|
|
}).then([sem] {
|
|
sem->signal();
|
|
});
|
|
}
|
|
sem->wait(smp::count).then([opts, sdev] {
|
|
sdev->link_ready().then([opts, sdev] {
|
|
for (unsigned i = 0; i < smp::count; i++) {
|
|
smp::submit_to(i, [opts, sdev] {
|
|
create_native_stack(opts, sdev);
|
|
});
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
// native_network_stack
|
|
class native_network_stack : public network_stack {
|
|
public:
|
|
static thread_local promise<std::unique_ptr<network_stack>> ready_promise;
|
|
private:
|
|
interface _netif;
|
|
ipv4 _inet;
|
|
bool _dhcp = false;
|
|
promise<> _config;
|
|
timer<> _timer;
|
|
|
|
future<> run_dhcp(bool is_renew = false, const dhcp::lease & res = dhcp::lease());
|
|
void on_dhcp(bool, const dhcp::lease &, bool);
|
|
void set_ipv4_packet_filter(ip_packet_filter* filter) {
|
|
_inet.set_packet_filter(filter);
|
|
}
|
|
using tcp4 = tcp<ipv4_traits>;
|
|
public:
|
|
explicit native_network_stack(boost::program_options::variables_map opts, std::shared_ptr<device> dev);
|
|
virtual server_socket listen(socket_address sa, listen_options opt) override;
|
|
virtual future<connected_socket> connect(socket_address sa) override;
|
|
virtual udp_channel make_udp_channel(ipv4_addr addr) override;
|
|
virtual future<> initialize() override;
|
|
static future<std::unique_ptr<network_stack>> create(boost::program_options::variables_map opts) {
|
|
if (engine().cpu_id() == 0) {
|
|
create_native_net_device(opts);
|
|
}
|
|
return ready_promise.get_future();
|
|
}
|
|
virtual bool has_per_core_namespace() override { return true; };
|
|
void arp_learn(ethernet_address l2, ipv4_address l3) {
|
|
_inet.learn(l2, l3);
|
|
}
|
|
friend class native_server_socket_impl<tcp4>;
|
|
};
|
|
|
|
thread_local promise<std::unique_ptr<network_stack>> native_network_stack::ready_promise;
|
|
|
|
udp_channel
|
|
native_network_stack::make_udp_channel(ipv4_addr addr) {
|
|
return _inet.get_udp().make_channel(addr);
|
|
}
|
|
|
|
void
|
|
add_native_net_options_description(boost::program_options::options_description &opts) {
|
|
|
|
#ifdef HAVE_XEN
|
|
auto xen = is_xen();
|
|
if (xen != xen_info::nonxen) {
|
|
opts.add(xen::get_xenfront_net_options_description());
|
|
return;
|
|
}
|
|
#endif
|
|
opts.add(get_virtio_net_options_description());
|
|
#ifdef HAVE_DPDK
|
|
opts.add(get_dpdk_net_options_description());
|
|
#endif
|
|
}
|
|
|
|
native_network_stack::native_network_stack(boost::program_options::variables_map opts, std::shared_ptr<device> dev)
|
|
: _netif(std::move(dev))
|
|
, _inet(&_netif) {
|
|
_inet.get_udp().set_queue_size(opts["udpv4-queue-size"].as<int>());
|
|
_dhcp = opts["host-ipv4-addr"].defaulted()
|
|
&& opts["gw-ipv4-addr"].defaulted()
|
|
&& opts["netmask-ipv4-addr"].defaulted() && opts["dhcp"].as<bool>();
|
|
if (!_dhcp) {
|
|
_inet.set_host_address(ipv4_address(_dhcp ? 0 : opts["host-ipv4-addr"].as<std::string>()));
|
|
_inet.set_gw_address(ipv4_address(opts["gw-ipv4-addr"].as<std::string>()));
|
|
_inet.set_netmask_address(ipv4_address(opts["netmask-ipv4-addr"].as<std::string>()));
|
|
}
|
|
}
|
|
|
|
server_socket
|
|
native_network_stack::listen(socket_address sa, listen_options opts) {
|
|
assert(sa.as_posix_sockaddr().sa_family == AF_INET);
|
|
return tcpv4_listen(_inet.get_tcp(), ntohs(sa.as_posix_sockaddr_in().sin_port), opts);
|
|
}
|
|
|
|
future<connected_socket>
|
|
native_network_stack::connect(socket_address sa) {
|
|
assert(sa.as_posix_sockaddr().sa_family == AF_INET);
|
|
return tcpv4_connect(_inet.get_tcp(), sa);
|
|
}
|
|
|
|
using namespace std::chrono_literals;
|
|
|
|
future<> native_network_stack::run_dhcp(bool is_renew, const dhcp::lease& res) {
|
|
lw_shared_ptr<dhcp> d = make_lw_shared<dhcp>(_inet);
|
|
|
|
// Hijack the ip-stack.
|
|
for (unsigned i = 0; i < smp::count; i++) {
|
|
smp::submit_to(i, [d] {
|
|
auto & ns = static_cast<native_network_stack&>(engine().net());
|
|
ns.set_ipv4_packet_filter(d->get_ipv4_filter());
|
|
});
|
|
}
|
|
|
|
net::dhcp::result_type fut = is_renew ? d->renew(res) : d->discover();
|
|
|
|
return fut.then([this, d, is_renew](bool success, const dhcp::lease & res) {
|
|
for (unsigned i = 0; i < smp::count; i++) {
|
|
smp::submit_to(i, [] {
|
|
auto & ns = static_cast<native_network_stack&>(engine().net());
|
|
ns.set_ipv4_packet_filter(nullptr);
|
|
});
|
|
}
|
|
on_dhcp(success, res, is_renew);
|
|
});
|
|
}
|
|
|
|
void native_network_stack::on_dhcp(bool success, const dhcp::lease & res, bool is_renew) {
|
|
if (success) {
|
|
_inet.set_host_address(res.ip);
|
|
_inet.set_gw_address(res.gateway);
|
|
_inet.set_netmask_address(res.netmask);
|
|
}
|
|
// Signal waiters.
|
|
if (!is_renew) {
|
|
_config.set_value();
|
|
}
|
|
|
|
if (engine().cpu_id() == 0) {
|
|
// And the other cpus, which, in the case of initial discovery,
|
|
// will be waiting for us.
|
|
for (unsigned i = 1; i < smp::count; i++) {
|
|
smp::submit_to(i, [success, res, is_renew]() {
|
|
auto & ns = static_cast<native_network_stack&>(engine().net());
|
|
ns.on_dhcp(success, res, is_renew);
|
|
});
|
|
}
|
|
if (success) {
|
|
// And set up to renew the lease later on.
|
|
_timer.set_callback(
|
|
[this, res]() {
|
|
_config = promise<>();
|
|
run_dhcp(true, res);
|
|
});
|
|
_timer.arm(
|
|
std::chrono::duration_cast<clock_type::duration>(
|
|
res.lease_time));
|
|
}
|
|
}
|
|
}
|
|
|
|
future<> native_network_stack::initialize() {
|
|
return network_stack::initialize().then([this]() {
|
|
if (!_dhcp) {
|
|
return make_ready_future();
|
|
}
|
|
|
|
// Only run actual discover on main cpu.
|
|
// All other cpus must simply for main thread to complete and signal them.
|
|
if (engine().cpu_id() == 0) {
|
|
run_dhcp();
|
|
}
|
|
return _config.get_future();
|
|
});
|
|
}
|
|
|
|
void arp_learn(ethernet_address l2, ipv4_address l3)
|
|
{
|
|
for (unsigned i = 0; i < smp::count; i++) {
|
|
smp::submit_to(i, [l2, l3] {
|
|
auto & ns = static_cast<native_network_stack&>(engine().net());
|
|
ns.arp_learn(l2, l3);
|
|
});
|
|
}
|
|
}
|
|
|
|
void create_native_stack(boost::program_options::variables_map opts, std::shared_ptr<device> dev) {
|
|
native_network_stack::ready_promise.set_value(std::unique_ptr<network_stack>(std::make_unique<native_network_stack>(opts, std::move(dev))));
|
|
}
|
|
|
|
boost::program_options::options_description nns_options() {
|
|
boost::program_options::options_description opts(
|
|
"Native networking stack options");
|
|
opts.add_options()
|
|
("tap-device",
|
|
boost::program_options::value<std::string>()->default_value("tap0"),
|
|
"tap device to connect to")
|
|
("host-ipv4-addr",
|
|
boost::program_options::value<std::string>()->default_value("192.168.122.2"),
|
|
"static IPv4 address to use")
|
|
("gw-ipv4-addr",
|
|
boost::program_options::value<std::string>()->default_value("192.168.122.1"),
|
|
"static IPv4 gateway to use")
|
|
("netmask-ipv4-addr",
|
|
boost::program_options::value<std::string>()->default_value("255.255.255.0"),
|
|
"static IPv4 netmask to use")
|
|
("udpv4-queue-size",
|
|
boost::program_options::value<int>()->default_value(ipv4_udp::default_queue_size),
|
|
"Default size of the UDPv4 per-channel packet queue")
|
|
("dhcp",
|
|
boost::program_options::value<bool>()->default_value(true),
|
|
"Use DHCP discovery")
|
|
("hw-queue-weight",
|
|
boost::program_options::value<float>()->default_value(1.0f),
|
|
"Weighing of a hardware network queue relative to a software queue (0=no work, 1=equal share)")
|
|
#ifdef HAVE_DPDK
|
|
("dpdk-pmd", "Use DPDK PMD drivers")
|
|
#endif
|
|
("lro",
|
|
boost::program_options::value<std::string>()->default_value("on"),
|
|
"Enable LRO")
|
|
;
|
|
|
|
add_native_net_options_description(opts);
|
|
return opts;
|
|
}
|
|
|
|
network_stack_registrator nns_registrator{
|
|
"native", nns_options(), native_network_stack::create
|
|
};
|
|
|
|
}
|