Files
scylladb/net/native-stack.cc

345 lines
12 KiB
C++

/*
* This file is open source software, licensed to you under the terms
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
* distributed with this work for additional information regarding copyright
* ownership. You may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (C) 2014 Cloudius Systems, Ltd.
*/
#include "native-stack.hh"
#include "native-stack-impl.hh"
#include "net.hh"
#include "ip.hh"
#include "tcp-stack.hh"
#include "udp.hh"
#include "virtio.hh"
#include "dpdk.hh"
#include "xenfront.hh"
#include "proxy.hh"
#include "dhcp.hh"
#include <memory>
#include <queue>
#ifdef HAVE_OSV
#include <osv/firmware.hh>
#include <gnu/libc-version.h>
#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
namespace net {
enum class xen_info {
nonxen = 0,
userspace = 1,
osv = 2,
};
#ifdef HAVE_XEN
static xen_info is_xen()
{
struct stat buf;
if (!stat("/proc/xen", &buf) || !stat("/dev/xen", &buf)) {
return xen_info::userspace;
}
#ifdef HAVE_OSV
const char *str = gnu_get_libc_release();
if (std::string("OSv") != str) {
return xen_info::nonxen;
}
auto firmware = osv::firmware_vendor();
if (firmware == "Xen") {
return xen_info::osv;
}
#endif
return xen_info::nonxen;
}
#endif
void create_native_net_device(boost::program_options::variables_map opts) {
std::unique_ptr<device> dev;
#ifdef HAVE_XEN
auto xen = is_xen();
if (xen != xen_info::nonxen) {
dev = xen::create_xenfront_net_device(opts, xen == xen_info::userspace);
} else
#endif
#ifdef HAVE_DPDK
if (opts.count("dpdk-pmd")) {
// Hardcoded port index 0.
// TODO: Inherit it from the opts
dev = create_dpdk_net_device(0, smp::count,
!(opts.count("lro") && opts["lro"].as<std::string>() == "off"),
!(opts.count("hw-fc") && opts["hw-fc"].as<std::string>() == "off"));
} else
#endif
dev = create_virtio_net_device(opts);
auto sem = std::make_shared<semaphore>(0);
std::shared_ptr<device> sdev(dev.release());
for (unsigned i = 0; i < smp::count; i++) {
smp::submit_to(i, [opts, sdev] {
uint16_t qid = engine().cpu_id();
if (qid < sdev->hw_queues_count()) {
auto qp = sdev->init_local_queue(opts, qid);
std::map<unsigned, float> cpu_weights;
for (unsigned i = sdev->hw_queues_count() + qid % sdev->hw_queues_count(); i < smp::count; i+= sdev->hw_queues_count()) {
cpu_weights[i] = 1;
}
cpu_weights[qid] = opts["hw-queue-weight"].as<float>();
qp->configure_proxies(cpu_weights);
sdev->set_local_queue(std::move(qp));
} else {
auto master = qid % sdev->hw_queues_count();
sdev->set_local_queue(create_proxy_net_device(master, sdev.get()));
}
}).then([sem] {
sem->signal();
});
}
sem->wait(smp::count).then([opts, sdev] {
sdev->link_ready().then([opts, sdev] {
for (unsigned i = 0; i < smp::count; i++) {
smp::submit_to(i, [opts, sdev] {
create_native_stack(opts, sdev);
});
}
});
});
}
// native_network_stack
class native_network_stack : public network_stack {
public:
static thread_local promise<std::unique_ptr<network_stack>> ready_promise;
private:
interface _netif;
ipv4 _inet;
bool _dhcp = false;
promise<> _config;
timer<> _timer;
future<> run_dhcp(bool is_renew = false, const dhcp::lease & res = dhcp::lease());
void on_dhcp(bool, const dhcp::lease &, bool);
void set_ipv4_packet_filter(ip_packet_filter* filter) {
_inet.set_packet_filter(filter);
}
using tcp4 = tcp<ipv4_traits>;
public:
explicit native_network_stack(boost::program_options::variables_map opts, std::shared_ptr<device> dev);
virtual server_socket listen(socket_address sa, listen_options opt) override;
virtual future<connected_socket> connect(socket_address sa, socket_address local) override;
virtual udp_channel make_udp_channel(ipv4_addr addr) override;
virtual future<> initialize() override;
static future<std::unique_ptr<network_stack>> create(boost::program_options::variables_map opts) {
if (engine().cpu_id() == 0) {
create_native_net_device(opts);
}
return ready_promise.get_future();
}
virtual bool has_per_core_namespace() override { return true; };
void arp_learn(ethernet_address l2, ipv4_address l3) {
_inet.learn(l2, l3);
}
friend class native_server_socket_impl<tcp4>;
};
thread_local promise<std::unique_ptr<network_stack>> native_network_stack::ready_promise;
udp_channel
native_network_stack::make_udp_channel(ipv4_addr addr) {
return _inet.get_udp().make_channel(addr);
}
void
add_native_net_options_description(boost::program_options::options_description &opts) {
#ifdef HAVE_XEN
auto xen = is_xen();
if (xen != xen_info::nonxen) {
opts.add(xen::get_xenfront_net_options_description());
return;
}
#endif
opts.add(get_virtio_net_options_description());
#ifdef HAVE_DPDK
opts.add(get_dpdk_net_options_description());
#endif
}
native_network_stack::native_network_stack(boost::program_options::variables_map opts, std::shared_ptr<device> dev)
: _netif(std::move(dev))
, _inet(&_netif) {
_inet.get_udp().set_queue_size(opts["udpv4-queue-size"].as<int>());
_dhcp = opts["host-ipv4-addr"].defaulted()
&& opts["gw-ipv4-addr"].defaulted()
&& opts["netmask-ipv4-addr"].defaulted() && opts["dhcp"].as<bool>();
if (!_dhcp) {
_inet.set_host_address(ipv4_address(_dhcp ? 0 : opts["host-ipv4-addr"].as<std::string>()));
_inet.set_gw_address(ipv4_address(opts["gw-ipv4-addr"].as<std::string>()));
_inet.set_netmask_address(ipv4_address(opts["netmask-ipv4-addr"].as<std::string>()));
}
}
server_socket
native_network_stack::listen(socket_address sa, listen_options opts) {
assert(sa.as_posix_sockaddr().sa_family == AF_INET);
return tcpv4_listen(_inet.get_tcp(), ntohs(sa.as_posix_sockaddr_in().sin_port), opts);
}
future<connected_socket>
native_network_stack::connect(socket_address sa, socket_address local) {
// FIXME: local is ignored since native stack does not support multiple IPs yet
assert(sa.as_posix_sockaddr().sa_family == AF_INET);
return tcpv4_connect(_inet.get_tcp(), sa);
}
using namespace std::chrono_literals;
future<> native_network_stack::run_dhcp(bool is_renew, const dhcp::lease& res) {
lw_shared_ptr<dhcp> d = make_lw_shared<dhcp>(_inet);
// Hijack the ip-stack.
for (unsigned i = 0; i < smp::count; i++) {
smp::submit_to(i, [d] {
auto & ns = static_cast<native_network_stack&>(engine().net());
ns.set_ipv4_packet_filter(d->get_ipv4_filter());
});
}
net::dhcp::result_type fut = is_renew ? d->renew(res) : d->discover();
return fut.then([this, d, is_renew](bool success, const dhcp::lease & res) {
for (unsigned i = 0; i < smp::count; i++) {
smp::submit_to(i, [] {
auto & ns = static_cast<native_network_stack&>(engine().net());
ns.set_ipv4_packet_filter(nullptr);
});
}
on_dhcp(success, res, is_renew);
});
}
void native_network_stack::on_dhcp(bool success, const dhcp::lease & res, bool is_renew) {
if (success) {
_inet.set_host_address(res.ip);
_inet.set_gw_address(res.gateway);
_inet.set_netmask_address(res.netmask);
}
// Signal waiters.
if (!is_renew) {
_config.set_value();
}
if (engine().cpu_id() == 0) {
// And the other cpus, which, in the case of initial discovery,
// will be waiting for us.
for (unsigned i = 1; i < smp::count; i++) {
smp::submit_to(i, [success, res, is_renew]() {
auto & ns = static_cast<native_network_stack&>(engine().net());
ns.on_dhcp(success, res, is_renew);
});
}
if (success) {
// And set up to renew the lease later on.
_timer.set_callback(
[this, res]() {
_config = promise<>();
run_dhcp(true, res);
});
_timer.arm(
std::chrono::duration_cast<clock_type::duration>(
res.lease_time));
}
}
}
future<> native_network_stack::initialize() {
return network_stack::initialize().then([this]() {
if (!_dhcp) {
return make_ready_future();
}
// Only run actual discover on main cpu.
// All other cpus must simply for main thread to complete and signal them.
if (engine().cpu_id() == 0) {
run_dhcp();
}
return _config.get_future();
});
}
void arp_learn(ethernet_address l2, ipv4_address l3)
{
for (unsigned i = 0; i < smp::count; i++) {
smp::submit_to(i, [l2, l3] {
auto & ns = static_cast<native_network_stack&>(engine().net());
ns.arp_learn(l2, l3);
});
}
}
void create_native_stack(boost::program_options::variables_map opts, std::shared_ptr<device> dev) {
native_network_stack::ready_promise.set_value(std::unique_ptr<network_stack>(std::make_unique<native_network_stack>(opts, std::move(dev))));
}
boost::program_options::options_description nns_options() {
boost::program_options::options_description opts(
"Native networking stack options");
opts.add_options()
("tap-device",
boost::program_options::value<std::string>()->default_value("tap0"),
"tap device to connect to")
("host-ipv4-addr",
boost::program_options::value<std::string>()->default_value("192.168.122.2"),
"static IPv4 address to use")
("gw-ipv4-addr",
boost::program_options::value<std::string>()->default_value("192.168.122.1"),
"static IPv4 gateway to use")
("netmask-ipv4-addr",
boost::program_options::value<std::string>()->default_value("255.255.255.0"),
"static IPv4 netmask to use")
("udpv4-queue-size",
boost::program_options::value<int>()->default_value(ipv4_udp::default_queue_size),
"Default size of the UDPv4 per-channel packet queue")
("dhcp",
boost::program_options::value<bool>()->default_value(true),
"Use DHCP discovery")
("hw-queue-weight",
boost::program_options::value<float>()->default_value(1.0f),
"Weighing of a hardware network queue relative to a software queue (0=no work, 1=equal share)")
#ifdef HAVE_DPDK
("dpdk-pmd", "Use DPDK PMD drivers")
#endif
("lro",
boost::program_options::value<std::string>()->default_value("on"),
"Enable LRO")
;
add_native_net_options_description(opts);
return opts;
}
network_stack_registrator nns_registrator{
"native", nns_options(), native_network_stack::create
};
}