Files
scylladb/net/virtio.cc
Gleb Natapov d77ee625bd virtio: signal availability of a virtio buffer in a vring after sending packet
Currently there is an implicit unbounded queue between virtio driver
and networking stack where packets may accumulate if they are received
faster that networking stack can handle them. The queuing happen because
virtio buffer availability is signaled immediately after received buffer
promise is fulfilled, but promise fulfilment does not mean that buffer is
processed, only that task that will process it is placed on a task queue.

The patch fixes the problem by making virtio buffer available only after
previous buffer's completion task is executed. It makes the aforementioned
implicit queue between virtio driver and networking stack bound by virtio
ring size.
2014-11-04 15:19:27 +02:00

979 lines
33 KiB
C++

/*
* Copyright (C) 2014 Cloudius Systems, Ltd.
*/
#include "virtio.hh"
#include "core/posix.hh"
#include "core/future-util.hh"
#include "core/vla.hh"
#include "virtio-interface.hh"
#include "core/reactor.hh"
#include "core/stream.hh"
#include "core/circular_buffer.hh"
#include "core/align.hh"
#include "util/function_input_iterator.hh"
#include "util/transform_iterator.hh"
#include <atomic>
#include <vector>
#include <queue>
#include <fcntl.h>
#include <linux/vhost.h>
#include <linux/if_tun.h>
#include "ip.hh"
#include "const.hh"
#ifdef HAVE_OSV
#include <osv/virtio-assign.hh>
#endif
using namespace net;
/* The virtio_notifier class determines how to do host-to-guest and guest-to-
* host notifications. We have two different implementations - one for vhost
* (where both notifications occur through eventfds) and one for an assigned
* virtio device from OSv.
*/
class virtio_notifier {
public:
// Notify the host
virtual void notify() = 0;
// Wait for the host to notify us
virtual future<> wait() = 0;
// Do whatever it takes to wake wait(). A notifier does not need to
// implement this function if wait() waits for an external even which is
// generated by an external process (e.g., virtio_notifier_host doesn't
// need to implement this).
virtual void wake_wait() {
abort();
}
virtual ~virtio_notifier() {
}
};
class virtio_notifier_vhost : public virtio_notifier {
private:
readable_eventfd _notified;
writeable_eventfd _kick;
public:
virtual void notify() override {
_kick.signal(1);
}
virtual future<> wait() override {
// convert _notified.wait(), a future<size_t>, to a future<>:
return _notified.wait().then([this] (size_t ignore) {
return make_ready_future<>();
});
}
virtio_notifier_vhost(readable_eventfd &&notified,
writeable_eventfd &&kick)
: _notified(std::move(notified))
, _kick(std::move(kick)) {}
};
#ifdef HAVE_OSV
class virtio_notifier_osv : public virtio_notifier {
private:
// TODO: In the future, don't use eventfd at all. Change Seastar's event
// loop to use wait_until, and even a non-atomic variable is enough if the
// interrupt runs on the same CPU. We can also not use interrupts at all
// and instead use polling.
readable_eventfd _notified;
writeable_eventfd _notified_write;
uint16_t _q_index;
osv::assigned_virtio &_virtio;
public:
virtual void notify() override {
_virtio.kick(_q_index);
}
virtual future<> wait() override {
return _notified.wait().then([this] (size_t ignore) {
return make_ready_future<>();
});
}
virtual void wake_wait() override {
_notified_write.signal(1);
}
virtio_notifier_osv(osv::assigned_virtio &virtio, uint16_t q_index)
: _notified_write(_notified.write_side())
, _q_index(q_index)
, _virtio(virtio)
{
}
};
#endif
using phys = uint64_t;
class vring {
public:
struct config {
char* descs;
char* avail;
char* used;
unsigned size;
bool event_index;
bool indirect;
bool mergable_buffers;
};
struct buffer {
phys addr;
uint32_t len;
bool writeable;
};
struct buffer_chain : std::vector<buffer> {
promise<size_t> completed;
};
private:
class desc {
public:
struct flags {
// This marks a buffer as continuing via the next field.
uint16_t has_next : 1;
// This marks a buffer as write-only (otherwise read-only).
uint16_t writeable : 1;
// This means the buffer contains a list of buffer descriptors.
uint16_t indirect : 1;
};
phys get_paddr();
uint32_t get_len() { return _len; }
uint16_t next_idx() { return _next; }
phys _paddr;
uint32_t _len;
flags _flags;
uint16_t _next;
};
// Guest to host
struct avail_layout {
struct flags {
// Mark that we do not need an interrupt for consuming a descriptor
// from the ring. Unreliable so it's simply an optimization
uint16_t no_interrupts : 1;
};
std::atomic<uint16_t> _flags;
// Where we put the next descriptor
std::atomic<uint16_t> _idx;
// There may be no more entries than the queue size read from device
uint16_t _ring[];
// used event index is an optimization in order to get an interrupt from the host
// only when the value reaches this number
// The location of this field is places after the variable length ring array,
// that's why we cannot fully define it within the struct and use a function accessor
//std::atomic<uint16_t> used_event;
};
struct used_elem {
// Index of start of used _desc chain. (uint32_t for padding reasons)
uint32_t _id;
// Total length of the descriptor chain which was used (written to)
uint32_t _len;
};
// Host to guest
struct used_layout {
enum {
// The Host advise the Guest: don't kick me when
// you add a buffer. It's unreliable, so it's simply an
// optimization. Guest will still kick if it's out of buffers.
no_notify = 1
};
// Using std::atomic since it being changed by the host
std::atomic<uint16_t> _flags;
// Using std::atomic in order to have memory barriers for it
std::atomic<uint16_t> _idx;
used_elem _used_elements[];
// avail event index is an optimization kick the host only when the value reaches this number
// The location of this field is places after the variable length ring array,
// that's why we cannot fully define it within the struct and use a function accessor
//std::atomic<uint16_t> avail_event;
};
struct avail {
explicit avail(config conf);
avail_layout* _shared;
uint16_t _head = 0;
uint16_t _avail_added_since_kick = 0;
};
struct used {
explicit used(config conf);
used_layout* _shared;
uint16_t _tail = 0;
};
private:
config _config;
std::unique_ptr<virtio_notifier> _notifier;
std::unique_ptr<promise<size_t>[]> _completions;
desc* _descs;
avail _avail;
used _used;
std::atomic<uint16_t>* _avail_event;
std::atomic<uint16_t>* _used_event;
semaphore _available_descriptors = { 0 };
int _free_head = -1;
int _free_last = -1;
public:
explicit vring(config conf);
void set_notifier(std::unique_ptr<virtio_notifier> notifier) {
_notifier = std::move(notifier);
}
const config& getconfig() {
return _config;
}
void wake_notifier_wait() {
_notifier->wake_wait();
}
// start the queue
void run();
// complete any buffers returned from the host
void complete();
// wait for the used ring to have at least @nr buffers
future<> on_used(size_t nr);
// Total number of descriptors in ring
int size() { return _config.size; }
template <typename Iterator>
void post(Iterator begin, Iterator end);
semaphore& available_descriptors() { return _available_descriptors; }
private:
// Let host know about interrupt delivery
void disable_interrupts() {
if (!_config.event_index) {
_avail._shared->_flags.store(VRING_AVAIL_F_NO_INTERRUPT, std::memory_order_relaxed);
}
}
// Return "true" if there are pending buffers in the queue
bool enable_interrupts() {
auto tail = _used._tail;
if (!_config.event_index) {
_avail._shared->_flags.store(0, std::memory_order_relaxed);
} else {
_used_event->store(tail, std::memory_order_relaxed);
}
// We need to set the host notification flag then check if the queue is
// empty. The order is important. Use memory fence to make sure other
// cores see the same order.
std::atomic_thread_fence(std::memory_order_seq_cst);
// Any pending buffers
auto used_head = _used._shared->_idx.load(std::memory_order_relaxed);
return used_head != tail;
}
bool interrupts_disabled() {
return (_avail._shared->_flags.load(std::memory_order_relaxed) & VRING_AVAIL_F_NO_INTERRUPT) != 0;
}
bool notifications_disabled() {
return (_used._shared->_flags.load(std::memory_order_relaxed) & VRING_USED_F_NO_NOTIFY) != 0;
}
void kick() {
bool need_kick = true;
// Make sure we see the fresh _idx value writen before kick.
std::atomic_thread_fence(std::memory_order_seq_cst);
if (_config.event_index) {
uint16_t avail_idx = _avail._shared->_idx.load(std::memory_order_relaxed);
uint16_t avail_event = _avail_event->load(std::memory_order_relaxed);
need_kick = (uint16_t)(avail_idx - avail_event - 1) < _avail._avail_added_since_kick;
} else {
if (notifications_disabled())
return;
}
if (need_kick || (_avail._avail_added_since_kick >= (uint16_t)(~0) / 2)) {
_notifier->notify();
_avail._avail_added_since_kick = 0;
}
}
void do_complete();
size_t mask() { return size() - 1; }
size_t masked(size_t idx) { return idx & mask(); }
size_t available();
unsigned allocate_desc();
void setup();
};
vring::avail::avail(config conf)
: _shared(reinterpret_cast<avail_layout*>(conf.avail)) {
}
vring::used::used(config conf)
: _shared(reinterpret_cast<used_layout*>(conf.used)) {
}
inline
unsigned vring::allocate_desc() {
assert(_free_head != -1);
auto desc = _free_head;
if (desc == _free_last) {
_free_last = _free_head = -1;
} else {
_free_head = _descs[desc]._next;
}
return desc;
}
vring::vring(config conf)
: _config(conf)
, _completions(new promise<size_t>[_config.size])
, _descs(reinterpret_cast<desc*>(conf.descs))
, _avail(conf)
, _used(conf)
, _avail_event(reinterpret_cast<std::atomic<uint16_t>*>(&_used._shared->_used_elements[conf.size]))
, _used_event(reinterpret_cast<std::atomic<uint16_t>*>(&_avail._shared->_ring[conf.size]))
{
setup();
}
void vring::setup() {
for (unsigned i = 0; i < _config.size; ++i) {
_descs[i]._next = i + 1;
}
_free_head = 0;
_free_last = _config.size - 1;
_available_descriptors.signal(_config.size);
}
void vring::run() {
complete();
}
template <typename Iterator>
void vring::post(Iterator begin, Iterator end) {
// Note: buffer_chain here is any container of buffer, not
// necessarily vector<buffer>.
// buffer_chain should also include a promise<size_t> completed
// member, signifying the action to take when the request
// completes.
using buffer_chain = decltype(*begin);
std::for_each(begin, end, [this] (buffer_chain bc) {
desc pseudo_head = {};
desc* prev = &pseudo_head;
for (auto i = bc.begin(); i != bc.end(); ++i) {
unsigned desc_idx = allocate_desc();
prev->_flags.has_next = true;
prev->_next = desc_idx;
desc &d = _descs[desc_idx];
d._flags = {};
auto b = *i;
d._flags.writeable = b.writeable;
d._paddr = b.addr;
d._len = b.len;
prev = &d;
}
auto desc_head = pseudo_head._next;
_completions[desc_head] = std::move(bc.completed);
_avail._shared->_ring[masked(_avail._head++)] = desc_head;
_avail._avail_added_since_kick++;
});
_avail._shared->_idx.store(_avail._head, std::memory_order_release);
kick();
do_complete();
}
void vring::do_complete() {
do {
disable_interrupts();
auto used_head = _used._shared->_idx.load(std::memory_order_acquire);
while (used_head != _used._tail) {
auto ue = _used._shared->_used_elements[masked(_used._tail++)];
_completions[ue._id].set_value(ue._len);
auto id = ue._id;
if (_free_last != -1) {
_descs[_free_last]._next = id;
} else {
_free_head = id;
}
unsigned count = 0;
while (true) {
auto& d = _descs[id];
count++;
if (!d._flags.has_next) {
break;
}
id = d._next;
}
_free_last = id;
}
} while (enable_interrupts());
}
void vring::complete() {
do_complete();
_notifier->wait().then([this] {
complete();
});
}
class virtio_net_device : public net::device {
protected:
struct net_hdr {
uint8_t needs_csum : 1;
uint8_t flags_reserved : 7;
enum { gso_none = 0, gso_tcpv4 = 1, gso_udp = 3, gso_tcpv6 = 4, gso_ecn = 0x80 };
uint8_t gso_type;
uint16_t hdr_len;
uint16_t gso_size;
uint16_t csum_start;
uint16_t csum_offset;
};
struct net_hdr_mrg : net_hdr {
uint16_t num_buffers;
};
class txq {
virtio_net_device& _dev;
vring _ring;
public:
txq(virtio_net_device& dev, vring::config config);
void set_notifier(std::unique_ptr<virtio_notifier> notifier) {
_ring.set_notifier(std::move(notifier));
}
const vring::config& getconfig() {
return _ring.getconfig();
}
void wake_notifier_wait() {
_ring.wake_notifier_wait();
}
void run() { _ring.run(); }
future<> post(packet p);
};
class rxq {
virtio_net_device& _dev;
vring _ring;
unsigned _remaining_buffers = 0;
std::vector<fragment> _fragments;
std::vector<std::unique_ptr<char[], free_deleter>> _deleters;
public:
rxq(virtio_net_device& _if, vring::config config);
void set_notifier(std::unique_ptr<virtio_notifier> notifier) {
_ring.set_notifier(std::move(notifier));
}
const vring::config& getconfig() {
return _ring.getconfig();
}
void run() {
keep_doing([this] { return prepare_buffers(); });
_ring.run();
}
void wake_notifier_wait() {
_ring.wake_notifier_wait();
}
private:
future<> prepare_buffers();
};
protected:
size_t _header_len;
boost::program_options::variables_map _opts;
std::unique_ptr<char[], free_deleter> _txq_storage;
std::unique_ptr<char[], free_deleter> _rxq_storage;
net::hw_features _hw_features;
uint64_t _features;
txq _txq;
rxq _rxq;
stream<packet> _rx_stream;
protected:
uint64_t setup_features();
vring::config txq_config(size_t txq_ring_size);
vring::config rxq_config(size_t rxq_ring_size);
void common_config(vring::config& r);
void queue_rx_packet(packet p);
size_t vring_storage_size(size_t ring_size);
public:
explicit virtio_net_device(boost::program_options::variables_map opts, size_t rx_ring_size, size_t tx_ring_size);
virtual subscription<packet> receive(std::function<future<> (packet)> next) override;
virtual future<> send(packet p) override;
virtual net::hw_features hw_features() override;
virtual phys virt_to_phys(void* p) {
return reinterpret_cast<uintptr_t>(p);
}
};
virtio_net_device::txq::txq(virtio_net_device& dev, vring::config config)
: _dev(dev), _ring(config) {
}
future<>
virtio_net_device::txq::post(packet p) {
net_hdr_mrg vhdr = {};
// Handle TCP checksum offload
auto oi = p.offload_info();
if (_dev.hw_features().tx_csum_offload) {
auto eth_hdr_len = sizeof(eth_hdr);
auto ip_hdr_len = oi.ip_hdr_len;
auto mtu = _dev.hw_features().mtu;
if (oi.protocol == ip_protocol_num::tcp) {
auto tcp_hdr_len = oi.tcp_hdr_len;
vhdr.needs_csum = 1;
vhdr.csum_start = eth_hdr_len + ip_hdr_len;
// TCP checksum filed's offset within the TCP header is 16 bytes
vhdr.csum_offset = 16;
if (_dev.hw_features().tx_tso && p.len() > mtu + eth_hdr_len) {
// IPv4 TCP TSO
vhdr.gso_type = net_hdr::gso_tcpv4;
// Sum of Ethernet, IP and TCP header size
vhdr.hdr_len = eth_hdr_len + ip_hdr_len + tcp_hdr_len;
// Maximum segment size of packet after the offload
vhdr.gso_size = mtu - ip_hdr_len - tcp_hdr_len;
}
} else if (oi.protocol == ip_protocol_num::udp) {
auto udp_hdr_len = oi.udp_hdr_len;
vhdr.needs_csum = 1;
vhdr.csum_start = eth_hdr_len + ip_hdr_len;
// UDP checksum filed's offset within the UDP header is 6 bytes
vhdr.csum_offset = 6;
if (_dev.hw_features().tx_ufo && p.len() > mtu + eth_hdr_len) {
vhdr.gso_type = net_hdr::gso_udp;
vhdr.hdr_len = eth_hdr_len + ip_hdr_len + udp_hdr_len;
vhdr.gso_size = mtu - ip_hdr_len - udp_hdr_len;
}
}
}
// prepend virtio-net header
packet q = packet(fragment{reinterpret_cast<char*>(&vhdr), _dev._header_len},
std::move(p));
auto nr_frags = q.nr_frags();
return _ring.available_descriptors().wait(nr_frags).then([this, nr_frags, p = std::move(q)] () mutable {
static auto fragment_to_buffer = [this] (fragment f) {
vring::buffer b;
b.addr = _dev.virt_to_phys(f.base);
b.len = f.size;
b.writeable = false;
return b;
};
struct packet_as_buffer_chain {
fragment* start;
fragment* finish;
promise<size_t> completed;
auto begin() {
return make_transform_iterator(start, fragment_to_buffer);
}
auto end() {
return make_transform_iterator(finish, fragment_to_buffer);
}
} vbc[1] { { p.fragments().begin(), p.fragments().end() } };
// schedule packet destruction
vbc[0].completed.get_future().then([this, nr_frags, p = std::move(p)] (size_t) {
_ring.available_descriptors().signal(nr_frags);
});
_ring.post(std::begin(vbc), std::end(vbc));
});
}
virtio_net_device::rxq::rxq(virtio_net_device& dev, vring::config config)
: _dev(dev), _ring(config) {
}
future<>
virtio_net_device::rxq::prepare_buffers() {
auto& available = _ring.available_descriptors();
return available.wait(1).then([this, &available] {
unsigned count = 1;
auto opportunistic = available.current();
if (available.try_wait(opportunistic)) {
count += opportunistic;
}
auto make_buffer_chain = [this] {
struct single_buffer_and_completion : std::array<vring::buffer, 1> {
promise<size_t> completed;
} bc;
std::unique_ptr<char[], free_deleter> buf(reinterpret_cast<char*>(malloc(4096)));
vring::buffer& b = bc[0];
b.addr = _dev.virt_to_phys(buf.get());
b.len = 4096;
b.writeable = true;
bc.completed.get_future().then([this, buf = std::move(buf)] (size_t len) mutable {
auto frag_buf = buf.get();
auto frag_len = len;
// First buffer
if (_remaining_buffers == 0) {
auto hdr = reinterpret_cast<net_hdr_mrg*>(frag_buf);
assert(hdr->num_buffers >= 1);
// TODO: special-case for num_buffers == 1
_remaining_buffers = hdr->num_buffers;
frag_buf += _dev._header_len;
frag_len -= _dev._header_len;
_fragments.clear();
_deleters.clear();
};
// Append current buffer
_fragments.emplace_back(fragment{frag_buf, frag_len});
_deleters.push_back(std::move(buf));
_remaining_buffers--;
// Last buffer
if (_remaining_buffers == 0) {
deleter del;
if (_deleters.size() == 1) {
del = make_free_deleter(_deleters[0].release());
_deleters.clear();
} else {
del = make_deleter(deleter(), [deleters = std::move(_deleters)] {});
}
packet p(_fragments.begin(), _fragments.end(), std::move(del));
_dev.queue_rx_packet(std::move(p));
_ring.available_descriptors().signal(_fragments.size());
}
});
return bc;
};
auto start = make_function_input_iterator(make_buffer_chain, 0U);
auto finish = make_function_input_iterator(make_buffer_chain, count);
_ring.post(start, finish);
});
}
// Allocate and zero-initialize a buffer which is page-aligned and can be
// used for virt_to_phys (i.e., physically contiguous).
static std::unique_ptr<char[], free_deleter> virtio_buffer(size_t size) {
void* ret;
auto r = posix_memalign(&ret, 4096, size);
assert(r == 0);
bzero(ret, size);
return std::unique_ptr<char[], free_deleter>(reinterpret_cast<char*>(ret));
}
virtio_net_device::virtio_net_device(boost::program_options::variables_map opts,
size_t rx_ring_size, size_t tx_ring_size)
: _opts(opts)
, _txq_storage(virtio_buffer(vring_storage_size(tx_ring_size)))
, _rxq_storage(virtio_buffer(vring_storage_size(rx_ring_size)))
, _features(setup_features())
, _txq(*this, txq_config(tx_ring_size))
, _rxq(*this, rxq_config(rx_ring_size))
, _rx_stream() {
_rx_stream.started();
}
uint64_t virtio_net_device::setup_features() {
int64_t seastar_supported_features = VIRTIO_RING_F_INDIRECT_DESC | VIRTIO_NET_F_MRG_RXBUF;
if (!(_opts.count("event-index") && _opts["event-index"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_RING_F_EVENT_IDX;
}
if (!(_opts.count("csum-offload") && _opts["csum-offload"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
_hw_features.tx_csum_offload = true;
_hw_features.rx_csum_offload = true;
} else {
_hw_features.tx_csum_offload = false;
_hw_features.rx_csum_offload = false;
}
if (!(_opts.count("tso") && _opts["tso"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_NET_F_HOST_TSO4;
seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4;
_hw_features.tx_tso = true;
} else {
_hw_features.tx_tso = false;
}
if (!(_opts.count("ufo") && _opts["ufo"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_NET_F_HOST_UFO;
seastar_supported_features |= VIRTIO_NET_F_GUEST_UFO;
_hw_features.tx_ufo = true;
} else {
_hw_features.tx_ufo = false;
}
seastar_supported_features |= VIRTIO_NET_F_MAC;
return seastar_supported_features;
}
size_t virtio_net_device::vring_storage_size(size_t ring_size) {
// overestimate, but not by much.
return 3 * 4096 + ring_size * (16 + 2 + 8);
}
void virtio_net_device::common_config(vring::config& r) {
r.avail = r.descs + 16 * r.size;
r.used = align_up(r.avail + 2 * r.size + 6, 4096);
r.event_index = (_features & VIRTIO_RING_F_EVENT_IDX) != 0;
r.indirect = false;
}
vring::config virtio_net_device::txq_config(size_t tx_ring_size) {
vring::config r;
r.size = tx_ring_size;
r.descs = _txq_storage.get();
r.mergable_buffers = false;
common_config(r);
return r;
}
vring::config virtio_net_device::rxq_config(size_t rx_ring_size) {
vring::config r;
r.size = rx_ring_size;
r.descs = _rxq_storage.get();
r.mergable_buffers = true;
common_config(r);
return r;
}
subscription<packet>
virtio_net_device::receive(std::function<future<> (packet)> next) {
_rxq.run();
return _rx_stream.listen(std::move(next));
}
future<>
virtio_net_device::send(packet p) {
return _txq.post(std::move(p));
}
void virtio_net_device::queue_rx_packet(packet p) {
_rx_stream.produce(std::move(p));
}
net::hw_features virtio_net_device::hw_features() {
return _hw_features;
}
boost::program_options::options_description
get_virtio_net_options_description()
{
boost::program_options::options_description opts(
"Virtio net options");
opts.add_options()
("event-index",
boost::program_options::value<std::string>()->default_value("on"),
"Enable event-index feature (on / off)")
("csum-offload",
boost::program_options::value<std::string>()->default_value("on"),
"Enable checksum offload feature (on / off)")
("tso",
boost::program_options::value<std::string>()->default_value("on"),
"Enable TCP segment offload feature (on / off)")
("ufo",
boost::program_options::value<std::string>()->default_value("on"),
"Enable UDP fragmentation offload feature (on / off)")
("virtio-ring-size",
boost::program_options::value<unsigned>()->default_value(256),
"Virtio ring size (must be power-of-two)")
;
return opts;
}
class virtio_net_device_vhost : public virtio_net_device {
private:
// The vhost file descriptor needs to remain open throughout the life of
// this driver, as as soon as we close it, vhost stops servicing us.
file_desc _vhost_fd;
public:
virtio_net_device_vhost(sstring tap_device, boost::program_options::variables_map opts);
virtual ethernet_address hw_address() override;
};
ethernet_address virtio_net_device_vhost::hw_address() {
return {{{ 0x12, 0x23, 0x34, 0x56, 0x67, 0x78 }}};
}
static size_t config_ring_size(boost::program_options::variables_map &opts) {
if (opts.count("event-index")) {
return opts["virtio-ring-size"].as<unsigned>();
} else {
return 256;
}
}
virtio_net_device_vhost::virtio_net_device_vhost(sstring tap_device,
boost::program_options::variables_map opts)
: virtio_net_device(opts, config_ring_size(opts), config_ring_size(opts))
, _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR))
{
int64_t vhost_supported_features;
_vhost_fd.ioctl(VHOST_GET_FEATURES, vhost_supported_features);
vhost_supported_features &= _features;
_vhost_fd.ioctl(VHOST_SET_FEATURES, vhost_supported_features);
if (vhost_supported_features & VIRTIO_NET_F_MRG_RXBUF) {
_header_len = sizeof(net_hdr_mrg);
} else {
_header_len = sizeof(net_hdr);
}
// Open and set up the tap device, which we'll tell vhost to use.
// Note that the tap_fd we open here will be closed at the end of
// this function. It appears that this is fine - i.e., after we pass
// this fd to VHOST_NET_SET_BACKEND, the Linux kernel keeps the reference
// to it and it's fine to close the file descriptor.
file_desc tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK));
assert(tap_device.size() + 1 <= IFNAMSIZ);
ifreq ifr = {};
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR;
strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str());
tap_fd.ioctl(TUNSETIFF, ifr);
unsigned int offload = 0;
if (hw_features().tx_csum_offload && hw_features().rx_csum_offload) {
offload = TUN_F_CSUM;
if (hw_features().tx_tso) {
offload |= TUN_F_TSO4;
}
if (hw_features().tx_ufo) {
offload |= TUN_F_UFO;
}
}
tap_fd.ioctl(TUNSETOFFLOAD, offload);
tap_fd.ioctl(TUNSETVNETHDRSZ, _header_len);
// Additional vhost setup:
_vhost_fd.ioctl(VHOST_SET_OWNER);
auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1);
mem_table->nregions = 1;
auto& region = mem_table->regions[0];
region.guest_phys_addr = 0;
region.memory_size = (size_t(1) << 47) - 4096;
region.userspace_addr = 0;
region.flags_padding = 0;
_vhost_fd.ioctl(VHOST_SET_MEM_TABLE, *mem_table);
vhost_vring_state vvs0 = { 0, _rxq.getconfig().size };
_vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs0);
vhost_vring_state vvs1 = { 1, _txq.getconfig().size };
_vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs1);
auto tov = [](char* x) { return reinterpret_cast<uintptr_t>(x); };
_vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
0, 0, tov(_rxq.getconfig().descs), tov(_rxq.getconfig().used),
tov(_rxq.getconfig().avail), 0
});
_vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
1, 0, tov(_txq.getconfig().descs), tov(_txq.getconfig().used),
tov(_txq.getconfig().avail), 0
});
readable_eventfd _txq_notify;
writeable_eventfd _txq_kick;
readable_eventfd _rxq_notify;
writeable_eventfd _rxq_kick;
_vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{0, _rxq_kick.get_read_fd()});
_vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{0, _rxq_notify.get_write_fd()});
_vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{1, _txq_kick.get_read_fd()});
_vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{1, _txq_notify.get_write_fd()});
_rxq.set_notifier(std::make_unique<virtio_notifier_vhost>(
std::move(_rxq_notify), std::move(_rxq_kick)));
_txq.set_notifier(std::make_unique<virtio_notifier_vhost>(
std::move(_txq_notify), std::move(_txq_kick)));
_vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{0, tap_fd.get()});
_vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{1, tap_fd.get()});
_txq.run();
}
#ifdef HAVE_OSV
class virtio_net_device_osv : public virtio_net_device {
private:
ethernet_address _mac;
osv::assigned_virtio &_virtio;
public:
virtio_net_device_osv(osv::assigned_virtio &virtio,
boost::program_options::variables_map opts);
virtual ethernet_address hw_address() override {
return _mac;
}
virtual phys virt_to_phys(void* p) override {
return osv::assigned_virtio::virt_to_phys(p);
}
};
virtio_net_device_osv::virtio_net_device_osv(osv::assigned_virtio &virtio,
boost::program_options::variables_map opts)
: virtio_net_device(opts, virtio.queue_size(0), virtio.queue_size(1))
, _virtio(virtio)
{
// Read the host's virtio supported feature bitmask, AND it with the
// features we want to use, and tell the host of the result:
uint32_t subset = _virtio.init_features(_features);
if (subset & VIRTIO_NET_F_MRG_RXBUF) {
_header_len = sizeof(net_hdr_mrg);
} else {
_header_len = sizeof(net_hdr);
}
// TODO: save bits from "subset" in _hw_features?
// bool _mergeable_bufs = subset & VIRTIO_NET_F_MRG_RXBUF;
// bool _status = subset & VIRTIO_NET_F_STATUS;
// bool _tso_ecn = subset & VIRTIO_NET_F_GUEST_ECN;
// bool _host_tso_ecn = subset & VIRTIO_NET_F_HOST_ECN;
// bool _csum = subset & VIRTIO_NET_F_CSUM;
// bool _guest_csum = subset & VIRTIO_NET_F_GUEST_CSUM;
// bool _guest_tso4 = subset & VIRTIO_NET_F_GUEST_TSO4;
// bool _host_tso4 = subset & VIRTIO_NET_F_HOST_TSO4;
// bool _guest_ufo = subset & VIRTIO_NET_F_GUEST_UFO;
// Get the MAC address set by the host
assert(subset & VIRTIO_NET_F_MAC);
struct net_config {
/* The config defining mac address (if VIRTIO_NET_F_MAC) */
uint8_t mac[6];
/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* */
uint16_t status;
/* Maximum number of each of transmit and receive queues;
* see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ.
* Legal values are between 1 and 0x8000
*/
uint16_t max_virtqueue_pairs;
} __attribute__((packed)) host_config;
_virtio.conf_read(&host_config, sizeof(host_config));
_mac = {{{ host_config.mac[0], host_config.mac[1], host_config.mac[2],
host_config.mac[3], host_config.mac[4], host_config.mac[5] }}};
// Setup notifiers
_rxq.set_notifier(std::make_unique<virtio_notifier_osv>(_virtio, 0));
_txq.set_notifier(std::make_unique<virtio_notifier_osv>(_virtio, 1));
// Tell the host where we put the rings (we already allocated them earlier)
_virtio.set_queue_pfn(
0, virt_to_phys(_rxq.getconfig().descs));
_virtio.set_queue_pfn(
1, virt_to_phys(_txq.getconfig().descs));
_txq.run();
// Set up interrupts
// FIXME: in OSv, the first thing we do in the handler is to call
// _rqx.disable_interrupts(). Here in seastar, we only do it much later
// in the main engine. Probably needs to do it like in osv - in the beginning of the handler.
_virtio.enable_interrupt(
0, [&] { _rxq.wake_notifier_wait(); } );
_virtio.enable_interrupt(
1, [&] { _txq.wake_notifier_wait(); } );
_virtio.set_driver_ok();
}
#endif
// FIXME: no real reason why the tap_device needs to be a separate option -
// can be opts["tap-device"] (this is what we do anyway in net/stack.cc)
std::unique_ptr<net::device> create_virtio_net_device(sstring tap_device, boost::program_options::variables_map opts) {
net::device *ptr;
#ifdef HAVE_OSV
if (osv::assigned_virtio::get && osv::assigned_virtio::get()) {
std::cout << "In OSv and assigned host's virtio device\n";
ptr = new virtio_net_device_osv(*osv::assigned_virtio::get(), opts);
} else
#endif
ptr = new virtio_net_device_vhost(tap_device, opts);
// This assumes only one device per cpu. Will need to be fixed when
// this assumption will no longer hold.
dev = ptr;
return std::unique_ptr<net::device>(ptr);
}