Files
scylladb/net/virtio.cc
Avi Kivity 27913013b1 virtio: tighten rx packets for debug mode
Allocate exactly the available fragment size in order to catch buffer
overflows.

We get similar behaviour in dpdk, since without huge pages, it must copy
the packet into a newly allocated buffer.
2015-03-01 16:42:07 +02:00

1012 lines
33 KiB
C++

/*
* This file is open source software, licensed to you under the terms
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
* distributed with this work for additional information regarding copyright
* ownership. You may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (C) 2014 Cloudius Systems, Ltd.
*/
#include "virtio.hh"
#include "core/posix.hh"
#include "core/future-util.hh"
#include "core/vla.hh"
#include "virtio-interface.hh"
#include "core/reactor.hh"
#include "core/stream.hh"
#include "core/circular_buffer.hh"
#include "core/align.hh"
#include "util/function_input_iterator.hh"
#include "util/transform_iterator.hh"
#include <atomic>
#include <vector>
#include <queue>
#include <fcntl.h>
#include <linux/vhost.h>
#include <linux/if_tun.h>
#include "ip.hh"
#include "const.hh"
#include "net/native-stack.hh"
#ifdef HAVE_OSV
#include <osv/virtio-assign.hh>
#endif
using namespace net;
namespace virtio {
using phys = uint64_t;
#ifndef HAVE_OSV
phys virt_to_phys(void* p) {
return reinterpret_cast<uintptr_t>(p);
}
#else
phys virt_to_phys(void* p) {
return osv::assigned_virtio::virt_to_phys(p);
}
#endif
class device : public net::device {
private:
boost::program_options::variables_map _opts;
net::hw_features _hw_features;
uint64_t _features;
private:
uint64_t setup_features() {
int64_t seastar_supported_features = VIRTIO_RING_F_INDIRECT_DESC | VIRTIO_NET_F_MRG_RXBUF;
if (!(_opts.count("event-index") && _opts["event-index"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_RING_F_EVENT_IDX;
}
if (!(_opts.count("csum-offload") && _opts["csum-offload"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
_hw_features.tx_csum_l4_offload = true;
_hw_features.rx_csum_offload = true;
} else {
_hw_features.tx_csum_l4_offload = false;
_hw_features.rx_csum_offload = false;
}
if (!(_opts.count("tso") && _opts["tso"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_NET_F_HOST_TSO4;
seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4;
_hw_features.tx_tso = true;
} else {
_hw_features.tx_tso = false;
}
if (!(_opts.count("ufo") && _opts["ufo"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_NET_F_HOST_UFO;
seastar_supported_features |= VIRTIO_NET_F_GUEST_UFO;
_hw_features.tx_ufo = true;
} else {
_hw_features.tx_ufo = false;
}
seastar_supported_features |= VIRTIO_NET_F_MAC;
return seastar_supported_features;
}
public:
device(boost::program_options::variables_map opts)
: _opts(opts), _features(setup_features())
{}
ethernet_address hw_address() override {
return { 0x12, 0x23, 0x34, 0x56, 0x67, 0x78 };
}
net::hw_features hw_features() {
return _hw_features;
}
uint64_t features() {
return _features;
}
virtual std::unique_ptr<net::qp> init_local_queue(boost::program_options::variables_map opts, uint16_t qid) override;
};
/* The virtio_notifier class determines how to do host-to-guest and guest-to-
* host notifications. We have two different implementations - one for vhost
* (where both notifications occur through eventfds) and one for an assigned
* virtio device from OSv.
*/
class notifier {
public:
// Notify the host
virtual void notify() = 0;
// Do whatever it takes to wake wait(). A notifier does not need to
// implement this function if wait() waits for an external even which is
// generated by an external process (e.g., virtio_notifier_host doesn't
// need to implement this).
virtual void wake_wait() {
abort();
}
virtual ~notifier() {
}
};
class notifier_vhost : public notifier {
private:
writeable_eventfd _kick;
public:
virtual void notify() override {
_kick.signal(1);
}
notifier_vhost(writeable_eventfd &&kick)
: _kick(std::move(kick)) {}
};
#ifdef HAVE_OSV
class notifier_osv : public notifier {
private:
uint16_t _q_index;
osv::assigned_virtio &_virtio;
public:
virtual void notify() override {
_virtio.kick(_q_index);
}
virtual void wake_wait() override {
_notified->signal();
}
notifier_osv(osv::assigned_virtio &virtio, uint16_t q_index)
, _q_index(q_index)
, _virtio(virtio)
{
}
};
#endif
struct ring_config {
char* descs;
char* avail;
char* used;
unsigned size;
bool event_index;
bool indirect;
bool mergable_buffers;
};
struct buffer {
phys addr;
uint32_t len;
bool writeable;
};
// The 'buffer_chain' concept, used in vring, is a container of buffers, as in:
//
// using buffer_chain = std::vector<buffer>;
//
// The 'Completion' concept is a functor with the signature:
//
// void (buffer_chain&, size_t len);
//
template <typename BufferChain, typename Completion>
class vring {
private:
class desc {
public:
struct flags {
// This marks a buffer as continuing via the next field.
uint16_t has_next : 1;
// This marks a buffer as write-only (otherwise read-only).
uint16_t writeable : 1;
// This means the buffer contains a list of buffer descriptors.
uint16_t indirect : 1;
};
phys get_paddr();
uint32_t get_len() { return _len; }
uint16_t next_idx() { return _next; }
phys _paddr;
uint32_t _len;
flags _flags;
uint16_t _next;
};
// Guest to host
struct avail_layout {
struct flags {
// Mark that we do not need an interrupt for consuming a descriptor
// from the ring. Unreliable so it's simply an optimization
uint16_t no_interrupts : 1;
};
std::atomic<uint16_t> _flags;
// Where we put the next descriptor
std::atomic<uint16_t> _idx;
// There may be no more entries than the queue size read from device
uint16_t _ring[];
// used event index is an optimization in order to get an interrupt from the host
// only when the value reaches this number
// The location of this field is places after the variable length ring array,
// that's why we cannot fully define it within the struct and use a function accessor
//std::atomic<uint16_t> used_event;
};
struct used_elem {
// Index of start of used _desc chain. (uint32_t for padding reasons)
uint32_t _id;
// Total length of the descriptor chain which was used (written to)
uint32_t _len;
};
// Host to guest
struct used_layout {
enum {
// The Host advise the Guest: don't kick me when
// you add a buffer. It's unreliable, so it's simply an
// optimization. Guest will still kick if it's out of buffers.
no_notify = 1
};
// Using std::atomic since it being changed by the host
std::atomic<uint16_t> _flags;
// Using std::atomic in order to have memory barriers for it
std::atomic<uint16_t> _idx;
used_elem _used_elements[];
// avail event index is an optimization kick the host only when the value reaches this number
// The location of this field is places after the variable length ring array,
// that's why we cannot fully define it within the struct and use a function accessor
//std::atomic<uint16_t> avail_event;
};
struct avail {
explicit avail(ring_config conf);
avail_layout* _shared;
uint16_t _head = 0;
uint16_t _avail_added_since_kick = 0;
};
struct used {
explicit used(ring_config conf);
used_layout* _shared;
uint16_t _tail = 0;
};
private:
ring_config _config;
Completion _complete;
std::unique_ptr<notifier> _notifier;
std::unique_ptr<BufferChain[]> _buffer_chains;
desc* _descs;
avail _avail;
used _used;
std::atomic<uint16_t>* _avail_event;
std::atomic<uint16_t>* _used_event;
semaphore _available_descriptors = { 0 };
int _free_head = -1;
int _free_last = -1;
reactor::poller _poller;
public:
explicit vring(ring_config conf, Completion complete);
void set_notifier(std::unique_ptr<notifier> notifier) {
_notifier = std::move(notifier);
}
const ring_config& getconfig() {
return _config;
}
void wake_notifier_wait() {
_notifier->wake_wait();
}
// start the queue
void run();
// wait for the used ring to have at least @nr buffers
future<> on_used(size_t nr);
// Total number of descriptors in ring
int size() { return _config.size; }
template <typename Iterator>
void post(Iterator begin, Iterator end);
semaphore& available_descriptors() { return _available_descriptors; }
private:
bool notifications_disabled() {
return (_used._shared->_flags.load(std::memory_order_relaxed) & VRING_USED_F_NO_NOTIFY) != 0;
}
void kick() {
bool need_kick = true;
// Make sure we see the fresh _idx value writen before kick.
std::atomic_thread_fence(std::memory_order_seq_cst);
if (_config.event_index) {
uint16_t avail_idx = _avail._shared->_idx.load(std::memory_order_relaxed);
uint16_t avail_event = _avail_event->load(std::memory_order_relaxed);
need_kick = (uint16_t)(avail_idx - avail_event - 1) < _avail._avail_added_since_kick;
} else {
if (notifications_disabled())
return;
}
if (need_kick || (_avail._avail_added_since_kick >= (uint16_t)(~0) / 2)) {
_notifier->notify();
_avail._avail_added_since_kick = 0;
}
}
bool do_complete();
size_t mask() { return size() - 1; }
size_t masked(size_t idx) { return idx & mask(); }
size_t available();
unsigned allocate_desc();
void setup();
};
template <typename BufferChain, typename Completion>
vring<BufferChain, Completion>::avail::avail(ring_config conf)
: _shared(reinterpret_cast<avail_layout*>(conf.avail)) {
}
template <typename BufferChain, typename Completion>
vring<BufferChain, Completion>::used::used(ring_config conf)
: _shared(reinterpret_cast<used_layout*>(conf.used)) {
}
template <typename BufferChain, typename Completion>
inline
unsigned
vring<BufferChain, Completion>::allocate_desc() {
assert(_free_head != -1);
auto desc = _free_head;
if (desc == _free_last) {
_free_last = _free_head = -1;
} else {
_free_head = _descs[desc]._next;
}
return desc;
}
template <typename BufferChain, typename Completion>
vring<BufferChain, Completion>::vring(ring_config conf, Completion complete)
: _config(conf)
, _complete(complete)
, _buffer_chains(new BufferChain[_config.size])
, _descs(reinterpret_cast<desc*>(conf.descs))
, _avail(conf)
, _used(conf)
, _avail_event(reinterpret_cast<std::atomic<uint16_t>*>(&_used._shared->_used_elements[conf.size]))
, _used_event(reinterpret_cast<std::atomic<uint16_t>*>(&_avail._shared->_ring[conf.size]))
, _poller([this] {
return do_complete();
})
{
setup();
}
template <typename BufferChain, typename Completion>
void vring<BufferChain, Completion>::setup() {
for (unsigned i = 0; i < _config.size; ++i) {
_descs[i]._next = i + 1;
}
_free_head = 0;
_free_last = _config.size - 1;
_available_descriptors.signal(_config.size);
}
// Iterator: points at a buffer_chain
template <typename BufferChain, typename Completion>
template <typename Iterator>
void vring<BufferChain, Completion>::post(Iterator begin, Iterator end) {
for (auto bci = begin; bci!= end; ++bci) {
auto&& bc = *bci;
desc pseudo_head = {};
desc* prev = &pseudo_head;
for (auto i = bc.begin(); i != bc.end(); ++i) {
unsigned desc_idx = allocate_desc();
prev->_flags.has_next = true;
prev->_next = desc_idx;
desc &d = _descs[desc_idx];
d._flags = {};
auto&& b = *i;
d._flags.writeable = b.writeable;
d._paddr = b.addr;
d._len = b.len;
prev = &d;
}
auto desc_head = pseudo_head._next;
_buffer_chains[desc_head] = std::move(bc);
_avail._shared->_ring[masked(_avail._head++)] = desc_head;
_avail._avail_added_since_kick++;
}
_avail._shared->_idx.store(_avail._head, std::memory_order_release);
kick();
}
template <typename BufferChain, typename Completion>
bool vring<BufferChain, Completion>::do_complete() {
auto used_head = _used._shared->_idx.load(std::memory_order_acquire);
auto count = _used._tail - used_head;
_complete.bunch(count);
while (used_head != _used._tail) {
auto ue = _used._shared->_used_elements[masked(_used._tail++)];
_complete(std::move(_buffer_chains[ue._id]), ue._len);
auto id = ue._id;
if (_free_last != -1) {
_descs[_free_last]._next = id;
} else {
_free_head = id;
}
while (true) {
auto& d = _descs[id];
if (!d._flags.has_next) {
break;
}
id = d._next;
}
_free_last = id;
}
return count;
}
class qp : public net::qp {
protected:
struct net_hdr {
uint8_t needs_csum : 1;
uint8_t flags_reserved : 7;
enum { gso_none = 0, gso_tcpv4 = 1, gso_udp = 3, gso_tcpv6 = 4, gso_ecn = 0x80 };
uint8_t gso_type;
uint16_t hdr_len;
uint16_t gso_size;
uint16_t csum_start;
uint16_t csum_offset;
};
struct net_hdr_mrg : net_hdr {
uint16_t num_buffers;
};
class txq {
static buffer fragment_to_buffer(fragment f) {
buffer b;
b.addr = virt_to_phys(f.base);
b.len = f.size;
b.writeable = false;
return b;
};
struct packet_as_buffer_chain {
packet p;
auto begin() {
return make_transform_iterator(p.fragments().begin(), fragment_to_buffer);
}
auto end() {
return make_transform_iterator(p.fragments().end(), fragment_to_buffer);
}
};
struct complete {
txq& q;
void operator()(packet_as_buffer_chain&& bc, size_t len) {
// move the packet here, to be destroyed on scope exit
auto p = std::move(bc.p);
q._ring.available_descriptors().signal(p.nr_frags());
}
void bunch(uint64_t c) {}
};
qp& _dev;
vring<packet_as_buffer_chain, complete> _ring;
std::vector<packet_as_buffer_chain> _packets;
public:
txq(qp& dev, ring_config config);
void set_notifier(std::unique_ptr<notifier> notifier) {
_ring.set_notifier(std::move(notifier));
}
const ring_config& getconfig() {
return _ring.getconfig();
}
void wake_notifier_wait() {
_ring.wake_notifier_wait();
}
uint32_t post(circular_buffer<packet>& p);
};
class rxq {
struct buffer_and_virt : buffer {
std::unique_ptr<char[], free_deleter> buf;
};
using single_buffer = std::array<buffer_and_virt, 1>;
struct complete {
rxq& q;
void operator()(single_buffer&& bc, size_t len) {
q.complete_buffer(std::move(bc), len);
}
void bunch(uint64_t c) {
q.update_rx_count(c);
}
};
qp& _dev;
vring<single_buffer, complete> _ring;
unsigned _remaining_buffers = 0;
std::vector<fragment> _fragments;
std::vector<std::unique_ptr<char[], free_deleter>> _buffers;
public:
rxq(qp& _if, ring_config config);
void set_notifier(std::unique_ptr<notifier> notifier) {
_ring.set_notifier(std::move(notifier));
}
const ring_config& getconfig() {
return _ring.getconfig();
}
void run() {
keep_doing([this] { return prepare_buffers(); });
}
void wake_notifier_wait() {
_ring.wake_notifier_wait();
}
void update_rx_count(uint64_t c) {
_dev.update_rx_count(c);
}
private:
future<> prepare_buffers();
void complete_buffer(single_buffer&& b, size_t len);
void debug_mode_adjust_fragments();
};
protected:
device* _dev;
size_t _header_len;
std::unique_ptr<char[], free_deleter> _txq_storage;
std::unique_ptr<char[], free_deleter> _rxq_storage;
txq _txq;
rxq _rxq;
protected:
ring_config txq_config(size_t txq_ring_size);
ring_config rxq_config(size_t rxq_ring_size);
void common_config(ring_config& r);
size_t vring_storage_size(size_t ring_size);
public:
explicit qp(device* dev, size_t rx_ring_size, size_t tx_ring_size);
virtual future<> send(packet p) override {
abort();
}
virtual uint32_t send(circular_buffer<packet>& p) override;
virtual void rx_start() override;
friend class rxq;
};
qp::txq::txq(qp& dev, ring_config config)
: _dev(dev), _ring(config, complete{*this}) {
}
uint32_t
qp::txq::post(circular_buffer<packet>& pb) {
_packets.clear();
while (!pb.empty() && pb.front().nr_frags() + 1 <= _ring.available_descriptors().current()) {
net_hdr_mrg vhdr = {};
auto p = std::move(pb.front());
pb.pop_front();
// Handle TCP checksum offload
auto oi = p.offload_info();
if (_dev._dev->hw_features().tx_csum_l4_offload) {
auto eth_hdr_len = sizeof(eth_hdr);
auto ip_hdr_len = oi.ip_hdr_len;
auto mtu = _dev._dev->hw_features().mtu;
if (oi.protocol == ip_protocol_num::tcp) {
auto tcp_hdr_len = oi.tcp_hdr_len;
if (oi.needs_csum) {
vhdr.needs_csum = 1;
vhdr.csum_start = eth_hdr_len + ip_hdr_len;
// TCP checksum filed's offset within the TCP header is 16 bytes
vhdr.csum_offset = 16;
}
if (oi.tso_seg_size) {
// IPv4 TCP TSO
vhdr.gso_type = net_hdr::gso_tcpv4;
// Sum of Ethernet, IP and TCP header size
vhdr.hdr_len = eth_hdr_len + ip_hdr_len + tcp_hdr_len;
// Maximum segment size of packet after the offload
vhdr.gso_size = oi.tso_seg_size;
}
} else if (oi.protocol == ip_protocol_num::udp) {
auto udp_hdr_len = oi.udp_hdr_len;
if (oi.needs_csum) {
vhdr.needs_csum = 1;
vhdr.csum_start = eth_hdr_len + ip_hdr_len;
// UDP checksum filed's offset within the UDP header is 6 bytes
vhdr.csum_offset = 6;
}
if (_dev._dev->hw_features().tx_ufo && p.len() > mtu + eth_hdr_len) {
vhdr.gso_type = net_hdr::gso_udp;
vhdr.hdr_len = eth_hdr_len + ip_hdr_len + udp_hdr_len;
vhdr.gso_size = mtu - ip_hdr_len - udp_hdr_len;
}
}
}
// prepend virtio-net header
packet q = packet(fragment{reinterpret_cast<char*>(&vhdr), _dev._header_len},
std::move(p));
auto fut = _ring.available_descriptors().wait(q.nr_frags());
assert(fut.available()); // how it cannot?
_packets.emplace_back(packet_as_buffer_chain{ std::move(q) });
}
_ring.post(_packets.begin(), _packets.end());
return _packets.size();
}
qp::rxq::rxq(qp& dev, ring_config config)
: _dev(dev), _ring(config, complete{*this}) {
}
future<>
qp::rxq::prepare_buffers() {
auto& available = _ring.available_descriptors();
return available.wait(1).then([this, &available] {
unsigned count = 1;
auto opportunistic = available.current();
if (available.try_wait(opportunistic)) {
count += opportunistic;
}
auto make_buffer_chain = [this] {
single_buffer bc;
std::unique_ptr<char[], free_deleter> buf(reinterpret_cast<char*>(malloc(4096)));
buffer_and_virt& b = bc[0];
b.addr = virt_to_phys(buf.get());
b.len = 4096;
b.writeable = true;
b.buf = std::move(buf);
return bc;
};
auto start = make_function_input_iterator(make_buffer_chain, 0U);
auto finish = make_function_input_iterator(make_buffer_chain, count);
_ring.post(start, finish);
});
}
void
qp::rxq::debug_mode_adjust_fragments() {
#ifdef DEBUG
// For debug mode, reallocate last fragment to detect buffer overruns
auto last = _fragments.back();
auto sz = last.size;
std::unique_ptr<char[], free_deleter> buf(reinterpret_cast<char*>(malloc(sz)));
if (!buf) {
throw std::bad_alloc();
}
std::copy_n(last.base, sz, buf.get());
_fragments.back() = { buf.get(), sz };
_buffers.back() = std::move(buf);
#endif
}
void
qp::rxq::complete_buffer(single_buffer&& bc, size_t len) {
auto&& sb = bc[0];
auto&& buf = sb.buf;
auto frag_buf = buf.get();
auto frag_len = len;
// First buffer
if (_remaining_buffers == 0) {
auto hdr = reinterpret_cast<net_hdr_mrg*>(frag_buf);
assert(hdr->num_buffers >= 1);
_remaining_buffers = hdr->num_buffers;
frag_buf += _dev._header_len;
frag_len -= _dev._header_len;
_fragments.clear();
_buffers.clear();
};
// Append current buffer
_fragments.emplace_back(fragment{frag_buf, frag_len});
_buffers.push_back(std::move(buf));
_remaining_buffers--;
// Last buffer
if (_remaining_buffers == 0) {
debug_mode_adjust_fragments();
deleter del;
if (_buffers.size() == 1) {
del = make_free_deleter(_buffers[0].release());
_buffers.clear();
} else {
del = make_object_deleter(std::move(_buffers));
}
packet p(_fragments.begin(), _fragments.end(), std::move(del));
_dev._dev->l2receive(std::move(p));
_ring.available_descriptors().signal(_fragments.size());
}
}
// Allocate and zero-initialize a buffer which is page-aligned and can be
// used for virt_to_phys (i.e., physically contiguous).
static std::unique_ptr<char[], free_deleter> virtio_buffer(size_t size) {
void* ret;
auto r = posix_memalign(&ret, 4096, size);
assert(r == 0);
bzero(ret, size);
return std::unique_ptr<char[], free_deleter>(reinterpret_cast<char*>(ret));
}
qp::qp(device* dev, size_t rx_ring_size, size_t tx_ring_size)
: _dev(dev)
, _txq_storage(virtio_buffer(vring_storage_size(tx_ring_size)))
, _rxq_storage(virtio_buffer(vring_storage_size(rx_ring_size)))
, _txq(*this, txq_config(tx_ring_size))
, _rxq(*this, rxq_config(rx_ring_size)) {
}
size_t qp::vring_storage_size(size_t ring_size) {
// overestimate, but not by much.
return 3 * 4096 + ring_size * (16 + 2 + 8);
}
void qp::common_config(ring_config& r) {
r.avail = r.descs + 16 * r.size;
r.used = align_up(r.avail + 2 * r.size + 6, 4096);
r.event_index = (_dev->features() & VIRTIO_RING_F_EVENT_IDX) != 0;
r.indirect = false;
}
ring_config qp::txq_config(size_t tx_ring_size) {
ring_config r;
r.size = tx_ring_size;
r.descs = _txq_storage.get();
r.mergable_buffers = false;
common_config(r);
return r;
}
ring_config qp::rxq_config(size_t rx_ring_size) {
ring_config r;
r.size = rx_ring_size;
r.descs = _rxq_storage.get();
r.mergable_buffers = true;
common_config(r);
return r;
}
void
qp::rx_start() {
_rxq.run();
}
uint32_t
qp::send(circular_buffer<packet>& p) {
return _txq.post(p);
}
class qp_vhost : public qp {
private:
// The vhost file descriptor needs to remain open throughout the life of
// this driver, as as soon as we close it, vhost stops servicing us.
file_desc _vhost_fd;
public:
qp_vhost(device* dev, boost::program_options::variables_map opts);
};
static size_t config_ring_size(boost::program_options::variables_map &opts) {
if (opts.count("event-index")) {
return opts["virtio-ring-size"].as<unsigned>();
} else {
return 256;
}
}
qp_vhost::qp_vhost(device *dev, boost::program_options::variables_map opts)
: qp(dev, config_ring_size(opts), config_ring_size(opts))
, _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR))
{
auto tap_device = opts["tap-device"].as<std::string>();
int64_t vhost_supported_features;
_vhost_fd.ioctl(VHOST_GET_FEATURES, vhost_supported_features);
vhost_supported_features &= _dev->features();
_vhost_fd.ioctl(VHOST_SET_FEATURES, vhost_supported_features);
if (vhost_supported_features & VIRTIO_NET_F_MRG_RXBUF) {
_header_len = sizeof(net_hdr_mrg);
} else {
_header_len = sizeof(net_hdr);
}
// Open and set up the tap device, which we'll tell vhost to use.
// Note that the tap_fd we open here will be closed at the end of
// this function. It appears that this is fine - i.e., after we pass
// this fd to VHOST_NET_SET_BACKEND, the Linux kernel keeps the reference
// to it and it's fine to close the file descriptor.
file_desc tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK));
assert(tap_device.size() + 1 <= IFNAMSIZ);
ifreq ifr = {};
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR;
strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str());
tap_fd.ioctl(TUNSETIFF, ifr);
unsigned int offload = 0;
auto hw_features = _dev->hw_features();
if (hw_features.tx_csum_l4_offload && hw_features.rx_csum_offload) {
offload = TUN_F_CSUM;
if (hw_features.tx_tso) {
offload |= TUN_F_TSO4;
}
if (hw_features.tx_ufo) {
offload |= TUN_F_UFO;
}
}
tap_fd.ioctl(TUNSETOFFLOAD, offload);
tap_fd.ioctl(TUNSETVNETHDRSZ, _header_len);
// Additional vhost setup:
_vhost_fd.ioctl(VHOST_SET_OWNER);
auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1);
mem_table->nregions = 1;
auto& region = mem_table->regions[0];
region.guest_phys_addr = 0;
region.memory_size = (size_t(1) << 47) - 4096;
region.userspace_addr = 0;
region.flags_padding = 0;
_vhost_fd.ioctl(VHOST_SET_MEM_TABLE, *mem_table);
vhost_vring_state vvs0 = { 0, _rxq.getconfig().size };
_vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs0);
vhost_vring_state vvs1 = { 1, _txq.getconfig().size };
_vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs1);
auto tov = [](char* x) { return reinterpret_cast<uintptr_t>(x); };
_vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
0, 0, tov(_rxq.getconfig().descs), tov(_rxq.getconfig().used),
tov(_rxq.getconfig().avail), 0
});
_vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
1, 0, tov(_txq.getconfig().descs), tov(_txq.getconfig().used),
tov(_txq.getconfig().avail), 0
});
readable_eventfd _txq_notify;
writeable_eventfd _txq_kick;
readable_eventfd _rxq_notify;
writeable_eventfd _rxq_kick;
_vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{0, _rxq_kick.get_read_fd()});
_vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{0, _rxq_notify.get_write_fd()});
_vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{1, _txq_kick.get_read_fd()});
_vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{1, _txq_notify.get_write_fd()});
_rxq.set_notifier(std::make_unique<notifier_vhost>(std::move(_rxq_kick)));
_txq.set_notifier(std::make_unique<notifier_vhost>(std::move(_txq_kick)));
_vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{0, tap_fd.get()});
_vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{1, tap_fd.get()});
}
#ifdef HAVE_OSV
class qp_osv : public qp {
private:
ethernet_address _mac;
osv::assigned_virtio &_virtio;
public:
qp_osv(osv::assigned_virtio &virtio,
boost::program_options::variables_map opts);
virtual ethernet_address hw_address() override {
return _mac;
}
};
qp_osv::qp_osv(osv::assigned_virtio &virtio,
boost::program_options::variables_map opts)
: qp(opts, virtio.queue_size(0), virtio.queue_size(1))
, _virtio(virtio)
{
// Read the host's virtio supported feature bitmask, AND it with the
// features we want to use, and tell the host of the result:
uint32_t subset = _virtio.init_features(_dev->features());
if (subset & VIRTIO_NET_F_MRG_RXBUF) {
_header_len = sizeof(net_hdr_mrg);
} else {
_header_len = sizeof(net_hdr);
}
// TODO: save bits from "subset" in _hw_features?
// bool _mergeable_bufs = subset & VIRTIO_NET_F_MRG_RXBUF;
// bool _status = subset & VIRTIO_NET_F_STATUS;
// bool _tso_ecn = subset & VIRTIO_NET_F_GUEST_ECN;
// bool _host_tso_ecn = subset & VIRTIO_NET_F_HOST_ECN;
// bool _csum = subset & VIRTIO_NET_F_CSUM;
// bool _guest_csum = subset & VIRTIO_NET_F_GUEST_CSUM;
// bool _guest_tso4 = subset & VIRTIO_NET_F_GUEST_TSO4;
// bool _host_tso4 = subset & VIRTIO_NET_F_HOST_TSO4;
// bool _guest_ufo = subset & VIRTIO_NET_F_GUEST_UFO;
// Get the MAC address set by the host
assert(subset & VIRTIO_NET_F_MAC);
struct net_config {
/* The ring_config defining mac address (if VIRTIO_NET_F_MAC) */
uint8_t mac[6];
/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* */
uint16_t status;
/* Maximum number of each of transmit and receive queues;
* see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ.
* Legal values are between 1 and 0x8000
*/
uint16_t max_virtqueue_pairs;
} __attribute__((packed)) host_config;
_virtio.conf_read(&host_config, sizeof(host_config));
_mac = { host_config.mac[0], host_config.mac[1], host_config.mac[2],
host_config.mac[3], host_config.mac[4], host_config.mac[5] };
// Setup notifiers
_rxq.set_notifier(std::make_unique<notifier_osv>(_virtio, 0));
_txq.set_notifier(std::make_unique<notifier_osv>(_virtio, 1));
// Tell the host where we put the rings (we already allocated them earlier)
_virtio.set_queue_pfn(
0, virt_to_phys(_rxq.getconfig().descs));
_virtio.set_queue_pfn(
1, virt_to_phys(_txq.getconfig().descs));
_txq.run();
// Set up interrupts
// FIXME: in OSv, the first thing we do in the handler is to call
// _rqx.disable_interrupts(). Here in seastar, we only do it much later
// in the main engine(). Probably needs to do it like in osv - in the beginning of the handler.
_virtio.enable_interrupt(
0, [&] { _rxq.wake_notifier_wait(); } );
_virtio.enable_interrupt(
1, [&] { _txq.wake_notifier_wait(); } );
_virtio.set_driver_ok();
}
#endif
std::unique_ptr<net::qp> device::init_local_queue(boost::program_options::variables_map opts, uint16_t qid) {
static bool called = false;
assert(!qid);
assert(!called);
called = true;
#ifdef HAVE_OSV
if (osv::assigned_virtio::get && osv::assigned_virtio::get()) {
std::cout << "In OSv and assigned host's virtio device\n";
return std::make_unique<qp_osv>(*osv::assigned_virtio::get(), opts);
}
#endif
return std::make_unique<qp_vhost>(this, opts);
}
}
boost::program_options::options_description
get_virtio_net_options_description()
{
boost::program_options::options_description opts(
"Virtio net options");
opts.add_options()
("event-index",
boost::program_options::value<std::string>()->default_value("on"),
"Enable event-index feature (on / off)")
("csum-offload",
boost::program_options::value<std::string>()->default_value("on"),
"Enable checksum offload feature (on / off)")
("tso",
boost::program_options::value<std::string>()->default_value("on"),
"Enable TCP segment offload feature (on / off)")
("ufo",
boost::program_options::value<std::string>()->default_value("on"),
"Enable UDP fragmentation offload feature (on / off)")
("virtio-ring-size",
boost::program_options::value<unsigned>()->default_value(256),
"Virtio ring size (must be power-of-two)")
;
return opts;
}
std::unique_ptr<net::device> create_virtio_net_device(boost::program_options::variables_map opts) {
return std::make_unique<virtio::device>(opts);
}
// Locks the shared object in memory and forces on-load function resolution.
// Needed if the function passed to enable_interrupt() is run at interrupt
// time.
// TODO: Instead of doing this, _virtio.enable_interrupt() could take a
// pollable to wake instead of a function, then this won't be needed.
asm(".pushsection .note.osv-mlock, \"a\"; .long 0, 0, 0; .popsection");