Files
scylladb/virtio.cc
Avi Kivity a7930ffcaa net: rework packet class
1. Replace the completion promise<> with a custom deleter class; this
   is lighter weight, and we don't really need the destructor to be
   executed by the scheduler.
2. Add lots of consuctors for composing packets from existing packet,
   by appending or prepending packets
3. Over-allocate in some cases to accomodate for the common practice of
   prepending protocol headers.
2014-08-28 17:29:27 +03:00

493 lines
16 KiB
C++

/*
* Copyright (C) 2014 Cloudius Systems, Ltd.
*/
#include "virtio.hh"
#include "posix.hh"
#include "vla.hh"
#include "virtio-interface.hh"
#include "reactor.hh"
#include <atomic>
#include <vector>
#include <queue>
#include <fcntl.h>
#include <linux/vhost.h>
#include <linux/if.h>
#include <linux/if_tun.h>
using namespace net;
using phys = uint64_t;
template <typename T>
inline
T align_up(T v, T align) {
return (v + align - 1) & ~(align - 1);
}
template <typename T>
inline
T* align_up(T* v, size_t align) {
static_assert(sizeof(T) == 1, "align byte pointers only");
return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
}
inline
phys virt_to_phys(void* p) {
return reinterpret_cast<uintptr_t>(p);
}
class vring {
public:
struct config {
char* descs;
char* avail;
char* used;
unsigned size;
bool event_index;
bool indirect;
bool mergable_buffers;
};
struct buffer {
phys addr;
uint32_t len;
promise<size_t> completed;
bool writeable;
};
using buffer_chain = std::vector<buffer>;
// provide buffers for the queue, wait on @available to gain buffer space
using producer_type = future<std::vector<buffer_chain>> (semaphore& available);
private:
class desc {
public:
struct flags {
// This marks a buffer as continuing via the next field.
uint16_t has_next : 1;
// This marks a buffer as write-only (otherwise read-only).
uint16_t writeable : 1;
// This means the buffer contains a list of buffer descriptors.
uint16_t indirect : 1;
};
phys get_paddr();
uint32_t get_len() { return _len; }
uint16_t next_idx() { return _next; }
phys _paddr;
uint32_t _len;
flags _flags;
uint16_t _next;
};
// Guest to host
struct avail_layout {
struct flags {
// Mark that we do not need an interrupt for consuming a descriptor
// from the ring. Unreliable so it's simply an optimization
uint16_t no_interrupts : 1;
};
std::atomic<uint16_t> _flags;
// Where we put the next descriptor
std::atomic<uint16_t> _idx;
// There may be no more entries than the queue size read from device
uint16_t _ring[];
// used event index is an optimization in order to get an interrupt from the host
// only when the value reaches this number
// The location of this field is places after the variable length ring array,
// that's why we cannot fully define it within the struct and use a function accessor
//std::atomic<uint16_t> used_event;
};
struct used_elem {
// Index of start of used _desc chain. (uint32_t for padding reasons)
uint32_t _id;
// Total length of the descriptor chain which was used (written to)
uint32_t _len;
};
// Host to guest
struct used_layout {
enum {
// The Host advise the Guest: don't kick me when
// you add a buffer. It's unreliable, so it's simply an
// optimization. Guest will still kick if it's out of buffers.
no_notify = 1
};
bool notifications_disabled() {
return (_flags.load(std::memory_order_relaxed) & VRING_USED_F_NO_NOTIFY) != 0;
}
// Using std::atomic since it being changed by the host
std::atomic<uint16_t> _flags;
// Using std::atomic in order to have memory barriers for it
std::atomic<uint16_t> _idx;
used_elem _used_elements[];
// avail event index is an optimization kick the host only when the value reaches this number
// The location of this field is places after the variable length ring array,
// that's why we cannot fully define it within the struct and use a function accessor
//std::atomic<uint16_t> avail_event;
};
struct avail {
explicit avail(config conf);
avail_layout* _shared;
std::atomic<uint16_t>* _host_notify_on_index = nullptr;
uint16_t _head = 0;
};
struct used {
explicit used(config conf);
used_layout* _shared;
std::atomic<uint16_t>* _notify_on_index = nullptr;
uint16_t _tail = 0;
};
private:
config _config;
readable_eventfd _notified;
writeable_eventfd _kick;
std::function<producer_type> _producer;
std::unique_ptr<promise<size_t>[]> _completions;
desc* _descs;
avail _avail;
used _used;
semaphore _available_descriptors = { 0 };
int _free_desc = -1;
public:
vring(config conf, readable_eventfd notified, writeable_eventfd kick,
std::function<producer_type> producer);
// start the queue
void run();
// complete any buffers returned from the host
void complete();
// wait for the used ring to have at least @nr buffers
future<> on_used(size_t nr);
// Total number of descriptors in ring
int size() { return _config.size; }
// Let host know about interrupt delivery
void disable_interrupts();
void enable_interrupts();
private:
size_t mask() { return size() - 1; }
size_t masked(size_t idx) { return idx & mask(); }
size_t available();
unsigned allocate_desc();
void free_desc(unsigned id);
void setup();
};
vring::avail::avail(config conf)
: _shared(reinterpret_cast<avail_layout*>(conf.avail)) {
}
vring::used::used(config conf)
: _shared(reinterpret_cast<used_layout*>(conf.used)) {
}
inline
unsigned vring::allocate_desc() {
assert(_free_desc != -1);
auto desc = _free_desc;
_free_desc = _descs[desc]._next;
return desc;
}
inline
void vring::free_desc(unsigned id) {
_descs[id]._next = _free_desc;
_free_desc = id;
_available_descriptors.signal();
}
vring::vring(config conf, readable_eventfd notified, writeable_eventfd kick,
std::function<producer_type> producer)
: _config(conf)
, _notified(std::move(notified))
, _kick(std::move(kick))
, _producer(std::move(producer))
, _completions(new promise<size_t>[_config.size])
, _descs(reinterpret_cast<desc*>(conf.descs))
, _avail(conf)
, _used(conf)
{
setup();
}
void vring::setup() {
for (unsigned i = 0; i < _config.size; ++i) {
free_desc(i);
}
}
void vring::run() {
_producer(_available_descriptors).then([this] (std::vector<buffer_chain> vbc) {
for (auto&& bc: vbc) {
bool has_prev = false;
unsigned prev_desc_idx = 0;
for (auto i = bc.rbegin(); i != bc.rend(); ++i) {
unsigned desc_idx = allocate_desc();
desc& d = _descs[desc_idx];
d._flags = {};
d._flags.writeable = i->writeable;
d._flags.has_next = has_prev;
d._next = prev_desc_idx;
d._paddr = i->addr;
d._len = i->len;
prev_desc_idx = desc_idx;
_completions[desc_idx] = std::move(i->completed);
}
auto desc_head = prev_desc_idx;
_avail._shared->_ring[masked(_avail._head++)] = desc_head;
}
_avail._shared->_idx.store(_avail._head, std::memory_order_release);
_kick.signal(1);
complete();
run();
});
}
void vring::complete() {
auto used_head = _used._shared->_idx.load(std::memory_order_acquire);
while (used_head != _used._tail) {
auto ue = _used._shared->_used_elements[masked(_used._tail++)];
//auto& d = _descs[ue._id];
_completions[ue._id].set_value(ue._len);
free_desc(ue._id);
// FIXME: free buffers? length? chains?
}
_notified.wait().then([this] (size_t ignore) {
complete();
});
}
class virtio_net_device : public net::device {
struct init {
readable_eventfd _txq_notify;
writeable_eventfd _txq_kick;
readable_eventfd _rxq_notify;
writeable_eventfd _rxq_kick;
int _txq_notify_fd;
int _txq_kick_fd;
int _rxq_notify_fd;
int _rxq_kick_fd;
init() {
_txq_notify_fd = _txq_notify.get_write_fd();
_txq_kick_fd = _txq_kick.get_read_fd();
_rxq_notify_fd = _rxq_notify.get_write_fd();
_rxq_kick_fd = _txq_kick.get_read_fd();
}
};
class txq {
vring _ring;
public:
txq(vring::config config, readable_eventfd notified, writeable_eventfd kicked);
void run() { _ring.run(); }
future<> post(packet p);
private:
future<std::vector<vring::buffer_chain>> transmit(semaphore& available);
std::queue<packet> _tx_queue;
semaphore _tx_queue_length = { 0 };
};
class rxq {
virtio_net_device& _dev;
vring _ring;
size_t _header_len = 10; // adjust for mrg_buf
public:
rxq(virtio_net_device& _if,
vring::config config, readable_eventfd notified, writeable_eventfd kicked);
void run() { _ring.run(); }
private:
future<std::vector<vring::buffer_chain>> prepare_buffers(semaphore& available);
void received(char* buffer);
};
private:
file_desc _tap_fd;
file_desc _vhost_fd;
std::unique_ptr<char[], free_deleter> _txq_storage;
std::unique_ptr<char[], free_deleter> _rxq_storage;
txq _txq;
rxq _rxq;
semaphore _rx_queue_length = { 0 };
std::queue<packet> _rx_queue;
private:
vring::config txq_config();
vring::config rxq_config();
void queue_rx_packet(packet p);
public:
explicit virtio_net_device(sstring tap_device, init x = init());
virtual future<packet> receive() override;
virtual future<> send(packet p) override;
};
virtio_net_device::txq::txq(vring::config config, readable_eventfd notified, writeable_eventfd kicked)
: _ring(config, std::move(notified), std::move(kicked),
[this] (semaphore& available) { return transmit(available); }) {
}
future<std::vector<vring::buffer_chain>>
virtio_net_device::txq::transmit(semaphore& available) {
return _tx_queue_length.wait().then([this] {
auto p = std::move(_tx_queue.front());
_tx_queue.pop();
std::vector<vring::buffer_chain> vbc;
vring::buffer_chain bc;
vring::buffer b;
// dirty hack: assume there is a header there instead of allocating
// it ourself
b.addr = virt_to_phys(p.fragments[0].base - 10);
b.len = p.fragments[0].size + 10;
b.writeable = false;
// schedule packet destruction
b.completed.get_future().then([p = std::move(p)] (size_t) {});
bc.push_back(std::move(b));
vbc.push_back(std::move(bc));
return make_ready_future<std::vector<vring::buffer_chain>>(std::move(vbc));
});
}
future<>
virtio_net_device::txq::post(packet p) {
_tx_queue.push(std::move(p));
_tx_queue_length.signal();
return make_ready_future<>(); // FIXME: queue bounds
}
virtio_net_device::rxq::rxq(virtio_net_device& netif,
vring::config config, readable_eventfd notified, writeable_eventfd kicked)
: _dev(netif), _ring(config, std::move(notified), std::move(kicked),
[this] (semaphore& available) { return prepare_buffers(available); }) {
}
future<std::vector<vring::buffer_chain>>
virtio_net_device::rxq::prepare_buffers(semaphore& available) {
return available.wait(1).then([this, &available] {
unsigned count = 1;
auto opportunistic = available.current();
if (available.try_wait(opportunistic)) {
count += opportunistic;
}
std::vector<vring::buffer_chain> ret;
ret.reserve(count);
for (unsigned i = 0; i < count; ++i) {
vring::buffer_chain bc;
std::unique_ptr<char[]> buf(new char[4096]);
vring::buffer b;
b.addr = virt_to_phys(buf.get());
b.len = 4096;
b.writeable = true;
b.completed.get_future().then([this, buf = buf.get()] (size_t len) {
packet p(fragment{buf + _header_len, len - _header_len},
[buf] { delete[] buf; });
_dev.queue_rx_packet(std::move(p));
});
bc.push_back(std::move(b));
buf.release();
ret.push_back(std::move(bc));
}
return make_ready_future<std::vector<vring::buffer_chain>>(std::move(ret));
});
}
virtio_net_device::virtio_net_device(sstring tap_device, init x)
: _tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK))
, _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR))
, _txq_storage(allocate_aligned_buffer<char>(3*4096, 4096))
, _rxq_storage(allocate_aligned_buffer<char>(3*4096, 4096))
, _txq(txq_config(), std::move(x._txq_notify), std::move(x._txq_kick))
, _rxq(*this, rxq_config(), std::move(x._rxq_notify), std::move(x._rxq_kick)) {
assert(tap_device.size() + 1 <= IFNAMSIZ);
ifreq ifr = {};
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR;
strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str());
_tap_fd.ioctl(TUNSETIFF, ifr);
_vhost_fd.ioctl(VHOST_SET_OWNER);
auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1);
mem_table->nregions = 1;
auto& region = mem_table->regions[0];
region.guest_phys_addr = 0;
region.memory_size = (size_t(1) << 47) - 4096;
region.userspace_addr = 0;
region.flags_padding = 0;
_vhost_fd.ioctl(VHOST_SET_MEM_TABLE, *mem_table);
uint64_t features =
/* VIRTIO_RING_F_EVENT_IDX
| */ VIRTIO_RING_F_INDIRECT_DESC
/* | VIRTIO_NET_F_MRG_RXBUF */;
_vhost_fd.ioctl(VHOST_SET_FEATURES, features);
vhost_vring_state vvs0 = { 0, 256 };
_vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs0);
vhost_vring_state vvs1 = { 1, 256 };
_vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs1);
auto tov = [](char* x) { return reinterpret_cast<uintptr_t>(x); };
_vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
0, 0, tov(rxq_config().descs), tov(rxq_config().used), tov(rxq_config().avail), 0
});
_vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
1, 0, tov(txq_config().descs), tov(txq_config().used), tov(txq_config().avail), 0
});
_vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{0, x._rxq_kick_fd});
_vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{0, x._rxq_notify_fd});
_vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{1, x._txq_kick_fd});
_vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{1, x._txq_notify_fd});
_vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{0, _tap_fd.get()});
_vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{1, _tap_fd.get()});
_txq.run();
_rxq.run();
}
vring::config virtio_net_device::txq_config() {
vring::config r;
auto size = 256;
r.descs = _txq_storage.get();
r.avail = r.descs + 16 * size;
r.used = align_up(r.avail + 2 * size + 6, 4096);
r.size = size;
r.event_index = !true;
r.indirect = false;
r.mergable_buffers = false;
return r;
}
vring::config virtio_net_device::rxq_config() {
vring::config r;
auto size = 256;
r.descs = _rxq_storage.get();
r.avail = r.descs + 16 * size;
r.used = align_up(r.avail + 2 * size + 6, 4096);
r.size = size;
r.event_index = !true;
r.indirect = false;
r.mergable_buffers = true;
return r;
}
future<packet>
virtio_net_device::receive() {
return _rx_queue_length.wait().then([this] {
auto p = std::move(_rx_queue.front());
_rx_queue.pop();
return make_ready_future<packet>(std::move(p));
});
}
future<>
virtio_net_device::send(packet p) {
return _txq.post(std::move(p));
}
void virtio_net_device::queue_rx_packet(packet p) {
_rx_queue.push(std::move(p));
_rx_queue_length.signal(1);
}
std::unique_ptr<net::device> create_virtio_net_device(sstring tap_device) {
return std::make_unique<virtio_net_device>(tap_device);
}