net: Support TCP checksum offload

It gives ~5% httpd improvements on monster.

csum-offload option is added, e.g., to disable:

./httpd --network-stack native --csum-offload off
This commit is contained in:
Asias He
2014-09-24 15:21:19 +08:00
committed by Avi Kivity
parent 0fb796a6c4
commit 236418d262
7 changed files with 102 additions and 20 deletions

View File

@@ -90,6 +90,11 @@ public:
throw_system_error_on(r == -1);
return r;
}
int ioctl(int request, unsigned int value) {
int r = ::ioctl(_fd, request, value);
throw_system_error_on(r == -1);
return r;
}
template <class X>
int ioctl(int request, X& data) {
int r = ::ioctl(_fd, request, &data);

View File

@@ -18,7 +18,8 @@ std::ostream& operator<<(std::ostream& os, ipv4_address a) {
}
ipv4::ipv4(interface* netif)
: _global_arp(netif)
: _netif(netif)
, _global_arp(netif)
, _arp(_global_arp)
, _l3(netif, 0x0800)
, _rx_packets(_l3.receive([this] (packet p, ethernet_address ea) {
@@ -38,10 +39,12 @@ ipv4::handle_received_packet(packet p, ethernet_address from) {
if (!iph) {
return make_ready_future<>();
}
checksummer csum;
csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
if (csum.get() != 0) {
return make_ready_future<>();
if (!hw_features().rx_csum_offload) {
checksummer csum;
csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
if (csum.get() != 0) {
return make_ready_future<>();
}
}
ntoh(*iph);
// FIXME: process options

View File

@@ -70,6 +70,7 @@ struct ipv4_traits {
template <uint8_t ProtoNum>
class ipv4_l4 {
public:
ipv4& _inet;
public:
ipv4_l4(ipv4& inet) : _inet(inet) {}
@@ -100,6 +101,7 @@ public:
static address_type broadcast_address() { return ipv4_address(0xffffffff); }
static proto_type arp_protocol_type() { return 0x0800; }
private:
interface* _netif;
arp _global_arp;
arp_for<ipv4> _arp;
ipv4_address _host_address;
@@ -118,6 +120,7 @@ public:
void send(ipv4_address to, uint8_t proto_num, packet p);
tcp<ipv4_traits>& get_tcp() { return _tcp._tcp; }
void register_l4(proto_type id, ip_protocol* handler);
net::hw_features hw_features() { return _netif->hw_features(); }
};
template <uint8_t ProtoNum>

View File

@@ -26,7 +26,8 @@ future<> l3_protocol::send(ethernet_address to, packet p) {
interface::interface(std::unique_ptr<device> dev)
: _dev(std::move(dev))
, _rx(_dev->receive([this] (packet p) { return dispatch_packet(std::move(p)); }))
, _hw_address(_dev->hw_address()) {
, _hw_address(_dev->hw_address())
, _hw_features(_dev->hw_features()) {
}
subscription<packet, ethernet_address>

View File

@@ -20,6 +20,13 @@ class interface;
class device;
class l3_protocol;
struct hw_features {
// Enable tx checksum offload
bool tx_csum_offload;
// Enable rx checksum offload
bool rx_csum_offload;
};
class l3_protocol {
interface* _netif;
uint16_t _proto_num;
@@ -42,12 +49,14 @@ class interface {
};
std::unordered_map<uint16_t, l3_rx_stream> _proto_map;
ethernet_address _hw_address;
net::hw_features _hw_features;
private:
future<> dispatch_packet(packet p);
future<> send(uint16_t proto_num, ethernet_address to, packet p);
public:
explicit interface(std::unique_ptr<device> dev);
ethernet_address hw_address() { return _hw_address; }
net::hw_features hw_features() { return _hw_features; }
subscription<packet, ethernet_address> register_l3(uint16_t proto_num,
std::function<future<> (packet p, ethernet_address from)> next);
friend class l3_protocol;
@@ -59,7 +68,9 @@ public:
virtual subscription<packet> receive(std::function<future<> (packet)> next_packet) = 0;
virtual future<> send(packet p) = 0;
virtual ethernet_address hw_address() = 0;
virtual net::hw_features hw_features() = 0;
};
}
#endif /* NET_HH_ */

View File

@@ -219,6 +219,7 @@ public:
explicit tcp(inet_type& inet) : _inet(inet) {}
void received(packet p, ipaddr from, ipaddr to);
listener listen(uint16_t port, size_t queue_length = 100);
net::hw_features hw_features() { return _inet._inet.hw_features(); }
private:
void send(ipaddr from, ipaddr to, packet p);
void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
@@ -253,11 +254,14 @@ void tcp<InetTraits>::received(packet p, ipaddr from, ipaddr to) {
if (unsigned(th->data_offset * 4) < sizeof(*th)) {
return;
}
checksummer csum;
InetTraits::pseudo_header_checksum(csum, from, to, p.len());
csum.sum(p);
if (csum.get() != 0) {
return;
if (!hw_features().rx_csum_offload) {
checksummer csum;
InetTraits::pseudo_header_checksum(csum, from, to, p.len());
csum.sum(p);
if (csum.get() != 0) {
return;
}
}
// FIXME: process options
p.trim_front(th->data_offset * 4);
@@ -332,8 +336,12 @@ void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr f
checksummer csum;
InetTraits::pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
csum.sum(p);
th->checksum = csum.get();
if (hw_features().tx_csum_offload) {
th->checksum = ~csum.get();
} else {
csum.sum(p);
th->checksum = csum.get();
}
send(local_ip, foreign_ip, std::move(p));
}
@@ -536,8 +544,14 @@ void tcp<InetTraits>::tcb::output() {
checksummer csum;
InetTraits::pseudo_header_checksum(csum, _local_ip, _foreign_ip, sizeof(*th) + len);
csum.sum(p);
th->checksum = csum.get();
if (_tcp.hw_features().tx_csum_offload) {
// virtio-net's VIRTIO_NET_F_CSUM feature requires th->checksum to be
// initialized to ones' complement sum of the pseudo header.
th->checksum = ~csum.get();
} else {
csum.sum(p);
th->checksum = csum.get();
}
_tcp.send(_local_ip, _foreign_ip, std::move(p));
}

View File

@@ -15,6 +15,7 @@
#include <linux/vhost.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include "tcp.hh"
using namespace net;
@@ -399,12 +400,14 @@ private:
std::unique_ptr<char[], free_deleter> _rxq_storage;
boost::program_options::variables_map _opts;
uint64_t _features;
net::hw_features _hw_features;
txq _txq;
rxq _rxq;
stream<packet> _rx_stream;
future<> _rx_ready;
private:
uint64_t setup_features();
void setup_tap_device(sstring tap_device);
vring::config txq_config();
vring::config rxq_config();
void common_config(vring::config& r);
@@ -414,6 +417,7 @@ public:
virtual subscription<packet> receive(std::function<future<> (packet)> next) override;
virtual future<> send(packet p) override;
virtual ethernet_address hw_address() override;
virtual net::hw_features hw_features() override;
};
virtio_net_device::txq::txq(virtio_net_device& dev, vring::config config,
@@ -429,6 +433,20 @@ virtio_net_device::txq::transmit(semaphore& available) {
_tx_queue.pop();
// Linux requires that hdr_len be sane even if gso is disabled.
net_hdr_mrg vhdr = {};
// Handle TCP checksum offload
if (_dev.hw_features().tx_csum_offload) {
// FIXME: No magic numbers
auto hdr = p.get_header<tcp_hdr>(14+ 20);
if (hdr) {
vhdr.needs_csum = 1;
// 14 bytes ethernet header and 20 bytes IP header
vhdr.csum_start = 14 + 20;
// TCP checksum filed's offset within the TCP header is 16 bytes
vhdr.csum_offset = 16;
}
}
// prepend virtio-net header
packet q = packet(fragment{reinterpret_cast<char*>(&vhdr), _dev._header_len},
std::move(p));
@@ -496,6 +514,21 @@ virtio_net_device::rxq::prepare_buffers(semaphore& available) {
});
}
void virtio_net_device::setup_tap_device(sstring tap_device) {
assert(tap_device.size() + 1 <= IFNAMSIZ);
ifreq ifr = {};
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR;
strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str());
_tap_fd.ioctl(TUNSETIFF, ifr);
unsigned int offload = 0;
if (hw_features().tx_csum_offload && hw_features().rx_csum_offload) {
offload = TUN_F_CSUM;
}
_tap_fd.ioctl(TUNSETOFFLOAD, offload);
}
virtio_net_device::virtio_net_device(sstring tap_device, boost::program_options::variables_map opts, init x)
: _tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK))
, _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR))
@@ -507,11 +540,7 @@ virtio_net_device::virtio_net_device(sstring tap_device, boost::program_options:
, _rxq(*this, rxq_config(), std::move(x._rxq_notify), std::move(x._rxq_kick))
, _rx_stream()
, _rx_ready(_rx_stream.started()) {
assert(tap_device.size() + 1 <= IFNAMSIZ);
ifreq ifr = {};
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR;
strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str());
_tap_fd.ioctl(TUNSETIFF, ifr);
setup_tap_device(tap_device);
_vhost_fd.ioctl(VHOST_SET_OWNER);
auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1);
mem_table->nregions = 1;
@@ -544,9 +573,18 @@ virtio_net_device::virtio_net_device(sstring tap_device, boost::program_options:
uint64_t virtio_net_device::setup_features() {
int64_t seastar_supported_features = VIRTIO_RING_F_INDIRECT_DESC;
if (!(_opts.count("event-index") && _opts["event-index"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_RING_F_EVENT_IDX;
}
if (!(_opts.count("csum-offload") && _opts["csum-offload"].as<std::string>() == "off")) {
seastar_supported_features |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
_hw_features.tx_csum_offload = true;
_hw_features.rx_csum_offload = true;
} else {
_hw_features.tx_csum_offload = false;
_hw_features.rx_csum_offload = false;
}
int64_t vhost_supported_features;
_vhost_fd.ioctl(VHOST_GET_FEATURES, vhost_supported_features);
@@ -601,6 +639,10 @@ ethernet_address virtio_net_device::hw_address() {
return {{{ 0x12, 0x23, 0x34, 0x56, 0x67, 0x78 }}};
}
net::hw_features virtio_net_device::hw_features() {
return _hw_features;
}
boost::program_options::options_description
get_virtio_net_options_description()
{
@@ -610,6 +652,9 @@ get_virtio_net_options_description()
("event-index",
boost::program_options::value<std::string>()->default_value("on"),
"Enable event-index feature (on / off)")
("csum-offload",
boost::program_options::value<std::string>()->default_value("on"),
"Enable checksum offload feature (on / off)")
;
return opts;
}