diff --git a/core/posix.hh b/core/posix.hh index eb27cdaab1..8fe72bba36 100644 --- a/core/posix.hh +++ b/core/posix.hh @@ -90,6 +90,11 @@ public: throw_system_error_on(r == -1); return r; } + int ioctl(int request, unsigned int value) { + int r = ::ioctl(_fd, request, value); + throw_system_error_on(r == -1); + return r; + } template int ioctl(int request, X& data) { int r = ::ioctl(_fd, request, &data); diff --git a/net/ip.cc b/net/ip.cc index 3a9a6da23d..bc79330986 100644 --- a/net/ip.cc +++ b/net/ip.cc @@ -18,7 +18,8 @@ std::ostream& operator<<(std::ostream& os, ipv4_address a) { } ipv4::ipv4(interface* netif) - : _global_arp(netif) + : _netif(netif) + , _global_arp(netif) , _arp(_global_arp) , _l3(netif, 0x0800) , _rx_packets(_l3.receive([this] (packet p, ethernet_address ea) { @@ -38,10 +39,12 @@ ipv4::handle_received_packet(packet p, ethernet_address from) { if (!iph) { return make_ready_future<>(); } - checksummer csum; - csum.sum(reinterpret_cast(iph), sizeof(*iph)); - if (csum.get() != 0) { - return make_ready_future<>(); + if (!hw_features().rx_csum_offload) { + checksummer csum; + csum.sum(reinterpret_cast(iph), sizeof(*iph)); + if (csum.get() != 0) { + return make_ready_future<>(); + } } ntoh(*iph); // FIXME: process options diff --git a/net/ip.hh b/net/ip.hh index a606cd05bc..538d57d4bb 100644 --- a/net/ip.hh +++ b/net/ip.hh @@ -70,6 +70,7 @@ struct ipv4_traits { template class ipv4_l4 { +public: ipv4& _inet; public: ipv4_l4(ipv4& inet) : _inet(inet) {} @@ -100,6 +101,7 @@ public: static address_type broadcast_address() { return ipv4_address(0xffffffff); } static proto_type arp_protocol_type() { return 0x0800; } private: + interface* _netif; arp _global_arp; arp_for _arp; ipv4_address _host_address; @@ -118,6 +120,7 @@ public: void send(ipv4_address to, uint8_t proto_num, packet p); tcp& get_tcp() { return _tcp._tcp; } void register_l4(proto_type id, ip_protocol* handler); + net::hw_features hw_features() { return _netif->hw_features(); } }; template diff --git a/net/net.cc b/net/net.cc index bff25a23ca..df9f366790 100644 --- a/net/net.cc +++ b/net/net.cc @@ -26,7 +26,8 @@ future<> l3_protocol::send(ethernet_address to, packet p) { interface::interface(std::unique_ptr dev) : _dev(std::move(dev)) , _rx(_dev->receive([this] (packet p) { return dispatch_packet(std::move(p)); })) - , _hw_address(_dev->hw_address()) { + , _hw_address(_dev->hw_address()) + , _hw_features(_dev->hw_features()) { } subscription diff --git a/net/net.hh b/net/net.hh index 9384c0df7a..7253977db4 100644 --- a/net/net.hh +++ b/net/net.hh @@ -20,6 +20,13 @@ class interface; class device; class l3_protocol; +struct hw_features { + // Enable tx checksum offload + bool tx_csum_offload; + // Enable rx checksum offload + bool rx_csum_offload; +}; + class l3_protocol { interface* _netif; uint16_t _proto_num; @@ -42,12 +49,14 @@ class interface { }; std::unordered_map _proto_map; ethernet_address _hw_address; + net::hw_features _hw_features; private: future<> dispatch_packet(packet p); future<> send(uint16_t proto_num, ethernet_address to, packet p); public: explicit interface(std::unique_ptr dev); ethernet_address hw_address() { return _hw_address; } + net::hw_features hw_features() { return _hw_features; } subscription register_l3(uint16_t proto_num, std::function (packet p, ethernet_address from)> next); friend class l3_protocol; @@ -59,7 +68,9 @@ public: virtual subscription receive(std::function (packet)> next_packet) = 0; virtual future<> send(packet p) = 0; virtual ethernet_address hw_address() = 0; + virtual net::hw_features hw_features() = 0; }; + } #endif /* NET_HH_ */ diff --git a/net/tcp.hh b/net/tcp.hh index ef4d97385c..260d7e7782 100644 --- a/net/tcp.hh +++ b/net/tcp.hh @@ -219,6 +219,7 @@ public: explicit tcp(inet_type& inet) : _inet(inet) {} void received(packet p, ipaddr from, ipaddr to); listener listen(uint16_t port, size_t queue_length = 100); + net::hw_features hw_features() { return _inet._inet.hw_features(); } private: void send(ipaddr from, ipaddr to, packet p); void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip); @@ -253,11 +254,14 @@ void tcp::received(packet p, ipaddr from, ipaddr to) { if (unsigned(th->data_offset * 4) < sizeof(*th)) { return; } - checksummer csum; - InetTraits::pseudo_header_checksum(csum, from, to, p.len()); - csum.sum(p); - if (csum.get() != 0) { - return; + + if (!hw_features().rx_csum_offload) { + checksummer csum; + InetTraits::pseudo_header_checksum(csum, from, to, p.len()); + csum.sum(p); + if (csum.get() != 0) { + return; + } } // FIXME: process options p.trim_front(th->data_offset * 4); @@ -332,8 +336,12 @@ void tcp::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr f checksummer csum; InetTraits::pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th)); - csum.sum(p); - th->checksum = csum.get(); + if (hw_features().tx_csum_offload) { + th->checksum = ~csum.get(); + } else { + csum.sum(p); + th->checksum = csum.get(); + } send(local_ip, foreign_ip, std::move(p)); } @@ -536,8 +544,14 @@ void tcp::tcb::output() { checksummer csum; InetTraits::pseudo_header_checksum(csum, _local_ip, _foreign_ip, sizeof(*th) + len); - csum.sum(p); - th->checksum = csum.get(); + if (_tcp.hw_features().tx_csum_offload) { + // virtio-net's VIRTIO_NET_F_CSUM feature requires th->checksum to be + // initialized to ones' complement sum of the pseudo header. + th->checksum = ~csum.get(); + } else { + csum.sum(p); + th->checksum = csum.get(); + } _tcp.send(_local_ip, _foreign_ip, std::move(p)); } diff --git a/net/virtio.cc b/net/virtio.cc index 01d3aa9b2b..bdd1d63f96 100644 --- a/net/virtio.cc +++ b/net/virtio.cc @@ -15,6 +15,7 @@ #include #include #include +#include "tcp.hh" using namespace net; @@ -399,12 +400,14 @@ private: std::unique_ptr _rxq_storage; boost::program_options::variables_map _opts; uint64_t _features; + net::hw_features _hw_features; txq _txq; rxq _rxq; stream _rx_stream; future<> _rx_ready; private: uint64_t setup_features(); + void setup_tap_device(sstring tap_device); vring::config txq_config(); vring::config rxq_config(); void common_config(vring::config& r); @@ -414,6 +417,7 @@ public: virtual subscription receive(std::function (packet)> next) override; virtual future<> send(packet p) override; virtual ethernet_address hw_address() override; + virtual net::hw_features hw_features() override; }; virtio_net_device::txq::txq(virtio_net_device& dev, vring::config config, @@ -429,6 +433,20 @@ virtio_net_device::txq::transmit(semaphore& available) { _tx_queue.pop(); // Linux requires that hdr_len be sane even if gso is disabled. net_hdr_mrg vhdr = {}; + + // Handle TCP checksum offload + if (_dev.hw_features().tx_csum_offload) { + // FIXME: No magic numbers + auto hdr = p.get_header(14+ 20); + if (hdr) { + vhdr.needs_csum = 1; + // 14 bytes ethernet header and 20 bytes IP header + vhdr.csum_start = 14 + 20; + // TCP checksum filed's offset within the TCP header is 16 bytes + vhdr.csum_offset = 16; + } + } + // prepend virtio-net header packet q = packet(fragment{reinterpret_cast(&vhdr), _dev._header_len}, std::move(p)); @@ -496,6 +514,21 @@ virtio_net_device::rxq::prepare_buffers(semaphore& available) { }); } +void virtio_net_device::setup_tap_device(sstring tap_device) { + assert(tap_device.size() + 1 <= IFNAMSIZ); + + ifreq ifr = {}; + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR; + strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str()); + _tap_fd.ioctl(TUNSETIFF, ifr); + + unsigned int offload = 0; + if (hw_features().tx_csum_offload && hw_features().rx_csum_offload) { + offload = TUN_F_CSUM; + } + _tap_fd.ioctl(TUNSETOFFLOAD, offload); +} + virtio_net_device::virtio_net_device(sstring tap_device, boost::program_options::variables_map opts, init x) : _tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK)) , _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR)) @@ -507,11 +540,7 @@ virtio_net_device::virtio_net_device(sstring tap_device, boost::program_options: , _rxq(*this, rxq_config(), std::move(x._rxq_notify), std::move(x._rxq_kick)) , _rx_stream() , _rx_ready(_rx_stream.started()) { - assert(tap_device.size() + 1 <= IFNAMSIZ); - ifreq ifr = {}; - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR; - strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str()); - _tap_fd.ioctl(TUNSETIFF, ifr); + setup_tap_device(tap_device); _vhost_fd.ioctl(VHOST_SET_OWNER); auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1); mem_table->nregions = 1; @@ -544,9 +573,18 @@ virtio_net_device::virtio_net_device(sstring tap_device, boost::program_options: uint64_t virtio_net_device::setup_features() { int64_t seastar_supported_features = VIRTIO_RING_F_INDIRECT_DESC; + if (!(_opts.count("event-index") && _opts["event-index"].as() == "off")) { seastar_supported_features |= VIRTIO_RING_F_EVENT_IDX; } + if (!(_opts.count("csum-offload") && _opts["csum-offload"].as() == "off")) { + seastar_supported_features |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM; + _hw_features.tx_csum_offload = true; + _hw_features.rx_csum_offload = true; + } else { + _hw_features.tx_csum_offload = false; + _hw_features.rx_csum_offload = false; + } int64_t vhost_supported_features; _vhost_fd.ioctl(VHOST_GET_FEATURES, vhost_supported_features); @@ -601,6 +639,10 @@ ethernet_address virtio_net_device::hw_address() { return {{{ 0x12, 0x23, 0x34, 0x56, 0x67, 0x78 }}}; } +net::hw_features virtio_net_device::hw_features() { + return _hw_features; +} + boost::program_options::options_description get_virtio_net_options_description() { @@ -610,6 +652,9 @@ get_virtio_net_options_description() ("event-index", boost::program_options::value()->default_value("on"), "Enable event-index feature (on / off)") + ("csum-offload", + boost::program_options::value()->default_value("on"), + "Enable checksum offload feature (on / off)") ; return opts; }