diff --git a/configure.py b/configure.py index 16687e05e1..c5025220dd 100755 --- a/configure.py +++ b/configure.py @@ -27,12 +27,6 @@ def get_flags(): if line.rstrip('\n').startswith('flags'): return re.sub(r'^flags\s+: ', '', line).split() -def has_avx(): - return 'avx' in get_flags() - -def has_avx2(): - return 'avx2' in get_flags() - def add_tristate(arg_parser, name, dest, help): arg_parser.add_argument('--enable-' + name, dest = dest, action = 'store_true', default = None, help = 'Enable ' + help) @@ -51,6 +45,35 @@ def apply_tristate(var, test, note, missing): return False return False +# +# dpdk_cflags - fetch the DPDK specific CFLAGS +# +# Run a simple makefile that "includes" the DPDK main makefile and prints the +# MACHINE_CFLAGS value +# +def dpdk_cflags (dpdk_target): + with tempfile.NamedTemporaryFile() as sfile: + dpdk_target = os.path.abspath(dpdk_target) + dpdk_target = re.sub(r'\/+$', '', dpdk_target) + dpdk_sdk_path = os.path.dirname(dpdk_target) + dpdk_target_name = os.path.basename(dpdk_target) + dpdk_arch = dpdk_target_name.split('-')[0] + + sfile.file.write(bytes('include ' + dpdk_sdk_path + '/mk/rte.vars.mk' + "\n", 'utf-8')) + sfile.file.write(bytes('all:' + "\n\t", 'utf-8')) + sfile.file.write(bytes('@echo $(MACHINE_CFLAGS)' + "\n", 'utf-8')) + sfile.file.flush() + + dpdk_cflags = subprocess.check_output(['make', '-f', sfile.name, + 'RTE_SDK=' + dpdk_sdk_path, + 'RTE_TARGET=' + dpdk_target_name, + 'RTE_ARCH=' + dpdk_arch]) + dpdk_cflags_str = dpdk_cflags.decode('utf-8') + dpdk_cflags_str = re.sub(r'\n+$', '', dpdk_cflags_str) + dpdk_cflags_final = '' + + return dpdk_cflags_str + def try_compile(compiler, source = '', flags = []): with tempfile.NamedTemporaryFile() as sfile: sfile.file.write(bytes(source, 'utf-8')) @@ -292,10 +315,9 @@ if args.with_osv: if args.dpdk_target: args.user_cflags = (args.user_cflags + - ' -DHAVE_DPDK -I' + - args.dpdk_target + '/include -Wno-error=literal-suffix -Wno-literal-suffix -Wno-invalid-offsetof -m64' + - ' -mavx' if has_avx() else '' + - ' -mavx2' if has_avx2() else '') + ' -DHAVE_DPDK -I' + args.dpdk_target + '/include ' + + dpdk_cflags(args.dpdk_target) + + ' -Wno-error=literal-suffix -Wno-literal-suffix -Wno-invalid-offsetof') libs += (' -L' + args.dpdk_target + '/lib ' + '-Wl,--whole-archive -lrte_pmd_bond -lrte_pmd_vmxnet3_uio -lrte_pmd_virtio_uio -lrte_pmd_i40e -lrte_pmd_ixgbe -lrte_pmd_e1000 -lrte_pmd_ring -Wl,--no-whole-archive -lrte_distributor -lrte_kni -lrte_pipeline -lrte_table -lrte_port -lrte_timer -lrte_hash -lrte_lpm -lrte_power -lrte_acl -lrte_meter -lrte_sched -lrte_kvargs -lrte_mbuf -lrte_ip_frag -lethdev -lrte_eal -lrte_malloc -lrte_mempool -lrte_ring -lrte_cmdline -lrte_cfgfile -lrt -lm -ldl') diff --git a/net/dpdk.cc b/net/dpdk.cc index 17535e8245..f423891a94 100644 --- a/net/dpdk.cc +++ b/net/dpdk.cc @@ -168,6 +168,7 @@ class dpdk_device : public device { net::hw_features _hw_features; uint8_t _queues_ready = 0; unsigned _home_cpu; + bool _use_lro; std::vector _redir_table; #ifdef RTE_VERSION_1_7 struct rte_eth_rxconf _rx_conf_default = {}; @@ -214,10 +215,11 @@ private: void check_port_link_status(); public: - dpdk_device(uint8_t port_idx, uint16_t num_queues) + dpdk_device(uint8_t port_idx, uint16_t num_queues, bool use_lro) : _port_idx(port_idx) , _num_queues(num_queues) - , _home_cpu(engine().cpu_id()) { + , _home_cpu(engine().cpu_id()) + , _use_lro(use_lro) { /* now initialise the port we will use */ int ret = init_port_start(); @@ -235,6 +237,8 @@ public: return _hw_features; } + net::hw_features& hw_features_ref() { return _hw_features; } + const rte_eth_rxconf* def_rx_conf() const { #ifdef RTE_VERSION_1_7 return &_rx_conf_default; @@ -1043,12 +1047,22 @@ private: */ packet from_mbuf(rte_mbuf* m); + /** + * Transform an LRO rte_mbuf cluster into the "packet" object. + * @param m HEAD of the mbufs' cluster to transform + * + * @return a "packet" object representing the newly received LRO packet. + */ + packet from_mbuf_lro(rte_mbuf* m); + private: dpdk_device* _dev; uint8_t _qid; rte_mempool *_pktmbuf_pool_rx; std::vector _rx_free_pkts; std::vector _rx_free_bufs; + std::vector _frags; + std::vector _bufs; size_t _num_rx_free_segs = 0; reactor::poller _rx_gc_poller; std::unique_ptr _rx_xmem; @@ -1133,6 +1147,19 @@ int dpdk_device::init_port_start() port_conf.rxmode.hw_vlan_strip = 1; } + // Enable HW CRC stripping + port_conf.rxmode.hw_strip_crc = 1; + +#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT + // Enable LRO + if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) { + printf("LRO is on\n"); + port_conf.rxmode.enable_lro = 1; + _hw_features.rx_lro = true; + } else +#endif + printf("LRO is off\n"); + // Check that all CSUM features are either all set all together or not set // all together. If this assumption breaks we need to rework the below logic // by splitting the csum offload feature bit into separate bits for IPv4, @@ -1413,47 +1440,112 @@ void dpdk_qp::rx_start() { } template<> -inline packet dpdk_qp::from_mbuf(rte_mbuf* m) +inline packet dpdk_qp::from_mbuf_lro(rte_mbuf* m) { // - // Try to allocate a buffer for packet's data. If we fail - give the - // application an mbuf itself. If we succeed - copy the data into this - // buffer, create a packet based on this buffer and return the mbuf to its - // pool. + // Try to allocate a buffer for the whole packet's data. + // If we fail - construct the packet from mbufs. + // If we succeed - copy the data into this buffer, create a packet based on + // this buffer and return the mbuf to its pool. // - auto len = rte_pktmbuf_data_len(m); - char* buf = (char*)malloc(len); + auto pkt_len = rte_pktmbuf_pkt_len(m); + char* buf = (char*)malloc(pkt_len); if (!buf) { - fragment f{rte_pktmbuf_mtod(m, char*), len}; - return packet(f, make_deleter(deleter(), [m] { rte_pktmbuf_free(m); })); + _frags.clear(); + + for (rte_mbuf* m1 = m; m1 != nullptr; m1 = m1->next) { + char* data = rte_pktmbuf_mtod(m1, char*); + _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m1)}); + } + + return packet(_frags.begin(), _frags.end(), + make_deleter(deleter(), [m] { rte_pktmbuf_free(m); })); } else { - rte_memcpy(buf, rte_pktmbuf_mtod(m, char*), len); + // Copy the contents of the packet into the buffer we've just allocated + size_t offset = 0; + for (rte_mbuf* m1 = m; m1 != nullptr; m1 = m1->next) { + char* data = rte_pktmbuf_mtod(m1, char*); + auto len = rte_pktmbuf_data_len(m1); + + rte_memcpy(buf + offset, data, len); + offset += len; + } + rte_pktmbuf_free(m); - fragment f{buf, len}; - return packet(f, make_free_deleter(buf)); + return packet(fragment{buf, pkt_len}, make_free_deleter(buf)); } } +template<> +inline packet dpdk_qp::from_mbuf(rte_mbuf* m) +{ + if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) { + // + // Try to allocate a buffer for packet's data. If we fail - give the + // application an mbuf itself. If we succeed - copy the data into this + // buffer, create a packet based on this buffer and return the mbuf to + // its pool. + // + auto len = rte_pktmbuf_data_len(m); + char* buf = (char*)malloc(len); + if (!buf) { + return packet(fragment{rte_pktmbuf_mtod(m, char*), len}, + make_deleter(deleter(), + [m] { rte_pktmbuf_free(m); })); + } else { + rte_memcpy(buf, rte_pktmbuf_mtod(m, char*), len); + rte_pktmbuf_free(m); + + return packet(fragment{buf, len}, make_free_deleter(buf)); + } + } else { + return from_mbuf_lro(m); + } +} + +template<> +inline packet dpdk_qp::from_mbuf_lro(rte_mbuf* m) +{ + _frags.clear(); + _bufs.clear(); + + for (; m != nullptr; m = m->next) { + char* data = rte_pktmbuf_mtod(m, char*); + + _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)}); + _bufs.push_back(data); + } + + return packet(_frags.begin(), _frags.end(), + make_deleter(deleter(), + [bufs_vec = std::move(_bufs)] { + for (auto&& b : bufs_vec) { + free(b); + } + })); +} + template<> inline packet dpdk_qp::from_mbuf(rte_mbuf* m) { - char* data = rte_pktmbuf_mtod(m, char*); - - fragment f{data, rte_pktmbuf_data_len(m)}; - packet p(f, make_free_deleter(data)); - _rx_free_pkts.push_back(m); _num_rx_free_segs += rte_mbuf_nb_segs(m); - return p; + if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) { + char* data = rte_pktmbuf_mtod(m, char*); + + return packet(fragment{data, rte_pktmbuf_data_len(m)}, + make_free_deleter(data)); + } else { + return from_mbuf_lro(m); + } } template inline bool dpdk_qp::refill_one_cluster(rte_mbuf* head) { - while (head != NULL) { - struct rte_mbuf *m_next = head->next; + for (; head != nullptr; head = head->next) { if (!refill_rx_mbuf(head)) { // // If we failed to allocate a new buffer - push the rest of the @@ -1463,7 +1555,6 @@ inline bool dpdk_qp::refill_one_cluster(rte_mbuf* head) return false; } _rx_free_bufs.push_back(head); - head = m_next; } return true; @@ -1517,12 +1608,6 @@ void dpdk_qp::process_packets( struct rte_mbuf *m = bufs[i]; offload_info oi; - // TODO: Remove this when implement LRO support - if (!rte_pktmbuf_is_contiguous(m)) { - rte_exit(EXIT_FAILURE, - "DPDK-Rx: Have got a fragmented buffer - not supported\n"); - } - packet p = from_mbuf(m); // Set stipped VLAN value if available @@ -1631,7 +1716,8 @@ std::unique_ptr dpdk_device::init_local_queue(boost::program_options::variab std::unique_ptr create_dpdk_net_device( uint8_t port_idx, - uint8_t num_queues) + uint8_t num_queues, + bool use_lro) { static bool called = false; @@ -1647,7 +1733,7 @@ std::unique_ptr create_dpdk_net_device( printf("ports number: %d\n", rte_eth_dev_count()); } - return std::make_unique(port_idx, num_queues); + return std::make_unique(port_idx, num_queues, use_lro); } boost::program_options::options_description diff --git a/net/dpdk.hh b/net/dpdk.hh index 69cf0dda6d..86a13defbf 100644 --- a/net/dpdk.hh +++ b/net/dpdk.hh @@ -30,7 +30,8 @@ std::unique_ptr create_dpdk_net_device( uint8_t port_idx = 0, - uint8_t num_queues = 1); + uint8_t num_queues = 1, + bool use_lro = true); boost::program_options::options_description get_dpdk_net_options_description(); diff --git a/net/native-stack.cc b/net/native-stack.cc index d36e872b7e..88859cd16a 100644 --- a/net/native-stack.cc +++ b/net/native-stack.cc @@ -85,7 +85,8 @@ void create_native_net_device(boost::program_options::variables_map opts) { if (opts.count("dpdk-pmd")) { // Hardcoded port index 0. // TODO: Inherit it from the opts - dev = create_dpdk_net_device(0, smp::count); + dev = create_dpdk_net_device(0, smp::count, + !(opts.count("lro") && opts["lro"].as() == "off")); } else #endif dev = create_virtio_net_device(opts); @@ -325,6 +326,9 @@ boost::program_options::options_description nns_options() { #ifdef HAVE_DPDK ("dpdk-pmd", "Use DPDK PMD drivers") #endif + ("lro", + boost::program_options::value()->default_value("on"), + "Enable LRO") ; add_native_net_options_description(opts); diff --git a/net/net.hh b/net/net.hh index 96cf20ff7b..bccc73033d 100644 --- a/net/net.hh +++ b/net/net.hh @@ -71,6 +71,8 @@ struct hw_features { bool tx_csum_l4_offload = false; // Enable rx checksum offload bool rx_csum_offload = false; + // LRO is enabled + bool rx_lro = false; // Enable tx TCP segment offload bool tx_tso = false; // Enable tx UDP fragmentation offload diff --git a/net/virtio.cc b/net/virtio.cc index 72185fddd9..3243914759 100644 --- a/net/virtio.cc +++ b/net/virtio.cc @@ -87,11 +87,18 @@ private: } if (!(_opts.count("tso") && _opts["tso"].as() == "off")) { seastar_supported_features |= VIRTIO_NET_F_HOST_TSO4; - seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4; _hw_features.tx_tso = true; } else { _hw_features.tx_tso = false; } + + if (!(_opts.count("lro") && _opts["lro"].as() == "off")) { + seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4; + _hw_features.rx_lro = true; + } else { + _hw_features.rx_lro = false; + } + if (!(_opts.count("ufo") && _opts["ufo"].as() == "off")) { seastar_supported_features |= VIRTIO_NET_F_HOST_UFO; seastar_supported_features |= VIRTIO_NET_F_GUEST_UFO;