mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-24 10:30:38 +00:00
Merge branch 'dpdk_lro-v4' of github.com:cloudius-systems/seastar-dev
This commit is contained in:
42
configure.py
42
configure.py
@@ -27,12 +27,6 @@ def get_flags():
|
||||
if line.rstrip('\n').startswith('flags'):
|
||||
return re.sub(r'^flags\s+: ', '', line).split()
|
||||
|
||||
def has_avx():
|
||||
return 'avx' in get_flags()
|
||||
|
||||
def has_avx2():
|
||||
return 'avx2' in get_flags()
|
||||
|
||||
def add_tristate(arg_parser, name, dest, help):
|
||||
arg_parser.add_argument('--enable-' + name, dest = dest, action = 'store_true', default = None,
|
||||
help = 'Enable ' + help)
|
||||
@@ -51,6 +45,35 @@ def apply_tristate(var, test, note, missing):
|
||||
return False
|
||||
return False
|
||||
|
||||
#
|
||||
# dpdk_cflags - fetch the DPDK specific CFLAGS
|
||||
#
|
||||
# Run a simple makefile that "includes" the DPDK main makefile and prints the
|
||||
# MACHINE_CFLAGS value
|
||||
#
|
||||
def dpdk_cflags (dpdk_target):
|
||||
with tempfile.NamedTemporaryFile() as sfile:
|
||||
dpdk_target = os.path.abspath(dpdk_target)
|
||||
dpdk_target = re.sub(r'\/+$', '', dpdk_target)
|
||||
dpdk_sdk_path = os.path.dirname(dpdk_target)
|
||||
dpdk_target_name = os.path.basename(dpdk_target)
|
||||
dpdk_arch = dpdk_target_name.split('-')[0]
|
||||
|
||||
sfile.file.write(bytes('include ' + dpdk_sdk_path + '/mk/rte.vars.mk' + "\n", 'utf-8'))
|
||||
sfile.file.write(bytes('all:' + "\n\t", 'utf-8'))
|
||||
sfile.file.write(bytes('@echo $(MACHINE_CFLAGS)' + "\n", 'utf-8'))
|
||||
sfile.file.flush()
|
||||
|
||||
dpdk_cflags = subprocess.check_output(['make', '-f', sfile.name,
|
||||
'RTE_SDK=' + dpdk_sdk_path,
|
||||
'RTE_TARGET=' + dpdk_target_name,
|
||||
'RTE_ARCH=' + dpdk_arch])
|
||||
dpdk_cflags_str = dpdk_cflags.decode('utf-8')
|
||||
dpdk_cflags_str = re.sub(r'\n+$', '', dpdk_cflags_str)
|
||||
dpdk_cflags_final = ''
|
||||
|
||||
return dpdk_cflags_str
|
||||
|
||||
def try_compile(compiler, source = '', flags = []):
|
||||
with tempfile.NamedTemporaryFile() as sfile:
|
||||
sfile.file.write(bytes(source, 'utf-8'))
|
||||
@@ -292,10 +315,9 @@ if args.with_osv:
|
||||
|
||||
if args.dpdk_target:
|
||||
args.user_cflags = (args.user_cflags +
|
||||
' -DHAVE_DPDK -I' +
|
||||
args.dpdk_target + '/include -Wno-error=literal-suffix -Wno-literal-suffix -Wno-invalid-offsetof -m64' +
|
||||
' -mavx' if has_avx() else '' +
|
||||
' -mavx2' if has_avx2() else '')
|
||||
' -DHAVE_DPDK -I' + args.dpdk_target + '/include ' +
|
||||
dpdk_cflags(args.dpdk_target) +
|
||||
' -Wno-error=literal-suffix -Wno-literal-suffix -Wno-invalid-offsetof')
|
||||
libs += (' -L' + args.dpdk_target + '/lib ' +
|
||||
'-Wl,--whole-archive -lrte_pmd_bond -lrte_pmd_vmxnet3_uio -lrte_pmd_virtio_uio -lrte_pmd_i40e -lrte_pmd_ixgbe -lrte_pmd_e1000 -lrte_pmd_ring -Wl,--no-whole-archive -lrte_distributor -lrte_kni -lrte_pipeline -lrte_table -lrte_port -lrte_timer -lrte_hash -lrte_lpm -lrte_power -lrte_acl -lrte_meter -lrte_sched -lrte_kvargs -lrte_mbuf -lrte_ip_frag -lethdev -lrte_eal -lrte_malloc -lrte_mempool -lrte_ring -lrte_cmdline -lrte_cfgfile -lrt -lm -ldl')
|
||||
|
||||
|
||||
148
net/dpdk.cc
148
net/dpdk.cc
@@ -168,6 +168,7 @@ class dpdk_device : public device {
|
||||
net::hw_features _hw_features;
|
||||
uint8_t _queues_ready = 0;
|
||||
unsigned _home_cpu;
|
||||
bool _use_lro;
|
||||
std::vector<uint8_t> _redir_table;
|
||||
#ifdef RTE_VERSION_1_7
|
||||
struct rte_eth_rxconf _rx_conf_default = {};
|
||||
@@ -214,10 +215,11 @@ private:
|
||||
void check_port_link_status();
|
||||
|
||||
public:
|
||||
dpdk_device(uint8_t port_idx, uint16_t num_queues)
|
||||
dpdk_device(uint8_t port_idx, uint16_t num_queues, bool use_lro)
|
||||
: _port_idx(port_idx)
|
||||
, _num_queues(num_queues)
|
||||
, _home_cpu(engine().cpu_id()) {
|
||||
, _home_cpu(engine().cpu_id())
|
||||
, _use_lro(use_lro) {
|
||||
|
||||
/* now initialise the port we will use */
|
||||
int ret = init_port_start();
|
||||
@@ -235,6 +237,8 @@ public:
|
||||
return _hw_features;
|
||||
}
|
||||
|
||||
net::hw_features& hw_features_ref() { return _hw_features; }
|
||||
|
||||
const rte_eth_rxconf* def_rx_conf() const {
|
||||
#ifdef RTE_VERSION_1_7
|
||||
return &_rx_conf_default;
|
||||
@@ -1043,12 +1047,22 @@ private:
|
||||
*/
|
||||
packet from_mbuf(rte_mbuf* m);
|
||||
|
||||
/**
|
||||
* Transform an LRO rte_mbuf cluster into the "packet" object.
|
||||
* @param m HEAD of the mbufs' cluster to transform
|
||||
*
|
||||
* @return a "packet" object representing the newly received LRO packet.
|
||||
*/
|
||||
packet from_mbuf_lro(rte_mbuf* m);
|
||||
|
||||
private:
|
||||
dpdk_device* _dev;
|
||||
uint8_t _qid;
|
||||
rte_mempool *_pktmbuf_pool_rx;
|
||||
std::vector<rte_mbuf*> _rx_free_pkts;
|
||||
std::vector<rte_mbuf*> _rx_free_bufs;
|
||||
std::vector<fragment> _frags;
|
||||
std::vector<char*> _bufs;
|
||||
size_t _num_rx_free_segs = 0;
|
||||
reactor::poller _rx_gc_poller;
|
||||
std::unique_ptr<void, free_deleter> _rx_xmem;
|
||||
@@ -1133,6 +1147,19 @@ int dpdk_device::init_port_start()
|
||||
port_conf.rxmode.hw_vlan_strip = 1;
|
||||
}
|
||||
|
||||
// Enable HW CRC stripping
|
||||
port_conf.rxmode.hw_strip_crc = 1;
|
||||
|
||||
#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
|
||||
// Enable LRO
|
||||
if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) {
|
||||
printf("LRO is on\n");
|
||||
port_conf.rxmode.enable_lro = 1;
|
||||
_hw_features.rx_lro = true;
|
||||
} else
|
||||
#endif
|
||||
printf("LRO is off\n");
|
||||
|
||||
// Check that all CSUM features are either all set all together or not set
|
||||
// all together. If this assumption breaks we need to rework the below logic
|
||||
// by splitting the csum offload feature bit into separate bits for IPv4,
|
||||
@@ -1413,47 +1440,112 @@ void dpdk_qp<HugetlbfsMemBackend>::rx_start() {
|
||||
}
|
||||
|
||||
template<>
|
||||
inline packet dpdk_qp<false>::from_mbuf(rte_mbuf* m)
|
||||
inline packet dpdk_qp<false>::from_mbuf_lro(rte_mbuf* m)
|
||||
{
|
||||
//
|
||||
// Try to allocate a buffer for packet's data. If we fail - give the
|
||||
// application an mbuf itself. If we succeed - copy the data into this
|
||||
// buffer, create a packet based on this buffer and return the mbuf to its
|
||||
// pool.
|
||||
// Try to allocate a buffer for the whole packet's data.
|
||||
// If we fail - construct the packet from mbufs.
|
||||
// If we succeed - copy the data into this buffer, create a packet based on
|
||||
// this buffer and return the mbuf to its pool.
|
||||
//
|
||||
auto len = rte_pktmbuf_data_len(m);
|
||||
char* buf = (char*)malloc(len);
|
||||
auto pkt_len = rte_pktmbuf_pkt_len(m);
|
||||
char* buf = (char*)malloc(pkt_len);
|
||||
if (!buf) {
|
||||
fragment f{rte_pktmbuf_mtod(m, char*), len};
|
||||
return packet(f, make_deleter(deleter(), [m] { rte_pktmbuf_free(m); }));
|
||||
_frags.clear();
|
||||
|
||||
for (rte_mbuf* m1 = m; m1 != nullptr; m1 = m1->next) {
|
||||
char* data = rte_pktmbuf_mtod(m1, char*);
|
||||
_frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m1)});
|
||||
}
|
||||
|
||||
return packet(_frags.begin(), _frags.end(),
|
||||
make_deleter(deleter(), [m] { rte_pktmbuf_free(m); }));
|
||||
} else {
|
||||
rte_memcpy(buf, rte_pktmbuf_mtod(m, char*), len);
|
||||
// Copy the contents of the packet into the buffer we've just allocated
|
||||
size_t offset = 0;
|
||||
for (rte_mbuf* m1 = m; m1 != nullptr; m1 = m1->next) {
|
||||
char* data = rte_pktmbuf_mtod(m1, char*);
|
||||
auto len = rte_pktmbuf_data_len(m1);
|
||||
|
||||
rte_memcpy(buf + offset, data, len);
|
||||
offset += len;
|
||||
}
|
||||
|
||||
rte_pktmbuf_free(m);
|
||||
|
||||
fragment f{buf, len};
|
||||
return packet(f, make_free_deleter(buf));
|
||||
return packet(fragment{buf, pkt_len}, make_free_deleter(buf));
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
inline packet dpdk_qp<false>::from_mbuf(rte_mbuf* m)
|
||||
{
|
||||
if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) {
|
||||
//
|
||||
// Try to allocate a buffer for packet's data. If we fail - give the
|
||||
// application an mbuf itself. If we succeed - copy the data into this
|
||||
// buffer, create a packet based on this buffer and return the mbuf to
|
||||
// its pool.
|
||||
//
|
||||
auto len = rte_pktmbuf_data_len(m);
|
||||
char* buf = (char*)malloc(len);
|
||||
if (!buf) {
|
||||
return packet(fragment{rte_pktmbuf_mtod(m, char*), len},
|
||||
make_deleter(deleter(),
|
||||
[m] { rte_pktmbuf_free(m); }));
|
||||
} else {
|
||||
rte_memcpy(buf, rte_pktmbuf_mtod(m, char*), len);
|
||||
rte_pktmbuf_free(m);
|
||||
|
||||
return packet(fragment{buf, len}, make_free_deleter(buf));
|
||||
}
|
||||
} else {
|
||||
return from_mbuf_lro(m);
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
inline packet dpdk_qp<true>::from_mbuf_lro(rte_mbuf* m)
|
||||
{
|
||||
_frags.clear();
|
||||
_bufs.clear();
|
||||
|
||||
for (; m != nullptr; m = m->next) {
|
||||
char* data = rte_pktmbuf_mtod(m, char*);
|
||||
|
||||
_frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)});
|
||||
_bufs.push_back(data);
|
||||
}
|
||||
|
||||
return packet(_frags.begin(), _frags.end(),
|
||||
make_deleter(deleter(),
|
||||
[bufs_vec = std::move(_bufs)] {
|
||||
for (auto&& b : bufs_vec) {
|
||||
free(b);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
template<>
|
||||
inline packet dpdk_qp<true>::from_mbuf(rte_mbuf* m)
|
||||
{
|
||||
char* data = rte_pktmbuf_mtod(m, char*);
|
||||
|
||||
fragment f{data, rte_pktmbuf_data_len(m)};
|
||||
packet p(f, make_free_deleter(data));
|
||||
|
||||
_rx_free_pkts.push_back(m);
|
||||
_num_rx_free_segs += rte_mbuf_nb_segs(m);
|
||||
|
||||
return p;
|
||||
if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) {
|
||||
char* data = rte_pktmbuf_mtod(m, char*);
|
||||
|
||||
return packet(fragment{data, rte_pktmbuf_data_len(m)},
|
||||
make_free_deleter(data));
|
||||
} else {
|
||||
return from_mbuf_lro(m);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool HugetlbfsMemBackend>
|
||||
inline bool dpdk_qp<HugetlbfsMemBackend>::refill_one_cluster(rte_mbuf* head)
|
||||
{
|
||||
while (head != NULL) {
|
||||
struct rte_mbuf *m_next = head->next;
|
||||
for (; head != nullptr; head = head->next) {
|
||||
if (!refill_rx_mbuf(head)) {
|
||||
//
|
||||
// If we failed to allocate a new buffer - push the rest of the
|
||||
@@ -1463,7 +1555,6 @@ inline bool dpdk_qp<HugetlbfsMemBackend>::refill_one_cluster(rte_mbuf* head)
|
||||
return false;
|
||||
}
|
||||
_rx_free_bufs.push_back(head);
|
||||
head = m_next;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1517,12 +1608,6 @@ void dpdk_qp<HugetlbfsMemBackend>::process_packets(
|
||||
struct rte_mbuf *m = bufs[i];
|
||||
offload_info oi;
|
||||
|
||||
// TODO: Remove this when implement LRO support
|
||||
if (!rte_pktmbuf_is_contiguous(m)) {
|
||||
rte_exit(EXIT_FAILURE,
|
||||
"DPDK-Rx: Have got a fragmented buffer - not supported\n");
|
||||
}
|
||||
|
||||
packet p = from_mbuf(m);
|
||||
|
||||
// Set stipped VLAN value if available
|
||||
@@ -1631,7 +1716,8 @@ std::unique_ptr<qp> dpdk_device::init_local_queue(boost::program_options::variab
|
||||
|
||||
std::unique_ptr<net::device> create_dpdk_net_device(
|
||||
uint8_t port_idx,
|
||||
uint8_t num_queues)
|
||||
uint8_t num_queues,
|
||||
bool use_lro)
|
||||
{
|
||||
static bool called = false;
|
||||
|
||||
@@ -1647,7 +1733,7 @@ std::unique_ptr<net::device> create_dpdk_net_device(
|
||||
printf("ports number: %d\n", rte_eth_dev_count());
|
||||
}
|
||||
|
||||
return std::make_unique<dpdk::dpdk_device>(port_idx, num_queues);
|
||||
return std::make_unique<dpdk::dpdk_device>(port_idx, num_queues, use_lro);
|
||||
}
|
||||
|
||||
boost::program_options::options_description
|
||||
|
||||
@@ -30,7 +30,8 @@
|
||||
|
||||
std::unique_ptr<net::device> create_dpdk_net_device(
|
||||
uint8_t port_idx = 0,
|
||||
uint8_t num_queues = 1);
|
||||
uint8_t num_queues = 1,
|
||||
bool use_lro = true);
|
||||
|
||||
boost::program_options::options_description get_dpdk_net_options_description();
|
||||
|
||||
|
||||
@@ -85,7 +85,8 @@ void create_native_net_device(boost::program_options::variables_map opts) {
|
||||
if (opts.count("dpdk-pmd")) {
|
||||
// Hardcoded port index 0.
|
||||
// TODO: Inherit it from the opts
|
||||
dev = create_dpdk_net_device(0, smp::count);
|
||||
dev = create_dpdk_net_device(0, smp::count,
|
||||
!(opts.count("lro") && opts["lro"].as<std::string>() == "off"));
|
||||
} else
|
||||
#endif
|
||||
dev = create_virtio_net_device(opts);
|
||||
@@ -325,6 +326,9 @@ boost::program_options::options_description nns_options() {
|
||||
#ifdef HAVE_DPDK
|
||||
("dpdk-pmd", "Use DPDK PMD drivers")
|
||||
#endif
|
||||
("lro",
|
||||
boost::program_options::value<std::string>()->default_value("on"),
|
||||
"Enable LRO")
|
||||
;
|
||||
|
||||
add_native_net_options_description(opts);
|
||||
|
||||
@@ -71,6 +71,8 @@ struct hw_features {
|
||||
bool tx_csum_l4_offload = false;
|
||||
// Enable rx checksum offload
|
||||
bool rx_csum_offload = false;
|
||||
// LRO is enabled
|
||||
bool rx_lro = false;
|
||||
// Enable tx TCP segment offload
|
||||
bool tx_tso = false;
|
||||
// Enable tx UDP fragmentation offload
|
||||
|
||||
@@ -87,11 +87,18 @@ private:
|
||||
}
|
||||
if (!(_opts.count("tso") && _opts["tso"].as<std::string>() == "off")) {
|
||||
seastar_supported_features |= VIRTIO_NET_F_HOST_TSO4;
|
||||
seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4;
|
||||
_hw_features.tx_tso = true;
|
||||
} else {
|
||||
_hw_features.tx_tso = false;
|
||||
}
|
||||
|
||||
if (!(_opts.count("lro") && _opts["lro"].as<std::string>() == "off")) {
|
||||
seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4;
|
||||
_hw_features.rx_lro = true;
|
||||
} else {
|
||||
_hw_features.rx_lro = false;
|
||||
}
|
||||
|
||||
if (!(_opts.count("ufo") && _opts["ufo"].as<std::string>() == "off")) {
|
||||
seastar_supported_features |= VIRTIO_NET_F_HOST_UFO;
|
||||
seastar_supported_features |= VIRTIO_NET_F_GUEST_UFO;
|
||||
|
||||
Reference in New Issue
Block a user