Files
scylladb/net/tcp.hh
Tomasz Grabiec 83963b23d3 Replace rescue() usages with then_wrapped()
They are pretty much the same. This change removes rescue().
2015-03-04 17:34:59 +01:00

1910 lines
66 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* This file is open source software, licensed to you under the terms
* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
* distributed with this work for additional information regarding copyright
* ownership. You may not use this file except in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (C) 2014 Cloudius Systems, Ltd.
*/
#ifndef TCP_HH_
#define TCP_HH_
#include "core/shared_ptr.hh"
#include "core/queue.hh"
#include "core/semaphore.hh"
#include "core/print.hh"
#include "net.hh"
#include "ip_checksum.hh"
#include "ip.hh"
#include "const.hh"
#include "packet-util.hh"
#include <unordered_map>
#include <map>
#include <functional>
#include <deque>
#include <chrono>
#include <experimental/optional>
#include <random>
#include <stdexcept>
#define CRYPTOPP_ENABLE_NAMESPACE_WEAK 1
#include <cryptopp/md5.h>
using namespace std::chrono_literals;
namespace net {
class tcp_hdr;
class tcp_error : public std::runtime_error {
public:
tcp_error(const std::string& msg) : std::runtime_error(msg) {}
};
class tcp_reset_error : public tcp_error {
public:
tcp_reset_error() : tcp_error("connection is reset") {}
};
class tcp_connect_error : public tcp_error {
public:
tcp_connect_error() : tcp_error("fail to connect") {}
};
class tcp_refused_error : public tcp_error {
public:
tcp_refused_error() : tcp_error("connection refused") {}
};
enum class tcp_state : uint16_t {
CLOSED = (1 << 0),
LISTEN = (1 << 1),
SYN_SENT = (1 << 2),
SYN_RECEIVED = (1 << 3),
ESTABLISHED = (1 << 4),
FIN_WAIT_1 = (1 << 5),
FIN_WAIT_2 = (1 << 6),
CLOSE_WAIT = (1 << 7),
CLOSING = (1 << 8),
LAST_ACK = (1 << 9),
TIME_WAIT = (1 << 10)
};
inline tcp_state operator|(tcp_state s1, tcp_state s2) {
return tcp_state(uint16_t(s1) | uint16_t(s2));
}
template <typename... Args>
void tcp_debug(const char* fmt, Args&&... args) {
#if TCP_DEBUG
print(fmt, std::forward<Args>(args)...);
#endif
}
struct tcp_option {
// The kind and len field are fixed and defined in TCP protocol
enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8, nop = 1, eol = 0 };
enum class option_len: uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 };
struct mss {
option_kind kind = option_kind::mss;
option_len len = option_len::mss;
packed<uint16_t> mss;
template <typename Adjuster>
void adjust_endianness(Adjuster a) { a(mss); }
} __attribute__((packed));
struct win_scale {
option_kind kind = option_kind::win_scale;
option_len len = option_len::win_scale;
uint8_t shift;
} __attribute__((packed));
struct sack {
option_kind kind = option_kind::sack;
option_len len = option_len::sack;
} __attribute__((packed));
struct timestamps {
option_kind kind = option_kind::timestamps;
option_len len = option_len::timestamps;
packed<uint32_t> t1;
packed<uint32_t> t2;
template <typename Adjuster>
void adjust_endianness(Adjuster a) { a(t1, t2); }
} __attribute__((packed));
struct nop {
option_kind kind = option_kind::nop;
} __attribute__((packed));
struct eol {
option_kind kind = option_kind::eol;
} __attribute__((packed));
static const uint8_t align = 4;
void parse(uint8_t* beg, uint8_t* end);
uint8_t fill(tcp_hdr* th, uint8_t option_size);
uint8_t get_size(bool syn_on, bool ack_on);
// For option negotiattion
bool _mss_received = false;
bool _win_scale_received = false;
bool _timestamps_received = false;
bool _sack_received = false;
// Option data
uint16_t _remote_mss = 536;
uint16_t _local_mss;
uint8_t _remote_win_scale = 0;
uint8_t _local_win_scale = 0;
};
inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
struct tcp_seq {
uint32_t raw;
};
inline tcp_seq ntoh(tcp_seq s) {
return tcp_seq { ntoh(s.raw) };
}
inline tcp_seq hton(tcp_seq s) {
return tcp_seq { hton(s.raw) };
}
inline
std::ostream& operator<<(std::ostream& os, tcp_seq s) {
return os << s.raw;
}
inline tcp_seq make_seq(uint32_t raw) { return tcp_seq{raw}; }
inline tcp_seq& operator+=(tcp_seq& s, int32_t n) { s.raw += n; return s; }
inline tcp_seq& operator-=(tcp_seq& s, int32_t n) { s.raw -= n; return s; }
inline tcp_seq operator+(tcp_seq s, int32_t n) { return s += n; }
inline tcp_seq operator-(tcp_seq s, int32_t n) { return s -= n; }
inline int32_t operator-(tcp_seq s, tcp_seq q) { return s.raw - q.raw; }
inline bool operator==(tcp_seq s, tcp_seq q) { return s.raw == q.raw; }
inline bool operator!=(tcp_seq s, tcp_seq q) { return !(s == q); }
inline bool operator<(tcp_seq s, tcp_seq q) { return s - q < 0; }
inline bool operator>(tcp_seq s, tcp_seq q) { return q < s; }
inline bool operator<=(tcp_seq s, tcp_seq q) { return !(s > q); }
inline bool operator>=(tcp_seq s, tcp_seq q) { return !(s < q); }
struct tcp_hdr {
packed<uint16_t> src_port;
packed<uint16_t> dst_port;
packed<tcp_seq> seq;
packed<tcp_seq> ack;
uint8_t rsvd1 : 4;
uint8_t data_offset : 4;
uint8_t f_fin : 1;
uint8_t f_syn : 1;
uint8_t f_rst : 1;
uint8_t f_psh : 1;
uint8_t f_ack : 1;
uint8_t f_urg : 1;
uint8_t rsvd2 : 2;
packed<uint16_t> window;
packed<uint16_t> checksum;
packed<uint16_t> urgent;
template <typename Adjuster>
void adjust_endianness(Adjuster a) { a(src_port, dst_port, seq, ack, window, checksum, urgent); }
} __attribute__((packed));
template <typename InetTraits>
class tcp {
public:
using ipaddr = typename InetTraits::address_type;
using inet_type = typename InetTraits::inet_type;
using connid = l4connid<InetTraits>;
using connid_hash = typename connid::connid_hash;
class connection;
class listener;
private:
class tcb;
class tcb : public enable_lw_shared_from_this<tcb> {
using clock_type = lowres_clock;
static constexpr const tcp_state CLOSED = tcp_state::CLOSED;
static constexpr const tcp_state LISTEN = tcp_state::LISTEN;
static constexpr const tcp_state SYN_SENT = tcp_state::SYN_SENT;
static constexpr const tcp_state SYN_RECEIVED = tcp_state::SYN_RECEIVED;
static constexpr const tcp_state ESTABLISHED = tcp_state::ESTABLISHED;
static constexpr const tcp_state FIN_WAIT_1 = tcp_state::FIN_WAIT_1;
static constexpr const tcp_state FIN_WAIT_2 = tcp_state::FIN_WAIT_2;
static constexpr const tcp_state CLOSE_WAIT = tcp_state::CLOSE_WAIT;
static constexpr const tcp_state CLOSING = tcp_state::CLOSING;
static constexpr const tcp_state LAST_ACK = tcp_state::LAST_ACK;
static constexpr const tcp_state TIME_WAIT = tcp_state::TIME_WAIT;
tcp_state _state = CLOSED;
tcp& _tcp;
connection* _conn = nullptr;
promise<> _connect_done;
ipaddr _local_ip;
ipaddr _foreign_ip;
uint16_t _local_port;
uint16_t _foreign_port;
struct unacked_segment {
packet p;
uint16_t data_len;
unsigned nr_transmits;
clock_type::time_point tx_time;
};
struct send {
tcp_seq unacknowledged;
tcp_seq next;
uint32_t window;
uint8_t window_scale;
uint16_t mss;
tcp_seq urgent;
tcp_seq wl1;
tcp_seq wl2;
tcp_seq initial;
std::deque<unacked_segment> data;
std::deque<packet> unsent;
uint32_t unsent_len = 0;
uint32_t queued_len = 0;
bool closed = false;
promise<> _window_opened;
// Wait for all data are acked
std::experimental::optional<promise<>> _all_data_acked_promise;
// Limit number of data queued into send queue
semaphore user_queue_space = {212992};
// Round-trip time variation
std::chrono::milliseconds rttvar;
// Smoothed round-trip time
std::chrono::milliseconds srtt;
bool first_rto_sample = true;
clock_type::time_point syn_tx_time;
// Congestion window
uint32_t cwnd;
// Slow start threshold
uint32_t ssthresh;
// Duplicated ACKs
uint16_t dupacks = 0;
unsigned syn_retransmit = 0;
unsigned fin_retransmit = 0;
uint32_t limited_transfer = 0;
uint32_t partial_ack = 0;
tcp_seq recover;
bool window_probe = false;
} _snd;
struct receive {
tcp_seq next;
uint32_t window;
uint8_t window_scale;
uint16_t mss;
tcp_seq urgent;
tcp_seq initial;
std::deque<packet> data;
packet_merger<tcp_seq> out_of_order;
std::experimental::optional<promise<>> _data_received_promise;
} _rcv;
tcp_option _option;
timer<lowres_clock> _delayed_ack;
// Retransmission timeout
std::chrono::milliseconds _rto{1000};
std::chrono::milliseconds _persist_time_out{1000};
static constexpr std::chrono::milliseconds _rto_min{1000};
static constexpr std::chrono::milliseconds _rto_max{60000};
// Clock granularity
static constexpr std::chrono::milliseconds _rto_clk_granularity{1};
static constexpr uint16_t _max_nr_retransmit{5};
timer<lowres_clock> _retransmit;
timer<lowres_clock> _persist;
uint16_t _nr_full_seg_received = 0;
struct isn_secret {
// 512 bits secretkey for ISN generating
uint32_t key[16];
isn_secret () {
std::random_device rd;
std::default_random_engine e(rd());
std::uniform_int_distribution<uint32_t> dist{};
for (auto& k : key) {
k = dist(e);
}
}
};
static isn_secret _isn_secret;
tcp_seq get_isn();
circular_buffer<typename InetTraits::l4packet> _packetq;
bool _poll_active = false;
public:
tcb(tcp& t, connid id);
void input_handle_listen_state(tcp_hdr* th, packet p);
void input_handle_syn_sent_state(tcp_hdr* th, packet p);
void input_handle_other_state(tcp_hdr* th, packet p);
void output_one(bool data_retransmit = false);
future<> wait_for_data();
future<> wait_for_all_data_acked();
future<> send(packet p);
void connect();
packet read();
void close();
void remove_from_tcbs() {
auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port};
_tcp._tcbs.erase(id);
}
std::experimental::optional<typename InetTraits::l4packet> get_packet();
void output() {
if (!_poll_active) {
_poll_active = true;
_tcp.poll_tcb(_foreign_ip, this->shared_from_this()).then_wrapped([this] (auto&& f) {
try {
f.get();
} catch(arp_queue_full_error& ex) {
// retry later
_poll_active = false;
this->start_retransmit_timer();
} catch(arp_timeout_error& ex) {
if (this->in_state(SYN_SENT)) {
_connect_done.set_exception(ex);
this->cleanup();
}
// in other states connection should time out
}
});
}
}
future<> connect_done() {
return _connect_done.get_future();
}
tcp_state& state() {
return _state;
}
private:
void respond_with_reset(tcp_hdr* th);
bool merge_out_of_order();
void insert_out_of_order(tcp_seq seq, packet p);
void trim_receive_data_after_window();
bool should_send_ack(uint16_t seg_len);
void clear_delayed_ack();
packet get_transmit_packet();
void retransmit_one() {
bool data_retransmit = true;
output_one(data_retransmit);
}
void start_retransmit_timer() {
auto now = clock_type::now();
start_retransmit_timer(now);
};
void start_retransmit_timer(clock_type::time_point now) {
auto tp = now + _rto;
_retransmit.rearm(tp);
};
void stop_retransmit_timer() {
_retransmit.cancel();
};
void start_persist_timer() {
auto now = clock_type::now();
start_persist_timer(now);
};
void start_persist_timer(clock_type::time_point now) {
auto tp = now + _persist_time_out;
_persist.rearm(tp);
};
void stop_persist_timer() {
_persist.cancel();
};
void persist();
void retransmit();
void fast_retransmit();
void update_rto(clock_type::time_point tx_time);
void update_cwnd(uint32_t acked_bytes);
void cleanup();
uint32_t can_send() {
if (_snd.window_probe) {
return 1;
}
// Can not send more than advertised window allows
auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len);
// Can not send more than congestion window allows
x = std::min(_snd.cwnd, x);
if (_snd.dupacks == 1 || _snd.dupacks == 2) {
// RFC5681 Step 3.1
// Send cwnd + 2 * smss per RFC3042
auto flight = flight_size();
auto max = _snd.cwnd + 2 * _snd.mss;
x = flight <= max ? std::min(x, max - flight) : 0;
_snd.limited_transfer += x;
} else if (_snd.dupacks >= 3) {
// RFC5681 Step 3.5
// Sent 1 full-sized segment at most
x = std::min(uint32_t(_snd.mss), x);
}
return x;
}
uint32_t flight_size() {
uint32_t size = 0;
std::for_each(_snd.data.begin(), _snd.data.end(), [&] (unacked_segment& seg) { size += seg.p.len(); });
return size;
}
uint16_t local_mss() {
return _tcp.hw_features().mtu - net::tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
}
void queue_packet(packet p) {
_packetq.emplace_back(typename InetTraits::l4packet{_foreign_ip, std::move(p)});
}
void signal_data_received() {
if (_rcv._data_received_promise) {
_rcv._data_received_promise->set_value();
_rcv._data_received_promise = {};
}
}
void signal_all_data_acked() {
if (_snd._all_data_acked_promise && _snd.unsent_len == 0 && _snd.queued_len == 0) {
_snd._all_data_acked_promise->set_value();
_snd._all_data_acked_promise = {};
}
}
void do_syn_sent() {
_state = SYN_SENT;
_snd.syn_tx_time = clock_type::now();
// Send <SYN> to remote
output();
}
void do_syn_received() {
_state = SYN_RECEIVED;
_snd.syn_tx_time = clock_type::now();
// Send <SYN,ACK> to remote
output();
}
void do_established() {
_state = ESTABLISHED;
update_rto(_snd.syn_tx_time);
_connect_done.set_value();
}
void do_reset() {
_state = CLOSED;
// Free packets to be sent which are waiting for _snd.user_queue_space
_snd.user_queue_space.broken(tcp_reset_error());
cleanup();
if (_rcv._data_received_promise) {
_rcv._data_received_promise->set_exception(tcp_reset_error());
}
if (_snd._all_data_acked_promise) {
_snd._all_data_acked_promise->set_exception(tcp_reset_error());
}
}
void do_time_wait() {
// FIXME: Implement TIME_WAIT state timer
_state = TIME_WAIT;
cleanup();
}
void do_closed() {
_state = CLOSED;
cleanup();
}
void do_setup_isn() {
_snd.initial = get_isn();
_snd.unacknowledged = _snd.initial;
_snd.next = _snd.initial + 1;
_snd.recover = _snd.initial;
}
void do_local_fin_acked() {
_snd.unacknowledged += 1;
_snd.next += 1;
}
bool syn_needs_on() {
return in_state(SYN_SENT | SYN_RECEIVED);
}
bool fin_needs_on() {
return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed &&
_snd.unsent_len == 0 && _snd.queued_len == 0;
}
bool ack_needs_on() {
return !in_state(CLOSED | LISTEN | SYN_SENT);
}
bool foreign_will_not_send() {
return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED);
}
bool in_state(tcp_state state) {
return uint16_t(_state) & uint16_t(state);
}
void exit_fast_recovery() {
_snd.dupacks = 0;
_snd.limited_transfer = 0;
_snd.partial_ack = 0;
}
uint32_t data_segment_acked(tcp_seq seg_ack);
bool segment_acceptable(tcp_seq seg_seq, unsigned seg_len);
void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end);
friend class connection;
};
inet_type& _inet;
std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs;
std::unordered_map<uint16_t, listener*> _listening;
std::random_device _rd;
std::default_random_engine _e;
std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535};
circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs;
// queue for packets that do not belong to any tcb
circular_buffer<ipv4_traits::l4packet> _packetq;
semaphore _queue_space = {212992};
public:
class connection {
lw_shared_ptr<tcb> _tcb;
public:
explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; }
connection(const connection&) = delete;
connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) {
_tcb->_conn = this;
}
~connection();
void operator=(const connection&) = delete;
connection& operator=(connection&& x) {
if (this != &x) {
this->~connection();
new (this) connection(std::move(x));
}
return *this;
}
future<> send(packet p) {
return _tcb->send(std::move(p));
}
future<> wait_for_data() {
return _tcb->wait_for_data();
}
packet read() {
return _tcb->read();
}
void close_read();
void close_write();
};
class listener {
tcp& _tcp;
uint16_t _port;
queue<connection> _q;
private:
listener(tcp& t, uint16_t port, size_t queue_length)
: _tcp(t), _port(port), _q(queue_length) {
_tcp._listening.emplace(_port, this);
}
public:
listener(listener&& x)
: _tcp(x._tcp), _port(x._port), _q(std::move(x._q)) {
_tcp._listening[_port] = this;
x._port = 0;
}
~listener() {
if (_port) {
_tcp._listening.erase(_port);
}
}
future<connection> accept() {
return _q.not_empty().then([this] {
return make_ready_future<connection>(_q.pop());
});
}
friend class tcp;
};
public:
explicit tcp(inet_type& inet);
void received(packet p, ipaddr from, ipaddr to);
bool forward(forward_hash& out_hash_data, packet& p, size_t off);
listener listen(uint16_t port, size_t queue_length = 100);
future<connection> connect(socket_address sa);
const net::hw_features& hw_features() const { return _inet._inet.hw_features(); }
future<> poll_tcb(ipaddr to, lw_shared_ptr<tcb> tcb);
private:
void send_packet_without_tcb(ipaddr from, ipaddr to, packet p);
void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
friend class listener;
};
template <typename InetTraits>
tcp<InetTraits>::tcp(inet_type& inet) : _inet(inet), _e(_rd()) {
_inet.register_packet_provider([this, tcb_polled = 0u] () mutable {
std::experimental::optional<typename InetTraits::l4packet> l4p;
auto c = _poll_tcbs.size();
if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) {
l4p = std::move(_packetq.front());
_packetq.pop_front();
_queue_space.signal(l4p.value().p.len());
} else {
while (c--) {
tcb_polled++;
lw_shared_ptr<tcb> tcb;
ethernet_address dst;
std::tie(tcb, dst) = std::move(_poll_tcbs.front());
_poll_tcbs.pop_front();
l4p = tcb->get_packet();
if (l4p) {
l4p.value().e_dst = dst;
break;
}
}
}
return l4p;
});
}
template <typename InetTraits>
future<> tcp<InetTraits>::poll_tcb(ipaddr to, lw_shared_ptr<tcb> tcb) {
return _inet.get_l2_dst_address(to).then([this, tcb = std::move(tcb)] (ethernet_address dst) {
_poll_tcbs.emplace_back(std::move(tcb), dst);
});
}
template <typename InetTraits>
auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener {
return listener(*this, port, queue_length);
}
template <typename InetTraits>
future<typename tcp<InetTraits>::connection> tcp<InetTraits>::connect(socket_address sa) {
uint16_t src_port;
connid id;
auto src_ip = _inet._inet.host_address();
auto dst_ip = ipv4_address(sa);
auto dst_port = net::ntoh(sa.u.in.sin_port);
do {
src_port = _port_dist(_e);
id = connid{src_ip, dst_ip, src_port, dst_port};
} while (_inet._inet.netif()->hash2cpu(id.hash()) != engine().cpu_id()
|| _tcbs.find(id) != _tcbs.end());
auto tcbp = make_lw_shared<tcb>(*this, id);
_tcbs.insert({id, tcbp});
tcbp->connect();
return tcbp->connect_done().then([tcbp] {
return make_ready_future<connection>(connection(tcbp));
});
}
template <typename InetTraits>
bool tcp<InetTraits>::forward(forward_hash& out_hash_data, packet& p, size_t off) {
auto th = p.get_header<tcp_hdr>(off);
if (th) {
out_hash_data.push_back(th->src_port);
out_hash_data.push_back(th->dst_port);
}
return true;
}
template <typename InetTraits>
void tcp<InetTraits>::received(packet p, ipaddr from, ipaddr to) {
auto th = p.get_header<tcp_hdr>(0);
if (!th) {
return;
}
// th->data_offset is correct even before ntoh()
if (unsigned(th->data_offset * 4) < sizeof(*th)) {
return;
}
if (!hw_features().rx_csum_offload) {
checksummer csum;
InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len());
csum.sum(p);
if (csum.get() != 0) {
return;
}
}
auto h = ntoh(*th);
auto id = connid{to, from, h.dst_port, h.src_port};
auto tcbi = _tcbs.find(id);
lw_shared_ptr<tcb> tcbp;
if (tcbi == _tcbs.end()) {
auto listener = _listening.find(id.local_port);
if (listener == _listening.end() || listener->second->_q.full()) {
// 1) In CLOSE state
// 1.1 all data in the incoming segment is discarded. An incoming
// segment containing a RST is discarded. An incoming segment not
// containing a RST causes a RST to be sent in response.
// FIXME:
// if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
// if ACK on: <SEQ=SEG.ACK><CTL=RST>
return respond_with_reset(&h, id.local_ip, id.foreign_ip);
} else {
// 2) In LISTEN state
// 2.1 first check for an RST
if (h.f_rst) {
// An incoming RST should be ignored
return;
}
// 2.2 second check for an ACK
if (h.f_ack) {
// Any acknowledgment is bad if it arrives on a connection
// still in the LISTEN state.
// <SEQ=SEG.ACK><CTL=RST>
return respond_with_reset(&h, id.local_ip, id.foreign_ip);
}
// 2.3 third check for a SYN
if (h.f_syn) {
// check the security
// NOTE: Ignored for now
tcbp = make_lw_shared<tcb>(*this, id);
listener->second->_q.push(connection(tcbp));
_tcbs.insert({id, tcbp});
return tcbp->input_handle_listen_state(&h, std::move(p));
}
// 2.4 fourth other text or control
// So you are unlikely to get here, but if you do, drop the
// segment, and return.
return;
}
} else {
tcbp = tcbi->second;
if (tcbp->state() == tcp_state::SYN_SENT) {
// 3) In SYN_SENT State
return tcbp->input_handle_syn_sent_state(&h, std::move(p));
} else {
// 4) In other state, can be one of the following:
// SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
// CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
return tcbp->input_handle_other_state(&h, std::move(p));
}
}
}
// Send packet does not belong to any tcb
template <typename InetTraits>
void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, packet p) {
if (_queue_space.try_wait(p.len())) { // drop packets that do not fit the queue
_inet.get_l2_dst_address(to).then([this, to, p = std::move(p)] (ethernet_address e_dst) mutable {
_packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp});
});
}
}
template <typename InetTraits>
tcp<InetTraits>::connection::~connection() {
if (_tcb) {
_tcb->_conn = nullptr;
close_read();
close_write();
}
}
template <typename InetTraits>
tcp<InetTraits>::tcb::tcb(tcp& t, connid id)
: _tcp(t)
, _local_ip(id.local_ip)
, _foreign_ip(id.foreign_ip)
, _local_port(id.local_port)
, _foreign_port(id.foreign_port)
, _delayed_ack([this] { _nr_full_seg_received = 0; output(); })
, _retransmit([this] { retransmit(); })
, _persist([this] { persist(); }) {
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth) {
_tcp.respond_with_reset(rth, _local_ip, _foreign_ip);
}
template <typename InetTraits>
void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip) {
if (rth->f_rst) {
return;
}
packet p;
auto th = p.prepend_header<tcp_hdr>();
th->src_port = rth->dst_port;
th->dst_port = rth->src_port;
if (rth->f_ack) {
th->seq = rth->ack;
}
// If this RST packet is in response to a SYN packet. We ACK the ISN.
if (rth->f_syn) {
th->ack = rth->seq + 1;
th->f_ack = true;
}
th->f_rst = true;
th->data_offset = sizeof(*th) / 4;
th->checksum = 0;
*th = hton(*th);
checksummer csum;
offload_info oi;
InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
if (hw_features().tx_csum_l4_offload) {
th->checksum = ~csum.get();
oi.needs_csum = true;
} else {
csum.sum(p);
th->checksum = csum.get();
oi.needs_csum = false;
}
oi.protocol = ip_protocol_num::tcp;
oi.tcp_hdr_len = sizeof(tcp_hdr);
p.set_offload_info(oi);
send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
}
template <typename InetTraits>
uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_seq seg_ack) {
uint32_t total_acked_bytes = 0;
// Full ACK of segment
while (!_snd.data.empty()
&& (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) {
auto acked_bytes = _snd.data.front().p.len();
_snd.unacknowledged += acked_bytes;
// Ignore retransmitted segments when setting the RTO
if (_snd.data.front().nr_transmits == 0) {
update_rto(_snd.data.front().tx_time);
}
update_cwnd(acked_bytes);
total_acked_bytes += acked_bytes;
_snd.user_queue_space.signal(_snd.data.front().data_len);
_snd.data.pop_front();
}
// Partial ACK of segment
if (_snd.unacknowledged < seg_ack) {
auto acked_bytes = seg_ack - _snd.unacknowledged;
if (!_snd.data.empty()) {
auto& unacked_seg = _snd.data.front();
unacked_seg.p.trim_front(acked_bytes);
}
_snd.unacknowledged = seg_ack;
update_cwnd(acked_bytes);
total_acked_bytes += acked_bytes;
}
return total_acked_bytes;
}
template <typename InetTraits>
bool tcp<InetTraits>::tcb::segment_acceptable(tcp_seq seg_seq, unsigned seg_len) {
if (seg_len == 0 && _rcv.window == 0) {
// SEG.SEQ = RCV.NXT
return seg_seq == _rcv.next;
} else if (seg_len == 0 && _rcv.window > 0) {
// RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window);
} else if (seg_len > 0 && _rcv.window > 0) {
// RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
// or
// RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window);
bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window);
return x || y;
} else {
// SEG.LEN > 0 RCV.WND = 0, not acceptable
return false;
}
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) {
// Handle tcp options
_option.parse(opt_start, opt_end);
// Remote receive window scale factor
_snd.window_scale = _option._remote_win_scale;
// Local receive window scale factor
_rcv.window_scale = _option._local_win_scale;
// Maximum segment size remote can receive
_snd.mss = _option._remote_mss;
// Maximum segment size local can receive
_rcv.mss = _option._local_mss = local_mss();
// Linux's default window size
_rcv.window = 29200 << _rcv.window_scale;
_snd.window = th->window << _snd.window_scale;
// Segment sequence number used for last window update
_snd.wl1 = th->seq;
// Segment acknowledgment number used for last window update
_snd.wl2 = th->ack;
// Setup initial congestion window
if (2190 < _snd.mss) {
_snd.cwnd = 2 * _snd.mss;
} else if (1095 < _snd.mss && _snd.mss <= 2190) {
_snd.cwnd = 3 * _snd.mss;
} else {
_snd.cwnd = 4 * _snd.mss;
}
// Setup initial slow start threshold
_snd.ssthresh = th->window << _snd.window_scale;
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, packet p) {
auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
auto opt_end = opt_start + opt_len;
p.trim_front(th->data_offset * 4);
tcp_seq seg_seq = th->seq;
// Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
_rcv.next = seg_seq + 1;
_rcv.initial = seg_seq;
// ISS should be selected and a SYN segment sent of the form:
// <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
// SND.NXT is set to ISS+1 and SND.UNA to ISS
// NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
// ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
// have
// th->seq = syn_on ? _snd.initial : _snd.next
// to make sure retransmitted SYN has correct SEQ number.
do_setup_isn();
_rcv.urgent = _rcv.next;
tcp_debug("listen: LISTEN -> SYN_RECEIVED\n");
init_from_options(th, opt_start, opt_end);
do_syn_received();
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, packet p) {
auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
auto opt_end = opt_start + opt_len;
p.trim_front(th->data_offset * 4);
tcp_seq seg_seq = th->seq;
auto seg_ack = th->ack;
bool acceptable = false;
// 3.1 first check the ACK bit
if (th->f_ack) {
// If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
// RST bit is set, if so drop the segment and return)
if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
return respond_with_reset(th);
}
// If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
}
// 3.2 second check the RST bit
if (th->f_rst) {
// If the ACK was acceptable then signal the user "error: connection
// reset", drop the segment, enter CLOSED state, delete TCB, and
// return. Otherwise (no ACK) drop the segment and return.
if (acceptable) {
return do_reset();
} else {
return;
}
}
// 3.3 third check the security and precedence
// NOTE: Ignored for now
// 3.4 fourth check the SYN bit
if (th->f_syn) {
// RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should
// be advanced to equal SEG.ACK (if there is an ACK), and any segments
// on the retransmission queue which are thereby acknowledged should be
// removed.
_rcv.next = seg_seq + 1;
_rcv.initial = seg_seq;
if (th->f_ack) {
// TODO: clean retransmission queue
_snd.unacknowledged = seg_ack;
}
if (_snd.unacknowledged > _snd.initial) {
// If SND.UNA > ISS (our SYN has been ACKed), change the connection
// state to ESTABLISHED, form an ACK segment
// <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
tcp_debug("syn: SYN_SENT -> ESTABLISHED\n");
init_from_options(th, opt_start, opt_end);
do_established();
output();
} else {
// Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
// <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
tcp_debug("syn: SYN_SENT -> SYN_RECEIVED\n");
do_syn_received();
}
}
// 3.5 fifth, if neither of the SYN or RST bits is set then drop the
// segment and return.
return;
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, packet p) {
p.trim_front(th->data_offset * 4);
bool do_output = false;
bool do_output_data = false;
tcp_seq seg_seq = th->seq;
auto seg_ack = th->ack;
auto seg_len = p.len();
// 4.1 first check sequence number
if (!segment_acceptable(seg_seq, seg_len)) {
//<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
return output();
}
// In the following it is assumed that the segment is the idealized
// segment that begins at RCV.NXT and does not exceed the window.
if (seg_seq < _rcv.next) {
// ignore already acknowledged data
auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
p.trim_front(dup);
seg_len -= dup;
seg_seq += dup;
}
// FIXME: We should trim data outside the right edge of the receive window as well
if (seg_seq != _rcv.next) {
insert_out_of_order(seg_seq, std::move(p));
// A TCP receiver SHOULD send an immediate duplicate ACK
// when an out-of-order segment arrives.
return output();
}
// 4.2 second check the RST bit
if (th->f_rst) {
if (in_state(SYN_RECEIVED)) {
// If this connection was initiated with a passive OPEN (i.e.,
// came from the LISTEN state), then return this connection to
// LISTEN state and return. The user need not be informed. If
// this connection was initiated with an active OPEN (i.e., came
// from SYN_SENT state) then the connection was refused, signal
// the user "connection refused". In either case, all segments
// on the retransmission queue should be removed. And in the
// active OPEN case, enter the CLOSED state and delete the TCB,
// and return.
_connect_done.set_exception(tcp_refused_error());
return do_reset();
}
if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
// If the RST bit is set then, any outstanding RECEIVEs and SEND
// should receive "reset" responses. All segment queues should be
// flushed. Users should also receive an unsolicited general
// "connection reset" signal. Enter the CLOSED state, delete the
// TCB, and return.
return do_reset();
}
if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
// If the RST bit is set then, enter the CLOSED state, delete the
// TCB, and return.
return do_closed();
}
}
// 4.3 third check security and precedence
// NOTE: Ignored for now
// 4.4 fourth, check the SYN bit
if (th->f_syn) {
// SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
// CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
// If the SYN is in the window it is an error, send a reset, any
// outstanding RECEIVEs and SEND should receive "reset" responses,
// all segment queues should be flushed, the user should also
// receive an unsolicited general "connection reset" signal, enter
// the CLOSED state, delete the TCB, and return.
respond_with_reset(th);
return do_reset();
// If the SYN is not in the window this step would not be reached
// and an ack would have been sent in the first step (sequence
// number check).
}
// 4.5 fifth check the ACK field
if (!th->f_ack) {
// if the ACK bit is off drop the segment and return
return;
} else {
// SYN_RECEIVED STATE
if (in_state(SYN_RECEIVED)) {
// If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
// and continue processing.
if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
tcp_debug("SYN_RECEIVED -> ESTABLISHED\n");
do_established();
} else {
// <SEQ=SEG.ACK><CTL=RST>
return respond_with_reset(th);
}
}
auto update_window = [this, th, seg_seq, seg_ack] {
tcp_debug("window update seg_seq=%d, seg_ack=%d, old window=%d new window=%d\n",
seg_seq, seg_ack, _snd.window, th->window << _snd.window_scale);
_snd.window = th->window << _snd.window_scale;
_snd.wl1 = seg_seq;
_snd.wl2 = seg_ack;
if (_snd.window == 0) {
_persist_time_out = _rto;
start_persist_timer();
} else {
stop_persist_timer();
}
};
// ESTABLISHED STATE or
// CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
if (in_state(ESTABLISHED | CLOSE_WAIT)){
// If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
// Remote ACKed data we sent
auto acked_bytes = data_segment_acked(seg_ack);
// If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
update_window();
}
// some data is acked, try send more data
do_output_data = true;
auto set_retransmit_timer = [this] {
if (_snd.data.empty()) {
// All outstanding segments are acked, turn off the timer.
stop_retransmit_timer();
// Signal the waiter of this event
signal_all_data_acked();
} else {
// Restart the timer becasue new data is acked.
start_retransmit_timer();
}
};
if (_snd.dupacks >= 3) {
// We are in fast retransmit / fast recovery phase
uint32_t smss = _snd.mss;
if (seg_ack > _snd.recover) {
tcp_debug("ack: full_ack\n");
// Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
_snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
// Exit the fast recovery procedure
exit_fast_recovery();
set_retransmit_timer();
} else {
tcp_debug("ack: partial_ack\n");
// Retransmit the first unacknowledged segment
fast_retransmit();
// Deflate the congestion window by the amount of new data
// acknowledged by the Cumulative Acknowledgment field
_snd.cwnd -= acked_bytes;
// If the partial ACK acknowledges at least one SMSS of new
// data, then add back SMSS bytes to the congestion window
if (acked_bytes >= smss) {
_snd.cwnd += smss;
}
// Send a new segment if permitted by the new value of
// cwnd. Do not exit the fast recovery procedure For
// the first partial ACK that arrives during fast
// recovery, also reset the retransmit timer.
if (++_snd.partial_ack == 1) {
start_retransmit_timer();
}
}
} else {
// RFC5681: The fast retransmit algorithm uses the arrival
// of 3 duplicate ACKs (as defined in section 2, without
// any intervening ACKs which move SND.UNA) as an
// indication that a segment has been lost.
//
// So, here we reset dupacks to zero becasue this ACK moves
// SND.UNA.
exit_fast_recovery();
set_retransmit_timer();
}
} else if (!_snd.data.empty() && seg_len == 0 &&
th->f_fin == 0 && th->f_syn == 0 &&
th->ack == _snd.unacknowledged &&
uint32_t(th->window << _snd.window_scale) == _snd.window) {
// Note:
// RFC793 states:
// If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
// RFC5681 states:
// The TCP sender SHOULD use the "fast retransmit" algorithm to detect
// and repair loss, based on incoming duplicate ACKs.
// Here, We follow RFC5681.
_snd.dupacks++;
uint32_t smss = _snd.mss;
// 3 duplicated ACKs trigger a fast retransmit
if (_snd.dupacks == 1 || _snd.dupacks == 2) {
// RFC5681 Step 3.1
// Send cwnd + 2 * smss per RFC3042
do_output_data = true;
} else if (_snd.dupacks == 3) {
// RFC6582 Step 3.2
if (seg_ack - 1 > _snd.recover) {
_snd.recover = _snd.next - 1;
// RFC5681 Step 3.2
_snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
fast_retransmit();
} else {
// Do not enter fast retransmit and do not reset ssthresh
}
// RFC5681 Step 3.3
_snd.cwnd = _snd.ssthresh + 3 * smss;
} else if (_snd.dupacks > 3) {
// RFC5681 Step 3.4
_snd.cwnd += smss;
// RFC5681 Step 3.5
do_output_data = true;
}
} else if (seg_ack > _snd.next) {
// If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
// then send an ACK, drop the segment, and return
return output();
} else if (_snd.window == 0 && th->window > 0) {
update_window();
do_output_data = true;
}
}
// FIN_WAIT_1 STATE
if (in_state(FIN_WAIT_1)) {
// In addition to the processing for the ESTABLISHED state, if
// our FIN is now acknowledged then enter FIN-WAIT-2 and continue
// processing in that state.
if (seg_ack == _snd.next + 1) {
tcp_debug("ack: FIN_WAIT_1 -> FIN_WAIT_2\n");
_state = FIN_WAIT_2;
do_local_fin_acked();
}
}
// FIN_WAIT_2 STATE
if (in_state(FIN_WAIT_2)) {
// In addition to the processing for the ESTABLISHED state, if
// the retransmission queue is empty, the users CLOSE can be
// acknowledged ("ok") but do not delete the TCB.
// TODO
}
// CLOSING STATE
if (in_state(CLOSING)) {
if (seg_ack == _snd.next + 1) {
tcp_debug("ack: CLOSING -> TIME_WAIT\n");
do_local_fin_acked();
return do_time_wait();
} else {
return;
}
}
// LAST_ACK STATE
if (in_state(LAST_ACK)) {
if (seg_ack == _snd.next + 1) {
tcp_debug("ack: LAST_ACK -> CLOSED\n");
do_local_fin_acked();
return do_closed();
}
}
// TIME_WAIT STATE
if (in_state(TIME_WAIT)) {
// The only thing that can arrive in this state is a
// retransmission of the remote FIN. Acknowledge it, and restart
// the 2 MSL timeout.
// TODO
}
}
// 4.6 sixth, check the URG bit
if (th->f_urg) {
// TODO
}
// 4.7 seventh, process the segment text
if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_1)) {
if (p.len()) {
// Once the TCP takes responsibility for the data it advances
// RCV.NXT over the data accepted, and adjusts RCV.WND as
// apporopriate to the current buffer availability. The total of
// RCV.NXT and RCV.WND should not be reduced.
_rcv.data.push_back(std::move(p));
_rcv.next += seg_len;
auto merged = merge_out_of_order();
signal_data_received();
// Send an acknowledgment of the form:
// <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
// This acknowledgment should be piggybacked on a segment being
// transmitted if possible without incurring undue delay.
if (merged) {
// TCP receiver SHOULD send an immediate ACK when the
// incoming segment fills in all or part of a gap in the
// sequence space.
do_output = true;
} else {
do_output = should_send_ack(seg_len);
}
}
} else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
// This should not occur, since a FIN has been received from the
// remote side. Ignore the segment text.
return;
}
// 4.8 eighth, check the FIN bit
if (th->f_fin) {
if (in_state(CLOSED | LISTEN | SYN_SENT)) {
// Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
// since the SEG.SEQ cannot be validated; drop the segment and return.
return;
}
auto fin_seq = seg_seq + seg_len;
if (fin_seq == _rcv.next) {
_rcv.next = fin_seq + 1;
signal_data_received();
// If this <FIN> packet contains data as well, we can ACK both data
// and <FIN> in a single packet, so canncel the previous ACK.
clear_delayed_ack();
do_output = false;
// Send ACK for the FIN!
output();
if (in_state(SYN_RECEIVED | ESTABLISHED)) {
tcp_debug("fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT\n");
_state = CLOSE_WAIT;
}
if (in_state(FIN_WAIT_1)) {
// If our FIN has been ACKed (perhaps in this segment), then
// enter TIME-WAIT, start the time-wait timer, turn off the other
// timers; otherwise enter the CLOSING state.
// Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
// not FIN_WAIT_1 if we reach here.
tcp_debug("fin: FIN_WAIT_1 -> CLOSING\n");
_state = CLOSING;
}
if (in_state(FIN_WAIT_2)) {
tcp_debug("fin: FIN_WAIT_2 -> TIME_WAIT\n");
return do_time_wait();
}
}
}
if (do_output || (do_output_data && can_send())) {
// Since we will do output, we can canncel scheduled delayed ACK.
clear_delayed_ack();
output();
}
}
template <typename InetTraits>
packet tcp<InetTraits>::tcb::get_transmit_packet() {
// easy case: empty queue
if (_snd.unsent.empty()) {
return packet();
}
auto can_send = this->can_send();
// Max number of TCP payloads we can pass to NIC
uint32_t len;
if (_tcp.hw_features().tx_tso) {
// FIXME: Info tap device the size of the splitted packet
len = _tcp.hw_features().max_packet_len - net::tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
} else {
len = std::min(uint16_t(_tcp.hw_features().mtu - net::tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss);
}
can_send = std::min(can_send, len);
// easy case: one small packet
if (_snd.unsent.size() == 1 && _snd.unsent.front().len() <= can_send) {
auto p = std::move(_snd.unsent.front());
_snd.unsent.pop_front();
_snd.unsent_len -= p.len();
return p;
}
// moderate case: need to split one packet
if (_snd.unsent.front().len() > can_send) {
auto p = _snd.unsent.front().share(0, can_send);
_snd.unsent.front().trim_front(can_send);
_snd.unsent_len -= p.len();
return p;
}
// hard case: merge some packets, possibly split last
auto p = std::move(_snd.unsent.front());
_snd.unsent.pop_front();
can_send -= p.len();
while (!_snd.unsent.empty()
&& _snd.unsent.front().len() <= can_send) {
can_send -= _snd.unsent.front().len();
p.append(std::move(_snd.unsent.front()));
_snd.unsent.pop_front();
}
if (!_snd.unsent.empty() && can_send) {
auto& q = _snd.unsent.front();
p.append(q.share(0, can_send));
q.trim_front(can_send);
}
_snd.unsent_len -= p.len();
return p;
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::output_one(bool data_retransmit) {
if (in_state(CLOSED)) {
return;
}
packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet();
uint16_t len = p.len();
bool syn_on = syn_needs_on();
bool ack_on = ack_needs_on();
auto options_size = _option.get_size(syn_on, ack_on);
auto th = p.prepend_header<tcp_hdr>(options_size);
th->src_port = _local_port;
th->dst_port = _foreign_port;
th->f_syn = syn_on;
th->f_ack = ack_on;
if (ack_on) {
clear_delayed_ack();
}
th->f_urg = false;
th->f_psh = false;
tcp_seq seq;
if (data_retransmit) {
seq = _snd.unacknowledged;
} else {
seq = syn_on ? _snd.initial : _snd.next;
_snd.next += len;
}
th->seq = seq;
th->ack = _rcv.next;
th->data_offset = (sizeof(*th) + options_size) / 4;
th->window = _rcv.window >> _rcv.window_scale;
th->checksum = 0;
// FIXME: does the FIN have to fit in the window?
bool fin_on = fin_needs_on();
th->f_fin = fin_on;
// Add tcp options
_option.fill(th, options_size);
*th = hton(*th);
offload_info oi;
checksummer csum;
uint16_t pseudo_hdr_seg_len = 0;
oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size;
if (_tcp.hw_features().tx_csum_l4_offload) {
oi.needs_csum = true;
//
// tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's
// PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones'
// complement sum of the pseudo header.
//
// For TSO the csum should be calculated for a pseudo header with
// segment length set to 0. All the rest is the same as for a TCP Tx
// CSUM offload case.
//
if (_tcp.hw_features().tx_tso &&
p.len() > _snd.mss + sizeof(eth_hdr) + oi.ip_hdr_len +
oi.tcp_hdr_len) {
oi.tso_seg_size = _snd.mss;
} else {
pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
}
} else {
pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
oi.needs_csum = false;
}
InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip,
pseudo_hdr_seg_len);
if (_tcp.hw_features().tx_csum_l4_offload) {
th->checksum = ~csum.get();
} else {
csum.sum(p);
th->checksum = csum.get();
}
oi.protocol = ip_protocol_num::tcp;
p.set_offload_info(oi);
if (!data_retransmit && (len || syn_on || fin_on)) {
auto now = clock_type::now();
if (len) {
unsigned nr_transmits = 0;
_snd.data.emplace_back(unacked_segment{p.share(sizeof(tcp_hdr) + options_size, len),
len, nr_transmits, now});
}
if (!_retransmit.armed()) {
start_retransmit_timer(now);
}
}
queue_packet(std::move(p));
}
template <typename InetTraits>
future<> tcp<InetTraits>::tcb::wait_for_data() {
if (!_rcv.data.empty() || foreign_will_not_send()) {
return make_ready_future<>();
}
_rcv._data_received_promise = promise<>();
return _rcv._data_received_promise->get_future();
}
template <typename InetTraits>
future<> tcp<InetTraits>::tcb::wait_for_all_data_acked() {
if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) {
return make_ready_future<>();
}
_snd._all_data_acked_promise = promise<>();
return _snd._all_data_acked_promise->get_future();
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::connect() {
// An initial send sequence number (ISS) is selected. A SYN segment of the
// form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1,
// enter SYN-SENT state, and return.
do_setup_isn();
// Local receive window scale factor
_rcv.window_scale = _option._local_win_scale = 7;
// Maximum segment size local can receive
_rcv.mss = _option._local_mss = local_mss();
// Linux's default window size
_rcv.window = 29200 << _rcv.window_scale;
do_syn_sent();
}
template <typename InetTraits>
packet tcp<InetTraits>::tcb::read() {
packet p;
for (auto&& q : _rcv.data) {
p.append(std::move(q));
}
_rcv.data.clear();
return p;
}
template <typename InetTraits>
future<> tcp<InetTraits>::tcb::send(packet p) {
// We can not send after the connection is closed
assert(!_snd.closed);
if (in_state(CLOSED)) {
return make_exception_future<>(tcp_reset_error());
}
// TODO: Handle p.len() > max user_queue_space case
auto len = p.len();
_snd.queued_len += len;
return _snd.user_queue_space.wait(len).then([this, zis = this->shared_from_this(), p = std::move(p)] () mutable {
assert(!_snd.closed);
_snd.unsent_len += p.len();
_snd.queued_len -= p.len();
_snd.unsent.push_back(std::move(p));
if (can_send() > 0) {
output();
}
return make_ready_future<>();
});
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::close() {
if (in_state(CLOSED) || _snd.closed) {
return;
}
// TODO: We should return a future to upper layer
wait_for_all_data_acked().then([this, zis = this->shared_from_this()] () mutable {
_snd.closed = true;
tcp_debug("close: unsent_len=%d\n", _snd.unsent_len);
if (in_state(CLOSE_WAIT)) {
tcp_debug("close: CLOSE_WAIT -> LAST_ACK\n");
_state = LAST_ACK;
} else if (in_state(ESTABLISHED)) {
tcp_debug("close: ESTABLISHED -> FIN_WAIT_1\n");
_state = FIN_WAIT_1;
}
// Send <FIN> to remote
// Note: we call output_one to make sure a packet with FIN actually
// sent out. If we only call output() and _packetq is not empty,
// tcp::tcb::get_packet(), packet with FIN will not be generated.
output_one();
output();
});
}
template <typename InetTraits>
bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) {
// We've received a TSO packet, do ack immediately
if (seg_len > _rcv.mss) {
_nr_full_seg_received = 0;
_delayed_ack.cancel();
return true;
}
// We've received a full sized segment, ack for every second full sized segment
if (seg_len == _rcv.mss) {
if (_nr_full_seg_received++ >= 1) {
_nr_full_seg_received = 0;
_delayed_ack.cancel();
return true;
}
}
// If the timer is armed and its callback hasn't been run.
if (_delayed_ack.armed()) {
return false;
}
// If the timer is not armed, schedule a delayed ACK.
// The maximum delayed ack timer allowed by RFC1122 is 500ms, most
// implementations use 200ms.
_delayed_ack.arm(200ms);
return false;
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::clear_delayed_ack() {
_delayed_ack.cancel();
}
template <typename InetTraits>
bool tcp<InetTraits>::tcb::merge_out_of_order() {
bool merged = false;
if (_rcv.out_of_order.map.empty()) {
return merged;
}
for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) {
auto& p = it->second;
auto seg_beg = it->first;
auto seg_len = p.len();
auto seg_end = seg_beg + seg_len;
if (seg_beg <= _rcv.next && _rcv.next < seg_end) {
// This segment has been received out of order and its previous
// segment has been received now
auto trim = _rcv.next - seg_beg;
if (trim) {
p.trim_front(trim);
seg_len -= trim;
}
_rcv.next += seg_len;
_rcv.data.push_back(std::move(p));
// Since c++11, erase() always returns the value of the following element
it = _rcv.out_of_order.map.erase(it);
merged = true;
} else if (_rcv.next >= seg_end) {
// This segment has been receive already, drop it
it = _rcv.out_of_order.map.erase(it);
} else {
// seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only,
// so we can stop looking here.
it++;
break;
}
}
return merged;
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::insert_out_of_order(tcp_seq seg, packet p) {
_rcv.out_of_order.merge(seg, std::move(p));
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::trim_receive_data_after_window() {
abort();
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::persist() {
tcp_debug("persist timer fired\n");
// Send 1 byte packet to probe peer's window size
_snd.window_probe = true;
output_one();
_snd.window_probe = false;
output();
// Perform binary exponential back-off per RFC1122
_persist_time_out = std::min(_persist_time_out * 2, _rto_max);
start_persist_timer();
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::retransmit() {
auto output_update_rto = [this] {
output();
// According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
this->_rto = std::min(this->_rto * 2, this->_rto_max);
start_retransmit_timer();
};
// Retransmit SYN
if (syn_needs_on()) {
if (_snd.syn_retransmit++ < _max_nr_retransmit) {
output_update_rto();
} else {
_connect_done.set_exception(tcp_connect_error());
cleanup();
return;
}
}
// Retransmit FIN
if (fin_needs_on()) {
if (_snd.fin_retransmit++ < _max_nr_retransmit) {
output_update_rto();
} else {
cleanup();
return;
}
}
// Retransmit Data
if (_snd.data.empty()) {
return;
}
// If there are unacked data, retransmit the earliest segment
auto& unacked_seg = _snd.data.front();
// According to RFC5681
// Update ssthresh only for the first retransmit
uint32_t smss = _snd.mss;
if (unacked_seg.nr_transmits == 0) {
_snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
}
// RFC6582 Step 4
_snd.recover = _snd.next - 1;
// Start the slow start process
_snd.cwnd = smss;
// End fast recovery
exit_fast_recovery();
if (unacked_seg.nr_transmits < _max_nr_retransmit) {
unacked_seg.nr_transmits++;
} else {
// Delete connection when max num of retransmission is reached
cleanup();
return;
}
retransmit_one();
output_update_rto();
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::fast_retransmit() {
if (!_snd.data.empty()) {
auto& unacked_seg = _snd.data.front();
unacked_seg.nr_transmits++;
retransmit_one();
output();
}
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) {
// Update RTO according to RFC6298
auto R = std::chrono::duration_cast<std::chrono::milliseconds>(clock_type::now() - tx_time);
if (_snd.first_rto_sample) {
_snd.first_rto_sample = false;
// RTTVAR <- R/2
// SRTT <- R
_snd.rttvar = R / 2;
_snd.srtt = R;
} else {
// RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
// SRTT <- (1 - alpha) * SRTT + alpha * R'
// where alpha = 1/8 and beta = 1/4
auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt);
_snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4;
_snd.srtt = _snd.srtt * 7 / 8 + R / 8;
}
// RTO <- SRTT + max(G, K * RTTVAR)
_rto = _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar);
// Make sure 1 sec << _rto << 60 sec
_rto = std::max(_rto, _rto_min);
_rto = std::min(_rto, _rto_max);
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) {
uint32_t smss = _snd.mss;
if (_snd.cwnd < _snd.ssthresh) {
// In slow start phase
_snd.cwnd += std::min(acked_bytes, smss);
} else {
// In congestion avoidance phase
uint32_t round_up = 1;
_snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd);
}
}
template <typename InetTraits>
void tcp<InetTraits>::tcb::cleanup() {
_snd.unsent.clear();
_snd.data.clear();
_rcv.out_of_order.map.clear();
_rcv.data.clear();
stop_retransmit_timer();
clear_delayed_ack();
remove_from_tcbs();
}
template <typename InetTraits>
tcp_seq tcp<InetTraits>::tcb::get_isn() {
// Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers
// with the expression:
// ISN = M + F(localip, localport, remoteip, remoteport, secretkey)
// M is the 4 microsecond timer
using namespace std::chrono;
uint32_t hash[4];
hash[0] = _local_ip.ip;
hash[1] = _foreign_ip.ip;
hash[2] = (_local_port << 16) + _foreign_port;
hash[3] = _isn_secret.key[15];
CryptoPP::Weak::MD5::Transform(hash, _isn_secret.key);
auto seq = hash[0];
auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch());
seq += m.count() / 4;
return make_seq(seq);
}
template <typename InetTraits>
std::experimental::optional<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() {
_poll_active = false;
if (_packetq.empty()) {
output_one();
}
if (in_state(CLOSED)) {
return std::experimental::optional<typename InetTraits::l4packet>();
}
assert(!_packetq.empty());
auto p = std::move(_packetq.front());
_packetq.pop_front();
if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) {
// If there are packets to send in the queue or tcb is allowed to send
// more add tcp back to polling set to keep sending. In addition, dupacks >= 3
// is an indication that an segment is lost, stop sending more in this case.
output();
}
return std::move(p);
}
template <typename InetTraits>
void tcp<InetTraits>::connection::close_read() {
}
template <typename InetTraits>
void tcp<InetTraits>::connection::close_write() {
_tcb->close();
}
template <typename InetTraits>
constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit;
template <typename InetTraits>
constexpr std::chrono::milliseconds tcp<InetTraits>::tcb::_rto_min;
template <typename InetTraits>
constexpr std::chrono::milliseconds tcp<InetTraits>::tcb::_rto_max;
template <typename InetTraits>
constexpr std::chrono::milliseconds tcp<InetTraits>::tcb::_rto_clk_granularity;
template <typename InetTraits>
typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret;
}
#endif /* TCP_HH_ */