diff options
author | JJD | 2017-10-03 22:15:13 +0200 |
---|---|---|
committer | JJD | 2017-10-03 22:15:13 +0200 |
commit | ec1bfe68a246e55ded1bd048a248b11010db030d (patch) | |
tree | 20cf76c91cf5cf3b8f4e179aee5aa5632d239184 /tcp_wave.patch | |
download | aur-ec1bfe68a246e55ded1bd048a248b11010db030d.tar.gz |
initial commit. Linux wave is a linux version which contains the TCP Wave congestion control module
Diffstat (limited to 'tcp_wave.patch')
-rw-r--r-- | tcp_wave.patch | 1699 |
1 files changed, 1699 insertions, 0 deletions
diff --git a/tcp_wave.patch b/tcp_wave.patch new file mode 100644 index 000000000000..74b533c5c19c --- /dev/null +++ b/tcp_wave.patch @@ -0,0 +1,1699 @@ +diff --git a/MAINTAINERS b/MAINTAINERS +index 1c3feffb1c1c..34fe18d467cd 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -12724,6 +12724,12 @@ W: http://tcp-lp-mod.sourceforge.net/ + S: Maintained + F: net/ipv4/tcp_lp.c + ++TCP WAVE MODULE ++M: "Natale Patriciello" <natale.patriciello@gmail.com> ++W: http://tlcsat.uniroma2.it/tcpwave4linux/ ++S: Maintained ++F: net/ipv4/tcp_wave.c ++ + TDA10071 MEDIA DRIVER + M: Antti Palosaari <crope@iki.fi> + L: linux-media@vger.kernel.org +diff --git a/include/net/tcp.h b/include/net/tcp.h +index f642a39f9eee..955a5233d94e 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -988,6 +988,16 @@ struct tcp_congestion_ops { + /* get info for inet_diag (optional) */ + size_t (*get_info)(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info); ++ /* get the expiration time for the pacing timer (optional) */ ++ u64 (*get_pacing_time)(struct sock *sk); ++ /* no data to transmit at the pacing timer expiration time (optional) */ ++ void (*no_data_to_transmit)(struct sock *sk); ++ /* the pacing timer is expired (optional) */ ++ void (*pacing_timer_expired)(struct sock *sk); ++ /* get the # segs to send out when the timer expires (optional) */ ++ u32 (*get_segs_per_round)(struct sock *sk); ++ /* the TCP has sent some segments (optional) */ ++ void (*segments_sent)(struct sock *sk, u32 sent); + + char name[TCP_CA_NAME_MAX]; + struct module *owner; +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index bbe201047df6..9e755cff2c3d 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -142,6 +142,7 @@ enum { + INET_DIAG_PAD, + INET_DIAG_MARK, + INET_DIAG_BBRINFO, ++ INET_DIAG_WAVEINFO, + __INET_DIAG_MAX, + }; + +@@ -186,9 +187,21 @@ struct tcp_bbr_info { + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + }; + ++/* INET_DIAG_WAVEINFO */ ++ ++struct tcp_wave_info { ++ __u32 tx_timer; ++ __u16 burst; ++ __u32 previous_ack_t_disp; ++ __u32 min_rtt; ++ __u32 avg_rtt; ++ __u32 max_rtt; ++}; ++ + union tcp_cc_info { + struct tcpvegas_info vegas; + struct tcp_dctcp_info dctcp; + struct tcp_bbr_info bbr; ++ struct tcp_wave_info wave; + }; + #endif /* _UAPI_INET_DIAG_H_ */ +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 91a2557942fa..de23b3a04b98 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -492,6 +492,18 @@ config TCP_CONG_BIC + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + ++config TCP_CONG_WAVE ++ tristate "Wave TCP" ++ default m ++ ---help--- ++ TCP Wave (TCPW) replaces the window-based transmission paradigm of the ++ standard TCP with a burst-based transmission, the ACK-clock scheduling ++ with a self-managed timer and the RTT-based congestion control loop with ++ an Ack-based Capacity and Congestion Estimation (ACCE) module. In ++ non-technical words, it sends data down the stack when its internal ++ timer expires, and the timing of the received ACKs contribute to ++ updating this timer regularly. ++ + config TCP_CONG_CUBIC + tristate "CUBIC TCP" + default y +@@ -690,6 +702,9 @@ choice + config DEFAULT_CUBIC + bool "Cubic" if TCP_CONG_CUBIC=y + ++ config DEFAULT_WAVE ++ bool "Wave" if TCP_CONG_WAVE=y ++ + config DEFAULT_HTCP + bool "Htcp" if TCP_CONG_HTCP=y + +@@ -729,6 +744,7 @@ config DEFAULT_TCP_CONG + string + default "bic" if DEFAULT_BIC + default "cubic" if DEFAULT_CUBIC ++ default "wave" if DEFAULT_WAVE + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS +diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile +index afcb435adfbe..e82ba69b19a9 100644 +--- a/net/ipv4/Makefile ++++ b/net/ipv4/Makefile +@@ -47,6 +47,7 @@ obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o + obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o + obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o + obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o ++obj-$(CONFIG_TCP_CONG_WAVE) += tcp_wave.o + obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o + obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o + obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o +@@ -61,5 +62,8 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o + obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o + obj-$(CONFIG_NETLABEL) += cipso_ipv4.o + ++CFLAGS_tcp_wave.o := -DDEBUG ++CFLAGS_tcp_output.o := -DDEBUG ++ + obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ + xfrm4_output.o xfrm4_protocol.o +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index a3e91b552edc..6c1f384b3ba2 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -994,9 +994,9 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); +- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); +- } else if (skb == tcp_send_head(sk)) + tcp_push_one(sk, mss_now); ++ } else if (skb == tcp_send_head(sk)) ++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + continue; + + wait_for_sndbuf: +@@ -1340,9 +1340,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); +- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); +- } else if (skb == tcp_send_head(sk)) + tcp_push_one(sk, mss_now); ++ } else if (skb == tcp_send_head(sk)) ++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + continue; + + wait_for_sndbuf: +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index b7661a68d498..ef9dbd0af283 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -42,6 +42,22 @@ + #include <linux/gfp.h> + #include <linux/module.h> + ++static const char *header_flags[5] = { "[SYN]", "[SYN|ACK]", ++ "[ACK]", "[FIN|ACK]", "[UNK]" }; ++static inline const char *print_tcp_header_flags(__u8 flags) ++{ ++ if (flags & TCPHDR_SYN && !(flags & TCPHDR_ACK)) ++ return header_flags[0]; ++ else if (flags & TCPHDR_SYN && flags & TCPHDR_ACK) ++ return header_flags[1]; ++ else if (flags & TCPHDR_FIN) ++ return header_flags[3]; ++ else if (flags & TCPHDR_ACK) ++ return header_flags[2]; ++ else ++ return header_flags[4]; ++} ++ + /* People can turn this off for buggy TCP's found in printers etc. */ + int sysctl_tcp_retrans_collapse __read_mostly = 1; + +@@ -742,6 +758,7 @@ static void tcp_tsq_handler(struct sock *sk) + tp->snd_cwnd > tcp_packets_in_flight(tp)) + tcp_xmit_retransmit_queue(sk); + ++ pr_debug("%u [tcp_tsq_handler]\n", tcp_jiffies32); + tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, + 0, GFP_ATOMIC); + } +@@ -950,22 +967,38 @@ static bool tcp_needs_internal_pacing(const struct sock *sk) + return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; + } + ++static bool tcp_pacing_timer_check(const struct sock *sk) ++{ ++ return hrtimer_active(&tcp_sk(sk)->pacing_timer); ++} ++ + static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) + { ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + u64 len_ns; +- u32 rate; + + if (!tcp_needs_internal_pacing(sk)) + return; +- rate = sk->sk_pacing_rate; +- if (!rate || rate == ~0U) +- return; +- +- /* Should account for header sizes as sch_fq does, +- * but lets make things simple. +- */ +- len_ns = (u64)skb->len * NSEC_PER_SEC; +- do_div(len_ns, rate); ++ ++ if (ca_ops && ca_ops->get_pacing_time) { ++ if (tcp_pacing_timer_check(sk)) ++ return; ++ ++ len_ns = ca_ops->get_pacing_time(sk); ++ } else { ++ u32 rate = sk->sk_pacing_rate; ++ if (!rate || rate == ~0U) ++ return; ++ ++ /* Should account for header sizes as sch_fq does, ++ * but lets make things simple. ++ */ ++ len_ns = (u64)skb->len * NSEC_PER_SEC; ++ do_div(len_ns, rate); ++ } ++ ++ pr_debug("%u [%s] len_ns=%llu\n", tcp_jiffies32, __func__, len_ns); ++ + hrtimer_start(&tcp_sk(sk)->pacing_timer, + ktime_add_ns(ktime_get(), len_ns), + HRTIMER_MODE_ABS_PINNED); +@@ -994,6 +1027,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + struct tcp_md5sig_key *md5; + struct tcphdr *th; + int err; ++ u8 flags; + + BUG_ON(!skb || !tcp_skb_pcount(skb)); + tp = tcp_sk(sk); +@@ -1062,6 +1096,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + th->check = 0; + th->urg_ptr = 0; + ++ flags = tcb->tcp_flags; ++ + /* The urg_mode check is necessary during a below snd_una win probe */ + if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { + if (before(tp->snd_up, tcb->seq + 0x10000)) { +@@ -1122,6 +1158,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + ++ pr_debug("%u [tcp_transmit_skb] seq=%u, ack=%u, window=%u, len=%u flags=%s err=%i \n", ++ tcp_jiffies32, ntohl(th->seq), ntohl(th->ack_seq), ++ ntohs(th->window), skb->len, print_tcp_header_flags(flags), err); ++ + if (likely(err <= 0)) + return err; + +@@ -2135,6 +2175,7 @@ static int tcp_mtu_probe(struct sock *sk) + /* We're ready to send. If this fails, the probe will + * be resegmented into mss-sized pieces by tcp_write_xmit(). + */ ++ pr_debug("%u [tcp_mtu_probe] sending a probe\n", tcp_jiffies32); + if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { + /* Decrement cwnd here because we are sending + * effectively two packets. */ +@@ -2154,7 +2195,7 @@ static int tcp_mtu_probe(struct sock *sk) + static bool tcp_pacing_check(const struct sock *sk) + { + return tcp_needs_internal_pacing(sk) && +- hrtimer_active(&tcp_sk(sk)->pacing_timer); ++ tcp_pacing_timer_check(sk); + } + + /* TCP Small Queues : +@@ -2258,6 +2299,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) + { ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int tso_segs, sent_pkts; +@@ -2265,6 +2307,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int result; + bool is_cwnd_limited = false, is_rwnd_limited = false; + u32 max_segs; ++ u32 pacing_allowed_segs = 0; ++ bool notify = false; + + sent_pkts = 0; + +@@ -2280,11 +2324,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + + max_segs = tcp_tso_segs(sk, mss_now); + tcp_mstamp_refresh(tp); ++ ++ if (!tcp_pacing_timer_check(sk)) { ++ pacing_allowed_segs = 1; ++ if (ca_ops) { ++ if (ca_ops->pacing_timer_expired) { ++ ca_ops->pacing_timer_expired(sk); ++ notify = true; ++ } ++ if (ca_ops->get_segs_per_round) ++ pacing_allowed_segs = ca_ops->get_segs_per_round(sk); ++ } ++ } else { ++ pr_debug("%u [%s] timer running\n", tcp_jiffies32, __func__); ++ } ++ + while ((skb = tcp_send_head(sk))) { + unsigned int limit; + +- if (tcp_pacing_check(sk)) ++ pr_debug("%u [%s] allowed=%u sent=%u, inflight=%u, cwnd=%u\n", tcp_jiffies32, __func__, ++ pacing_allowed_segs, sent_pkts, tcp_packets_in_flight(tp), ++ tp->snd_cwnd); ++ ++ if (sent_pkts > pacing_allowed_segs) { ++ pr_debug("%u [%s] BREAK for sent\n", tcp_jiffies32, ++ __func__); + break; ++ } + + tso_segs = tcp_init_tso_segs(skb, mss_now); + BUG_ON(!tso_segs); +@@ -2292,33 +2358,42 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { + /* "skb_mstamp" is used as a start point for the retransmit timer */ + skb->skb_mstamp = tp->tcp_mstamp; ++ pr_debug("%u [%s] 1", tcp_jiffies32, __func__); + goto repair; /* Skip network transmission */ + } + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (!cwnd_quota) { +- if (push_one == 2) ++ if (push_one == 2) { + /* Force out a loss probe pkt. */ ++ pr_debug("%u [%s] 2", tcp_jiffies32, __func__); + cwnd_quota = 1; +- else ++ } else { ++ pr_debug("%u [%s] 3", tcp_jiffies32, __func__); + break; ++ } + } + + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { + is_rwnd_limited = true; ++ pr_debug("%u [%s] 4", tcp_jiffies32, __func__); + break; + } + + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? +- nonagle : TCP_NAGLE_PUSH)))) ++ nonagle : TCP_NAGLE_PUSH)))) { ++ pr_debug("%u [%s] 5", tcp_jiffies32, __func__); + break; ++ } + } else { + if (!push_one && + tcp_tso_should_defer(sk, skb, &is_cwnd_limited, +- max_segs)) ++ max_segs)) { ++ pr_debug("%u [%s] 6", tcp_jiffies32, __func__); + break; ++ } + } + + limit = mss_now; +@@ -2330,16 +2405,22 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + nonagle); + + if (skb->len > limit && +- unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) ++ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) { ++ pr_debug("%u [%s] 7", tcp_jiffies32, __func__); + break; ++ } + + if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); +- if (tcp_small_queue_check(sk, skb, 0)) ++ if (tcp_small_queue_check(sk, skb, 0)) { ++ pr_debug("%u [%s] 8", tcp_jiffies32, __func__); + break; ++ } + +- if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) ++ if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) { ++ pr_debug("%u [%s] 9", tcp_jiffies32, __func__); + break; ++ } + + repair: + /* Advance the send_head. This one is sent out. +@@ -2350,10 +2431,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + tcp_minshall_update(tp, mss_now, skb); + sent_pkts += tcp_skb_pcount(skb); + +- if (push_one) ++ if (push_one) { ++ pr_debug("%u [%s] 10", tcp_jiffies32, __func__); + break; ++ } + } + ++ if (!tcp_send_head(sk)) { ++ pr_debug("%u [%s] no skb in queue\n", tcp_jiffies32, __func__); ++ } ++ ++ if (ca_ops && notify && ca_ops->segments_sent) ++ ca_ops->segments_sent(sk, sent_pkts); ++ + if (is_rwnd_limited) + tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); + else +@@ -2447,6 +2537,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (skb) { + if (tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; ++ pr_debug("%u [tcp_send_loss_probe]\n", tcp_jiffies32); + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; +@@ -2503,6 +2594,8 @@ void tcp_send_loss_probe(struct sock *sk) + void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, + int nonagle) + { ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and + * all will be happy. +@@ -2513,6 +2606,10 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, + if (tcp_write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_mask(sk, GFP_ATOMIC))) + tcp_check_probe_timer(sk); ++ ++ if (!tcp_send_head(sk) && ca_ops && ca_ops->no_data_to_transmit) { ++ ca_ops->no_data_to_transmit(sk); ++ } + } + + /* Send _single_ skb sitting at the send head. This function requires +@@ -2522,9 +2619,13 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) + { + struct sk_buff *skb = tcp_send_head(sk); + +- BUG_ON(!skb || skb->len < mss_now); ++ /* Don't be forced to send not meaningful data */ ++ if (!skb || skb->len < mss_now) ++ return; + ++ pr_debug("%u [tcp_push_one] Pushing directly\n", tcp_jiffies32); + tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); ++ pr_debug("%u [tcp_push_one] End of untimed push\n", tcp_jiffies32); + } + + /* This function returns the amount that we can raise the +@@ -2868,9 +2969,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) + + skb->skb_mstamp = tp->tcp_mstamp; + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); ++ pr_debug("%u [tcp_retransmit_skb] retransmit\n", tcp_jiffies32); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } else { ++ pr_debug("%u [tcp_retransmit_skb] retransmit\n", tcp_jiffies32); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + } + +@@ -3084,6 +3187,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) + TCPHDR_ACK | TCPHDR_RST); + tcp_mstamp_refresh(tcp_sk(sk)); + /* Send it off. */ ++ pr_debug("%u [tcp_send_active_reset]\n", tcp_jiffies32); + if (tcp_transmit_skb(sk, skb, 0, priority)) + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + } +@@ -3120,6 +3224,7 @@ int tcp_send_synack(struct sock *sk) + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; + tcp_ecn_send_synack(sk, skb); + } ++ pr_debug("%u [tcp_send_synack]\n", tcp_jiffies32); + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + } + +@@ -3399,6 +3504,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) + if (syn_data->len) + tcp_chrono_start(sk, TCP_CHRONO_BUSY); + ++ pr_debug("%u [tcp_send_syn_data]\n", tcp_jiffies32); + err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); + + syn->skb_mstamp = syn_data->skb_mstamp; +@@ -3420,6 +3526,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) + /* Send a regular SYN with Fast Open cookie request option */ + if (fo->cookie.len > 0) + fo->cookie.len = 0; ++ pr_debug("%u [tcp_send_syn_data] fallback \n", tcp_jiffies32); + err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); + if (err) + tp->syn_fastopen = 0; +@@ -3458,6 +3565,7 @@ int tcp_connect(struct sock *sk) + tcp_ecn_send_syn(sk, buff); + + /* Send off SYN; include data in Fast Open. */ ++ pr_debug("%u [tcp_connect]\n", tcp_jiffies32); + err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : + tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); + if (err == -ECONNREFUSED) +@@ -3572,6 +3680,7 @@ void tcp_send_ack(struct sock *sk) + skb_set_tcp_pure_ack(buff); + + /* Send it off, this clears delayed acks for us. */ ++ pr_debug("%u [tcp_send_ack]\n", tcp_jiffies32); + tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); + } + EXPORT_SYMBOL_GPL(tcp_send_ack); +@@ -3606,6 +3715,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) + */ + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + NET_INC_STATS(sock_net(sk), mib); ++ ++ pr_debug("%u [tcp_xmit_probe_skb]\n", tcp_jiffies32); + return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); + } + +@@ -3651,6 +3762,7 @@ int tcp_write_wakeup(struct sock *sk, int mib) + tcp_set_skb_tso_segs(skb, mss); + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; ++ pr_debug("%u [tcp_write_wakeup]\n", tcp_jiffies32); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (!err) + tcp_event_new_data_sent(sk, skb); +diff --git a/net/ipv4/tcp_wave.c b/net/ipv4/tcp_wave.c +new file mode 100644 +index 000000000000..9b48cbd84fe1 +--- /dev/null ++++ b/net/ipv4/tcp_wave.c +@@ -0,0 +1,1136 @@ ++/* ++ * TCP Wave ++ * ++ * Copyright 2017 Natale Patriciello <natale.patriciello@gmail.com> ++ * ++ * This program is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see <http://www.gnu.org/licenses/>. ++ * ++ */ ++ ++#define pr_fmt(fmt) "WAVE: " fmt ++ ++#include <linux/module.h> ++#include <net/net_namespace.h> ++#include <net/tcp.h> ++#include <linux/inet_diag.h> ++#include <linux/slab.h> ++#include <linux/proc_fs.h> ++ ++#ifdef DEBUG ++static bool enable_log = true; ++#else ++static bool enable_log = false; ++#endif ++ ++static uint init_burst __read_mostly = 10; ++static uint min_burst __read_mostly = 3; ++static uint init_timer_ms __read_mostly = 200; ++static uint beta_ms __read_mostly = 150; ++static int port __read_mostly = 0; ++static unsigned int bufsize __read_mostly = 4096; ++static const char procname[] = "tcpwave"; ++ ++module_param(init_burst, uint, 0644); ++MODULE_PARM_DESC(init_burst, "initial burst (segments)"); ++module_param(min_burst, uint, 0644); ++MODULE_PARM_DESC(min_burst, "minimum burst (segments)"); ++module_param(init_timer_ms, uint, 0644); ++MODULE_PARM_DESC(init_timer_ms, "initial timer (ms)"); ++module_param(beta_ms, uint, 0644); ++MODULE_PARM_DESC(beta_ms, "beta parameter (ms)"); ++module_param(port, int, 0); ++MODULE_PARM_DESC(port, "Port to match when logging (0=all)"); ++module_param(bufsize, uint, 0); ++MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); ++ ++/* Shift factor for the exponentially weighted average. */ ++#define AVG_SCALE 20 ++#define AVG_UNIT (1 << AVG_SCALE) ++ ++/* Taken from BBR */ ++#define BW_SCALE 24 ++#define BW_UNIT (1 << BW_SCALE) ++ ++/* Tell if the driver is initialized (init has been called) */ ++#define FLAG_INIT 0x1 ++/* Tell if, as sender, the driver is started (after TX_START) */ ++#define FLAG_START 0x2 ++/* If it's true, we save the sent size as a burst */ ++#define FLAG_SAVE 0x4 ++ ++/* Log struct */ ++struct wavetcp_log { ++ ktime_t tstamp; ++ union { ++ struct sockaddr raw; ++ struct sockaddr_in v4; ++ struct sockaddr_in6 v6; ++ } src, dst; ++ u32 tx_timer; ++ u16 burst; ++ u32 min_rtt; ++ u32 avg_rtt; ++ u32 max_rtt; ++}; ++ ++static struct { ++ spinlock_t lock; ++ wait_queue_head_t wait; ++ ktime_t start; ++ unsigned long head, tail; ++ struct wavetcp_log *log; ++} wavetcp_probe; ++ ++/* List for saving the size of sent burst over time */ ++struct wavetcp_burst_hist { ++ u16 size; /* The burst size */ ++ struct list_head list; /* Kernel list declaration */ ++}; ++ ++static bool test_flag(u8 flags, u8 value) ++{ ++ return (flags & value) == value; ++} ++ ++static void set_flag(u8 *flags, u8 value) ++{ ++ *flags |= value; ++} ++ ++static void clear_flag(u8 *flags, u8 value) ++{ ++ *flags &= ~(value); ++} ++ ++static bool ktime_is_null(ktime_t kt) ++{ ++ return ktime_compare(kt, ns_to_ktime(0)) == 0; ++} ++ ++/* TCP Wave private struct */ ++struct wavetcp { ++ u8 flags; /* The module flags */ ++ u32 tx_timer; /* The current transmission timer (us) */ ++ u8 burst; /* The current burst size (segments) */ ++ s8 delta_segments; /* Represents a delta from the burst size of segments sent */ ++ u16 pkts_acked; /* The segments acked in the round */ ++ u8 heuristic_scale; /* Heuristic scale, to divide the RTT */ ++ ktime_t previous_ack_t_disp; /* Previous ack_train_disp Value */ ++ ktime_t first_ack_time; /* First ACK time of the round */ ++ u32 backup_first_ack_time_us; /* Backup value of the first ack time */ ++ u32 first_rtt; /* First RTT of the round */ ++ u32 min_rtt; /* Minimum RTT of the round */ ++ u32 avg_rtt; /* Average RTT of the previous round */ ++ u32 max_rtt; /* Maximum RTT */ ++ u8 stab_factor; /* Stability factor */ ++ struct kmem_cache *cache; /* The memory cache for saving the burst sizes */ ++ struct wavetcp_burst_hist *history; /* The burst history */ ++ u16 sport; /* TCP Source Port of the connection */ ++}; ++ ++/* Called to setup Wave for the current socket after it enters the CONNECTED ++ * state (i.e., called after the SYN-ACK is received). The slow start should be ++ * 0 (see wavetcp_get_ssthresh) and we set the initial cwnd to the initial ++ * burst. ++ * ++ * After the ACK of the SYN-ACK is sent, the TCP will add a bit of delay to ++ * permit the queueing of data from the application, otherwise we will end up ++ * in a scattered situation (we have one segment -> send it -> no other segment, ++ * don't set the timer -> slightly after, another segment come and we loop). ++ * ++ * At the first expiration, the cwnd will be large enough to push init_burst ++ * segments out. ++ */ ++static void wavetcp_init(struct sock *sk) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ ca->sport = ntohs(inet_sk(sk)->inet_sport); ++ ++ pr_debug("%u sport: %u [%s]\n", tcp_jiffies32, ca->sport, ++ __func__); ++ ++ /* Setting the initial Cwnd to 0 will not call the TX_START event */ ++ tp->snd_ssthresh = 0; ++ tp->snd_cwnd = init_burst; ++ ++ /* Used to avoid to take the SYN-ACK measurements */ ++ ca->flags = 0; ++ ca->flags = FLAG_INIT | FLAG_SAVE; ++ ++ ca->burst = init_burst; ++ ca->delta_segments = init_burst; ++ ca->tx_timer = init_timer_ms * USEC_PER_MSEC; ++ ca->first_ack_time = ns_to_ktime(0); ++ ca->backup_first_ack_time_us = 0; ++ ca->heuristic_scale = 0; ++ ca->first_rtt = 0; ++ ca->min_rtt = -1; /* a lot of time */ ++ ca->avg_rtt = 0; ++ ca->max_rtt = 0; ++ ca->stab_factor = 0; ++ ca->previous_ack_t_disp = ns_to_ktime(0); ++ ++ ca->history = kmalloc(sizeof(*ca->history), GFP_KERNEL); ++ ++ /* Init the history of bwnd */ ++ INIT_LIST_HEAD(&ca->history->list); ++ ++ /* Init our cache pool for the bwnd history */ ++ ca->cache = KMEM_CACHE(wavetcp_burst_hist, 0); ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++} ++ ++static void wavetcp_release(struct sock *sk) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ struct wavetcp_burst_hist *tmp; ++ struct list_head *pos, *q; ++ ++ if (!test_flag(ca->flags, FLAG_INIT)) ++ return; ++ ++ pr_debug("%u sport: %u [%s]\n", tcp_jiffies32, ca->sport, ++ __func__); ++ ++ list_for_each_safe(pos, q, &ca->history->list) { ++ tmp = list_entry(pos, struct wavetcp_burst_hist, list); ++ list_del(pos); ++ kmem_cache_free(ca->cache, tmp); ++ } ++ ++ kfree(ca->history); ++ kmem_cache_destroy(ca->cache); ++} ++ ++static void wavetcp_print_history(struct wavetcp *ca) ++{ ++ struct wavetcp_burst_hist *tmp; ++ struct list_head *pos, *q; ++ ++ list_for_each_safe(pos, q, &ca->history->list) { ++ tmp = list_entry(pos, struct wavetcp_burst_hist, list); ++ pr_debug("[%s] %u\n", __func__, tmp->size); ++ } ++} ++ ++/* Please explain that we will be forever in congestion avoidance. */ ++static u32 wavetcp_recalc_ssthresh(struct sock *sk) ++{ ++ pr_debug("%u [%s]\n", tcp_jiffies32, __func__); ++ return 0; ++} ++ ++static void wavetcp_state(struct sock *sk, u8 new_state) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ ++ if (!test_flag(ca->flags, FLAG_INIT)) ++ return; ++ ++ switch (new_state) { ++ case TCP_CA_Open: ++ pr_debug("%u sport: %u [%s] set CA_Open\n", tcp_jiffies32, ++ ca->sport, __func__); ++ /* We have fully recovered, so reset some variables */ ++ ca->delta_segments = 0; ++ break; ++ default: ++ pr_debug("%u sport: %u [%s] set state %u, ignored\n", ++ tcp_jiffies32, ca->sport, __func__, new_state); ++ } ++} ++ ++static u32 wavetcp_undo_cwnd(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ /* Not implemented yet. We stick to the decision made earlier */ ++ pr_debug("%u [%s]\n", tcp_jiffies32, __func__); ++ return tp->snd_cwnd; ++} ++ ++/* Add the size of the burst in the history of bursts */ ++static void wavetcp_insert_burst(struct wavetcp *ca, u32 burst) ++{ ++ struct wavetcp_burst_hist *cur; ++ ++ pr_debug("%u sport: %u [%s] adding %u segment in the history of burst\n", ++ tcp_jiffies32, ca->sport, __func__, burst); ++ ++ /* Take the memory from the pre-allocated pool */ ++ cur = (struct wavetcp_burst_hist *)kmem_cache_alloc(ca->cache, ++ GFP_KERNEL); ++ BUG_ON(!cur); ++ ++ cur->size = burst; ++ list_add_tail(&cur->list, &ca->history->list); ++} ++ ++static void wavetcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ ++ if (!test_flag(ca->flags, FLAG_INIT)) ++ return; ++ ++ switch (event) { ++ case CA_EVENT_TX_START: ++ /* first transmit when no packets in flight */ ++ pr_debug("%u sport: %u [%s] TX_START\n", tcp_jiffies32, ++ ca->sport, __func__); ++ ++ set_flag(&ca->flags, FLAG_START); ++ ++ break; ++ default: ++ pr_debug("%u sport: %u [%s] got event %u, ignored\n", ++ tcp_jiffies32, ca->sport, __func__, event); ++ break; ++ } ++} ++ ++static void wavetcp_adj_mode(struct wavetcp *ca, ++ unsigned long delta_rtt) ++{ ++ ca->stab_factor = ca->avg_rtt / ca->tx_timer; ++ ++ ca->min_rtt = -1; /* a lot of time */ ++ ca->avg_rtt = ca->max_rtt; ++ ca->tx_timer = init_timer_ms * USEC_PER_MSEC; ++ ++ pr_debug("%u sport: %u [%s] stab_factor %u, timer %u us, avg_rtt %u us\n", ++ tcp_jiffies32, ca->sport, __func__, ca->stab_factor, ++ ca->tx_timer, ca->avg_rtt); ++} ++ ++static int wavetcp_probe_used(void) ++{ ++ return (wavetcp_probe.head - wavetcp_probe.tail) & (bufsize - 1); ++} ++ ++static int wavetcp_probe_avail(void) ++{ ++ return bufsize - wavetcp_probe_used() - 1; ++} ++ ++#define wavetcp_probe_copy_fl_to_si4(inet, si4, mem) \ ++ do { \ ++ si4.sin_family = AF_INET; \ ++ si4.sin_port = inet->inet_##mem##port; \ ++ si4.sin_addr.s_addr = inet->inet_##mem##addr; \ ++ } while (0) \ ++ ++static void wavetcp_tracking_mode(struct sock *sk, u64 delta_rtt, ++ ktime_t ack_train_disp) ++{ ++ const struct inet_sock *inet = inet_sk(sk); ++ struct wavetcp *ca = inet_csk_ca(sk); ++ ++ if (ktime_is_null(ack_train_disp)) { ++ pr_debug("%u sport: %u [%s] ack_train_disp is 0. Impossible to do tracking.\n", ++ tcp_jiffies32, ca->sport, __func__); ++ return; ++ } ++ ++ ca->tx_timer = (ktime_to_us(ack_train_disp) + (delta_rtt / 2)); ++ ++ if (ca->tx_timer == 0) { ++ pr_debug("%u sport: %u [%s] WARNING: tx timer is 0" ++ ", forcefully set it to 1000 us\n", ++ tcp_jiffies32, ca->sport, __func__); ++ ca->tx_timer = 1000; ++ } ++ ++ pr_debug("%u sport: %u [%s] tx timer is %u us\n", ++ tcp_jiffies32, ca->sport, __func__, ca->tx_timer); ++ ++ if (!enable_log) ++ return; ++ ++ if (port == 0 || ++ ntohs(inet->inet_dport) == port || ++ ntohs(inet->inet_sport) == port) { ++ ++ spin_lock(&wavetcp_probe.lock); ++ if (wavetcp_probe_avail() > 1) { ++ struct wavetcp_log *p = wavetcp_probe.log + wavetcp_probe.head; ++ p->tstamp = ktime_get(); ++ ++ switch (sk->sk_family) { ++ case AF_INET: ++ wavetcp_probe_copy_fl_to_si4(inet, p->src.v4, s); ++ wavetcp_probe_copy_fl_to_si4(inet, p->dst.v4, d); ++ break; ++ case AF_INET6: ++ memset(&p->src.v6, 0, sizeof(p->src.v6)); ++ memset(&p->dst.v6, 0, sizeof(p->dst.v6)); ++#if IS_ENABLED(CONFIG_IPV6) ++ p->src.v6.sin6_family = AF_INET6; ++ p->src.v6.sin6_port = inet->inet_sport; ++ p->src.v6.sin6_addr = inet6_sk(sk)->saddr; ++ ++ p->dst.v6.sin6_family = AF_INET6; ++ p->dst.v6.sin6_port = inet->inet_dport; ++ p->dst.v6.sin6_addr = sk->sk_v6_daddr; ++#endif ++ break; ++ default: ++ BUG(); ++ } ++ ++ p->tx_timer = ca->tx_timer; ++ p->burst = ca->burst; ++ p->min_rtt = ca->burst; ++ p->avg_rtt = ca->avg_rtt; ++ p->max_rtt = ca->max_rtt; ++ ++ wavetcp_probe.head = (wavetcp_probe.head + 1) & (bufsize - 1); ++ } ++ spin_unlock (&wavetcp_probe.lock); ++ ++ wake_up(&wavetcp_probe.wait); ++ } ++} ++ ++/* The weight a is: ++ * ++ * a = (first_rtt - min_rtt) / first_rtt ++ * ++ */ ++static u64 wavetcp_compute_weight(u32 first_rtt, u32 min_rtt) ++{ ++ u64 diff = first_rtt - min_rtt; ++ ++ diff = diff * AVG_UNIT; ++ ++ return diff / first_rtt; ++} ++ ++static ktime_t heuristic_ack_train_disp(struct wavetcp *ca, ++ const struct rate_sample *rs, ++ u32 burst) ++{ ++ ktime_t ack_train_disp = ns_to_ktime(0); ++ ktime_t interval = ns_to_ktime (0); ++ ktime_t backup_first_ack = ns_to_ktime(0); ++ ++ if (rs->interval_us <= 0) { ++ pr_debug("%u sport: %u [%s] WARNING is not possible " ++ "to heuristically calculate ack_train_disp, returning 0." ++ "Delivered %u, interval_us %li\n", ++ tcp_jiffies32, ca->sport, __func__, ++ rs->delivered, rs->interval_us); ++ return ack_train_disp; ++ } ++ ++ interval = ns_to_ktime(rs->interval_us * NSEC_PER_USEC); ++ backup_first_ack = ns_to_ktime(ca->backup_first_ack_time_us * NSEC_PER_USEC); ++ ++ /* ++ * The heuristic takes the RTT of the first ACK, the RTT of the ++ * latest ACK, and uses the difference as ack_train_disp. ++ * ++ * If the sample for the first and last ACK are the same (e.g., ++ * one ACK per burst) we use as the latest option the value of ++ * interval_us (which is the RTT). However, this value is ++ * exponentially lowered each time we don't have any valid ++ * sample (i.e., we perform a division by 2, by 4, and so on). ++ * The increased transmitted rate, if it is out of the capacity ++ * of the bottleneck, will be compensated by an higher ++ * delta_rtt, and so limited by the adjustment algorithm. This ++ * is a blind search, but we do not have any valid sample... ++ */ ++ if (ktime_compare(interval, backup_first_ack) > 0) { ++ /* first heuristic */ ++ ack_train_disp = ktime_sub(interval, backup_first_ack); ++ } else { ++ /* this branch avoids an overflow. However, reaching ++ * this point means that the ACK train is not aligned ++ * with the sent burst. ++ */ ++ ack_train_disp = ktime_sub(backup_first_ack, interval); ++ } ++ ++ if (ktime_is_null(ack_train_disp)) { ++ /* Blind search */ ++ u32 blind_interval_us = rs->interval_us >> ca->heuristic_scale; ++ ++ca->heuristic_scale; ++ ack_train_disp = ns_to_ktime(blind_interval_us * NSEC_PER_USEC); ++ pr_debug("%u sport: %u [%s] we received one BIG ack." ++ " Doing an heuristic with scale %u, interval_us" ++ " %li us, and setting ack_train_disp to %lli us\n", ++ tcp_jiffies32, ca->sport, __func__, ++ ca->heuristic_scale, rs->interval_us, ++ ktime_to_us(ack_train_disp)); ++ } else { ++ pr_debug("%u sport: %u [%s] we got the first ack with" ++ " interval %u us, the last (this) with interval %li us." ++ " Doing a substraction and setting ack_train_disp" ++ " to %lli us\n", ++ tcp_jiffies32, ca->sport, __func__, ++ ca->backup_first_ack_time_us, rs->interval_us, ++ ktime_to_us(ack_train_disp)); ++ } ++ ++ return ack_train_disp; ++} ++ ++static ktime_t filter_ack_train_disp(const struct wavetcp *ca, u64 delta_rtt_us, ++ ktime_t measured_ack_t_disp) ++{ ++ ktime_t filtered_ack_t_disp; ++ u64 alpha, left, right; ++ ++ alpha = (delta_rtt_us * AVG_UNIT) / (beta_ms * USEC_PER_MSEC); ++ left = ((AVG_UNIT - alpha) * ktime_to_us(ca->previous_ack_t_disp)) / AVG_UNIT; ++ right = (alpha * ktime_to_us(measured_ack_t_disp)) / AVG_UNIT; ++ ++ filtered_ack_t_disp = ns_to_ktime(((u32)left + (u32)right) * NSEC_PER_USEC); ++ ++ pr_debug("%u sport: %u [%s] AVG_UNIT %i delta_rtt %llu beta %i alpha %llu " ++ "measured_ack_train_disp %lli us prv_ack_train_disp %lli us left %llu right %llu, final %lli\n", ++ tcp_jiffies32, ca->sport, __func__, AVG_UNIT, delta_rtt_us, ++ beta_ms, alpha, ktime_to_us(measured_ack_t_disp), ++ ktime_to_us(ca->previous_ack_t_disp), ++ left, right, ktime_to_us(filtered_ack_t_disp)); ++ ++ return filtered_ack_t_disp; ++ ++} ++ ++static ktime_t calculate_ack_train_disp(struct wavetcp *ca, ++ const struct rate_sample *rs, ++ u32 burst, u64 delta_rtt_us) ++{ ++ ktime_t old_value_patd = ca->previous_ack_t_disp; ++ ktime_t ack_train_disp = ns_to_ktime(0); ++ ++ if (!ktime_is_null(ca->first_ack_time)) ++ ack_train_disp = ktime_sub(ktime_get(), ca->first_ack_time); ++ ++ if (ktime_is_null(ca->previous_ack_t_disp) && ++ ktime_is_null(ack_train_disp)) { ++ /* We received a cumulative ACK just after we sent the data, so ++ * the dispersion would be close to zero. Moreover, we don't ++ * have any valid sample from the past; in this case, we use ++ * an heuristic to calculate ack_train_disp. ++ */ ++ return heuristic_ack_train_disp(ca, rs, burst); ++ } ++ ++ /* resetting the heuristic scale because we have a real sample */ ++ ca->heuristic_scale = 0; ++ ++ if (ktime_is_null(ca->previous_ack_t_disp)) { ++ /* initialize the value */ ++ ca->previous_ack_t_disp = ack_train_disp; ++ } else if (ktime_compare(ack_train_disp, ca->previous_ack_t_disp) > 0) { ++ /* filter the measured value */ ++ return filter_ack_train_disp(ca, delta_rtt_us, ack_train_disp); ++ } else if (ktime_is_null(ack_train_disp)) { ++ /* Use the plain previous value */ ++ ack_train_disp = ca->previous_ack_t_disp; ++ } else { ++ /* In all other cases, update the previous value */ ++ ca->previous_ack_t_disp = ack_train_disp; ++ } ++ ++ pr_debug("%u sport: %u [%s] previous_ack_t_disp %lli us, measured ack_train_disp %lli us\n", ++ tcp_jiffies32, ca->sport, __func__, ++ ktime_to_us(old_value_patd), ++ ktime_to_us(ack_train_disp)); ++ ++ return ack_train_disp; ++} ++ ++static void calculate_avg_rtt(struct wavetcp *ca) ++{ ++ /* Why the if? ++ * ++ * a = (first_rtt - min_rtt) / first_rtt = 1 - (min_rtt/first_rtt) ++ * ++ * avg_rtt_0 = (1 - a) * first_rtt ++ * = (1 - (1 - (min_rtt/first_rtt))) * first_rtt ++ * = first_rtt - (first_rtt - min_rtt) ++ * = min_rtt ++ * ++ * ++ * And.. what happen in the else branch? We calculate first a (scaled by ++ * 1024), then do the substraction (1-a) by keeping in the consideration ++ * the scale, and in the end coming back to the result removing the ++ * scaling. ++ * ++ * We divide the equation ++ * ++ * AvgRtt = a * AvgRtt + (1-a)*Rtt ++ * ++ * in two part properly scaled, left and right, and then having a sum of ++ * the two parts to avoid (possible) overflow. ++ */ ++ if (ca->avg_rtt == 0) { ++ ca->avg_rtt = ca->min_rtt; ++ pr_debug("%u sport: %u [%s] calculated avg_rtt %u\n", tcp_jiffies32, ++ ca->sport, __func__, ca->avg_rtt); ++ } else if (ca->first_rtt > 0) { ++ u32 old_value = ca->avg_rtt; ++ u64 right; ++ u64 left; ++ u64 a; ++ ++ a = wavetcp_compute_weight(ca->first_rtt, ca->min_rtt); ++ ++ left = (a * ca->avg_rtt) / AVG_UNIT; ++ right = ((AVG_UNIT - a) * ca->first_rtt) / AVG_UNIT; ++ ++ ca->avg_rtt = (u32)left + (u32)right; ++ ++ pr_debug("%u sport: %u [%s] previous avg %u us, first_rtt %u us, " ++ "min %u us, a (shifted) %llu, calculated avg %u us\n", ++ tcp_jiffies32, ca->sport, __func__, ++ old_value, ca->first_rtt, ca->min_rtt, a, ca->avg_rtt); ++ } else { ++ pr_debug("%u sport: %u [%s] Can't calculate avg_rtt.\n", ++ tcp_jiffies32, ca->sport, __func__); ++ } ++} ++ ++static u64 calculate_delta_rtt(struct wavetcp *ca) ++{ ++ return ca->avg_rtt - ca->min_rtt; ++} ++ ++static void wavetcp_round_terminated(struct sock *sk, const struct rate_sample *rs, ++ u32 burst) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ u64 delta_rtt_us; ++ ++ /* If the round terminates without a sample of RTT, use the average */ ++ if (ca->first_rtt == 0) { ++ ca->first_rtt = ca->avg_rtt; ++ pr_debug("%u sport: %u [%s] Using the average value for first_rtt %u\n", ++ tcp_jiffies32, ca->sport, __func__, ca->first_rtt); ++ } ++ ++ calculate_avg_rtt(ca); ++ ++ /* If we have to wait, let's wait */ ++ if (ca->stab_factor > 0) { ++ --ca->stab_factor; ++ pr_debug("%u sport: %u [%s] reached burst %u, not applying (stab left: %u)\n", ++ tcp_jiffies32, ca->sport, __func__, burst, ++ ca->stab_factor); ++ return; ++ } ++ ++ delta_rtt_us = calculate_delta_rtt(ca); ++ ++ pr_debug("%u sport: %u [%s] reached burst %u, drtt %llu\n", ++ tcp_jiffies32, ca->sport, __func__, burst, delta_rtt_us); ++ ++ /* delta_rtt_us is in us, beta_ms in ms */ ++ if (delta_rtt_us > beta_ms * USEC_PER_MSEC) ++ wavetcp_adj_mode(ca, delta_rtt_us); ++ else ++ wavetcp_tracking_mode(sk, delta_rtt_us, ++ calculate_ack_train_disp(ca, rs, burst, ++ delta_rtt_us)); ++} ++ ++static void wavetcp_cong_control(struct sock *sk, const struct rate_sample *rs) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ struct wavetcp_burst_hist *tmp; ++ struct list_head *pos; ++ ++ if (!test_flag(ca->flags, FLAG_INIT)) ++ return; ++ ++ if (ca->backup_first_ack_time_us == 0 && rs->interval_us > 0) ++ ca->backup_first_ack_time_us = rs->interval_us; ++ ++ pos = ca->history->list.next; ++ tmp = list_entry(pos, struct wavetcp_burst_hist, list); ++ ++ if (tmp->size == 0) { ++ /* No burst in memory. Most likely we sent some segments out of ++ * the allowed window (e.g., loss probe) */ ++ pr_debug("%u sport: %u [%s] WARNING! empty burst\n", ++ tcp_jiffies32, ca->sport, __func__); ++ wavetcp_print_history(ca); ++ goto reset; ++ } ++ ++ pr_debug("%u sport: %u [%s] prior_delivered %u, delivered %i, interval_us %li, " ++ "rtt_us %li, losses %i, ack_sack %u, prior_in_flight %u, is_app %i," ++ " is_retrans %i\n", tcp_jiffies32, ca->sport, __func__, ++ rs->prior_delivered, rs->delivered, rs->interval_us, rs->rtt_us, ++ rs->losses, rs->acked_sacked, rs->prior_in_flight, ++ rs->is_app_limited, rs->is_retrans); ++ ++ /* Train management.*/ ++ ca->pkts_acked += rs->acked_sacked; ++ ++ if (ca->pkts_acked < tmp->size) ++ return; ++ ++ while (ca->pkts_acked >= tmp->size) { ++ /* Usually the burst end is also reflected in the rs->delivered ++ * variable. If this is not the case, and such variable is ++ * behind just for 1 segment, then do this experimental thing ++ * to re-allineate the burst with the rs->delivered variable. ++ * In the majority of cases, we went out of allineation because ++ * of a tail loss probe. ++ */ ++ if (rs->delivered + 1 == tmp->size) { ++ pr_debug("%u sport: %u [%s] highly experimental:" ++ " ignore 1 pkt. pkts_acked %u, delivered %u," ++ " burst %u\n", tcp_jiffies32, ca->sport, __func__, ++ ca->pkts_acked, rs->delivered, tmp->size); ++ ca->pkts_acked--; ++ return; ++ } ++ wavetcp_round_terminated(sk, rs, tmp->size); ++ ++ BUG_ON(ca->pkts_acked < tmp->size); ++ ++ ca->pkts_acked -= tmp->size; ++ ++ /* Delete the burst from the history */ ++ list_del(pos); ++ kmem_cache_free(ca->cache, tmp); ++ ++ /* Take next burst */ ++ pos = ca->history->list.next; ++ tmp = list_entry(pos, struct wavetcp_burst_hist, list); ++ ++ /* If we cycle, inside wavetcp_round_terminated we will take the ++ * Linux path instead of the wave path.. first_rtt will not be ++ * read, so don't waste a cycle to set it */ ++ ca->first_ack_time = ktime_get(); ++ ca->backup_first_ack_time_us = 0; ++ } ++ ++reset: ++ /* Reset the variables needed for the beginning of the next round*/ ++ ca->first_ack_time = ns_to_ktime(0); ++ ca->backup_first_ack_time_us = 0; ++ ca->first_rtt = 0; ++ pr_debug("%u sport: %u [%s] resetting RTT values for next round\n", ++ tcp_jiffies32, ca->sport, __func__); ++} ++ ++static void wavetcp_acce(struct wavetcp *ca, s32 rtt_us, u32 pkts_acked) ++{ ++ if (ktime_is_null(ca->first_ack_time)) { ++ ca->first_ack_time = ktime_get(); ++ pr_debug("%u sport: %u [%s] first ack of the train\n", ++ tcp_jiffies32, ca->sport, __func__); ++ } ++ ++ if (rtt_us <= 0) ++ return; ++ ++ if (ca->first_rtt == 0) { ++ ca->first_rtt = rtt_us; ++ ++ pr_debug("%u sport: %u [%s] first measurement rtt %i\n", ++ tcp_jiffies32, ca->sport, __func__, ++ ca->first_rtt); ++ } ++ ++ /* Check the minimum rtt we have seen */ ++ if (rtt_us < ca->min_rtt) { ++ ca->min_rtt = rtt_us; ++ pr_debug("%u sport: %u [%s] min rtt %u\n", tcp_jiffies32, ++ ca->sport, __func__, rtt_us); ++ } ++ ++ if (rtt_us > ca->max_rtt) ++ ca->max_rtt = rtt_us; ++} ++ ++/* Invoked each time we receive an ACK. Obviously, this function also gets ++ * called when we receive the SYN-ACK, but we ignore it thanks to the ++ * FLAG_INIT flag. ++ * ++ * We close the cwnd of the amount of segments acked, because we don't like ++ * sending out segments if the timer is not expired. Without doing this, we ++ * would end with cwnd - in_flight > 0. ++ */ ++static void wavetcp_acked(struct sock *sk, const struct ack_sample *sample) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (!test_flag(ca->flags, FLAG_INIT)) ++ return; ++ ++ /* We can divide the ACCE function in two part: the first take care of ++ * the RTT, and the second of the train management. Here we could have ++ * pkts_acked == 0, but with RTT values (because the underlying TCP can ++ * identify what segment has been ACKed through the SACK option). In any ++ * case, therefore, we enter wavetcp_acce.*/ ++ wavetcp_acce(ca, sample->rtt_us, sample->pkts_acked); ++ ++ if (tp->snd_cwnd < sample->pkts_acked) { ++ /* We sent some scattered segments, so the burst segments and ++ * the ACK we get is not aligned. ++ */ ++ pr_debug("%u sport: %u [%s] delta_seg %i\n", ++ tcp_jiffies32, ca->sport, __func__, ++ ca->delta_segments); ++ ++ ca->delta_segments += sample->pkts_acked - tp->snd_cwnd; ++ } ++ ++ pr_debug("%u sport: %u [%s] pkts_acked %u, rtt_us %i, in_flight %u " ++ ", cwnd %u, seq ack %u, delta %i\n", ++ tcp_jiffies32, ca->sport, __func__, sample->pkts_acked, ++ sample->rtt_us, sample->in_flight, tp->snd_cwnd, tp->snd_una, ++ ca->delta_segments); ++ ++ /* Brutally set the cwnd in order to not let segment out */ ++ tp->snd_cwnd = tcp_packets_in_flight(tp); ++} ++ ++/* The TCP informs us that the timer is expired (or has never been set). We can ++ * infer the latter by the FLAG_STARTED flag: if it's false, don't increase the ++ * cwnd, because it is at its default value (init_burst) and we still have to ++ * transmit the first burst. ++ */ ++static void wavetcp_timer_expired(struct sock *sk) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 current_burst = ca->burst; ++ ++ if (!test_flag(ca->flags, FLAG_START) || !test_flag(ca->flags, FLAG_INIT)) { ++ pr_debug("%u sport: %u [%s] returning because of flags, leaving cwnd %u\n", ++ tcp_jiffies32, ca->sport, __func__, tp->snd_cwnd); ++ return; ++ } ++ ++ pr_debug("%u sport: %u [%s] starting with delta %u current_burst %u\n", ++ tcp_jiffies32, ca->sport, __func__, ca->delta_segments, ++ current_burst); ++ ++ if (ca->delta_segments < 0) { ++ /* In the previous round, we sent more than the allowed burst, ++ * so reduce the current burst. ++ */ ++ BUG_ON(current_burst > ca->delta_segments); ++ current_burst += ca->delta_segments; /* please *reduce* */ ++ ++ /* Right now, we should send "current_burst" segments out */ ++ ++ if (tcp_packets_in_flight(tp) > tp->snd_cwnd) { ++ /* For some reasons (e.g., tcp loss probe) ++ * we sent something outside the allowed window. ++ * Add the amount of segments into the burst, in order ++ * to effectively send the previous "current_burst" ++ * segments, but without touching delta_segments. ++ */ ++ u32 diff = tcp_packets_in_flight(tp) - tp->snd_cwnd; ++ ++ current_burst += diff; ++ pr_debug("%u sport: %u [%s] adding %u to balance " ++ "segments sent out of window", tcp_jiffies32, ++ ca->sport, __func__, diff); ++ } ++ } ++ ++ ca->delta_segments = current_burst; ++ pr_debug("%u sport: %u [%s] setting delta_seg %u current burst %u\n", ++ tcp_jiffies32, ca->sport, __func__, ++ ca->delta_segments, current_burst); ++ ++ if (current_burst < min_burst) { ++ pr_debug("%u sport: %u [%s] WARNING !! not min_burst", ++ tcp_jiffies32, ca->sport, __func__); ++ ca->delta_segments += min_burst - current_burst; ++ current_burst = min_burst; ++ } ++ ++ tp->snd_cwnd += current_burst; ++ set_flag(&ca->flags, FLAG_SAVE); ++ ++ pr_debug("%u sport: %u [%s], increased window of %u segments, " ++ "total %u, delta %i, in_flight %u\n", ++ tcp_jiffies32, ca->sport, __func__, ca->burst, ++ tp->snd_cwnd, ca->delta_segments, tcp_packets_in_flight(tp)); ++ ++ if (tp->snd_cwnd - tcp_packets_in_flight(tp) > current_burst) { ++ pr_debug("%u sport: %u [%s] WARNING! " ++ " cwnd %u, in_flight %u, current burst %u\n", ++ tcp_jiffies32, ca->sport, __func__, ++ tp->snd_cwnd, tcp_packets_in_flight(tp), ++ current_burst); ++ } ++} ++ ++static u64 wavetcp_get_timer(struct sock *sk) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ u64 timer; ++ ++ BUG_ON(!test_flag(ca->flags, FLAG_INIT)); ++ ++ timer = min_t(u64, ++ ca->tx_timer * NSEC_PER_USEC, ++ init_timer_ms * NSEC_PER_MSEC); ++ ++ pr_debug("%u sport: %u [%s] returning timer of %llu ns\n", ++ tcp_jiffies32, ca->sport, __func__, timer); ++ ++ return timer; ++} ++ ++static void wavetcp_segment_sent(struct sock *sk, u32 sent) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct wavetcp *ca = inet_csk_ca(sk); ++ ++ if (!test_flag(ca->flags, FLAG_START)) { ++ pr_debug ("%u sport: %u [%s] !START\n", ++ tcp_jiffies32, ca->sport, __func__); ++ return; ++ } ++ ++ /* Don't save if we sent less than min_burst. Helpful at the ++ * beginning, when there is only one data segment ready */ ++ if (test_flag(ca->flags, FLAG_SAVE) && sent > min_burst) { ++ wavetcp_insert_burst(ca, sent); ++ clear_flag(&ca->flags, FLAG_SAVE); ++ } else { ++ pr_debug("%u sport: %u [%s] not saving burst, sent %u\n", ++ tcp_jiffies32, ca->sport, __func__, sent); ++ } ++ ++ if (sent > ca->burst) { ++ pr_debug("%u sport: %u [%s] WARNING! sent %u, burst %u" ++ " cwnd %u delta_seg %i\n, TSO very probable", ++ tcp_jiffies32, ca->sport, __func__, sent, ++ ca->burst, tp->snd_cwnd, ca->delta_segments); ++ } ++ ++ ca->delta_segments -= sent; ++ ++ if (ca->delta_segments >= 0 && ++ ca->burst > sent && ++ tcp_packets_in_flight(tp) <= tp->snd_cwnd) { ++ /* Reduce the cwnd accordingly, because we didn't sent enough ++ * to cover it (we are app limited probably) */ ++ u32 diff = ca->burst - sent; ++ ++ if (tp->snd_cwnd >= diff) ++ tp->snd_cwnd -= diff; ++ else ++ tp->snd_cwnd = 0; ++ pr_debug("%u sport: %u [%s] reducing cwnd by %u, value %u\n", ++ tcp_jiffies32, ca->sport, __func__, ++ ca->burst - sent, tp->snd_cwnd); ++ } ++} ++ ++static size_t wavetcp_get_info(struct sock *sk, u32 ext, int *attr, ++ union tcp_cc_info *info) ++{ ++ pr_debug("%u [%s] ext=%u", tcp_jiffies32, __func__, ext); ++ ++ if (ext & (1 << (INET_DIAG_WAVEINFO - 1))) { ++ struct wavetcp *ca = inet_csk_ca(sk); ++ ++ memset(&info->wave, 0, sizeof(info->wave)); ++ info->wave.tx_timer = ca->tx_timer; ++ info->wave.burst = ca->burst; ++ info->wave.previous_ack_t_disp = ca->previous_ack_t_disp; ++ info->wave.min_rtt = ca->min_rtt; ++ info->wave.avg_rtt = ca->avg_rtt; ++ info->wave.max_rtt = ca->max_rtt; ++ *attr = INET_DIAG_WAVEINFO; ++ return (sizeof(info->wave)); ++ } ++ return 0; ++} ++ ++static void wavetcp_no_data(struct sock *sk) ++{ ++ pr_debug("%u [%s]\n", tcp_jiffies32, __func__); ++} ++ ++static u32 wavetcp_sndbuf_expand(struct sock *sk) ++{ ++ return 10; ++} ++ ++static u32 wavetcp_get_segs_per_round(struct sock *sk) ++{ ++ struct wavetcp *ca = inet_csk_ca(sk); ++ ++ return ca->burst; ++} ++ ++static struct tcp_congestion_ops wave_cong_tcp __read_mostly = { ++ .init = wavetcp_init, ++ .get_info = wavetcp_get_info, ++ .release = wavetcp_release, ++ .ssthresh = wavetcp_recalc_ssthresh, ++/* .cong_avoid = wavetcp_cong_avoid, */ ++ .cong_control = wavetcp_cong_control, ++ .set_state = wavetcp_state, ++ .undo_cwnd = wavetcp_undo_cwnd, ++ .cwnd_event = wavetcp_cwnd_event, ++ .pkts_acked = wavetcp_acked, ++ .sndbuf_expand = wavetcp_sndbuf_expand, ++ .owner = THIS_MODULE, ++ .name = "wave", ++ .get_pacing_time = wavetcp_get_timer, ++ .pacing_timer_expired = wavetcp_timer_expired, ++ .no_data_to_transmit = wavetcp_no_data, ++ .get_segs_per_round = wavetcp_get_segs_per_round, ++ .segments_sent = wavetcp_segment_sent, ++}; ++ ++ ++static int wavetcp_log_open(struct inode *inode, struct file *file) ++{ ++ /* Reset (empty) log */ ++ spin_lock_bh(&wavetcp_probe.lock); ++ wavetcp_probe.head = wavetcp_probe.tail = 0; ++ wavetcp_probe.start = ktime_get(); ++ spin_unlock_bh(&wavetcp_probe.lock); ++ ++ return 0; ++} ++ ++static int wavetcp_log_sprint(char *tbuf, int n) ++{ ++ const struct wavetcp_log *p = wavetcp_probe.log + wavetcp_probe.tail; ++ struct timespec64 ts = ktime_to_timespec64(ktime_sub(p->tstamp, ++ wavetcp_probe.start)); ++ ++ return scnprintf(tbuf, n, ++ "%lu.%09lu %pISpc %pISpc %u %u %u %u %u\n", ++ (unsigned long)ts.tv_sec, ++ (unsigned long)ts.tv_nsec, ++ &p->src, &p->dst, p->tx_timer, p->burst, ++ p->min_rtt, p->avg_rtt, p->max_rtt); ++} ++ ++static ssize_t wavetcp_log_read(struct file *file, char __user *buf, ++ size_t len, loff_t *ppos) ++{ ++ int error = 0; ++ size_t cnt = 0; ++ ++ if (!buf) ++ return -EINVAL; ++ ++ while (cnt < len) { ++ char tbuf[256]; ++ int width; ++ ++ /* Wait for data in buffer */ ++ error = wait_event_interruptible(wavetcp_probe.wait, ++ wavetcp_probe_used() > 0); ++ if (error) ++ break; ++ ++ spin_lock_bh(&wavetcp_probe.lock); ++ if (wavetcp_probe.head == wavetcp_probe.tail) { ++ /* multiple readers race? */ ++ spin_unlock_bh(&wavetcp_probe.lock); ++ continue; ++ } ++ ++ width = wavetcp_log_sprint(tbuf, sizeof(tbuf)); ++ ++ if (cnt + width < len) ++ wavetcp_probe.tail = (wavetcp_probe.tail + 1) & (bufsize - 1); ++ ++ spin_unlock_bh(&wavetcp_probe.lock); ++ ++ /* if record greater than space available ++ return partial buffer (so far) */ ++ if (cnt + width >= len) ++ break; ++ ++ if (copy_to_user(buf + cnt, tbuf, width)) ++ return -EFAULT; ++ cnt += width; ++ } ++ ++ return cnt == 0 ? error : cnt; ++} ++ ++static const struct file_operations tcpwave_fops = { ++ .owner = THIS_MODULE, ++ .open = wavetcp_log_open, ++ .read = wavetcp_log_read, ++ .llseek = noop_llseek, ++}; ++ ++static int __init wavetcp_register(void) ++{ ++ BUILD_BUG_ON(sizeof(struct wavetcp) > ICSK_CA_PRIV_SIZE); ++ ++ if (!enable_log) ++ return tcp_register_congestion_control(&wave_cong_tcp); ++ ++ /* wave log initialization */ ++ ++ init_waitqueue_head(&wavetcp_probe.wait); ++ spin_lock_init(&wavetcp_probe.lock); ++ ++ if (bufsize == 0) ++ return -EINVAL; ++ ++ bufsize = roundup_pow_of_two(bufsize); ++ wavetcp_probe.log = kcalloc(bufsize, sizeof(struct wavetcp_log), GFP_KERNEL); ++ ++ if (!wavetcp_probe.log) ++ goto leave; ++ ++ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpwave_fops)) ++ goto freemem; ++ ++ return tcp_register_congestion_control(&wave_cong_tcp); ++ ++freemem: ++ kfree(wavetcp_probe.log); ++leave: ++ return -ENOMEM; ++} ++ ++static void __exit wavetcp_unregister(void) ++{ ++ if (enable_log) { ++ remove_proc_entry(procname, init_net.proc_net); ++ kfree(wavetcp_probe.log); ++ } ++ ++ tcp_unregister_congestion_control(&wave_cong_tcp); ++} ++ ++module_init(wavetcp_register); ++module_exit(wavetcp_unregister); ++ ++MODULE_AUTHOR("Natale Patriciello"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("WAVE TCP"); ++MODULE_VERSION("0.1"); |