summarylogtreecommitdiffstats
path: root/tcp_wave.patch
diff options
context:
space:
mode:
authorJJD2017-10-03 22:15:13 +0200
committerJJD2017-10-03 22:15:13 +0200
commitec1bfe68a246e55ded1bd048a248b11010db030d (patch)
tree20cf76c91cf5cf3b8f4e179aee5aa5632d239184 /tcp_wave.patch
downloadaur-ec1bfe68a246e55ded1bd048a248b11010db030d.tar.gz
initial commit. Linux wave is a linux version which contains the TCP Wave congestion control module
Diffstat (limited to 'tcp_wave.patch')
-rw-r--r--tcp_wave.patch1699
1 files changed, 1699 insertions, 0 deletions
diff --git a/tcp_wave.patch b/tcp_wave.patch
new file mode 100644
index 000000000000..74b533c5c19c
--- /dev/null
+++ b/tcp_wave.patch
@@ -0,0 +1,1699 @@
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 1c3feffb1c1c..34fe18d467cd 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -12724,6 +12724,12 @@ W: http://tcp-lp-mod.sourceforge.net/
+ S: Maintained
+ F: net/ipv4/tcp_lp.c
+
++TCP WAVE MODULE
++M: "Natale Patriciello" <natale.patriciello@gmail.com>
++W: http://tlcsat.uniroma2.it/tcpwave4linux/
++S: Maintained
++F: net/ipv4/tcp_wave.c
++
+ TDA10071 MEDIA DRIVER
+ M: Antti Palosaari <crope@iki.fi>
+ L: linux-media@vger.kernel.org
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index f642a39f9eee..955a5233d94e 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -988,6 +988,16 @@ struct tcp_congestion_ops {
+ /* get info for inet_diag (optional) */
+ size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
+ union tcp_cc_info *info);
++ /* get the expiration time for the pacing timer (optional) */
++ u64 (*get_pacing_time)(struct sock *sk);
++ /* no data to transmit at the pacing timer expiration time (optional) */
++ void (*no_data_to_transmit)(struct sock *sk);
++ /* the pacing timer is expired (optional) */
++ void (*pacing_timer_expired)(struct sock *sk);
++ /* get the # segs to send out when the timer expires (optional) */
++ u32 (*get_segs_per_round)(struct sock *sk);
++ /* the TCP has sent some segments (optional) */
++ void (*segments_sent)(struct sock *sk, u32 sent);
+
+ char name[TCP_CA_NAME_MAX];
+ struct module *owner;
+diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
+index bbe201047df6..9e755cff2c3d 100644
+--- a/include/uapi/linux/inet_diag.h
++++ b/include/uapi/linux/inet_diag.h
+@@ -142,6 +142,7 @@ enum {
+ INET_DIAG_PAD,
+ INET_DIAG_MARK,
+ INET_DIAG_BBRINFO,
++ INET_DIAG_WAVEINFO,
+ __INET_DIAG_MAX,
+ };
+
+@@ -186,9 +187,21 @@ struct tcp_bbr_info {
+ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */
+ };
+
++/* INET_DIAG_WAVEINFO */
++
++struct tcp_wave_info {
++ __u32 tx_timer;
++ __u16 burst;
++ __u32 previous_ack_t_disp;
++ __u32 min_rtt;
++ __u32 avg_rtt;
++ __u32 max_rtt;
++};
++
+ union tcp_cc_info {
+ struct tcpvegas_info vegas;
+ struct tcp_dctcp_info dctcp;
+ struct tcp_bbr_info bbr;
++ struct tcp_wave_info wave;
+ };
+ #endif /* _UAPI_INET_DIAG_H_ */
+diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
+index 91a2557942fa..de23b3a04b98 100644
+--- a/net/ipv4/Kconfig
++++ b/net/ipv4/Kconfig
+@@ -492,6 +492,18 @@ config TCP_CONG_BIC
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
++config TCP_CONG_WAVE
++ tristate "Wave TCP"
++ default m
++ ---help---
++ TCP Wave (TCPW) replaces the window-based transmission paradigm of the
++ standard TCP with a burst-based transmission, the ACK-clock scheduling
++ with a self-managed timer and the RTT-based congestion control loop with
++ an Ack-based Capacity and Congestion Estimation (ACCE) module. In
++ non-technical words, it sends data down the stack when its internal
++ timer expires, and the timing of the received ACKs contribute to
++ updating this timer regularly.
++
+ config TCP_CONG_CUBIC
+ tristate "CUBIC TCP"
+ default y
+@@ -690,6 +702,9 @@ choice
+ config DEFAULT_CUBIC
+ bool "Cubic" if TCP_CONG_CUBIC=y
+
++ config DEFAULT_WAVE
++ bool "Wave" if TCP_CONG_WAVE=y
++
+ config DEFAULT_HTCP
+ bool "Htcp" if TCP_CONG_HTCP=y
+
+@@ -729,6 +744,7 @@ config DEFAULT_TCP_CONG
+ string
+ default "bic" if DEFAULT_BIC
+ default "cubic" if DEFAULT_CUBIC
++ default "wave" if DEFAULT_WAVE
+ default "htcp" if DEFAULT_HTCP
+ default "hybla" if DEFAULT_HYBLA
+ default "vegas" if DEFAULT_VEGAS
+diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
+index afcb435adfbe..e82ba69b19a9 100644
+--- a/net/ipv4/Makefile
++++ b/net/ipv4/Makefile
+@@ -47,6 +47,7 @@ obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
+ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+ obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
+ obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
++obj-$(CONFIG_TCP_CONG_WAVE) += tcp_wave.o
+ obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
+ obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+@@ -61,5 +62,8 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
+ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+
++CFLAGS_tcp_wave.o := -DDEBUG
++CFLAGS_tcp_output.o := -DDEBUG
++
+ obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+ xfrm4_output.o xfrm4_protocol.o
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index a3e91b552edc..6c1f384b3ba2 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -994,9 +994,9 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+- } else if (skb == tcp_send_head(sk))
+ tcp_push_one(sk, mss_now);
++ } else if (skb == tcp_send_head(sk))
++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+ continue;
+
+ wait_for_sndbuf:
+@@ -1340,9 +1340,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+
+ if (forced_push(tp)) {
+ tcp_mark_push(tp, skb);
+- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+- } else if (skb == tcp_send_head(sk))
+ tcp_push_one(sk, mss_now);
++ } else if (skb == tcp_send_head(sk))
++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+ continue;
+
+ wait_for_sndbuf:
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index b7661a68d498..ef9dbd0af283 100644
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -42,6 +42,22 @@
+ #include <linux/gfp.h>
+ #include <linux/module.h>
+
++static const char *header_flags[5] = { "[SYN]", "[SYN|ACK]",
++ "[ACK]", "[FIN|ACK]", "[UNK]" };
++static inline const char *print_tcp_header_flags(__u8 flags)
++{
++ if (flags & TCPHDR_SYN && !(flags & TCPHDR_ACK))
++ return header_flags[0];
++ else if (flags & TCPHDR_SYN && flags & TCPHDR_ACK)
++ return header_flags[1];
++ else if (flags & TCPHDR_FIN)
++ return header_flags[3];
++ else if (flags & TCPHDR_ACK)
++ return header_flags[2];
++ else
++ return header_flags[4];
++}
++
+ /* People can turn this off for buggy TCP's found in printers etc. */
+ int sysctl_tcp_retrans_collapse __read_mostly = 1;
+
+@@ -742,6 +758,7 @@ static void tcp_tsq_handler(struct sock *sk)
+ tp->snd_cwnd > tcp_packets_in_flight(tp))
+ tcp_xmit_retransmit_queue(sk);
+
++ pr_debug("%u [tcp_tsq_handler]\n", tcp_jiffies32);
+ tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
+ 0, GFP_ATOMIC);
+ }
+@@ -950,22 +967,38 @@ static bool tcp_needs_internal_pacing(const struct sock *sk)
+ return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
+ }
+
++static bool tcp_pacing_timer_check(const struct sock *sk)
++{
++ return hrtimer_active(&tcp_sk(sk)->pacing_timer);
++}
++
+ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+ {
++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ u64 len_ns;
+- u32 rate;
+
+ if (!tcp_needs_internal_pacing(sk))
+ return;
+- rate = sk->sk_pacing_rate;
+- if (!rate || rate == ~0U)
+- return;
+-
+- /* Should account for header sizes as sch_fq does,
+- * but lets make things simple.
+- */
+- len_ns = (u64)skb->len * NSEC_PER_SEC;
+- do_div(len_ns, rate);
++
++ if (ca_ops && ca_ops->get_pacing_time) {
++ if (tcp_pacing_timer_check(sk))
++ return;
++
++ len_ns = ca_ops->get_pacing_time(sk);
++ } else {
++ u32 rate = sk->sk_pacing_rate;
++ if (!rate || rate == ~0U)
++ return;
++
++ /* Should account for header sizes as sch_fq does,
++ * but lets make things simple.
++ */
++ len_ns = (u64)skb->len * NSEC_PER_SEC;
++ do_div(len_ns, rate);
++ }
++
++ pr_debug("%u [%s] len_ns=%llu\n", tcp_jiffies32, __func__, len_ns);
++
+ hrtimer_start(&tcp_sk(sk)->pacing_timer,
+ ktime_add_ns(ktime_get(), len_ns),
+ HRTIMER_MODE_ABS_PINNED);
+@@ -994,6 +1027,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ struct tcp_md5sig_key *md5;
+ struct tcphdr *th;
+ int err;
++ u8 flags;
+
+ BUG_ON(!skb || !tcp_skb_pcount(skb));
+ tp = tcp_sk(sk);
+@@ -1062,6 +1096,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ th->check = 0;
+ th->urg_ptr = 0;
+
++ flags = tcb->tcp_flags;
++
+ /* The urg_mode check is necessary during a below snd_una win probe */
+ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+ if (before(tp->snd_up, tcb->seq + 0x10000)) {
+@@ -1122,6 +1158,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+
++ pr_debug("%u [tcp_transmit_skb] seq=%u, ack=%u, window=%u, len=%u flags=%s err=%i \n",
++ tcp_jiffies32, ntohl(th->seq), ntohl(th->ack_seq),
++ ntohs(th->window), skb->len, print_tcp_header_flags(flags), err);
++
+ if (likely(err <= 0))
+ return err;
+
+@@ -2135,6 +2175,7 @@ static int tcp_mtu_probe(struct sock *sk)
+ /* We're ready to send. If this fails, the probe will
+ * be resegmented into mss-sized pieces by tcp_write_xmit().
+ */
++ pr_debug("%u [tcp_mtu_probe] sending a probe\n", tcp_jiffies32);
+ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+ /* Decrement cwnd here because we are sending
+ * effectively two packets. */
+@@ -2154,7 +2195,7 @@ static int tcp_mtu_probe(struct sock *sk)
+ static bool tcp_pacing_check(const struct sock *sk)
+ {
+ return tcp_needs_internal_pacing(sk) &&
+- hrtimer_active(&tcp_sk(sk)->pacing_timer);
++ tcp_pacing_timer_check(sk);
+ }
+
+ /* TCP Small Queues :
+@@ -2258,6 +2299,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int push_one, gfp_t gfp)
+ {
++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ unsigned int tso_segs, sent_pkts;
+@@ -2265,6 +2307,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ int result;
+ bool is_cwnd_limited = false, is_rwnd_limited = false;
+ u32 max_segs;
++ u32 pacing_allowed_segs = 0;
++ bool notify = false;
+
+ sent_pkts = 0;
+
+@@ -2280,11 +2324,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+
+ max_segs = tcp_tso_segs(sk, mss_now);
+ tcp_mstamp_refresh(tp);
++
++ if (!tcp_pacing_timer_check(sk)) {
++ pacing_allowed_segs = 1;
++ if (ca_ops) {
++ if (ca_ops->pacing_timer_expired) {
++ ca_ops->pacing_timer_expired(sk);
++ notify = true;
++ }
++ if (ca_ops->get_segs_per_round)
++ pacing_allowed_segs = ca_ops->get_segs_per_round(sk);
++ }
++ } else {
++ pr_debug("%u [%s] timer running\n", tcp_jiffies32, __func__);
++ }
++
+ while ((skb = tcp_send_head(sk))) {
+ unsigned int limit;
+
+- if (tcp_pacing_check(sk))
++ pr_debug("%u [%s] allowed=%u sent=%u, inflight=%u, cwnd=%u\n", tcp_jiffies32, __func__,
++ pacing_allowed_segs, sent_pkts, tcp_packets_in_flight(tp),
++ tp->snd_cwnd);
++
++ if (sent_pkts > pacing_allowed_segs) {
++ pr_debug("%u [%s] BREAK for sent\n", tcp_jiffies32,
++ __func__);
+ break;
++ }
+
+ tso_segs = tcp_init_tso_segs(skb, mss_now);
+ BUG_ON(!tso_segs);
+@@ -2292,33 +2358,42 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp" is used as a start point for the retransmit timer */
+ skb->skb_mstamp = tp->tcp_mstamp;
++ pr_debug("%u [%s] 1", tcp_jiffies32, __func__);
+ goto repair; /* Skip network transmission */
+ }
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (!cwnd_quota) {
+- if (push_one == 2)
++ if (push_one == 2) {
+ /* Force out a loss probe pkt. */
++ pr_debug("%u [%s] 2", tcp_jiffies32, __func__);
+ cwnd_quota = 1;
+- else
++ } else {
++ pr_debug("%u [%s] 3", tcp_jiffies32, __func__);
+ break;
++ }
+ }
+
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+ is_rwnd_limited = true;
++ pr_debug("%u [%s] 4", tcp_jiffies32, __func__);
+ break;
+ }
+
+ if (tso_segs == 1) {
+ if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+ (tcp_skb_is_last(sk, skb) ?
+- nonagle : TCP_NAGLE_PUSH))))
++ nonagle : TCP_NAGLE_PUSH)))) {
++ pr_debug("%u [%s] 5", tcp_jiffies32, __func__);
+ break;
++ }
+ } else {
+ if (!push_one &&
+ tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+- max_segs))
++ max_segs)) {
++ pr_debug("%u [%s] 6", tcp_jiffies32, __func__);
+ break;
++ }
+ }
+
+ limit = mss_now;
+@@ -2330,16 +2405,22 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ nonagle);
+
+ if (skb->len > limit &&
+- unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
++ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) {
++ pr_debug("%u [%s] 7", tcp_jiffies32, __func__);
+ break;
++ }
+
+ if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+ clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
+- if (tcp_small_queue_check(sk, skb, 0))
++ if (tcp_small_queue_check(sk, skb, 0)) {
++ pr_debug("%u [%s] 8", tcp_jiffies32, __func__);
+ break;
++ }
+
+- if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
++ if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) {
++ pr_debug("%u [%s] 9", tcp_jiffies32, __func__);
+ break;
++ }
+
+ repair:
+ /* Advance the send_head. This one is sent out.
+@@ -2350,10 +2431,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+ tcp_minshall_update(tp, mss_now, skb);
+ sent_pkts += tcp_skb_pcount(skb);
+
+- if (push_one)
++ if (push_one) {
++ pr_debug("%u [%s] 10", tcp_jiffies32, __func__);
+ break;
++ }
+ }
+
++ if (!tcp_send_head(sk)) {
++ pr_debug("%u [%s] no skb in queue\n", tcp_jiffies32, __func__);
++ }
++
++ if (ca_ops && notify && ca_ops->segments_sent)
++ ca_ops->segments_sent(sk, sent_pkts);
++
+ if (is_rwnd_limited)
+ tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
+ else
+@@ -2447,6 +2537,7 @@ void tcp_send_loss_probe(struct sock *sk)
+ if (skb) {
+ if (tcp_snd_wnd_test(tp, skb, mss)) {
+ pcount = tp->packets_out;
++ pr_debug("%u [tcp_send_loss_probe]\n", tcp_jiffies32);
+ tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+ if (tp->packets_out > pcount)
+ goto probe_sent;
+@@ -2503,6 +2594,8 @@ void tcp_send_loss_probe(struct sock *sk)
+ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+ int nonagle)
+ {
++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
++
+ /* If we are closed, the bytes will have to remain here.
+ * In time closedown will finish, we empty the write queue and
+ * all will be happy.
+@@ -2513,6 +2606,10 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+ if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+ sk_gfp_mask(sk, GFP_ATOMIC)))
+ tcp_check_probe_timer(sk);
++
++ if (!tcp_send_head(sk) && ca_ops && ca_ops->no_data_to_transmit) {
++ ca_ops->no_data_to_transmit(sk);
++ }
+ }
+
+ /* Send _single_ skb sitting at the send head. This function requires
+@@ -2522,9 +2619,13 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
+ {
+ struct sk_buff *skb = tcp_send_head(sk);
+
+- BUG_ON(!skb || skb->len < mss_now);
++ /* Don't be forced to send not meaningful data */
++ if (!skb || skb->len < mss_now)
++ return;
+
++ pr_debug("%u [tcp_push_one] Pushing directly\n", tcp_jiffies32);
+ tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
++ pr_debug("%u [tcp_push_one] End of untimed push\n", tcp_jiffies32);
+ }
+
+ /* This function returns the amount that we can raise the
+@@ -2868,9 +2969,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
+
+ skb->skb_mstamp = tp->tcp_mstamp;
+ nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
++ pr_debug("%u [tcp_retransmit_skb] retransmit\n", tcp_jiffies32);
+ err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+ -ENOBUFS;
+ } else {
++ pr_debug("%u [tcp_retransmit_skb] retransmit\n", tcp_jiffies32);
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ }
+
+@@ -3084,6 +3187,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+ TCPHDR_ACK | TCPHDR_RST);
+ tcp_mstamp_refresh(tcp_sk(sk));
+ /* Send it off. */
++ pr_debug("%u [tcp_send_active_reset]\n", tcp_jiffies32);
+ if (tcp_transmit_skb(sk, skb, 0, priority))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+ }
+@@ -3120,6 +3224,7 @@ int tcp_send_synack(struct sock *sk)
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
+ tcp_ecn_send_synack(sk, skb);
+ }
++ pr_debug("%u [tcp_send_synack]\n", tcp_jiffies32);
+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ }
+
+@@ -3399,6 +3504,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+ if (syn_data->len)
+ tcp_chrono_start(sk, TCP_CHRONO_BUSY);
+
++ pr_debug("%u [tcp_send_syn_data]\n", tcp_jiffies32);
+ err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
+
+ syn->skb_mstamp = syn_data->skb_mstamp;
+@@ -3420,6 +3526,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+ /* Send a regular SYN with Fast Open cookie request option */
+ if (fo->cookie.len > 0)
+ fo->cookie.len = 0;
++ pr_debug("%u [tcp_send_syn_data] fallback \n", tcp_jiffies32);
+ err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+ if (err)
+ tp->syn_fastopen = 0;
+@@ -3458,6 +3565,7 @@ int tcp_connect(struct sock *sk)
+ tcp_ecn_send_syn(sk, buff);
+
+ /* Send off SYN; include data in Fast Open. */
++ pr_debug("%u [tcp_connect]\n", tcp_jiffies32);
+ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+ if (err == -ECONNREFUSED)
+@@ -3572,6 +3680,7 @@ void tcp_send_ack(struct sock *sk)
+ skb_set_tcp_pure_ack(buff);
+
+ /* Send it off, this clears delayed acks for us. */
++ pr_debug("%u [tcp_send_ack]\n", tcp_jiffies32);
+ tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
+ }
+ EXPORT_SYMBOL_GPL(tcp_send_ack);
+@@ -3606,6 +3715,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
+ */
+ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+ NET_INC_STATS(sock_net(sk), mib);
++
++ pr_debug("%u [tcp_xmit_probe_skb]\n", tcp_jiffies32);
+ return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
+ }
+
+@@ -3651,6 +3762,7 @@ int tcp_write_wakeup(struct sock *sk, int mib)
+ tcp_set_skb_tso_segs(skb, mss);
+
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
++ pr_debug("%u [tcp_write_wakeup]\n", tcp_jiffies32);
+ err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+ if (!err)
+ tcp_event_new_data_sent(sk, skb);
+diff --git a/net/ipv4/tcp_wave.c b/net/ipv4/tcp_wave.c
+new file mode 100644
+index 000000000000..9b48cbd84fe1
+--- /dev/null
++++ b/net/ipv4/tcp_wave.c
+@@ -0,0 +1,1136 @@
++/*
++ * TCP Wave
++ *
++ * Copyright 2017 Natale Patriciello <natale.patriciello@gmail.com>
++ *
++ * This program is free software: you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation, either version 3 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program. If not, see <http://www.gnu.org/licenses/>.
++ *
++ */
++
++#define pr_fmt(fmt) "WAVE: " fmt
++
++#include <linux/module.h>
++#include <net/net_namespace.h>
++#include <net/tcp.h>
++#include <linux/inet_diag.h>
++#include <linux/slab.h>
++#include <linux/proc_fs.h>
++
++#ifdef DEBUG
++static bool enable_log = true;
++#else
++static bool enable_log = false;
++#endif
++
++static uint init_burst __read_mostly = 10;
++static uint min_burst __read_mostly = 3;
++static uint init_timer_ms __read_mostly = 200;
++static uint beta_ms __read_mostly = 150;
++static int port __read_mostly = 0;
++static unsigned int bufsize __read_mostly = 4096;
++static const char procname[] = "tcpwave";
++
++module_param(init_burst, uint, 0644);
++MODULE_PARM_DESC(init_burst, "initial burst (segments)");
++module_param(min_burst, uint, 0644);
++MODULE_PARM_DESC(min_burst, "minimum burst (segments)");
++module_param(init_timer_ms, uint, 0644);
++MODULE_PARM_DESC(init_timer_ms, "initial timer (ms)");
++module_param(beta_ms, uint, 0644);
++MODULE_PARM_DESC(beta_ms, "beta parameter (ms)");
++module_param(port, int, 0);
++MODULE_PARM_DESC(port, "Port to match when logging (0=all)");
++module_param(bufsize, uint, 0);
++MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
++
++/* Shift factor for the exponentially weighted average. */
++#define AVG_SCALE 20
++#define AVG_UNIT (1 << AVG_SCALE)
++
++/* Taken from BBR */
++#define BW_SCALE 24
++#define BW_UNIT (1 << BW_SCALE)
++
++/* Tell if the driver is initialized (init has been called) */
++#define FLAG_INIT 0x1
++/* Tell if, as sender, the driver is started (after TX_START) */
++#define FLAG_START 0x2
++/* If it's true, we save the sent size as a burst */
++#define FLAG_SAVE 0x4
++
++/* Log struct */
++struct wavetcp_log {
++ ktime_t tstamp;
++ union {
++ struct sockaddr raw;
++ struct sockaddr_in v4;
++ struct sockaddr_in6 v6;
++ } src, dst;
++ u32 tx_timer;
++ u16 burst;
++ u32 min_rtt;
++ u32 avg_rtt;
++ u32 max_rtt;
++};
++
++static struct {
++ spinlock_t lock;
++ wait_queue_head_t wait;
++ ktime_t start;
++ unsigned long head, tail;
++ struct wavetcp_log *log;
++} wavetcp_probe;
++
++/* List for saving the size of sent burst over time */
++struct wavetcp_burst_hist {
++ u16 size; /* The burst size */
++ struct list_head list; /* Kernel list declaration */
++};
++
++static bool test_flag(u8 flags, u8 value)
++{
++ return (flags & value) == value;
++}
++
++static void set_flag(u8 *flags, u8 value)
++{
++ *flags |= value;
++}
++
++static void clear_flag(u8 *flags, u8 value)
++{
++ *flags &= ~(value);
++}
++
++static bool ktime_is_null(ktime_t kt)
++{
++ return ktime_compare(kt, ns_to_ktime(0)) == 0;
++}
++
++/* TCP Wave private struct */
++struct wavetcp {
++ u8 flags; /* The module flags */
++ u32 tx_timer; /* The current transmission timer (us) */
++ u8 burst; /* The current burst size (segments) */
++ s8 delta_segments; /* Represents a delta from the burst size of segments sent */
++ u16 pkts_acked; /* The segments acked in the round */
++ u8 heuristic_scale; /* Heuristic scale, to divide the RTT */
++ ktime_t previous_ack_t_disp; /* Previous ack_train_disp Value */
++ ktime_t first_ack_time; /* First ACK time of the round */
++ u32 backup_first_ack_time_us; /* Backup value of the first ack time */
++ u32 first_rtt; /* First RTT of the round */
++ u32 min_rtt; /* Minimum RTT of the round */
++ u32 avg_rtt; /* Average RTT of the previous round */
++ u32 max_rtt; /* Maximum RTT */
++ u8 stab_factor; /* Stability factor */
++ struct kmem_cache *cache; /* The memory cache for saving the burst sizes */
++ struct wavetcp_burst_hist *history; /* The burst history */
++ u16 sport; /* TCP Source Port of the connection */
++};
++
++/* Called to setup Wave for the current socket after it enters the CONNECTED
++ * state (i.e., called after the SYN-ACK is received). The slow start should be
++ * 0 (see wavetcp_get_ssthresh) and we set the initial cwnd to the initial
++ * burst.
++ *
++ * After the ACK of the SYN-ACK is sent, the TCP will add a bit of delay to
++ * permit the queueing of data from the application, otherwise we will end up
++ * in a scattered situation (we have one segment -> send it -> no other segment,
++ * don't set the timer -> slightly after, another segment come and we loop).
++ *
++ * At the first expiration, the cwnd will be large enough to push init_burst
++ * segments out.
++ */
++static void wavetcp_init(struct sock *sk)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ ca->sport = ntohs(inet_sk(sk)->inet_sport);
++
++ pr_debug("%u sport: %u [%s]\n", tcp_jiffies32, ca->sport,
++ __func__);
++
++ /* Setting the initial Cwnd to 0 will not call the TX_START event */
++ tp->snd_ssthresh = 0;
++ tp->snd_cwnd = init_burst;
++
++ /* Used to avoid to take the SYN-ACK measurements */
++ ca->flags = 0;
++ ca->flags = FLAG_INIT | FLAG_SAVE;
++
++ ca->burst = init_burst;
++ ca->delta_segments = init_burst;
++ ca->tx_timer = init_timer_ms * USEC_PER_MSEC;
++ ca->first_ack_time = ns_to_ktime(0);
++ ca->backup_first_ack_time_us = 0;
++ ca->heuristic_scale = 0;
++ ca->first_rtt = 0;
++ ca->min_rtt = -1; /* a lot of time */
++ ca->avg_rtt = 0;
++ ca->max_rtt = 0;
++ ca->stab_factor = 0;
++ ca->previous_ack_t_disp = ns_to_ktime(0);
++
++ ca->history = kmalloc(sizeof(*ca->history), GFP_KERNEL);
++
++ /* Init the history of bwnd */
++ INIT_LIST_HEAD(&ca->history->list);
++
++ /* Init our cache pool for the bwnd history */
++ ca->cache = KMEM_CACHE(wavetcp_burst_hist, 0);
++
++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
++}
++
++static void wavetcp_release(struct sock *sk)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++ struct wavetcp_burst_hist *tmp;
++ struct list_head *pos, *q;
++
++ if (!test_flag(ca->flags, FLAG_INIT))
++ return;
++
++ pr_debug("%u sport: %u [%s]\n", tcp_jiffies32, ca->sport,
++ __func__);
++
++ list_for_each_safe(pos, q, &ca->history->list) {
++ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
++ list_del(pos);
++ kmem_cache_free(ca->cache, tmp);
++ }
++
++ kfree(ca->history);
++ kmem_cache_destroy(ca->cache);
++}
++
++static void wavetcp_print_history(struct wavetcp *ca)
++{
++ struct wavetcp_burst_hist *tmp;
++ struct list_head *pos, *q;
++
++ list_for_each_safe(pos, q, &ca->history->list) {
++ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
++ pr_debug("[%s] %u\n", __func__, tmp->size);
++ }
++}
++
++/* Please explain that we will be forever in congestion avoidance. */
++static u32 wavetcp_recalc_ssthresh(struct sock *sk)
++{
++ pr_debug("%u [%s]\n", tcp_jiffies32, __func__);
++ return 0;
++}
++
++static void wavetcp_state(struct sock *sk, u8 new_state)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++
++ if (!test_flag(ca->flags, FLAG_INIT))
++ return;
++
++ switch (new_state) {
++ case TCP_CA_Open:
++ pr_debug("%u sport: %u [%s] set CA_Open\n", tcp_jiffies32,
++ ca->sport, __func__);
++ /* We have fully recovered, so reset some variables */
++ ca->delta_segments = 0;
++ break;
++ default:
++ pr_debug("%u sport: %u [%s] set state %u, ignored\n",
++ tcp_jiffies32, ca->sport, __func__, new_state);
++ }
++}
++
++static u32 wavetcp_undo_cwnd(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ /* Not implemented yet. We stick to the decision made earlier */
++ pr_debug("%u [%s]\n", tcp_jiffies32, __func__);
++ return tp->snd_cwnd;
++}
++
++/* Add the size of the burst in the history of bursts */
++static void wavetcp_insert_burst(struct wavetcp *ca, u32 burst)
++{
++ struct wavetcp_burst_hist *cur;
++
++ pr_debug("%u sport: %u [%s] adding %u segment in the history of burst\n",
++ tcp_jiffies32, ca->sport, __func__, burst);
++
++ /* Take the memory from the pre-allocated pool */
++ cur = (struct wavetcp_burst_hist *)kmem_cache_alloc(ca->cache,
++ GFP_KERNEL);
++ BUG_ON(!cur);
++
++ cur->size = burst;
++ list_add_tail(&cur->list, &ca->history->list);
++}
++
++static void wavetcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++
++ if (!test_flag(ca->flags, FLAG_INIT))
++ return;
++
++ switch (event) {
++ case CA_EVENT_TX_START:
++ /* first transmit when no packets in flight */
++ pr_debug("%u sport: %u [%s] TX_START\n", tcp_jiffies32,
++ ca->sport, __func__);
++
++ set_flag(&ca->flags, FLAG_START);
++
++ break;
++ default:
++ pr_debug("%u sport: %u [%s] got event %u, ignored\n",
++ tcp_jiffies32, ca->sport, __func__, event);
++ break;
++ }
++}
++
++static void wavetcp_adj_mode(struct wavetcp *ca,
++ unsigned long delta_rtt)
++{
++ ca->stab_factor = ca->avg_rtt / ca->tx_timer;
++
++ ca->min_rtt = -1; /* a lot of time */
++ ca->avg_rtt = ca->max_rtt;
++ ca->tx_timer = init_timer_ms * USEC_PER_MSEC;
++
++ pr_debug("%u sport: %u [%s] stab_factor %u, timer %u us, avg_rtt %u us\n",
++ tcp_jiffies32, ca->sport, __func__, ca->stab_factor,
++ ca->tx_timer, ca->avg_rtt);
++}
++
++static int wavetcp_probe_used(void)
++{
++ return (wavetcp_probe.head - wavetcp_probe.tail) & (bufsize - 1);
++}
++
++static int wavetcp_probe_avail(void)
++{
++ return bufsize - wavetcp_probe_used() - 1;
++}
++
++#define wavetcp_probe_copy_fl_to_si4(inet, si4, mem) \
++ do { \
++ si4.sin_family = AF_INET; \
++ si4.sin_port = inet->inet_##mem##port; \
++ si4.sin_addr.s_addr = inet->inet_##mem##addr; \
++ } while (0) \
++
++static void wavetcp_tracking_mode(struct sock *sk, u64 delta_rtt,
++ ktime_t ack_train_disp)
++{
++ const struct inet_sock *inet = inet_sk(sk);
++ struct wavetcp *ca = inet_csk_ca(sk);
++
++ if (ktime_is_null(ack_train_disp)) {
++ pr_debug("%u sport: %u [%s] ack_train_disp is 0. Impossible to do tracking.\n",
++ tcp_jiffies32, ca->sport, __func__);
++ return;
++ }
++
++ ca->tx_timer = (ktime_to_us(ack_train_disp) + (delta_rtt / 2));
++
++ if (ca->tx_timer == 0) {
++ pr_debug("%u sport: %u [%s] WARNING: tx timer is 0"
++ ", forcefully set it to 1000 us\n",
++ tcp_jiffies32, ca->sport, __func__);
++ ca->tx_timer = 1000;
++ }
++
++ pr_debug("%u sport: %u [%s] tx timer is %u us\n",
++ tcp_jiffies32, ca->sport, __func__, ca->tx_timer);
++
++ if (!enable_log)
++ return;
++
++ if (port == 0 ||
++ ntohs(inet->inet_dport) == port ||
++ ntohs(inet->inet_sport) == port) {
++
++ spin_lock(&wavetcp_probe.lock);
++ if (wavetcp_probe_avail() > 1) {
++ struct wavetcp_log *p = wavetcp_probe.log + wavetcp_probe.head;
++ p->tstamp = ktime_get();
++
++ switch (sk->sk_family) {
++ case AF_INET:
++ wavetcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
++ wavetcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
++ break;
++ case AF_INET6:
++ memset(&p->src.v6, 0, sizeof(p->src.v6));
++ memset(&p->dst.v6, 0, sizeof(p->dst.v6));
++#if IS_ENABLED(CONFIG_IPV6)
++ p->src.v6.sin6_family = AF_INET6;
++ p->src.v6.sin6_port = inet->inet_sport;
++ p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
++
++ p->dst.v6.sin6_family = AF_INET6;
++ p->dst.v6.sin6_port = inet->inet_dport;
++ p->dst.v6.sin6_addr = sk->sk_v6_daddr;
++#endif
++ break;
++ default:
++ BUG();
++ }
++
++ p->tx_timer = ca->tx_timer;
++ p->burst = ca->burst;
++ p->min_rtt = ca->burst;
++ p->avg_rtt = ca->avg_rtt;
++ p->max_rtt = ca->max_rtt;
++
++ wavetcp_probe.head = (wavetcp_probe.head + 1) & (bufsize - 1);
++ }
++ spin_unlock (&wavetcp_probe.lock);
++
++ wake_up(&wavetcp_probe.wait);
++ }
++}
++
++/* The weight a is:
++ *
++ * a = (first_rtt - min_rtt) / first_rtt
++ *
++ */
++static u64 wavetcp_compute_weight(u32 first_rtt, u32 min_rtt)
++{
++ u64 diff = first_rtt - min_rtt;
++
++ diff = diff * AVG_UNIT;
++
++ return diff / first_rtt;
++}
++
++static ktime_t heuristic_ack_train_disp(struct wavetcp *ca,
++ const struct rate_sample *rs,
++ u32 burst)
++{
++ ktime_t ack_train_disp = ns_to_ktime(0);
++ ktime_t interval = ns_to_ktime (0);
++ ktime_t backup_first_ack = ns_to_ktime(0);
++
++ if (rs->interval_us <= 0) {
++ pr_debug("%u sport: %u [%s] WARNING is not possible "
++ "to heuristically calculate ack_train_disp, returning 0."
++ "Delivered %u, interval_us %li\n",
++ tcp_jiffies32, ca->sport, __func__,
++ rs->delivered, rs->interval_us);
++ return ack_train_disp;
++ }
++
++ interval = ns_to_ktime(rs->interval_us * NSEC_PER_USEC);
++ backup_first_ack = ns_to_ktime(ca->backup_first_ack_time_us * NSEC_PER_USEC);
++
++ /*
++ * The heuristic takes the RTT of the first ACK, the RTT of the
++ * latest ACK, and uses the difference as ack_train_disp.
++ *
++ * If the sample for the first and last ACK are the same (e.g.,
++ * one ACK per burst) we use as the latest option the value of
++ * interval_us (which is the RTT). However, this value is
++ * exponentially lowered each time we don't have any valid
++ * sample (i.e., we perform a division by 2, by 4, and so on).
++ * The increased transmitted rate, if it is out of the capacity
++ * of the bottleneck, will be compensated by an higher
++ * delta_rtt, and so limited by the adjustment algorithm. This
++ * is a blind search, but we do not have any valid sample...
++ */
++ if (ktime_compare(interval, backup_first_ack) > 0) {
++ /* first heuristic */
++ ack_train_disp = ktime_sub(interval, backup_first_ack);
++ } else {
++ /* this branch avoids an overflow. However, reaching
++ * this point means that the ACK train is not aligned
++ * with the sent burst.
++ */
++ ack_train_disp = ktime_sub(backup_first_ack, interval);
++ }
++
++ if (ktime_is_null(ack_train_disp)) {
++ /* Blind search */
++ u32 blind_interval_us = rs->interval_us >> ca->heuristic_scale;
++ ++ca->heuristic_scale;
++ ack_train_disp = ns_to_ktime(blind_interval_us * NSEC_PER_USEC);
++ pr_debug("%u sport: %u [%s] we received one BIG ack."
++ " Doing an heuristic with scale %u, interval_us"
++ " %li us, and setting ack_train_disp to %lli us\n",
++ tcp_jiffies32, ca->sport, __func__,
++ ca->heuristic_scale, rs->interval_us,
++ ktime_to_us(ack_train_disp));
++ } else {
++ pr_debug("%u sport: %u [%s] we got the first ack with"
++ " interval %u us, the last (this) with interval %li us."
++ " Doing a substraction and setting ack_train_disp"
++ " to %lli us\n",
++ tcp_jiffies32, ca->sport, __func__,
++ ca->backup_first_ack_time_us, rs->interval_us,
++ ktime_to_us(ack_train_disp));
++ }
++
++ return ack_train_disp;
++}
++
++static ktime_t filter_ack_train_disp(const struct wavetcp *ca, u64 delta_rtt_us,
++ ktime_t measured_ack_t_disp)
++{
++ ktime_t filtered_ack_t_disp;
++ u64 alpha, left, right;
++
++ alpha = (delta_rtt_us * AVG_UNIT) / (beta_ms * USEC_PER_MSEC);
++ left = ((AVG_UNIT - alpha) * ktime_to_us(ca->previous_ack_t_disp)) / AVG_UNIT;
++ right = (alpha * ktime_to_us(measured_ack_t_disp)) / AVG_UNIT;
++
++ filtered_ack_t_disp = ns_to_ktime(((u32)left + (u32)right) * NSEC_PER_USEC);
++
++ pr_debug("%u sport: %u [%s] AVG_UNIT %i delta_rtt %llu beta %i alpha %llu "
++ "measured_ack_train_disp %lli us prv_ack_train_disp %lli us left %llu right %llu, final %lli\n",
++ tcp_jiffies32, ca->sport, __func__, AVG_UNIT, delta_rtt_us,
++ beta_ms, alpha, ktime_to_us(measured_ack_t_disp),
++ ktime_to_us(ca->previous_ack_t_disp),
++ left, right, ktime_to_us(filtered_ack_t_disp));
++
++ return filtered_ack_t_disp;
++
++}
++
++static ktime_t calculate_ack_train_disp(struct wavetcp *ca,
++ const struct rate_sample *rs,
++ u32 burst, u64 delta_rtt_us)
++{
++ ktime_t old_value_patd = ca->previous_ack_t_disp;
++ ktime_t ack_train_disp = ns_to_ktime(0);
++
++ if (!ktime_is_null(ca->first_ack_time))
++ ack_train_disp = ktime_sub(ktime_get(), ca->first_ack_time);
++
++ if (ktime_is_null(ca->previous_ack_t_disp) &&
++ ktime_is_null(ack_train_disp)) {
++ /* We received a cumulative ACK just after we sent the data, so
++ * the dispersion would be close to zero. Moreover, we don't
++ * have any valid sample from the past; in this case, we use
++ * an heuristic to calculate ack_train_disp.
++ */
++ return heuristic_ack_train_disp(ca, rs, burst);
++ }
++
++ /* resetting the heuristic scale because we have a real sample */
++ ca->heuristic_scale = 0;
++
++ if (ktime_is_null(ca->previous_ack_t_disp)) {
++ /* initialize the value */
++ ca->previous_ack_t_disp = ack_train_disp;
++ } else if (ktime_compare(ack_train_disp, ca->previous_ack_t_disp) > 0) {
++ /* filter the measured value */
++ return filter_ack_train_disp(ca, delta_rtt_us, ack_train_disp);
++ } else if (ktime_is_null(ack_train_disp)) {
++ /* Use the plain previous value */
++ ack_train_disp = ca->previous_ack_t_disp;
++ } else {
++ /* In all other cases, update the previous value */
++ ca->previous_ack_t_disp = ack_train_disp;
++ }
++
++ pr_debug("%u sport: %u [%s] previous_ack_t_disp %lli us, measured ack_train_disp %lli us\n",
++ tcp_jiffies32, ca->sport, __func__,
++ ktime_to_us(old_value_patd),
++ ktime_to_us(ack_train_disp));
++
++ return ack_train_disp;
++}
++
++static void calculate_avg_rtt(struct wavetcp *ca)
++{
++ /* Why the if?
++ *
++ * a = (first_rtt - min_rtt) / first_rtt = 1 - (min_rtt/first_rtt)
++ *
++ * avg_rtt_0 = (1 - a) * first_rtt
++ * = (1 - (1 - (min_rtt/first_rtt))) * first_rtt
++ * = first_rtt - (first_rtt - min_rtt)
++ * = min_rtt
++ *
++ *
++ * And.. what happen in the else branch? We calculate first a (scaled by
++ * 1024), then do the substraction (1-a) by keeping in the consideration
++ * the scale, and in the end coming back to the result removing the
++ * scaling.
++ *
++ * We divide the equation
++ *
++ * AvgRtt = a * AvgRtt + (1-a)*Rtt
++ *
++ * in two part properly scaled, left and right, and then having a sum of
++ * the two parts to avoid (possible) overflow.
++ */
++ if (ca->avg_rtt == 0) {
++ ca->avg_rtt = ca->min_rtt;
++ pr_debug("%u sport: %u [%s] calculated avg_rtt %u\n", tcp_jiffies32,
++ ca->sport, __func__, ca->avg_rtt);
++ } else if (ca->first_rtt > 0) {
++ u32 old_value = ca->avg_rtt;
++ u64 right;
++ u64 left;
++ u64 a;
++
++ a = wavetcp_compute_weight(ca->first_rtt, ca->min_rtt);
++
++ left = (a * ca->avg_rtt) / AVG_UNIT;
++ right = ((AVG_UNIT - a) * ca->first_rtt) / AVG_UNIT;
++
++ ca->avg_rtt = (u32)left + (u32)right;
++
++ pr_debug("%u sport: %u [%s] previous avg %u us, first_rtt %u us, "
++ "min %u us, a (shifted) %llu, calculated avg %u us\n",
++ tcp_jiffies32, ca->sport, __func__,
++ old_value, ca->first_rtt, ca->min_rtt, a, ca->avg_rtt);
++ } else {
++ pr_debug("%u sport: %u [%s] Can't calculate avg_rtt.\n",
++ tcp_jiffies32, ca->sport, __func__);
++ }
++}
++
++static u64 calculate_delta_rtt(struct wavetcp *ca)
++{
++ return ca->avg_rtt - ca->min_rtt;
++}
++
++static void wavetcp_round_terminated(struct sock *sk, const struct rate_sample *rs,
++ u32 burst)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++ u64 delta_rtt_us;
++
++ /* If the round terminates without a sample of RTT, use the average */
++ if (ca->first_rtt == 0) {
++ ca->first_rtt = ca->avg_rtt;
++ pr_debug("%u sport: %u [%s] Using the average value for first_rtt %u\n",
++ tcp_jiffies32, ca->sport, __func__, ca->first_rtt);
++ }
++
++ calculate_avg_rtt(ca);
++
++ /* If we have to wait, let's wait */
++ if (ca->stab_factor > 0) {
++ --ca->stab_factor;
++ pr_debug("%u sport: %u [%s] reached burst %u, not applying (stab left: %u)\n",
++ tcp_jiffies32, ca->sport, __func__, burst,
++ ca->stab_factor);
++ return;
++ }
++
++ delta_rtt_us = calculate_delta_rtt(ca);
++
++ pr_debug("%u sport: %u [%s] reached burst %u, drtt %llu\n",
++ tcp_jiffies32, ca->sport, __func__, burst, delta_rtt_us);
++
++ /* delta_rtt_us is in us, beta_ms in ms */
++ if (delta_rtt_us > beta_ms * USEC_PER_MSEC)
++ wavetcp_adj_mode(ca, delta_rtt_us);
++ else
++ wavetcp_tracking_mode(sk, delta_rtt_us,
++ calculate_ack_train_disp(ca, rs, burst,
++ delta_rtt_us));
++}
++
++static void wavetcp_cong_control(struct sock *sk, const struct rate_sample *rs)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++ struct wavetcp_burst_hist *tmp;
++ struct list_head *pos;
++
++ if (!test_flag(ca->flags, FLAG_INIT))
++ return;
++
++ if (ca->backup_first_ack_time_us == 0 && rs->interval_us > 0)
++ ca->backup_first_ack_time_us = rs->interval_us;
++
++ pos = ca->history->list.next;
++ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
++
++ if (tmp->size == 0) {
++ /* No burst in memory. Most likely we sent some segments out of
++ * the allowed window (e.g., loss probe) */
++ pr_debug("%u sport: %u [%s] WARNING! empty burst\n",
++ tcp_jiffies32, ca->sport, __func__);
++ wavetcp_print_history(ca);
++ goto reset;
++ }
++
++ pr_debug("%u sport: %u [%s] prior_delivered %u, delivered %i, interval_us %li, "
++ "rtt_us %li, losses %i, ack_sack %u, prior_in_flight %u, is_app %i,"
++ " is_retrans %i\n", tcp_jiffies32, ca->sport, __func__,
++ rs->prior_delivered, rs->delivered, rs->interval_us, rs->rtt_us,
++ rs->losses, rs->acked_sacked, rs->prior_in_flight,
++ rs->is_app_limited, rs->is_retrans);
++
++ /* Train management.*/
++ ca->pkts_acked += rs->acked_sacked;
++
++ if (ca->pkts_acked < tmp->size)
++ return;
++
++ while (ca->pkts_acked >= tmp->size) {
++ /* Usually the burst end is also reflected in the rs->delivered
++ * variable. If this is not the case, and such variable is
++ * behind just for 1 segment, then do this experimental thing
++ * to re-allineate the burst with the rs->delivered variable.
++ * In the majority of cases, we went out of allineation because
++ * of a tail loss probe.
++ */
++ if (rs->delivered + 1 == tmp->size) {
++ pr_debug("%u sport: %u [%s] highly experimental:"
++ " ignore 1 pkt. pkts_acked %u, delivered %u,"
++ " burst %u\n", tcp_jiffies32, ca->sport, __func__,
++ ca->pkts_acked, rs->delivered, tmp->size);
++ ca->pkts_acked--;
++ return;
++ }
++ wavetcp_round_terminated(sk, rs, tmp->size);
++
++ BUG_ON(ca->pkts_acked < tmp->size);
++
++ ca->pkts_acked -= tmp->size;
++
++ /* Delete the burst from the history */
++ list_del(pos);
++ kmem_cache_free(ca->cache, tmp);
++
++ /* Take next burst */
++ pos = ca->history->list.next;
++ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
++
++ /* If we cycle, inside wavetcp_round_terminated we will take the
++ * Linux path instead of the wave path.. first_rtt will not be
++ * read, so don't waste a cycle to set it */
++ ca->first_ack_time = ktime_get();
++ ca->backup_first_ack_time_us = 0;
++ }
++
++reset:
++ /* Reset the variables needed for the beginning of the next round*/
++ ca->first_ack_time = ns_to_ktime(0);
++ ca->backup_first_ack_time_us = 0;
++ ca->first_rtt = 0;
++ pr_debug("%u sport: %u [%s] resetting RTT values for next round\n",
++ tcp_jiffies32, ca->sport, __func__);
++}
++
++static void wavetcp_acce(struct wavetcp *ca, s32 rtt_us, u32 pkts_acked)
++{
++ if (ktime_is_null(ca->first_ack_time)) {
++ ca->first_ack_time = ktime_get();
++ pr_debug("%u sport: %u [%s] first ack of the train\n",
++ tcp_jiffies32, ca->sport, __func__);
++ }
++
++ if (rtt_us <= 0)
++ return;
++
++ if (ca->first_rtt == 0) {
++ ca->first_rtt = rtt_us;
++
++ pr_debug("%u sport: %u [%s] first measurement rtt %i\n",
++ tcp_jiffies32, ca->sport, __func__,
++ ca->first_rtt);
++ }
++
++ /* Check the minimum rtt we have seen */
++ if (rtt_us < ca->min_rtt) {
++ ca->min_rtt = rtt_us;
++ pr_debug("%u sport: %u [%s] min rtt %u\n", tcp_jiffies32,
++ ca->sport, __func__, rtt_us);
++ }
++
++ if (rtt_us > ca->max_rtt)
++ ca->max_rtt = rtt_us;
++}
++
++/* Invoked each time we receive an ACK. Obviously, this function also gets
++ * called when we receive the SYN-ACK, but we ignore it thanks to the
++ * FLAG_INIT flag.
++ *
++ * We close the cwnd of the amount of segments acked, because we don't like
++ * sending out segments if the timer is not expired. Without doing this, we
++ * would end with cwnd - in_flight > 0.
++ */
++static void wavetcp_acked(struct sock *sk, const struct ack_sample *sample)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (!test_flag(ca->flags, FLAG_INIT))
++ return;
++
++ /* We can divide the ACCE function in two part: the first take care of
++ * the RTT, and the second of the train management. Here we could have
++ * pkts_acked == 0, but with RTT values (because the underlying TCP can
++ * identify what segment has been ACKed through the SACK option). In any
++ * case, therefore, we enter wavetcp_acce.*/
++ wavetcp_acce(ca, sample->rtt_us, sample->pkts_acked);
++
++ if (tp->snd_cwnd < sample->pkts_acked) {
++ /* We sent some scattered segments, so the burst segments and
++ * the ACK we get is not aligned.
++ */
++ pr_debug("%u sport: %u [%s] delta_seg %i\n",
++ tcp_jiffies32, ca->sport, __func__,
++ ca->delta_segments);
++
++ ca->delta_segments += sample->pkts_acked - tp->snd_cwnd;
++ }
++
++ pr_debug("%u sport: %u [%s] pkts_acked %u, rtt_us %i, in_flight %u "
++ ", cwnd %u, seq ack %u, delta %i\n",
++ tcp_jiffies32, ca->sport, __func__, sample->pkts_acked,
++ sample->rtt_us, sample->in_flight, tp->snd_cwnd, tp->snd_una,
++ ca->delta_segments);
++
++ /* Brutally set the cwnd in order to not let segment out */
++ tp->snd_cwnd = tcp_packets_in_flight(tp);
++}
++
++/* The TCP informs us that the timer is expired (or has never been set). We can
++ * infer the latter by the FLAG_STARTED flag: if it's false, don't increase the
++ * cwnd, because it is at its default value (init_burst) and we still have to
++ * transmit the first burst.
++ */
++static void wavetcp_timer_expired(struct sock *sk)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++ struct tcp_sock *tp = tcp_sk(sk);
++ u32 current_burst = ca->burst;
++
++ if (!test_flag(ca->flags, FLAG_START) || !test_flag(ca->flags, FLAG_INIT)) {
++ pr_debug("%u sport: %u [%s] returning because of flags, leaving cwnd %u\n",
++ tcp_jiffies32, ca->sport, __func__, tp->snd_cwnd);
++ return;
++ }
++
++ pr_debug("%u sport: %u [%s] starting with delta %u current_burst %u\n",
++ tcp_jiffies32, ca->sport, __func__, ca->delta_segments,
++ current_burst);
++
++ if (ca->delta_segments < 0) {
++ /* In the previous round, we sent more than the allowed burst,
++ * so reduce the current burst.
++ */
++ BUG_ON(current_burst > ca->delta_segments);
++ current_burst += ca->delta_segments; /* please *reduce* */
++
++ /* Right now, we should send "current_burst" segments out */
++
++ if (tcp_packets_in_flight(tp) > tp->snd_cwnd) {
++ /* For some reasons (e.g., tcp loss probe)
++ * we sent something outside the allowed window.
++ * Add the amount of segments into the burst, in order
++ * to effectively send the previous "current_burst"
++ * segments, but without touching delta_segments.
++ */
++ u32 diff = tcp_packets_in_flight(tp) - tp->snd_cwnd;
++
++ current_burst += diff;
++ pr_debug("%u sport: %u [%s] adding %u to balance "
++ "segments sent out of window", tcp_jiffies32,
++ ca->sport, __func__, diff);
++ }
++ }
++
++ ca->delta_segments = current_burst;
++ pr_debug("%u sport: %u [%s] setting delta_seg %u current burst %u\n",
++ tcp_jiffies32, ca->sport, __func__,
++ ca->delta_segments, current_burst);
++
++ if (current_burst < min_burst) {
++ pr_debug("%u sport: %u [%s] WARNING !! not min_burst",
++ tcp_jiffies32, ca->sport, __func__);
++ ca->delta_segments += min_burst - current_burst;
++ current_burst = min_burst;
++ }
++
++ tp->snd_cwnd += current_burst;
++ set_flag(&ca->flags, FLAG_SAVE);
++
++ pr_debug("%u sport: %u [%s], increased window of %u segments, "
++ "total %u, delta %i, in_flight %u\n",
++ tcp_jiffies32, ca->sport, __func__, ca->burst,
++ tp->snd_cwnd, ca->delta_segments, tcp_packets_in_flight(tp));
++
++ if (tp->snd_cwnd - tcp_packets_in_flight(tp) > current_burst) {
++ pr_debug("%u sport: %u [%s] WARNING! "
++ " cwnd %u, in_flight %u, current burst %u\n",
++ tcp_jiffies32, ca->sport, __func__,
++ tp->snd_cwnd, tcp_packets_in_flight(tp),
++ current_burst);
++ }
++}
++
++static u64 wavetcp_get_timer(struct sock *sk)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++ u64 timer;
++
++ BUG_ON(!test_flag(ca->flags, FLAG_INIT));
++
++ timer = min_t(u64,
++ ca->tx_timer * NSEC_PER_USEC,
++ init_timer_ms * NSEC_PER_MSEC);
++
++ pr_debug("%u sport: %u [%s] returning timer of %llu ns\n",
++ tcp_jiffies32, ca->sport, __func__, timer);
++
++ return timer;
++}
++
++static void wavetcp_segment_sent(struct sock *sk, u32 sent)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct wavetcp *ca = inet_csk_ca(sk);
++
++ if (!test_flag(ca->flags, FLAG_START)) {
++ pr_debug ("%u sport: %u [%s] !START\n",
++ tcp_jiffies32, ca->sport, __func__);
++ return;
++ }
++
++ /* Don't save if we sent less than min_burst. Helpful at the
++ * beginning, when there is only one data segment ready */
++ if (test_flag(ca->flags, FLAG_SAVE) && sent > min_burst) {
++ wavetcp_insert_burst(ca, sent);
++ clear_flag(&ca->flags, FLAG_SAVE);
++ } else {
++ pr_debug("%u sport: %u [%s] not saving burst, sent %u\n",
++ tcp_jiffies32, ca->sport, __func__, sent);
++ }
++
++ if (sent > ca->burst) {
++ pr_debug("%u sport: %u [%s] WARNING! sent %u, burst %u"
++ " cwnd %u delta_seg %i\n, TSO very probable",
++ tcp_jiffies32, ca->sport, __func__, sent,
++ ca->burst, tp->snd_cwnd, ca->delta_segments);
++ }
++
++ ca->delta_segments -= sent;
++
++ if (ca->delta_segments >= 0 &&
++ ca->burst > sent &&
++ tcp_packets_in_flight(tp) <= tp->snd_cwnd) {
++ /* Reduce the cwnd accordingly, because we didn't sent enough
++ * to cover it (we are app limited probably) */
++ u32 diff = ca->burst - sent;
++
++ if (tp->snd_cwnd >= diff)
++ tp->snd_cwnd -= diff;
++ else
++ tp->snd_cwnd = 0;
++ pr_debug("%u sport: %u [%s] reducing cwnd by %u, value %u\n",
++ tcp_jiffies32, ca->sport, __func__,
++ ca->burst - sent, tp->snd_cwnd);
++ }
++}
++
++static size_t wavetcp_get_info(struct sock *sk, u32 ext, int *attr,
++ union tcp_cc_info *info)
++{
++ pr_debug("%u [%s] ext=%u", tcp_jiffies32, __func__, ext);
++
++ if (ext & (1 << (INET_DIAG_WAVEINFO - 1))) {
++ struct wavetcp *ca = inet_csk_ca(sk);
++
++ memset(&info->wave, 0, sizeof(info->wave));
++ info->wave.tx_timer = ca->tx_timer;
++ info->wave.burst = ca->burst;
++ info->wave.previous_ack_t_disp = ca->previous_ack_t_disp;
++ info->wave.min_rtt = ca->min_rtt;
++ info->wave.avg_rtt = ca->avg_rtt;
++ info->wave.max_rtt = ca->max_rtt;
++ *attr = INET_DIAG_WAVEINFO;
++ return (sizeof(info->wave));
++ }
++ return 0;
++}
++
++static void wavetcp_no_data(struct sock *sk)
++{
++ pr_debug("%u [%s]\n", tcp_jiffies32, __func__);
++}
++
++static u32 wavetcp_sndbuf_expand(struct sock *sk)
++{
++ return 10;
++}
++
++static u32 wavetcp_get_segs_per_round(struct sock *sk)
++{
++ struct wavetcp *ca = inet_csk_ca(sk);
++
++ return ca->burst;
++}
++
++static struct tcp_congestion_ops wave_cong_tcp __read_mostly = {
++ .init = wavetcp_init,
++ .get_info = wavetcp_get_info,
++ .release = wavetcp_release,
++ .ssthresh = wavetcp_recalc_ssthresh,
++/* .cong_avoid = wavetcp_cong_avoid, */
++ .cong_control = wavetcp_cong_control,
++ .set_state = wavetcp_state,
++ .undo_cwnd = wavetcp_undo_cwnd,
++ .cwnd_event = wavetcp_cwnd_event,
++ .pkts_acked = wavetcp_acked,
++ .sndbuf_expand = wavetcp_sndbuf_expand,
++ .owner = THIS_MODULE,
++ .name = "wave",
++ .get_pacing_time = wavetcp_get_timer,
++ .pacing_timer_expired = wavetcp_timer_expired,
++ .no_data_to_transmit = wavetcp_no_data,
++ .get_segs_per_round = wavetcp_get_segs_per_round,
++ .segments_sent = wavetcp_segment_sent,
++};
++
++
++static int wavetcp_log_open(struct inode *inode, struct file *file)
++{
++ /* Reset (empty) log */
++ spin_lock_bh(&wavetcp_probe.lock);
++ wavetcp_probe.head = wavetcp_probe.tail = 0;
++ wavetcp_probe.start = ktime_get();
++ spin_unlock_bh(&wavetcp_probe.lock);
++
++ return 0;
++}
++
++static int wavetcp_log_sprint(char *tbuf, int n)
++{
++ const struct wavetcp_log *p = wavetcp_probe.log + wavetcp_probe.tail;
++ struct timespec64 ts = ktime_to_timespec64(ktime_sub(p->tstamp,
++ wavetcp_probe.start));
++
++ return scnprintf(tbuf, n,
++ "%lu.%09lu %pISpc %pISpc %u %u %u %u %u\n",
++ (unsigned long)ts.tv_sec,
++ (unsigned long)ts.tv_nsec,
++ &p->src, &p->dst, p->tx_timer, p->burst,
++ p->min_rtt, p->avg_rtt, p->max_rtt);
++}
++
++static ssize_t wavetcp_log_read(struct file *file, char __user *buf,
++ size_t len, loff_t *ppos)
++{
++ int error = 0;
++ size_t cnt = 0;
++
++ if (!buf)
++ return -EINVAL;
++
++ while (cnt < len) {
++ char tbuf[256];
++ int width;
++
++ /* Wait for data in buffer */
++ error = wait_event_interruptible(wavetcp_probe.wait,
++ wavetcp_probe_used() > 0);
++ if (error)
++ break;
++
++ spin_lock_bh(&wavetcp_probe.lock);
++ if (wavetcp_probe.head == wavetcp_probe.tail) {
++ /* multiple readers race? */
++ spin_unlock_bh(&wavetcp_probe.lock);
++ continue;
++ }
++
++ width = wavetcp_log_sprint(tbuf, sizeof(tbuf));
++
++ if (cnt + width < len)
++ wavetcp_probe.tail = (wavetcp_probe.tail + 1) & (bufsize - 1);
++
++ spin_unlock_bh(&wavetcp_probe.lock);
++
++ /* if record greater than space available
++ return partial buffer (so far) */
++ if (cnt + width >= len)
++ break;
++
++ if (copy_to_user(buf + cnt, tbuf, width))
++ return -EFAULT;
++ cnt += width;
++ }
++
++ return cnt == 0 ? error : cnt;
++}
++
++static const struct file_operations tcpwave_fops = {
++ .owner = THIS_MODULE,
++ .open = wavetcp_log_open,
++ .read = wavetcp_log_read,
++ .llseek = noop_llseek,
++};
++
++static int __init wavetcp_register(void)
++{
++ BUILD_BUG_ON(sizeof(struct wavetcp) > ICSK_CA_PRIV_SIZE);
++
++ if (!enable_log)
++ return tcp_register_congestion_control(&wave_cong_tcp);
++
++ /* wave log initialization */
++
++ init_waitqueue_head(&wavetcp_probe.wait);
++ spin_lock_init(&wavetcp_probe.lock);
++
++ if (bufsize == 0)
++ return -EINVAL;
++
++ bufsize = roundup_pow_of_two(bufsize);
++ wavetcp_probe.log = kcalloc(bufsize, sizeof(struct wavetcp_log), GFP_KERNEL);
++
++ if (!wavetcp_probe.log)
++ goto leave;
++
++ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpwave_fops))
++ goto freemem;
++
++ return tcp_register_congestion_control(&wave_cong_tcp);
++
++freemem:
++ kfree(wavetcp_probe.log);
++leave:
++ return -ENOMEM;
++}
++
++static void __exit wavetcp_unregister(void)
++{
++ if (enable_log) {
++ remove_proc_entry(procname, init_net.proc_net);
++ kfree(wavetcp_probe.log);
++ }
++
++ tcp_unregister_congestion_control(&wave_cong_tcp);
++}
++
++module_init(wavetcp_register);
++module_exit(wavetcp_unregister);
++
++MODULE_AUTHOR("Natale Patriciello");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("WAVE TCP");
++MODULE_VERSION("0.1");