tcp_cubic.c source code [codebrowser/bsd/netinet/tcp_cubic.c]

1	/*
2	* Copyright (c) 2013-2014 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	#include <sys/param.h>
29	#include <sys/systm.h>
30	#include <sys/kernel.h>
31	#include <sys/protosw.h>
32	#include <sys/socketvar.h>
33	#include <sys/syslog.h>
34
35	#include <net/route.h>
36	#include <netinet/in.h>
37	#include <netinet/in_systm.h>
38	#include <netinet/ip.h>
39
40	#if INET6
41	#include <netinet/ip6.h>
42	#endif /* INET6 */
43
44	#include <netinet/ip_var.h>
45	#include <netinet/tcp.h>
46	#include <netinet/tcp_timer.h>
47	#include <netinet/tcp_var.h>
48	#include <netinet/tcp_fsm.h>
49	#include <netinet/tcp_var.h>
50	#include <netinet/tcp_cc.h>
51	#include <netinet/tcpip.h>
52	#include <netinet/tcp_seq.h>
53	#include <kern/task.h>
54	#include <libkern/OSAtomic.h>
55
56	static int tcp_cubic_init(struct tcpcb *tp);
57	static int tcp_cubic_cleanup(struct tcpcb *tp);
58	static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp);
59	static void tcp_cubic_congestion_avd(struct tcpcb tp, struct* tcphdr *th);
60	static void tcp_cubic_ack_rcvd(struct tcpcb tp, struct* tcphdr *th);
61	static void tcp_cubic_pre_fr(struct tcpcb *tp);
62	static void tcp_cubic_post_fr(struct tcpcb tp, struct* tcphdr *th);
63	static void tcp_cubic_after_timeout(struct tcpcb *tp);
64	static int tcp_cubic_delay_ack(struct tcpcb tp, struct* tcphdr *th);
65	static void tcp_cubic_switch_cc(struct tcpcb *tp, u_int16_t old_index);
66	static uint32_t tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt);
67	static uint32_t tcp_cubic_tcpwin(struct tcpcb tp, struct* tcphdr *th);
68	static inline void tcp_cubic_clear_state(struct tcpcb *tp);
69
70
71	extern float cbrtf(float x);
72
73	struct tcp_cc_algo tcp_cc_cubic = {
74	.name = "cubic",
75	.init = tcp_cubic_init,
76	.cleanup = tcp_cubic_cleanup,
77	.cwnd_init = tcp_cubic_cwnd_init_or_reset,
78	.congestion_avd = tcp_cubic_congestion_avd,
79	.ack_rcvd = tcp_cubic_ack_rcvd,
80	.pre_fr = tcp_cubic_pre_fr,
81	.post_fr = tcp_cubic_post_fr,
82	.after_idle = tcp_cubic_cwnd_init_or_reset,
83	.after_timeout = tcp_cubic_after_timeout,
84	.delay_ack = tcp_cubic_delay_ack,
85	.switch_to = tcp_cubic_switch_cc
86	};
87
88	const float tcp_cubic_backoff = `0.2`; / multiplicative decrease factor /
89	const float tcp_cubic_coeff = `0.4`;
90	const float tcp_cubic_fast_convergence_factor = `0.875`;
91
92	SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_tcp_friendliness, CTLFLAG_RW \| CTLFLAG_LOCKED,
93	static int, tcp_cubic_tcp_friendliness, `0`, "Enable TCP friendliness");
94
95	SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_fast_convergence, CTLFLAG_RW \| CTLFLAG_LOCKED,
96	static int, tcp_cubic_fast_convergence, `0`, "Enable fast convergence");
97
98	SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_use_minrtt, CTLFLAG_RW \| CTLFLAG_LOCKED,
99	static int, tcp_cubic_use_minrtt, `0`, "use a min of 5 sec rtt");
100
101	static int tcp_cubic_init(struct tcpcb *tp)
102	{
103	OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets);
104
105	VERIFY(tp->t_ccstate != NULL);
106	tcp_cubic_clear_state(tp);
107	return (`0`);
108	}
109
110	static int tcp_cubic_cleanup(struct tcpcb *tp)
111	{
112	#pragma unused(tp)
113	OSDecrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets);
114	return (`0`);
115	}
116
117	/*
118	* Initialize the congestion window at the beginning of a connection or
119	* after idle time
120	*/
121	static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp)
122	{
123	VERIFY(tp->t_ccstate != NULL);
124
125	tcp_cubic_clear_state(tp);
126	tcp_cc_cwnd_init_or_reset(tp);
127	tp->t_pipeack = `0`;
128	tcp_clear_pipeack_state(tp);
129
130	/ Start counting bytes for RFC 3465 again /
131	tp->t_bytes_acked = `0`;
132
133	/*
134	* slow start threshold could get initialized to a lower value
135	* when there is a cached value in the route metrics. In this case,
136	* the connection can enter congestion avoidance without any packet
137	* loss and Cubic will enter steady-state too early. It is better
138	* to always probe to find the initial slow-start threshold.
139	*/
140	if (tp->t_inpcb->inp_stat->txbytes <= TCP_CC_CWND_INIT_BYTES
141	&& tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT))
142	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
143
144	/ Initialize cubic last max to be same as ssthresh /
145	tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
146	}
147
148	/*
149	* Compute the target congestion window for the next RTT according to
150	* cubic equation when an ack is received.
151	*
152	* W(t) = C(t-K)^3 + W(last_max)
153	*/
154	static uint32_t
155	tcp_cubic_update(struct tcpcb *tp, u_int32_t rtt)
156	{
157	float K, var;
158	u_int32_t elapsed_time, win;
159
160	win = min(tp->snd_cwnd, tp->snd_wnd);
161	if (tp->t_ccstate->cub_last_max == `0`)
162	tp->t_ccstate->cub_last_max = tp->snd_ssthresh;
163
164	if (tp->t_ccstate->cub_epoch_start == `0`) {
165	/*
166	* This is the beginning of a new epoch, initialize some of
167	* the variables that we need to use for computing the
168	* congestion window later.
169	*/
170	tp->t_ccstate->cub_epoch_start = tcp_now;
171	if (tp->t_ccstate->cub_epoch_start == `0`)
172	tp->t_ccstate->cub_epoch_start = `1`;
173	if (win < tp->t_ccstate->cub_last_max) {
174
175	VERIFY(current_task() == kernel_task);
176
177	/*
178	* Compute cubic epoch period, this is the time
179	* period that the window will take to increase to
180	* last_max again after backoff due to loss.
181	*/
182	K = (tp->t_ccstate->cub_last_max - win)
183	/ tp->t_maxseg / tcp_cubic_coeff;
184	K = cbrtf(K);
185	tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ;
186	/ Origin point /
187	tp->t_ccstate->cub_origin_point =
188	tp->t_ccstate->cub_last_max;
189	} else {
190	tp->t_ccstate->cub_epoch_period = `0`;
191	tp->t_ccstate->cub_origin_point = win;
192	}
193	tp->t_ccstate->cub_target_win = `0`;
194	}
195
196	VERIFY(tp->t_ccstate->cub_origin_point > `0`);
197	/*
198	* Compute the target window for the next RTT using smoothed RTT
199	* as an estimate for next RTT.
200	*/
201	elapsed_time = timer_diff(tcp_now, `0`,
202	tp->t_ccstate->cub_epoch_start, `0`);
203
204	if (tcp_cubic_use_minrtt)
205	elapsed_time += max(tcp_cubic_use_minrtt, rtt);
206	else
207	elapsed_time += rtt;
208	var = (elapsed_time - tp->t_ccstate->cub_epoch_period) / TCP_RETRANSHZ;
209	var = var * var * var * (tcp_cubic_coeff * tp->t_maxseg);
210
211	tp->t_ccstate->cub_target_win = (u_int32_t)(tp->t_ccstate->cub_origin_point + var);
212	return (tp->t_ccstate->cub_target_win);
213	}
214
215	/*
216	* Standard TCP utilizes bandwidth well in low RTT and low BDP connections
217	* even when there is some packet loss. Enabling TCP mode will help Cubic
218	* to achieve this kind of utilization.
219	*
220	* But if there is a bottleneck link in the path with a fixed size queue
221	* and fixed bandwidth, TCP Cubic will help to reduce packet loss at this
222	* link because of the steady-state behavior. Using average and mean
223	* absolute deviation of W(lastmax), we try to detect if the congestion
224	* window is close to the bottleneck bandwidth. In that case, disabling
225	* TCP mode will help to minimize packet loss at this link.
226	*
227	* Disable TCP mode if the W(lastmax) (the window where previous packet
228	* loss happened) is within a small range from the average last max
229	* calculated.
230	*/
231	#define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \
232	((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \
233	(_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0)
234
235	/*
236	* Compute the window growth if standard TCP (AIMD) was used with
237	* a backoff of 0.5 and additive increase of 1 packet per RTT.
238	*
239	* TCP window at time t can be calculated using the following equation
240	* with beta as 0.8
241	*
242	* W(t) <- Wmax * beta + 3 * ((1 - beta)/(1 + beta)) * t/RTT
243	*
244	*/
245	static uint32_t
246	tcp_cubic_tcpwin(struct tcpcb tp, struct* tcphdr *th)
247	{
248	if (tp->t_ccstate->cub_tcp_win == `0`) {
249	tp->t_ccstate->cub_tcp_win = min(tp->snd_cwnd, tp->snd_wnd);
250	tp->t_ccstate->cub_tcp_bytes_acked = `0`;
251	} else {
252	tp->t_ccstate->cub_tcp_bytes_acked +=
253	BYTES_ACKED(th, tp);
254	if (tp->t_ccstate->cub_tcp_bytes_acked >=
255	tp->t_ccstate->cub_tcp_win) {
256	tp->t_ccstate->cub_tcp_bytes_acked -=
257	tp->t_ccstate->cub_tcp_win;
258	tp->t_ccstate->cub_tcp_win += tp->t_maxseg;
259	}
260	}
261	return (tp->t_ccstate->cub_tcp_win);
262	}
263
264	/*
265	* Handle an in-sequence ack during congestion avoidance phase.
266	*/
267	static void
268	tcp_cubic_congestion_avd(struct tcpcb tp, struct* tcphdr *th)
269	{
270	u_int32_t cubic_target_win, tcp_win, rtt;
271
272	/ Do not increase congestion window in non-validated phase /
273	if (tcp_cc_is_cwnd_nonvalidated(tp) != `0`)
274	return;
275
276	tp->t_bytes_acked += BYTES_ACKED(th, tp);
277
278	rtt = get_base_rtt(tp);
279	/*
280	* First compute cubic window. If cubic variables are not
281	* initialized (after coming out of recovery), this call will
282	* initialize them.
283	*/
284	cubic_target_win = tcp_cubic_update(tp, rtt);
285
286	/ Compute TCP window if a multiplicative decrease of 0.2 is used /
287	tcp_win = tcp_cubic_tcpwin(tp, th);
288
289	if (tp->snd_cwnd < tcp_win &&
290	(tcp_cubic_tcp_friendliness == `1` \|\|
291	TCP_CUBIC_ENABLE_TCPMODE(tp))) {
292	/ this connection is in TCP-friendly region /
293	if (tp->t_bytes_acked >= tp->snd_cwnd) {
294	tp->t_bytes_acked -= tp->snd_cwnd;
295	tp->snd_cwnd = min(tcp_win, TCP_MAXWIN << tp->snd_scale);
296	}
297	} else {
298	if (cubic_target_win > tp->snd_cwnd) {
299	/*
300	* The target win is computed for the next RTT.
301	* To reach this value, cwnd will have to be updated
302	* one segment at a time. Compute how many bytes
303	* need to be acknowledged before we can increase
304	* the cwnd by one segment.
305	*/
306	u_int64_t incr_win;
307	incr_win = tp->snd_cwnd * tp->t_maxseg;
308	incr_win /= (cubic_target_win - tp->snd_cwnd);
309	if (incr_win > `0` &&
310	tp->t_bytes_acked >= incr_win) {
311	tp->t_bytes_acked -= incr_win;
312	tp->snd_cwnd =
313	min((tp->snd_cwnd + tp->t_maxseg),
314	TCP_MAXWIN << tp->snd_scale);
315	}
316	}
317	}
318	}
319
320	static void
321	tcp_cubic_ack_rcvd(struct tcpcb tp, struct* tcphdr *th)
322	{
323	/ Do not increase the congestion window in non-validated phase /
324	if (tcp_cc_is_cwnd_nonvalidated(tp) != `0`)
325	return;
326
327	if (tp->snd_cwnd >= tp->snd_ssthresh) {
328	/ Congestion avoidance phase /
329	tcp_cubic_congestion_avd(tp, th);
330	} else {
331	/*
332	* Use 2*SMSS as limit on increment as suggested
333	* by RFC 3465 section 2.3
334	*/
335	uint32_t acked, abc_lim, incr;
336
337	acked = BYTES_ACKED(th, tp);
338	abc_lim = (tcp_do_rfc3465_lim2 &&
339	tp->snd_nxt == tp->snd_max) ?
340	`2` * tp->t_maxseg : tp->t_maxseg;
341	incr = min(acked, abc_lim);
342
343	tp->snd_cwnd += incr;
344	tp->snd_cwnd = min(tp->snd_cwnd,
345	TCP_MAXWIN << tp->snd_scale);
346	}
347	}
348
349	static void
350	tcp_cubic_pre_fr(struct tcpcb *tp)
351	{
352	u_int32_t win, avg;
353	int32_t dev;
354	tp->t_ccstate->cub_epoch_start = `0`;
355	tp->t_ccstate->cub_tcp_win = `0`;
356	tp->t_ccstate->cub_target_win = `0`;
357	tp->t_ccstate->cub_tcp_bytes_acked = `0`;
358
359	win = min(tp->snd_cwnd, tp->snd_wnd);
360	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
361	tp->t_lossflightsize = tp->snd_max - tp->snd_una;
362	win = (max(tp->t_pipeack, tp->t_lossflightsize)) >> `1`;
363	} else {
364	tp->t_lossflightsize = `0`;
365	}
366	/*
367	* Note the congestion window at which packet loss occurred as
368	* cub_last_max.
369	*
370	* If the congestion window is less than the last max window when
371	* loss occurred, it indicates that capacity available in the
372	* network has gone down. This can happen if a new flow has started
373	* and it is capturing some of the bandwidth. To reach convergence
374	* quickly, backoff a little more. Disable fast convergence to
375	* disable this behavior.
376	*/
377	if (win < tp->t_ccstate->cub_last_max &&
378	tcp_cubic_fast_convergence == `1`)
379	tp->t_ccstate->cub_last_max = (u_int32_t)(win *
380	tcp_cubic_fast_convergence_factor);
381	else
382	tp->t_ccstate->cub_last_max = win;
383
384	if (tp->t_ccstate->cub_last_max == `0`) {
385	/*
386	* If last_max is zero because snd_wnd is zero or for
387	* any other reason, initialize it to the amount of data
388	* in flight
389	*/
390	tp->t_ccstate->cub_last_max = tp->snd_max - tp->snd_una;
391	}
392
393	/*
394	* Compute average and mean absolute deviation of the
395	* window at which packet loss occurred.
396	*/
397	if (tp->t_ccstate->cub_avg_lastmax == `0`) {
398	tp->t_ccstate->cub_avg_lastmax = tp->t_ccstate->cub_last_max;
399	} else {
400	/*
401	* Average is computed by taking 63 parts of
402	* history and one part of the most recent value
403	*/
404	avg = tp->t_ccstate->cub_avg_lastmax;
405	avg = (avg << `6`) - avg;
406	tp->t_ccstate->cub_avg_lastmax =
407	(avg + tp->t_ccstate->cub_last_max) >> `6`;
408	}
409
410	/ caluclate deviation from average /
411	dev = tp->t_ccstate->cub_avg_lastmax - tp->t_ccstate->cub_last_max;
412
413	/ Take the absolute value /
414	if (dev < `0`)
415	dev = -dev;
416
417	if (tp->t_ccstate->cub_mean_dev == `0`) {
418	tp->t_ccstate->cub_mean_dev = dev;
419	} else {
420	dev = dev + ((tp->t_ccstate->cub_mean_dev << `4`)
421	- tp->t_ccstate->cub_mean_dev);
422	tp->t_ccstate->cub_mean_dev = dev >> `4`;
423	}
424
425	/ Backoff congestion window by tcp_cubic_backoff factor /
426	win = (u_int32_t)(win - (win * tcp_cubic_backoff));
427	win = (win / tp->t_maxseg);
428	if (win < `2`)
429	win = `2`;
430	tp->snd_ssthresh = win * tp->t_maxseg;
431	tcp_cc_resize_sndbuf(tp);
432	}
433
434	static void
435	tcp_cubic_post_fr(struct tcpcb tp, struct* tcphdr *th)
436	{
437	uint32_t flight_size = `0`;
438
439	if (SEQ_LEQ(th->th_ack, tp->snd_max))
440	flight_size = tp->snd_max - th->th_ack;
441
442	if (SACK_ENABLED(tp) && tp->t_lossflightsize > `0`) {
443	u_int32_t total_rxt_size = `0`, ncwnd;
444	/*
445	* When SACK is enabled, the number of retransmitted bytes
446	* can be counted more accurately.
447	*/
448	total_rxt_size = tcp_rxtseg_total_size(tp);
449	ncwnd = max(tp->t_pipeack, tp->t_lossflightsize);
450	if (total_rxt_size <= ncwnd) {
451	ncwnd = ncwnd - total_rxt_size;
452	}
453
454	/*
455	* To avoid sending a large burst at the end of recovery
456	* set a max limit on ncwnd
457	*/
458	ncwnd = min(ncwnd, (tp->t_maxseg << `6`));
459	ncwnd = ncwnd >> `1`;
460	flight_size = max(ncwnd, flight_size);
461	}
462	/*
463	* Complete ack. The current window was inflated for fast recovery.
464	* It has to be deflated post recovery.
465	*
466	* Window inflation should have left us with approx snd_ssthresh
467	* outstanding data. If the flight size is zero or one segment,
468	* make congestion window to be at least as big as 2 segments to
469	* avoid delayed acknowledgements. This is according to RFC 6582.
470	*/
471	if (flight_size < tp->snd_ssthresh)
472	tp->snd_cwnd = max(flight_size, tp->t_maxseg)
473	+ tp->t_maxseg;
474	else
475	tp->snd_cwnd = tp->snd_ssthresh;
476	tp->t_ccstate->cub_tcp_win = `0`;
477	tp->t_ccstate->cub_target_win = `0`;
478	tp->t_ccstate->cub_tcp_bytes_acked = `0`;
479	}
480
481	static void
482	tcp_cubic_after_timeout(struct tcpcb *tp)
483	{
484	VERIFY(tp->t_ccstate != NULL);
485
486	/*
487	* Avoid adjusting congestion window due to SYN retransmissions.
488	* If more than one byte (SYN) is outstanding then it is still
489	* needed to adjust the window.
490	*/
491	if (tp->t_state < TCPS_ESTABLISHED &&
492	((int)(tp->snd_max - tp->snd_una) <= `1`))
493	return;
494
495	if (!IN_FASTRECOVERY(tp)) {
496	tcp_cubic_clear_state(tp);
497	tcp_cubic_pre_fr(tp);
498	}
499
500	/*
501	* Close the congestion window down to one segment as a retransmit
502	* timeout might indicate severe congestion.
503	*/
504	tp->snd_cwnd = tp->t_maxseg;
505	}
506
507	static int
508	tcp_cubic_delay_ack(struct tcpcb tp, struct* tcphdr *th)
509	{
510	return (tcp_cc_delay_ack(tp, th));
511	}
512
513	/*
514	* When switching from a different CC it is better for Cubic to start
515	* fresh. The state required for Cubic calculation might be stale and it
516	* might not represent the current state of the network. If it starts as
517	* a new connection it will probe and learn the existing network conditions.
518	*/
519	static void
520	tcp_cubic_switch_cc(struct tcpcb *tp, uint16_t old_cc_index)
521	{
522	#pragma unused(old_cc_index)
523	tcp_cubic_cwnd_init_or_reset(tp);
524
525	OSIncrementAtomic((volatile SInt32 *)&tcp_cc_cubic.num_sockets);
526	}
527
528	static inline void tcp_cubic_clear_state(struct tcpcb *tp)
529	{
530	tp->t_ccstate->cub_last_max = `0`;
531	tp->t_ccstate->cub_epoch_start = `0`;
532	tp->t_ccstate->cub_origin_point = `0`;
533	tp->t_ccstate->cub_tcp_win = `0`;
534	tp->t_ccstate->cub_tcp_bytes_acked = `0`;
535	tp->t_ccstate->cub_epoch_period = `0`;
536	tp->t_ccstate->cub_target_win = `0`;
537	}
538

Browse the source code of codebrowser/bsd/netinet/tcp_cubic.c