tcp_output.c source code [codebrowser/bsd/netinet/tcp_output.c]

1	/*
2	* Copyright (c) 2000-2018 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30	* The Regents of the University of California. All rights reserved.
31	*
32	* Redistribution and use in source and binary forms, with or without
33	* modification, are permitted provided that the following conditions
34	* are met:
35	* 1. Redistributions of source code must retain the above copyright
36	* notice, this list of conditions and the following disclaimer.
37	* 2. Redistributions in binary form must reproduce the above copyright
38	* notice, this list of conditions and the following disclaimer in the
39	* documentation and/or other materials provided with the distribution.
40	* 3. All advertising materials mentioning features or use of this software
41	* must display the following acknowledgement:
42	* This product includes software developed by the University of
43	* California, Berkeley and its contributors.
44	* 4. Neither the name of the University nor the names of its contributors
45	* may be used to endorse or promote products derived from this software
46	* without specific prior written permission.
47	*
48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58	* SUCH DAMAGE.
59	*
60	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
61	* $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
62	*/
63	/*
64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65	* support for mandatory and extensible security protections. This notice
66	* is included in support of clause 2.2 (b) of the Apple Public License,
67	* Version 2.0.
68	*/
69
70	#define _IP_VHL
71
72
73	#include <sys/param.h>
74	#include <sys/systm.h>
75	#include <sys/kernel.h>
76	#include <sys/sysctl.h>
77	#include <sys/mbuf.h>
78	#include <sys/domain.h>
79	#include <sys/protosw.h>
80	#include <sys/socket.h>
81	#include <sys/socketvar.h>
82
83	#include <net/route.h>
84	#include <net/ntstat.h>
85	#include <net/if_var.h>
86	#include <net/if.h>
87	#include <net/if_types.h>
88	#include <net/dlil.h>
89
90	#include <netinet/in.h>
91	#include <netinet/in_systm.h>
92	#include <netinet/in_var.h>
93	#include <netinet/in_tclass.h>
94	#include <netinet/ip.h>
95	#include <netinet/in_pcb.h>
96	#include <netinet/ip_var.h>
97	#include <mach/sdt.h>
98	#if INET6
99	#include <netinet6/in6_pcb.h>
100	#include <netinet/ip6.h>
101	#include <netinet6/ip6_var.h>
102	#endif
103	#include <netinet/tcp.h>
104	#define TCPOUTFLAGS
105	#include <netinet/tcp_cache.h>
106	#include <netinet/tcp_fsm.h>
107	#include <netinet/tcp_seq.h>
108	#include <netinet/tcp_timer.h>
109	#include <netinet/tcp_var.h>
110	#include <netinet/tcpip.h>
111	#include <netinet/tcp_cc.h>
112	#if TCPDEBUG
113	#include <netinet/tcp_debug.h>
114	#endif
115	#include <sys/kdebug.h>
116	#include <mach/sdt.h>
117
118	#if IPSEC
119	#include <netinet6/ipsec.h>
120	#endif /IPSEC/
121
122	#if CONFIG_MACF_NET
123	#include <security/mac_framework.h>
124	#endif /* MAC_SOCKET */
125
126	#include <netinet/lro_ext.h>
127	#if MPTCP
128	#include <netinet/mptcp_var.h>
129	#include <netinet/mptcp.h>
130	#include <netinet/mptcp_opt.h>
131	#endif
132
133	#include <corecrypto/ccaes.h>
134
135	#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
136	#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
137	#define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) \| 1)
138
139	SYSCTL_SKMEM_TCP_INT(OID_AUTO, path_mtu_discovery,
140	CTLFLAG_RW \| CTLFLAG_LOCKED, int, path_mtu_discovery, `1`,
141	"Enable Path MTU Discovery");
142
143	SYSCTL_SKMEM_TCP_INT(OID_AUTO, slowstart_flightsize,
144	CTLFLAG_RW \| CTLFLAG_LOCKED, int, ss_fltsz, `1`,
145	"Slow start flight size");
146
147	SYSCTL_SKMEM_TCP_INT(OID_AUTO, local_slowstart_flightsize,
148	CTLFLAG_RW \| CTLFLAG_LOCKED, int, ss_fltsz_local, `8`,
149	"Slow start flight size for local networks");
150
151	int tcp_do_tso = `1`;
152	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW \| CTLFLAG_LOCKED,
153	&tcp_do_tso, `0`, "Enable TCP Segmentation Offload");
154
155	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ecn_setup_percentage,
156	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_ecn_setup_percentage, `100`,
157	"Max ECN setup percentage");
158
159	static int
160	sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
161	{
162	#pragma unused(oidp, arg1, arg2)
163	int i, err = `0`, changed = `0`;
164	struct ifnet *ifp;
165
166	err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t),
167	&i, &changed);
168	if (err != `0` \|\| req->newptr == USER_ADDR_NULL)
169	return(err);
170
171	if (changed) {
172	if ((tcp_ecn_outbound == `0` \|\| tcp_ecn_outbound == `1`) &&
173	(i == `0` \|\| i == `1`)) {
174	tcp_ecn_outbound = i;
175	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound);
176	return(err);
177	}
178	if (tcp_ecn_outbound == `2` && (i == `0` \|\| i == `1`)) {
179	/*
180	* Reset ECN enable flags on non-cellular
181	* interfaces so that the system default will take
182	* over
183	*/
184	ifnet_head_lock_shared();
185	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
186	if (!IFNET_IS_CELLULAR(ifp)) {
187	ifnet_lock_exclusive(ifp);
188	ifp->if_eflags &= ~IFEF_ECN_DISABLE;
189	ifp->if_eflags &= ~IFEF_ECN_ENABLE;
190	ifnet_lock_done(ifp);
191	}
192	}
193	ifnet_head_done();
194	} else {
195	/*
196	* Set ECN enable flags on non-cellular
197	* interfaces
198	*/
199	ifnet_head_lock_shared();
200	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
201	if (!IFNET_IS_CELLULAR(ifp)) {
202	ifnet_lock_exclusive(ifp);
203	ifp->if_eflags \|= IFEF_ECN_ENABLE;
204	ifp->if_eflags &= ~IFEF_ECN_DISABLE;
205	ifnet_lock_done(ifp);
206	}
207	}
208	ifnet_head_done();
209	}
210	tcp_ecn_outbound = i;
211	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound);
212	}
213	/ Change the other one too as the work is done /
214	if (i == `2` \|\| tcp_ecn_inbound == `2`) {
215	tcp_ecn_inbound = i;
216	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_negotiate_in, tcp_ecn_inbound);
217	}
218	return (err);
219	}
220
221	int tcp_ecn_outbound = `2`;
222	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out,
223	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_outbound, `0`,
224	sysctl_change_ecn_setting, "IU",
225	"Initiate ECN for outbound connections");
226
227	int tcp_ecn_inbound = `2`;
228	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_negotiate_in,
229	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_inbound, `0`,
230	sysctl_change_ecn_setting, "IU",
231	"Initiate ECN for inbound connections");
232
233	SYSCTL_SKMEM_TCP_INT(OID_AUTO, packetchain,
234	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_packet_chaining, `50`,
235	"Enable TCP output packet chaining");
236
237	SYSCTL_SKMEM_TCP_INT(OID_AUTO, socket_unlocked_on_output,
238	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_output_unlocked, `1`,
239	"Unlock TCP when sending packets down to IP");
240
241	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3390,
242	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_do_rfc3390, `1`,
243	"Calculate intial slowstart cwnd depending on MSS");
244
245	SYSCTL_SKMEM_TCP_INT(OID_AUTO, min_iaj_win,
246	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_min_iaj_win, MIN_IAJ_WIN,
247	"Minimum recv win based on inter-packet arrival jitter");
248
249	SYSCTL_SKMEM_TCP_INT(OID_AUTO, acc_iaj_react_limit,
250	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_acc_iaj_react_limit,
251	ACC_IAJ_REACT_LIMIT, "Accumulated IAJ when receiver starts to react");
252
253	SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautosndbuf,
254	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_do_autosendbuf, `1`,
255	"Enable send socket buffer auto-tuning");
256
257	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufinc,
258	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_inc,
259	`8` * `1024`, "Increment in send socket bufffer size");
260
261	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufmax,
262	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_max, `512` * `1024`,
263	"Maximum send socket buffer size");
264
265	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_prioritize,
266	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_prioritize_acks, `1`,
267	"Prioritize pure acks");
268
269	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_recvbg,
270	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_use_rtt_recvbg, `1`,
271	"Use RTT for bg recv algorithm");
272
273	SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_throttle_minwin,
274	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_recv_throttle_minwin, `16` * `1024`,
275	"Minimum recv win for throttling");
276
277	SYSCTL_SKMEM_TCP_INT(OID_AUTO, enable_tlp,
278	CTLFLAG_RW \| CTLFLAG_LOCKED,
279	int32_t, tcp_enable_tlp, `1`, "Enable Tail loss probe");
280
281	static int32_t packchain_newlist = `0`;
282	static int32_t packchain_looped = `0`;
283	static int32_t packchain_sent = `0`;
284
285	/ temporary: for testing /
286	#if IPSEC
287	extern int ipsec_bypass;
288	#endif
289
290	extern int slowlink_wsize; / window correction for slow links /
291	#if IPFIREWALL
292	extern int fw_enable; / firewall check for packet chaining /
293	extern int fw_bypass; / firewall check: disable packet chaining if there is rules /
294	#endif /* IPFIREWALL */
295
296	extern u_int32_t dlil_filter_disable_tso_count;
297	extern u_int32_t kipf_count;
298
299	static int tcp_ip_output(struct socket , struct* tcpcb , struct* mbuf *,
300	int, struct mbuf , int, int*, boolean_t);
301	static struct mbuf* tcp_send_lroacks(struct tcpcb tp, struct* mbuf m, struct* tcphdr *th);
302	static int tcp_recv_throttle(struct tcpcb *tp);
303
304	static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len)
305	{
306	struct socket *so = tp->t_inpcb->inp_socket;
307	unsigned int optlen = `0`;
308	unsigned int cookie_len;
309
310	if (tp->t_flags & TF_NOOPT)
311	goto fallback;
312
313	if ((so->so_flags1 & SOF1_DATA_AUTHENTICATED) &&
314	!(tp->t_flagsext & TF_FASTOPEN_HEUR))
315	return (len);
316
317	if (!tcp_heuristic_do_tfo(tp)) {
318	tp->t_tfo_stats \|= TFO_S_HEURISTICS_DISABLE;
319	tcpstat.tcps_tfo_heuristics_disable++;
320	goto fallback;
321	}
322
323	if (so->so_flags1 & SOF1_DATA_AUTHENTICATED)
324	return (len);
325
326	optlen += TCPOLEN_MAXSEG;
327
328	if (tp->t_flags & TF_REQ_SCALE)
329	optlen += `4`;
330
331	#if MPTCP
332	if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable &&
333	tp->t_rxtshift <= mptcp_mpcap_retries)
334	optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t);
335	#endif /* MPTCP */
336
337	if (tp->t_flags & TF_REQ_TSTMP)
338	optlen += TCPOLEN_TSTAMP_APPA;
339
340	if (SACK_ENABLED(tp))
341	optlen += TCPOLEN_SACK_PERMITTED;
342
343	/ Now, decide whether to use TFO or not /
344
345	/ Don't even bother trying if there is no space at all... /
346	if (MAX_TCPOPTLEN - optlen < TCPOLEN_FASTOPEN_REQ)
347	goto fallback;
348
349	cookie_len = tcp_cache_get_cookie_len(tp);
350	if (cookie_len == `0`)
351	/ No cookie, so we request one /
352	return (`0`);
353
354	/ There is not enough space for the cookie, so we cannot do TFO /
355	if (MAX_TCPOPTLEN - optlen < cookie_len)
356	goto fallback;
357
358	/ Do not send SYN+data if there is more in the queue than MSS /
359	if (so->so_snd.sb_cc > (tp->t_maxopd - MAX_TCPOPTLEN))
360	goto fallback;
361
362	/ Ok, everything looks good. We can go on and do TFO /
363	return (len);
364
365	fallback:
366	tp->t_flagsext &= ~TF_FASTOPEN;
367	return (`0`);
368	}
369
370	/ Returns the number of bytes written to the TCP option-space /
371	static unsigned
372	tcp_tfo_write_cookie_rep(struct tcpcb tp, unsigned* optlen, u_char *opt)
373	{
374	u_char out[CCAES_BLOCK_SIZE];
375	unsigned ret = `0`;
376	u_char *bp;
377
378	if ((MAX_TCPOPTLEN - optlen) <
379	(TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT))
380	return (ret);
381
382	tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out));
383
384	bp = opt + optlen;
385
386	*bp++ = TCPOPT_FASTOPEN;
387	*bp++ = `2` + TFO_COOKIE_LEN_DEFAULT;
388	memcpy(bp, out, TFO_COOKIE_LEN_DEFAULT);
389	ret += `2` + TFO_COOKIE_LEN_DEFAULT;
390
391	tp->t_tfo_stats \|= TFO_S_COOKIE_SENT;
392	tcpstat.tcps_tfo_cookie_sent++;
393
394	return (ret);
395	}
396
397	static unsigned
398	tcp_tfo_write_cookie(struct tcpcb tp, unsigned* optlen, int32_t len,
399	u_char *opt)
400	{
401	u_int8_t tfo_len = MAX_TCPOPTLEN - optlen - TCPOLEN_FASTOPEN_REQ;
402	struct socket *so = tp->t_inpcb->inp_socket;
403	unsigned ret = `0`;
404	int res;
405	u_char *bp;
406
407	if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
408	/ If there is some data, let's track it /
409	if (len > `0`) {
410	tp->t_tfo_stats \|= TFO_S_SYN_DATA_SENT;
411	tcpstat.tcps_tfo_syn_data_sent++;
412	}
413
414	return (`0`);
415	}
416
417	bp = opt + optlen;
418
419	/*
420	* The cookie will be copied in the appropriate place within the
421	* TCP-option space. That way we avoid the need for an intermediate
422	* variable.
423	*/
424	res = tcp_cache_get_cookie(tp, bp + TCPOLEN_FASTOPEN_REQ, &tfo_len);
425	if (res == `0`) {
426	*bp++ = TCPOPT_FASTOPEN;
427	*bp++ = TCPOLEN_FASTOPEN_REQ;
428	ret += TCPOLEN_FASTOPEN_REQ;
429
430	tp->t_tfo_flags \|= TFO_F_COOKIE_REQ;
431
432	tp->t_tfo_stats \|= TFO_S_COOKIE_REQ;
433	tcpstat.tcps_tfo_cookie_req++;
434	} else {
435	*bp++ = TCPOPT_FASTOPEN;
436	*bp++ = TCPOLEN_FASTOPEN_REQ + tfo_len;
437
438	ret += TCPOLEN_FASTOPEN_REQ + tfo_len;
439
440	tp->t_tfo_flags \|= TFO_F_COOKIE_SENT;
441
442	/ If there is some data, let's track it /
443	if (len > `0`) {
444	tp->t_tfo_stats \|= TFO_S_SYN_DATA_SENT;
445	tcpstat.tcps_tfo_syn_data_sent++;
446	}
447	}
448
449	return (ret);
450	}
451
452	static inline bool
453	tcp_send_ecn_flags_on_syn(struct tcpcb tp, struct* socket *so)
454	{
455	return(!((tp->ecn_flags & TE_SETUPSENT) \|\|
456	(so->so_flags & SOF_MP_SUBFLOW) \|\|
457	(tp->t_flagsext & TF_FASTOPEN)));
458	}
459
460	void
461	tcp_set_ecn(struct tcpcb tp, struct* ifnet *ifp)
462	{
463	boolean_t inbound;
464
465	/*
466	* Socket option has precedence
467	*/
468	if (tp->ecn_flags & TE_ECN_MODE_ENABLE) {
469	tp->ecn_flags \|= TE_ENABLE_ECN;
470	goto check_heuristic;
471	}
472
473	if (tp->ecn_flags & TE_ECN_MODE_DISABLE) {
474	tp->ecn_flags &= ~TE_ENABLE_ECN;
475	return;
476	}
477	/*
478	* Per interface setting comes next
479	*/
480	if (ifp != NULL) {
481	if (ifp->if_eflags & IFEF_ECN_ENABLE) {
482	tp->ecn_flags \|= TE_ENABLE_ECN;
483	goto check_heuristic;
484	}
485
486	if (ifp->if_eflags & IFEF_ECN_DISABLE) {
487	tp->ecn_flags &= ~TE_ENABLE_ECN;
488	return;
489	}
490	}
491	/*
492	* System wide settings come last
493	*/
494	inbound = (tp->t_inpcb->inp_socket->so_head != NULL);
495	if ((inbound && tcp_ecn_inbound == `1`) \|\|
496	(!inbound && tcp_ecn_outbound == `1`)) {
497	tp->ecn_flags \|= TE_ENABLE_ECN;
498	goto check_heuristic;
499	} else {
500	tp->ecn_flags &= ~TE_ENABLE_ECN;
501	}
502
503	return;
504
505	check_heuristic:
506	if (!tcp_heuristic_do_ecn(tp))
507	tp->ecn_flags &= ~TE_ENABLE_ECN;
508
509	/*
510	* If the interface setting, system-level setting and heuristics
511	* allow to enable ECN, randomly select 5% of connections to
512	* enable it
513	*/
514	if ((tp->ecn_flags & (TE_ECN_MODE_ENABLE \| TE_ECN_MODE_DISABLE
515	\| TE_ENABLE_ECN)) == TE_ENABLE_ECN) {
516	/*
517	* Use the random value in iss for randomizing
518	* this selection
519	*/
520	if ((tp->iss % `100`) >= tcp_ecn_setup_percentage)
521	tp->ecn_flags &= ~TE_ENABLE_ECN;
522	}
523	}
524
525	/*
526	* Tcp output routine: figure out what should be sent and send it.
527	*
528	* Returns: 0 Success
529	* EADDRNOTAVAIL
530	* ENOBUFS
531	* EMSGSIZE
532	* EHOSTUNREACH
533	* ENETDOWN
534	* ip_output_list:ENOMEM
535	* ip_output_list:EADDRNOTAVAIL
536	* ip_output_list:ENETUNREACH
537	* ip_output_list:EHOSTUNREACH
538	* ip_output_list:EACCES
539	* ip_output_list:EMSGSIZE
540	* ip_output_list:ENOBUFS
541	* ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
542	* ip6_output_list:EINVAL
543	* ip6_output_list:EOPNOTSUPP
544	* ip6_output_list:EHOSTUNREACH
545	* ip6_output_list:EADDRNOTAVAIL
546	* ip6_output_list:ENETUNREACH
547	* ip6_output_list:EMSGSIZE
548	* ip6_output_list:ENOBUFS
549	* ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
550	*/
551	int
552	tcp_output(struct tcpcb *tp)
553	{
554	struct inpcb *inp = tp->t_inpcb;
555	struct socket *so = inp->inp_socket;
556	int32_t len, recwin, sendwin, off;
557	int flags, error;
558	struct mbuf *m;
559	struct ip *ip = NULL;
560	struct ipovly *ipov = NULL;
561	#if INET6
562	struct ip6_hdr *ip6 = NULL;
563	#endif /* INET6 */
564	struct tcphdr *th;
565	u_char opt[TCP_MAXOLEN];
566	unsigned ipoptlen, optlen, hdrlen;
567	int idle, sendalot, lost = `0`;
568	int i, sack_rxmit;
569	int tso = `0`;
570	int sack_bytes_rxmt;
571	tcp_seq old_snd_nxt = `0`;
572	struct sackhole *p;
573	#if IPSEC
574	unsigned ipsec_optlen = `0`;
575	#endif /* IPSEC */
576	int idle_time = `0`;
577	struct mbuf *packetlist = NULL;
578	struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options;
579	#if INET6
580	int isipv6 = inp->inp_vflag & INP_IPV6 ;
581	#else
582	int isipv6 = `0`;
583	#endif
584	short packchain_listadd = `0`;
585	int so_options = so->so_options;
586	struct rtentry *rt;
587	u_int32_t svc_flags = `0`, allocated_len;
588	u_int32_t lro_ackmore = (tp->t_lropktlen != `0`) ? `1` : `0`;
589	struct mbuf *mnext = NULL;
590	int sackoptlen = `0`;
591	#if MPTCP
592	boolean_t mptcp_acknow;
593	#endif /* MPTCP */
594	boolean_t cell = FALSE;
595	boolean_t wifi = FALSE;
596	boolean_t wired = FALSE;
597	boolean_t sack_rescue_rxt = FALSE;
598	int sotc = so->so_traffic_class;
599
600	/*
601	* Determine length of data that should be transmitted,
602	* and flags that will be used.
603	* If there is some data or critical controls (SYN, RST)
604	* to send, then transmit; otherwise, investigate further.
605	*/
606	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
607
608	/ Since idle_time is signed integer, the following integer subtraction*
609	* will take care of wrap around of tcp_now
610	*/
611	idle_time = tcp_now - tp->t_rcvtime;
612	if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) {
613	if (CC_ALGO(tp)->after_idle != NULL &&
614	(tp->tcp_cc_index != TCP_CC_ALGO_CUBIC_INDEX \|\|
615	idle_time >= TCP_CC_CWND_NONVALIDATED_PERIOD)) {
616	CC_ALGO(tp)->after_idle(tp);
617	tcp_ccdbg_trace(tp, NULL, TCP_CC_IDLE_TIMEOUT);
618	}
619
620	/*
621	* Do some other tasks that need to be done after
622	* idle time
623	*/
624	if (!SLIST_EMPTY(&tp->t_rxt_segments))
625	tcp_rxtseg_clean(tp);
626
627	/ If stretch ack was auto-disabled, re-evaluate it /
628	tcp_cc_after_idle_stretchack(tp);
629	}
630	tp->t_flags &= ~TF_LASTIDLE;
631	if (idle) {
632	if (tp->t_flags & TF_MORETOCOME) {
633	tp->t_flags \|= TF_LASTIDLE;
634	idle = `0`;
635	}
636	}
637	#if MPTCP
638	if (tp->t_mpflags & TMPF_RESET) {
639	tcp_check_timer_state(tp);
640	/*
641	* Once a RST has been sent for an MPTCP subflow,
642	* the subflow socket stays around until deleted.
643	* No packets such as FINs must be sent after RST.
644	*/
645	return (`0`);
646	}
647	#endif /* MPTCP */
648
649	again:
650	#if MPTCP
651	mptcp_acknow = FALSE;
652	#endif
653
654	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_START, `0`,`0`,`0`,`0`,`0`);
655
656	#if INET6
657	if (isipv6) {
658	KERNEL_DEBUG(DBG_LAYER_BEG,
659	((inp->inp_fport << `16`) \| inp->inp_lport),
660	(((inp->in6p_laddr.s6_addr16[`0`] & `0xffff`) << `16`) \|
661	(inp->in6p_faddr.s6_addr16[`0`] & `0xffff`)),
662	sendalot,`0`,`0`);
663	} else
664	#endif
665
666	{
667	KERNEL_DEBUG(DBG_LAYER_BEG,
668	((inp->inp_fport << `16`) \| inp->inp_lport),
669	(((inp->inp_laddr.s_addr & `0xffff`) << `16`) \|
670	(inp->inp_faddr.s_addr & `0xffff`)),
671	sendalot,`0`,`0`);
672	}
673	/*
674	* If the route generation id changed, we need to check that our
675	* local (source) IP address is still valid. If it isn't either
676	* return error or silently do nothing (assuming the address will
677	* come back before the TCP connection times out).
678	*/
679	rt = inp->inp_route.ro_rt;
680	if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) {
681	struct ifnet *ifp;
682	struct in_ifaddr *ia = NULL;
683	struct in6_ifaddr *ia6 = NULL;
684	int found_srcaddr = `0`;
685
686	/ disable multipages at the socket /
687	somultipages(so, FALSE);
688
689	/ Disable TSO for the socket until we know more /
690	tp->t_flags &= ~TF_TSO;
691
692	soif2kcl(so, FALSE);
693
694	if (isipv6) {
695	ia6 = ifa_foraddr6(&inp->in6p_laddr);
696	if (ia6 != NULL)
697	found_srcaddr = `1`;
698	} else {
699	ia = ifa_foraddr(inp->inp_laddr.s_addr);
700	if (ia != NULL)
701	found_srcaddr = `1`;
702	}
703
704	/ check that the source address is still valid /
705	if (found_srcaddr == `0`) {
706	soevent(so,
707	(SO_FILT_HINT_LOCKED \| SO_FILT_HINT_NOSRCADDR));
708
709	if (tp->t_state >= TCPS_CLOSE_WAIT) {
710	tcp_drop(tp, EADDRNOTAVAIL);
711	return(EADDRNOTAVAIL);
712	}
713
714	/ Set retransmit timer if it wasn't set,*
715	* reset Persist timer and shift register as the
716	* advertised peer window may not be valid anymore
717	*/
718
719	if (!tp->t_timer[TCPT_REXMT]) {
720	tp->t_timer[TCPT_REXMT] =
721	OFFSET_FROM_START(tp, tp->t_rxtcur);
722	if (tp->t_timer[TCPT_PERSIST]) {
723	tp->t_timer[TCPT_PERSIST] = `0`;
724	tp->t_persist_stop = `0`;
725	TCP_RESET_REXMT_STATE(tp);
726	}
727	}
728
729	if (tp->t_pktlist_head != NULL)
730	m_freem_list(tp->t_pktlist_head);
731	TCP_PKTLIST_CLEAR(tp);
732
733	/ drop connection if source address isn't available /
734	if (so->so_flags & SOF_NOADDRAVAIL) {
735	tcp_drop(tp, EADDRNOTAVAIL);
736	return(EADDRNOTAVAIL);
737	} else {
738	tcp_check_timer_state(tp);
739	return(`0`); / silently ignore, keep data in socket: address may be back /
740	}
741	}
742	if (ia != NULL)
743	IFA_REMREF(&ia->ia_ifa);
744
745	if (ia6 != NULL)
746	IFA_REMREF(&ia6->ia_ifa);
747
748	/*
749	* Address is still valid; check for multipages capability
750	* again in case the outgoing interface has changed.
751	*/
752	RT_LOCK(rt);
753	if ((ifp = rt->rt_ifp) != NULL) {
754	somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
755	tcp_set_tso(tp, ifp);
756	soif2kcl(so, (ifp->if_eflags & IFEF_2KCL));
757	tcp_set_ecn(tp, ifp);
758	}
759	if (rt->rt_flags & RTF_UP)
760	RT_GENID_SYNC(rt);
761	/*
762	* See if we should do MTU discovery. Don't do it if:
763	* 1) it is disabled via the sysctl
764	* 2) the route isn't up
765	* 3) the MTU is locked (if it is, then discovery
766	* has been disabled)
767	*/
768
769	if (!path_mtu_discovery \|\| ((rt != NULL) &&
770	(!(rt->rt_flags & RTF_UP) \|\|
771	(rt->rt_rmx.rmx_locks & RTV_MTU))))
772	tp->t_flags &= ~TF_PMTUD;
773	else
774	tp->t_flags \|= TF_PMTUD;
775
776	RT_UNLOCK(rt);
777	}
778
779	if (rt != NULL) {
780	cell = IFNET_IS_CELLULAR(rt->rt_ifp);
781	wifi = (!cell && IFNET_IS_WIFI(rt->rt_ifp));
782	wired = (!wifi && IFNET_IS_WIRED(rt->rt_ifp));
783	}
784
785	/*
786	* If we've recently taken a timeout, snd_max will be greater than
787	* snd_nxt. There may be SACK information that allows us to avoid
788	* resending already delivered data. Adjust snd_nxt accordingly.
789	*/
790	if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
791	tcp_sack_adjust(tp);
792	sendalot = `0`;
793	off = tp->snd_nxt - tp->snd_una;
794	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
795
796	if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > `0`)
797	sendwin = min(sendwin, slowlink_wsize);
798
799	flags = tcp_outflags[tp->t_state];
800	/*
801	* Send any SACK-generated retransmissions. If we're explicitly
802	* trying to send out new data (when sendalot is 1), bypass this
803	* function. If we retransmit in fast recovery mode, decrement
804	* snd_cwnd, since we're replacing a (future) new transmission
805	* with a retransmission now, and we previously incremented
806	* snd_cwnd in tcp_input().
807	*/
808	/*
809	* Still in sack recovery , reset rxmit flag to zero.
810	*/
811	sack_rxmit = `0`;
812	sack_bytes_rxmt = `0`;
813	len = `0`;
814	p = NULL;
815	if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) &&
816	(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
817	int32_t cwin;
818
819	cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
820	if (cwin < `0`)
821	cwin = `0`;
822	/ Do not retransmit SACK segments beyond snd_recover /
823	if (SEQ_GT(p->end, tp->snd_recover)) {
824	/*
825	* (At least) part of sack hole extends beyond
826	* snd_recover. Check to see if we can rexmit data
827	* for this hole.
828	*/
829	if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
830	/*
831	* Can't rexmit any more data for this hole.
832	* That data will be rexmitted in the next
833	* sack recovery episode, when snd_recover
834	* moves past p->rxmit.
835	*/
836	p = NULL;
837	goto after_sack_rexmit;
838	} else
839	/ Can rexmit part of the current hole /
840	len = ((int32_t)min(cwin,
841	tp->snd_recover - p->rxmit));
842	} else {
843	len = ((int32_t)min(cwin, p->end - p->rxmit));
844	}
845	if (len > `0`) {
846	off = p->rxmit - tp->snd_una;
847	sack_rxmit = `1`;
848	sendalot = `1`;
849	tcpstat.tcps_sack_rexmits++;
850	tcpstat.tcps_sack_rexmit_bytes +=
851	min(len, tp->t_maxseg);
852	} else {
853	len = `0`;
854	}
855	}
856	after_sack_rexmit:
857	/*
858	* Get standard flags, and add SYN or FIN if requested by 'hidden'
859	* state flags.
860	*/
861	if (tp->t_flags & TF_NEEDFIN)
862	flags \|= TH_FIN;
863	if (tp->t_flags & TF_NEEDSYN)
864	flags \|= TH_SYN;
865
866	/*
867	* If in persist timeout with window of 0, send 1 byte.
868	* Otherwise, if window is small but nonzero
869	* and timer expired, we will send what we can
870	* and go to transmit state.
871	*/
872	if (tp->t_flagsext & TF_FORCE) {
873	if (sendwin == `0`) {
874	/*
875	* If we still have some data to send, then
876	* clear the FIN bit. Usually this would
877	* happen below when it realizes that we
878	* aren't sending all the data. However,
879	* if we have exactly 1 byte of unsent data,
880	* then it won't clear the FIN bit below,
881	* and if we are in persist state, we wind
882	* up sending the packet without recording
883	* that we sent the FIN bit.
884	*
885	* We can't just blindly clear the FIN bit,
886	* because if we don't have any more data
887	* to send then the probe will be the FIN
888	* itself.
889	*/
890	if (off < so->so_snd.sb_cc)
891	flags &= ~TH_FIN;
892	sendwin = `1`;
893	} else {
894	tp->t_timer[TCPT_PERSIST] = `0`;
895	tp->t_persist_stop = `0`;
896	TCP_RESET_REXMT_STATE(tp);
897	}
898	}
899
900	/*
901	* If snd_nxt == snd_max and we have transmitted a FIN, the
902	* offset will be > 0 even if so_snd.sb_cc is 0, resulting in
903	* a negative length. This can also occur when TCP opens up
904	* its congestion window while receiving additional duplicate
905	* acks after fast-retransmit because TCP will reset snd_nxt
906	* to snd_max after the fast-retransmit.
907	*
908	* In the normal retransmit-FIN-only case, however, snd_nxt will
909	* be set to snd_una, the offset will be 0, and the length may
910	* wind up 0.
911	*
912	* If sack_rxmit is true we are retransmitting from the scoreboard
913	* in which case len is already set.
914	*/
915	if (sack_rxmit == `0`) {
916	if (sack_bytes_rxmt == `0`) {
917	len = min(so->so_snd.sb_cc, sendwin) - off;
918	} else {
919	int32_t cwin;
920
921	cwin = tp->snd_cwnd -
922	(tp->snd_nxt - tp->sack_newdata) -
923	sack_bytes_rxmt;
924	if (cwin < `0`)
925	cwin = `0`;
926	/*
927	* We are inside of a SACK recovery episode and are
928	* sending new data, having retransmitted all the
929	* data possible in the scoreboard.
930	*/
931	len = min(so->so_snd.sb_cc, tp->snd_wnd)
932	- off;
933	/*
934	* Don't remove this (len > 0) check !
935	* We explicitly check for len > 0 here (although it
936	* isn't really necessary), to work around a gcc
937	* optimization issue - to force gcc to compute
938	* len above. Without this check, the computation
939	* of len is bungled by the optimizer.
940	*/
941	if (len > `0`) {
942	len = imin(len, cwin);
943	} else {
944	len = `0`;
945	}
946	/*
947	* At this point SACK recovery can not send any
948	* data from scoreboard or any new data. Check
949	* if we can do a rescue retransmit towards the
950	* tail end of recovery window.
951	*/
952	if (len == `0` && cwin > `0` &&
953	SEQ_LT(tp->snd_fack, tp->snd_recover) &&
954	!(tp->t_flagsext & TF_RESCUE_RXT)) {
955	len = min((tp->snd_recover - tp->snd_fack),
956	tp->t_maxseg);
957	len = imin(len, cwin);
958	old_snd_nxt = tp->snd_nxt;
959	sack_rescue_rxt = TRUE;
960	tp->snd_nxt = tp->snd_recover - len;
961	/*
962	* If FIN has been sent, snd_max
963	* must have been advanced to cover it.
964	*/
965	if ((tp->t_flags & TF_SENTFIN) &&
966	tp->snd_max == tp->snd_recover)
967	tp->snd_nxt--;
968
969	off = tp->snd_nxt - tp->snd_una;
970	sendalot = `0`;
971	tp->t_flagsext \|= TF_RESCUE_RXT;
972	}
973	}
974	}
975
976	/*
977	* Lop off SYN bit if it has already been sent. However, if this
978	* is SYN-SENT state and if segment contains data and if we don't
979	* know that foreign host supports TAO, suppress sending segment.
980	*/
981	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
982	if (tp->t_state != TCPS_SYN_RECEIVED \|\| tfo_enabled(tp))
983	flags &= ~TH_SYN;
984	off--;
985	len++;
986	if (len > `0` && tp->t_state == TCPS_SYN_SENT) {
987	while (inp->inp_sndinprog_cnt == `0` &&
988	tp->t_pktlist_head != NULL) {
989	packetlist = tp->t_pktlist_head;
990	packchain_listadd = tp->t_lastchain;
991	packchain_sent++;
992	TCP_PKTLIST_CLEAR(tp);
993
994	error = tcp_ip_output(so, tp, packetlist,
995	packchain_listadd, tp_inp_options,
996	(so_options & SO_DONTROUTE),
997	(sack_rxmit \|\| (sack_bytes_rxmt != `0`)),
998	isipv6);
999	}
1000
1001	/*
1002	* tcp was closed while we were in ip,
1003	* resume close
1004	*/
1005	if (inp->inp_sndinprog_cnt == `0` &&
1006	(tp->t_flags & TF_CLOSING)) {
1007	tp->t_flags &= ~TF_CLOSING;
1008	(void) tcp_close(tp);
1009	} else {
1010	tcp_check_timer_state(tp);
1011	}
1012	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END,
1013	`0`,`0`,`0`,`0`,`0`);
1014	return(`0`);
1015	}
1016	}
1017
1018	/*
1019	* Be careful not to send data and/or FIN on SYN segments.
1020	* This measure is needed to prevent interoperability problems
1021	* with not fully conformant TCP implementations.
1022	*
1023	* In case of TFO, we handle the setting of the len in
1024	* tcp_tfo_check. In case TFO is not enabled, never ever send
1025	* SYN+data.
1026	*/
1027	if ((flags & TH_SYN) && !tfo_enabled(tp)) {
1028	len = `0`;
1029	flags &= ~TH_FIN;
1030	}
1031
1032	if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && tfo_enabled(tp))
1033	len = tcp_tfo_check(tp, len);
1034
1035	/*
1036	* The check here used to be (len < 0). Some times len is zero
1037	* when the congestion window is closed and we need to check
1038	* if persist timer has to be set in that case. But don't set
1039	* persist until connection is established.
1040	*/
1041	if (len <= `0` && !(flags & TH_SYN)) {
1042	/*
1043	* If FIN has been sent but not acked,
1044	* but we haven't been called to retransmit,
1045	* len will be < 0. Otherwise, window shrank
1046	* after we sent into it. If window shrank to 0,
1047	* cancel pending retransmit, pull snd_nxt back
1048	* to (closed) window, and set the persist timer
1049	* if it isn't already going. If the window didn't
1050	* close completely, just wait for an ACK.
1051	*/
1052	len = `0`;
1053	if (sendwin == `0`) {
1054	tp->t_timer[TCPT_REXMT] = `0`;
1055	tp->t_timer[TCPT_PTO] = `0`;
1056	TCP_RESET_REXMT_STATE(tp);
1057	tp->snd_nxt = tp->snd_una;
1058	off = `0`;
1059	if (tp->t_timer[TCPT_PERSIST] == `0`)
1060	tcp_setpersist(tp);
1061	}
1062	}
1063
1064	/*
1065	* Automatic sizing of send socket buffer. Increase the send
1066	* socket buffer size if all of the following criteria are met
1067	* 1. the receiver has enough buffer space for this data
1068	* 2. send buffer is filled to 7/8th with data (so we actually
1069	* have data to make use of it);
1070	* 3. our send window (slow start and congestion controlled) is
1071	* larger than sent but unacknowledged data in send buffer.
1072	*/
1073	if (tcp_do_autosendbuf == `1` &&
1074	!INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) &&
1075	(so->so_snd.sb_flags & (SB_AUTOSIZE \| SB_TRIM)) == SB_AUTOSIZE &&
1076	tcp_cansbgrow(&so->so_snd)) {
1077	if ((tp->snd_wnd / `4` * `5`) >= so->so_snd.sb_hiwat &&
1078	so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / `8` * `7`) &&
1079	sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
1080	if (sbreserve(&so->so_snd,
1081	min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
1082	tcp_autosndbuf_max)) == `1`) {
1083	so->so_snd.sb_idealsize = so->so_snd.sb_hiwat;
1084	}
1085	}
1086	}
1087
1088	/*
1089	* Truncate to the maximum segment length or enable TCP Segmentation
1090	* Offloading (if supported by hardware) and ensure that FIN is removed
1091	* if the length no longer contains the last data byte.
1092	*
1093	* TSO may only be used if we are in a pure bulk sending state.
1094	* The presence of TCP-MD5, SACK retransmits, SACK advertizements,
1095	* ipfw rules and IP options, as well as disabling hardware checksum
1096	* offload prevent using TSO. With TSO the TCP header is the same
1097	* (except for the sequence number) for all generated packets. This
1098	* makes it impossible to transmit any options which vary per generated
1099	* segment or packet.
1100	*
1101	* The length of TSO bursts is limited to TCP_MAXWIN. That limit and
1102	* removal of FIN (if not already catched here) are handled later after
1103	* the exact length of the TCP options are known.
1104	*/
1105	#if IPSEC
1106	/*
1107	* Pre-calculate here as we save another lookup into the darknesses
1108	* of IPsec that way and can actually decide if TSO is ok.
1109	*/
1110	if (ipsec_bypass == `0`)
1111	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
1112	#endif
1113	if (len > tp->t_maxseg) {
1114	if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx &&
1115	ip_use_randomid && kipf_count == `0` &&
1116	dlil_filter_disable_tso_count == `0` &&
1117	tp->rcv_numsacks == `0` && sack_rxmit == `0` &&
1118	sack_bytes_rxmt == `0` &&
1119	inp->inp_options == NULL &&
1120	inp->in6p_options == NULL
1121	#if IPSEC
1122	&& ipsec_optlen == `0`
1123	#endif
1124	#if IPFIREWALL
1125	&& (fw_enable == `0` \|\| fw_bypass)
1126	#endif
1127	) {
1128	tso = `1`;
1129	sendalot = `0`;
1130	} else {
1131	len = tp->t_maxseg;
1132	sendalot = `1`;
1133	tso = `0`;
1134	}
1135	}
1136
1137	/ Send one segment or less as a tail loss probe /
1138	if (tp->t_flagsext & TF_SENT_TLPROBE) {
1139	len = min(len, tp->t_maxseg);
1140	sendalot = `0`;
1141	tso = `0`;
1142	}
1143
1144	#if MPTCP
1145	if ((so->so_flags & SOF_MP_SUBFLOW) &&
1146	!(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
1147	int newlen = len;
1148	if (tp->t_state >= TCPS_ESTABLISHED &&
1149	(tp->t_mpflags & TMPF_SND_MPPRIO \|\|
1150	tp->t_mpflags & TMPF_SND_REM_ADDR \|\|
1151	tp->t_mpflags & TMPF_SND_MPFAIL \|\|
1152	tp->t_mpflags & TMPF_SND_KEYS \|\|
1153	tp->t_mpflags & TMPF_SND_JACK)) {
1154	if (len > `0`) {
1155	len = `0`;
1156	}
1157	/*
1158	* On a new subflow, don't try to send again, because
1159	* we are still waiting for the fourth ack.
1160	*/
1161	if (!(tp->t_mpflags & TMPF_PREESTABLISHED))
1162	sendalot = `1`;
1163	mptcp_acknow = TRUE;
1164	} else {
1165	mptcp_acknow = FALSE;
1166	}
1167	/*
1168	* The contiguous bytes in the subflow socket buffer can be
1169	* discontiguous at the MPTCP level. Since only one DSS
1170	* option can be sent in one packet, reduce length to match
1171	* the contiguous MPTCP level. Set sendalot to send remainder.
1172	*/
1173	if (len > `0`)
1174	newlen = mptcp_adj_sendlen(so, off);
1175	if (newlen < len) {
1176	len = newlen;
1177	sendalot = `1`;
1178	}
1179	}
1180	#endif /* MPTCP */
1181
1182	/*
1183	* If the socket is capable of doing unordered send,
1184	* pull the amount of data that can be sent from the
1185	* unordered priority queues to the serial queue in
1186	* the socket buffer. If bytes are not yet available
1187	* in the highest priority message, we may not be able
1188	* to send any new data.
1189	*/
1190	if (so->so_flags & SOF_ENABLE_MSGS) {
1191	if ((off + len) >
1192	so->so_msg_state->msg_serial_bytes) {
1193	sbpull_unordered_data(so, off, len);
1194
1195	/ check if len needs to be modified /
1196	if ((off + len) >
1197	so->so_msg_state->msg_serial_bytes) {
1198	len = so->so_msg_state->msg_serial_bytes - off;
1199	if (len <= `0`) {
1200	len = `0`;
1201	tcpstat.tcps_msg_sndwaithipri++;
1202	}
1203	}
1204	}
1205	}
1206
1207	if (sack_rxmit) {
1208	if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
1209	flags &= ~TH_FIN;
1210	} else {
1211	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
1212	flags &= ~TH_FIN;
1213	}
1214	/*
1215	* Compare available window to amount of window
1216	* known to peer (as advertised window less
1217	* next expected input). If the difference is at least two
1218	* max size segments, or at least 25% of the maximum possible
1219	* window, then want to send a window update to peer.
1220	* Skip this if the connection is in T/TCP half-open state.
1221	*/
1222	recwin = tcp_sbspace(tp);
1223	#if MPTCP
1224	if (so->so_flags & SOF_MP_SUBFLOW) {
1225	struct mptcb *mp_tp = tptomptp(tp);
1226
1227	if (mp_tp != NULL) {
1228	mpte_lock_assert_held(mp_tp->mpt_mpte);
1229	recwin = imin(recwin, mptcp_sbspace(mp_tp));
1230	}
1231	}
1232	#endif
1233
1234	if (recwin < (int32_t)(so->so_rcv.sb_hiwat / `4`) &&
1235	recwin < (int)tp->t_maxseg)
1236	recwin = `0`;
1237
1238	#if TRAFFIC_MGT
1239	if (tcp_recv_bg == `1` \|\| IS_TCP_RECV_BG(so)) {
1240	if (recwin > `0` && tcp_recv_throttle(tp)) {
1241	uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg;
1242	uint32_t bg_rwintop = tp->rcv_adv;
1243	if (SEQ_LT(bg_rwintop, tp->rcv_nxt + min_iaj_win))
1244	bg_rwintop = tp->rcv_nxt + min_iaj_win;
1245	recwin = imin((int32_t)(bg_rwintop - tp->rcv_nxt),
1246	recwin);
1247	if (recwin < `0`)
1248	recwin = `0`;
1249	}
1250	}
1251	#endif /* TRAFFIC_MGT */
1252
1253	if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
1254	recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
1255
1256	/*
1257	* MPTCP needs to be able to announce a smaller window than previously,
1258	* because the other subflow may have filled up the available window-
1259	* space. So we have to be able to go backwards and announce a smaller
1260	* window.
1261	*/
1262	if (!(so->so_flags & SOF_MP_SUBFLOW) &&
1263	recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt))
1264	recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1265
1266	/*
1267	* Sender silly window avoidance. We transmit under the following
1268	* conditions when len is non-zero:
1269	*
1270	* - we've timed out (e.g. persist timer)
1271	* - we need to retransmit
1272	* - We have a full segment (or more with TSO)
1273	* - This is the last buffer in a write()/send() and we are
1274	* either idle or running NODELAY
1275	* - we have more then 1/2 the maximum send window's worth of
1276	* data (receiver may be limited the window size)
1277	*/
1278	if (len) {
1279	if (tp->t_flagsext & TF_FORCE)
1280	goto send;
1281	if (SEQ_LT(tp->snd_nxt, tp->snd_max))
1282	goto send;
1283	if (sack_rxmit)
1284	goto send;
1285
1286	/*
1287	* Send new data on the connection only if it is
1288	* not flow controlled
1289	*/
1290	if (!INP_WAIT_FOR_IF_FEEDBACK(inp) \|\|
1291	tp->t_state != TCPS_ESTABLISHED) {
1292	if (len >= tp->t_maxseg)
1293	goto send;
1294
1295	if (!(tp->t_flags & TF_MORETOCOME) &&
1296	(idle \|\| tp->t_flags & TF_NODELAY \|\|
1297	(tp->t_flags & TF_MAXSEGSNT) \|\|
1298	ALLOW_LIMITED_TRANSMIT(tp)) &&
1299	(tp->t_flags & TF_NOPUSH) == `0` &&
1300	(len + off >= so->so_snd.sb_cc \|\|
1301	/*
1302	* MPTCP needs to respect the DSS-mappings. So, it
1303	* may be sending data that could have been
1304	* coalesced, but cannot because of
1305	* mptcp_adj_sendlen().
1306	*/
1307	so->so_flags & SOF_MP_SUBFLOW))
1308	goto send;
1309	if (len >= tp->max_sndwnd / `2` && tp->max_sndwnd > `0`)
1310	goto send;
1311	} else {
1312	tcpstat.tcps_fcholdpacket++;
1313	}
1314	}
1315
1316	if (recwin > `0` && !(tp->t_flags & TF_NEEDSYN)) {
1317	/*
1318	* "adv" is the amount we can increase the window,
1319	* taking into account that we are limited by
1320	* TCP_MAXWIN << tp->rcv_scale.
1321	*/
1322	int32_t adv, oldwin = `0`;
1323	adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) -
1324	(tp->rcv_adv - tp->rcv_nxt);
1325
1326	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
1327	oldwin = tp->rcv_adv - tp->rcv_nxt;
1328
1329	if (adv >= (int32_t) (`2` * tp->t_maxseg)) {
1330	/*
1331	* Update only if the resulting scaled value of
1332	* the window changed, or if there is a change in
1333	* the sequence since the last ack. This avoids
1334	* what appears as dupe ACKS (see rdar://5640997)
1335	*
1336	* If streaming is detected avoid sending too many
1337	* window updates. We will depend on the delack
1338	* timer to send a window update when needed.
1339	*/
1340	if (!(tp->t_flags & TF_STRETCHACK) &&
1341	(tp->last_ack_sent != tp->rcv_nxt \|\|
1342	((oldwin + adv) >> tp->rcv_scale) >
1343	(oldwin >> tp->rcv_scale))) {
1344	goto send;
1345	}
1346
1347	}
1348	if (`4` * adv >= (int32_t) so->so_rcv.sb_hiwat)
1349	goto send;
1350
1351	/*
1352	* Make sure that the delayed ack timer is set if
1353	* we delayed sending a window update because of
1354	* streaming detection.
1355	*/
1356	if ((tp->t_flags & TF_STRETCHACK) &&
1357	!(tp->t_flags & TF_DELACK)) {
1358	tp->t_flags \|= TF_DELACK;
1359	tp->t_timer[TCPT_DELACK] =
1360	OFFSET_FROM_START(tp, tcp_delack);
1361	}
1362	}
1363
1364	/*
1365	* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
1366	* is also a catch-all for the retransmit timer timeout case.
1367	*/
1368	if (tp->t_flags & TF_ACKNOW)
1369	goto send;
1370	if ((flags & TH_RST) \|\|
1371	((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == `0`))
1372	goto send;
1373	if (SEQ_GT(tp->snd_up, tp->snd_una))
1374	goto send;
1375	#if MPTCP
1376	if (mptcp_acknow)
1377	goto send;
1378	#endif /* MPTCP */
1379	/*
1380	* If our state indicates that FIN should be sent
1381	* and we have not yet done so, then we need to send.
1382	*/
1383	if ((flags & TH_FIN) &&
1384	(!(tp->t_flags & TF_SENTFIN) \|\| tp->snd_nxt == tp->snd_una))
1385	goto send;
1386	/*
1387	* In SACK, it is possible for tcp_output to fail to send a segment
1388	* after the retransmission timer has been turned off. Make sure
1389	* that the retransmission timer is set.
1390	*/
1391	if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) &&
1392	SEQ_GT(tp->snd_max, tp->snd_una) &&
1393	tp->t_timer[TCPT_REXMT] == `0` &&
1394	tp->t_timer[TCPT_PERSIST] == `0`) {
1395	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
1396	tp->t_rxtcur);
1397	goto just_return;
1398	}
1399	/*
1400	* TCP window updates are not reliable, rather a polling protocol
1401	* using ``persist'' packets is used to insure receipt of window
1402	* updates. The three ``states'' for the output side are:
1403	* idle not doing retransmits or persists
1404	* persisting to move a small or zero window
1405	* (re)transmitting and thereby not persisting
1406	*
1407	* tp->t_timer[TCPT_PERSIST]
1408	* is set when we are in persist state.
1409	* tp->t_force
1410	* is set when we are called to send a persist packet.
1411	* tp->t_timer[TCPT_REXMT]
1412	* is set when we are retransmitting
1413	* The output side is idle when both timers are zero.
1414	*
1415	* If send window is too small, there is data to transmit, and no
1416	* retransmit or persist is pending, then go to persist state.
1417	* If nothing happens soon, send when timer expires:
1418	* if window is nonzero, transmit what we can,
1419	* otherwise force out a byte.
1420	*/
1421	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == `0` &&
1422	tp->t_timer[TCPT_PERSIST] == `0`) {
1423	TCP_RESET_REXMT_STATE(tp);
1424	tcp_setpersist(tp);
1425	}
1426	just_return:
1427	/*
1428	* If there is no reason to send a segment, just return.
1429	* but if there is some packets left in the packet list, send them now.
1430	*/
1431	while (inp->inp_sndinprog_cnt == `0` &&
1432	tp->t_pktlist_head != NULL) {
1433	packetlist = tp->t_pktlist_head;
1434	packchain_listadd = tp->t_lastchain;
1435	packchain_sent++;
1436	TCP_PKTLIST_CLEAR(tp);
1437
1438	error = tcp_ip_output(so, tp, packetlist,
1439	packchain_listadd,
1440	tp_inp_options, (so_options & SO_DONTROUTE),
1441	(sack_rxmit \|\| (sack_bytes_rxmt != `0`)), isipv6);
1442	}
1443	/ tcp was closed while we were in ip; resume close /
1444	if (inp->inp_sndinprog_cnt == `0` &&
1445	(tp->t_flags & TF_CLOSING)) {
1446	tp->t_flags &= ~TF_CLOSING;
1447	(void) tcp_close(tp);
1448	} else {
1449	tcp_check_timer_state(tp);
1450	}
1451	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
1452	return (`0`);
1453
1454	send:
1455	/*
1456	* Set TF_MAXSEGSNT flag if the segment size is greater than
1457	* the max segment size.
1458	*/
1459	if (len > `0`) {
1460	if (len >= tp->t_maxseg)
1461	tp->t_flags \|= TF_MAXSEGSNT;
1462	else
1463	tp->t_flags &= ~TF_MAXSEGSNT;
1464	}
1465	/*
1466	* Before ESTABLISHED, force sending of initial options
1467	* unless TCP set not to do any options.
1468	* NOTE: we assume that the IP/TCP header plus TCP options
1469	* always fit in a single mbuf, leaving room for a maximum
1470	* link header, i.e.
1471	* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1472	*/
1473	optlen = `0`;
1474	#if INET6
1475	if (isipv6)
1476	hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
1477	else
1478	#endif
1479	hdrlen = sizeof (struct tcpiphdr);
1480	if (flags & TH_SYN) {
1481	tp->snd_nxt = tp->iss;
1482	if ((tp->t_flags & TF_NOOPT) == `0`) {
1483	u_short mss;
1484
1485	opt[`0`] = TCPOPT_MAXSEG;
1486	opt[`1`] = TCPOLEN_MAXSEG;
1487	mss = htons((u_short) tcp_mssopt(tp));
1488	(void)memcpy(opt + `2`, &mss, sizeof(mss));
1489	optlen = TCPOLEN_MAXSEG;
1490
1491	if ((tp->t_flags & TF_REQ_SCALE) &&
1492	((flags & TH_ACK) == `0` \|\|
1493	(tp->t_flags & TF_RCVD_SCALE))) {
1494	((u_int32_t )(void *)(opt + optlen)) = htonl(
1495	TCPOPT_NOP << `24` \|
1496	TCPOPT_WINDOW << `16` \|
1497	TCPOLEN_WINDOW << `8` \|
1498	tp->request_r_scale);
1499	optlen += `4`;
1500	}
1501	#if MPTCP
1502	if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) {
1503	optlen = mptcp_setup_syn_opts(so, opt, optlen);
1504	}
1505	#endif /* MPTCP */
1506	}
1507	}
1508
1509	/*
1510	* Send a timestamp and echo-reply if this is a SYN and our side
1511	* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1512	* and our peer have sent timestamps in our SYN's.
1513	*/
1514	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_NOOPT)) == TF_REQ_TSTMP &&
1515	(flags & TH_RST) == `0` &&
1516	((flags & TH_ACK) == `0` \|\|
1517	(tp->t_flags & TF_RCVD_TSTMP))) {
1518	u_int32_t lp = (u_int32_t )(void *)(opt + optlen);
1519
1520	/ Form timestamp option as shown in appendix A of RFC 1323. /
1521	*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1522	*lp++ = htonl(tcp_now);
1523	*lp = htonl(tp->ts_recent);
1524	optlen += TCPOLEN_TSTAMP_APPA;
1525	}
1526
1527	/ Note the timestamp for receive buffer autosizing /
1528	if (tp->rfbuf_ts == `0` && (so->so_rcv.sb_flags & SB_AUTOSIZE))
1529	tp->rfbuf_ts = tcp_now;
1530
1531	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == `0`)) {
1532	/*
1533	* Tack on the SACK permitted option last.
1534	* And do padding of options after tacking this on.
1535	* This is because of MSS, TS, WinScale and Signatures are
1536	* all present, we have just 2 bytes left for the SACK
1537	* permitted option, which is just enough.
1538	*/
1539	/*
1540	* If this is the first SYN of connection (not a SYN
1541	* ACK), include SACK permitted option. If this is a
1542	* SYN ACK, include SACK permitted option if peer has
1543	* already done so. This is only for active connect,
1544	* since the syncache takes care of the passive connect.
1545	*/
1546	if ((flags & TH_SYN) &&
1547	(!(flags & TH_ACK) \|\| (tp->t_flags & TF_SACK_PERMIT))) {
1548	u_char *bp;
1549	bp = (u_char *)opt + optlen;
1550
1551	*bp++ = TCPOPT_SACK_PERMITTED;
1552	*bp++ = TCPOLEN_SACK_PERMITTED;
1553	optlen += TCPOLEN_SACK_PERMITTED;
1554	}
1555	}
1556	#if MPTCP
1557	if (so->so_flags & SOF_MP_SUBFLOW) {
1558	/*
1559	* Its important to piggyback acks with data as ack only packets
1560	* may get lost and data packets that don't send Data ACKs
1561	* still advance the subflow level ACK and therefore make it
1562	* hard for the remote end to recover in low cwnd situations.
1563	*/
1564	if (len != `0`) {
1565	tp->t_mpflags \|= (TMPF_SEND_DSN \|
1566	TMPF_MPTCP_ACKNOW);
1567	} else {
1568	tp->t_mpflags \|= TMPF_MPTCP_ACKNOW;
1569	}
1570	optlen = mptcp_setup_opts(tp, off, &opt[`0`], optlen, flags,
1571	len, &mptcp_acknow);
1572	tp->t_mpflags &= ~TMPF_SEND_DSN;
1573	}
1574	#endif /* MPTCP */
1575
1576	if (tfo_enabled(tp) && !(tp->t_flags & TF_NOOPT) &&
1577	(flags & (TH_SYN \| TH_ACK)) == TH_SYN)
1578	optlen += tcp_tfo_write_cookie(tp, optlen, len, opt);
1579
1580	if (tfo_enabled(tp) &&
1581	(flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK) &&
1582	(tp->t_tfo_flags & TFO_F_OFFER_COOKIE))
1583	optlen += tcp_tfo_write_cookie_rep(tp, optlen, opt);
1584
1585	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == `0`)) {
1586	/*
1587	* Send SACKs if necessary. This should be the last
1588	* option processed. Only as many SACKs are sent as
1589	* are permitted by the maximum options size.
1590	*
1591	* In general, SACK blocks consume 8*n+2 bytes.
1592	* So a full size SACK blocks option is 34 bytes
1593	* (to generate 4 SACK blocks). At a minimum,
1594	* we need 10 bytes (to generate 1 SACK block).
1595	* If TCP Timestamps (12 bytes) and TCP Signatures
1596	* (18 bytes) are both present, we'll just have
1597	* 10 bytes for SACK options 40 - (12 + 18).
1598	*/
1599	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1600	(tp->t_flags & TF_SACK_PERMIT) &&
1601	(tp->rcv_numsacks > `0` \|\| TCP_SEND_DSACK_OPT(tp)) &&
1602	MAX_TCPOPTLEN - optlen - `2` >= TCPOLEN_SACK) {
1603	int nsack, padlen;
1604	u_char bp = (u_char )opt + optlen;
1605	u_int32_t *lp;
1606
1607	nsack = (MAX_TCPOPTLEN - optlen - `2`) / TCPOLEN_SACK;
1608	nsack = min(nsack, (tp->rcv_numsacks +
1609	(TCP_SEND_DSACK_OPT(tp) ? `1` : `0`)));
1610	sackoptlen = (`2` + nsack * TCPOLEN_SACK);
1611
1612	/*
1613	* First we need to pad options so that the
1614	* SACK blocks can start at a 4-byte boundary
1615	* (sack option and length are at a 2 byte offset).
1616	*/
1617	padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % `4`;
1618	optlen += padlen;
1619	while (padlen-- > `0`)
1620	*bp++ = TCPOPT_NOP;
1621
1622	tcpstat.tcps_sack_send_blocks++;
1623	*bp++ = TCPOPT_SACK;
1624	*bp++ = sackoptlen;
1625	lp = (u_int32_t )(void* *)bp;
1626
1627	/*
1628	* First block of SACK option should represent
1629	* DSACK. Prefer to send SACK information if there
1630	* is space for only one SACK block. This will
1631	* allow for faster recovery.
1632	*/
1633	if (TCP_SEND_DSACK_OPT(tp) && nsack > `0` &&
1634	(tp->rcv_numsacks == `0` \|\| nsack > `1`)) {
1635	*lp++ = htonl(tp->t_dsack_lseq);
1636	*lp++ = htonl(tp->t_dsack_rseq);
1637	tcpstat.tcps_dsack_sent++;
1638	tp->t_dsack_sent++;
1639	nsack--;
1640	}
1641	VERIFY(nsack == `0` \|\| tp->rcv_numsacks >= nsack);
1642	for (i = `0`; i < nsack; i++) {
1643	struct sackblk sack = tp->sackblks[i];
1644	*lp++ = htonl(sack.start);
1645	*lp++ = htonl(sack.end);
1646	}
1647	optlen += sackoptlen;
1648	}
1649	}
1650
1651	/ Pad TCP options to a 4 byte boundary /
1652	if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
1653	int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
1654	u_char bp = (u_char )opt + optlen;
1655
1656	optlen += pad;
1657	while (pad) {
1658	*bp++ = TCPOPT_EOL;
1659	pad--;
1660	}
1661	}
1662
1663	/*
1664	* RFC 3168 states that:
1665	* - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
1666	* to handle the TCP ECE flag, even if you also later send a
1667	* non-ECN-setup SYN/SYN-ACK.
1668	* - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
1669	* the ip ECT flag.
1670	*
1671	* It is not clear how the ECE flag would ever be set if you never
1672	* set the IP ECT flag on outbound packets. All the same, we use
1673	* the TE_SETUPSENT to indicate that we have committed to handling
1674	* the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
1675	* whether or not we should set the IP ECT flag on outbound packet
1676	*
1677	* For a SYN-ACK, send an ECN setup SYN-ACK
1678	*/
1679	if ((flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK) &&
1680	(tp->ecn_flags & TE_ENABLE_ECN)) {
1681	if (tp->ecn_flags & TE_SETUPRECEIVED) {
1682	if (tcp_send_ecn_flags_on_syn(tp, so)) {
1683	/*
1684	* Setting TH_ECE makes this an ECN-setup
1685	* SYN-ACK
1686	*/
1687	flags \|= TH_ECE;
1688
1689	/*
1690	* Record that we sent the ECN-setup and
1691	* default to setting IP ECT.
1692	*/
1693	tp->ecn_flags \|= (TE_SETUPSENT\|TE_SENDIPECT);
1694	tcpstat.tcps_ecn_server_setup++;
1695	tcpstat.tcps_ecn_server_success++;
1696	} else {
1697	/*
1698	* We sent an ECN-setup SYN-ACK but it was
1699	* dropped. Fallback to non-ECN-setup
1700	* SYN-ACK and clear flag to indicate that
1701	* we should not send data with IP ECT set
1702	*
1703	* Pretend we didn't receive an
1704	* ECN-setup SYN.
1705	*
1706	* We already incremented the counter
1707	* assuming that the ECN setup will
1708	* succeed. Decrementing here
1709	* tcps_ecn_server_success to correct it.
1710	*/
1711	if (tp->ecn_flags & TE_SETUPSENT) {
1712	tcpstat.tcps_ecn_lost_synack++;
1713	tcpstat.tcps_ecn_server_success--;
1714	tp->ecn_flags \|= TE_LOST_SYNACK;
1715	}
1716
1717	tp->ecn_flags &=
1718	~(TE_SETUPRECEIVED \| TE_SENDIPECT \|
1719	TE_SENDCWR);
1720	}
1721	}
1722	} else if ((flags & (TH_SYN \| TH_ACK)) == TH_SYN &&
1723	(tp->ecn_flags & TE_ENABLE_ECN)) {
1724	if (tcp_send_ecn_flags_on_syn(tp, so)) {
1725	/*
1726	* Setting TH_ECE and TH_CWR makes this an
1727	* ECN-setup SYN
1728	*/
1729	flags \|= (TH_ECE \| TH_CWR);
1730	tcpstat.tcps_ecn_client_setup++;
1731	tp->ecn_flags \|= TE_CLIENT_SETUP;
1732
1733	/*
1734	* Record that we sent the ECN-setup and default to
1735	* setting IP ECT.
1736	*/
1737	tp->ecn_flags \|= (TE_SETUPSENT \| TE_SENDIPECT);
1738	} else {
1739	/*
1740	* We sent an ECN-setup SYN but it was dropped.
1741	* Fall back to non-ECN and clear flag indicating
1742	* we should send data with IP ECT set.
1743	*/
1744	if (tp->ecn_flags & TE_SETUPSENT) {
1745	tcpstat.tcps_ecn_lost_syn++;
1746	tp->ecn_flags \|= TE_LOST_SYN;
1747	}
1748	tp->ecn_flags &= ~TE_SENDIPECT;
1749	}
1750	}
1751
1752	/*
1753	* Check if we should set the TCP CWR flag.
1754	* CWR flag is sent when we reduced the congestion window because
1755	* we received a TCP ECE or we performed a fast retransmit. We
1756	* never set the CWR flag on retransmitted packets. We only set
1757	* the CWR flag on data packets. Pure acks don't have this set.
1758	*/
1759	if ((tp->ecn_flags & TE_SENDCWR) != `0` && len != `0` &&
1760	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
1761	flags \|= TH_CWR;
1762	tp->ecn_flags &= ~TE_SENDCWR;
1763	}
1764
1765	/*
1766	* Check if we should set the TCP ECE flag.
1767	*/
1768	if ((tp->ecn_flags & TE_SENDECE) != `0` && len == `0`) {
1769	flags \|= TH_ECE;
1770	tcpstat.tcps_ecn_sent_ece++;
1771	}
1772
1773
1774	hdrlen += optlen;
1775
1776	/ Reset DSACK sequence numbers /
1777	tp->t_dsack_lseq = `0`;
1778	tp->t_dsack_rseq = `0`;
1779
1780	#if INET6
1781	if (isipv6)
1782	ipoptlen = ip6_optlen(inp);
1783	else
1784	#endif
1785	{
1786	if (tp_inp_options) {
1787	ipoptlen = tp_inp_options->m_len -
1788	offsetof(struct ipoption, ipopt_list);
1789	} else {
1790	ipoptlen = `0`;
1791	}
1792	}
1793	#if IPSEC
1794	ipoptlen += ipsec_optlen;
1795	#endif
1796
1797	/*
1798	* Adjust data length if insertion of options will
1799	* bump the packet length beyond the t_maxopd length.
1800	* Clear the FIN bit because we cut off the tail of
1801	* the segment.
1802	*
1803	* When doing TSO limit a burst to TCP_MAXWIN minus the
1804	* IP, TCP and Options length to keep ip->ip_len from
1805	* overflowing. Prevent the last segment from being
1806	* fractional thus making them all equal sized and set
1807	* the flag to continue sending. TSO is disabled when
1808	* IP options or IPSEC are present.
1809	*/
1810	if (len + optlen + ipoptlen > tp->t_maxopd) {
1811	/*
1812	* If there is still more to send,
1813	* don't close the connection.
1814	*/
1815	flags &= ~TH_FIN;
1816	if (tso) {
1817	int32_t tso_maxlen;
1818
1819	tso_maxlen = tp->tso_max_segment_size ?
1820	tp->tso_max_segment_size : TCP_MAXWIN;
1821
1822	if (len > tso_maxlen - hdrlen - optlen) {
1823	len = tso_maxlen - hdrlen - optlen;
1824	len = len - (len % (tp->t_maxopd - optlen));
1825	sendalot = `1`;
1826	} else if (tp->t_flags & TF_NEEDFIN) {
1827	sendalot = `1`;
1828	}
1829	} else {
1830	len = tp->t_maxopd - optlen - ipoptlen;
1831	sendalot = `1`;
1832	}
1833	}
1834
1835	if (max_linkhdr + hdrlen > MCLBYTES)
1836	panic("tcphdr too big");
1837
1838	/ Check if there is enough data in the send socket*
1839	* buffer to start measuring bandwidth
1840	*/
1841	if ((tp->t_flagsext & TF_MEASURESNDBW) != `0` &&
1842	(tp->t_bwmeas != NULL) &&
1843	(tp->t_flagsext & TF_BWMEAS_INPROGRESS) == `0`) {
1844	tp->t_bwmeas->bw_size = min(min(
1845	(so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)),
1846	tp->snd_cwnd), tp->snd_wnd);
1847	if (tp->t_bwmeas->bw_minsize > `0` &&
1848	tp->t_bwmeas->bw_size < tp->t_bwmeas->bw_minsize)
1849	tp->t_bwmeas->bw_size = `0`;
1850	if (tp->t_bwmeas->bw_maxsize > `0`)
1851	tp->t_bwmeas->bw_size = min(tp->t_bwmeas->bw_size,
1852	tp->t_bwmeas->bw_maxsize);
1853	if (tp->t_bwmeas->bw_size > `0`) {
1854	tp->t_flagsext \|= TF_BWMEAS_INPROGRESS;
1855	tp->t_bwmeas->bw_start = tp->snd_max;
1856	tp->t_bwmeas->bw_ts = tcp_now;
1857	}
1858	}
1859
1860	VERIFY(inp->inp_flowhash != `0`);
1861	/*
1862	* Grab a header mbuf, attaching a copy of data to
1863	* be transmitted, and initialize the header from
1864	* the template for sends on this connection.
1865	*/
1866	if (len) {
1867	tp->t_pmtud_lastseg_size = len + optlen + ipoptlen;
1868	if ((tp->t_flagsext & TF_FORCE) && len == `1`)
1869	tcpstat.tcps_sndprobe++;
1870	else if (SEQ_LT(tp->snd_nxt, tp->snd_max) \|\| sack_rxmit) {
1871	tcpstat.tcps_sndrexmitpack++;
1872	tcpstat.tcps_sndrexmitbyte += len;
1873	if (nstat_collect) {
1874	nstat_route_tx(inp->inp_route.ro_rt, `1`,
1875	len, NSTAT_TX_FLAG_RETRANSMIT);
1876	INP_ADD_STAT(inp, cell, wifi, wired,
1877	txpackets, `1`);
1878	INP_ADD_STAT(inp, cell, wifi, wired,
1879	txbytes, len);
1880	tp->t_stat.txretransmitbytes += len;
1881	tp->t_stat.rxmitpkts++;
1882	}
1883	} else {
1884	tcpstat.tcps_sndpack++;
1885	tcpstat.tcps_sndbyte += len;
1886
1887	if (nstat_collect) {
1888	INP_ADD_STAT(inp, cell, wifi, wired,
1889	txpackets, `1`);
1890	INP_ADD_STAT(inp, cell, wifi, wired,
1891	txbytes, len);
1892	}
1893	inp_decr_sndbytes_unsent(so, len);
1894	}
1895	inp_set_activity_bitmap(inp);
1896	#if MPTCP
1897	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
1898	tcpstat.tcps_mp_sndpacks++;
1899	tcpstat.tcps_mp_sndbytes += len;
1900	}
1901	#endif /* MPTCP */
1902	/*
1903	* try to use the new interface that allocates all
1904	* the necessary mbuf hdrs under 1 mbuf lock and
1905	* avoids rescanning the socket mbuf list if
1906	* certain conditions are met. This routine can't
1907	* be used in the following cases...
1908	* 1) the protocol headers exceed the capacity of
1909	* of a single mbuf header's data area (no cluster attached)
1910	* 2) the length of the data being transmitted plus
1911	* the protocol headers fits into a single mbuf header's
1912	* data area (no cluster attached)
1913	*/
1914	m = NULL;
1915
1916	/ minimum length we are going to allocate /
1917	allocated_len = MHLEN;
1918	if (MHLEN < hdrlen + max_linkhdr) {
1919	MGETHDR(m, M_DONTWAIT, MT_HEADER);
1920	if (m == NULL) {
1921	error = ENOBUFS;
1922	goto out;
1923	}
1924	MCLGET(m, M_DONTWAIT);
1925	if ((m->m_flags & M_EXT) == `0`) {
1926	m_freem(m);
1927	error = ENOBUFS;
1928	goto out;
1929	}
1930	m->m_data += max_linkhdr;
1931	m->m_len = hdrlen;
1932	allocated_len = MCLBYTES;
1933	}
1934	if (len <= allocated_len - hdrlen - max_linkhdr) {
1935	if (m == NULL) {
1936	VERIFY(allocated_len <= MHLEN);
1937	MGETHDR(m, M_DONTWAIT, MT_HEADER);
1938	if (m == NULL) {
1939	error = ENOBUFS;
1940	goto out;
1941	}
1942	m->m_data += max_linkhdr;
1943	m->m_len = hdrlen;
1944	}
1945	/ makes sure we still have data left to be sent at this point /
1946	if (so->so_snd.sb_mb == NULL \|\| off < `0`) {
1947	if (m != NULL) m_freem(m);
1948	error = `0`; / should we return an error? /
1949	goto out;
1950	}
1951	m_copydata(so->so_snd.sb_mb, off, (int) len,
1952	mtod(m, caddr_t) + hdrlen);
1953	m->m_len += len;
1954	} else {
1955	uint32_t copymode;
1956	/*
1957	* Retain packet header metadata at the socket
1958	* buffer if this is is an MPTCP subflow,
1959	* otherwise move it.
1960	*/
1961	copymode = M_COPYM_MOVE_HDR;
1962	#if MPTCP
1963	if (so->so_flags & SOF_MP_SUBFLOW) {
1964	copymode = M_COPYM_NOOP_HDR;
1965	}
1966	#endif /* MPTCP */
1967	if (m != NULL) {
1968	m->m_next = m_copym_mode(so->so_snd.sb_mb,
1969	off, (int)len, M_DONTWAIT, copymode);
1970	if (m->m_next == NULL) {
1971	(void) m_free(m);
1972	error = ENOBUFS;
1973	goto out;
1974	}
1975	} else {
1976	/*
1977	* make sure we still have data left
1978	* to be sent at this point
1979	*/
1980	if (so->so_snd.sb_mb == NULL) {
1981	error = `0`; / should we return an error? /
1982	goto out;
1983	}
1984
1985	/*
1986	* m_copym_with_hdrs will always return the
1987	* last mbuf pointer and the offset into it that
1988	* it acted on to fullfill the current request,
1989	* whether a valid 'hint' was passed in or not.
1990	*/
1991	if ((m = m_copym_with_hdrs(so->so_snd.sb_mb,
1992	off, len, M_DONTWAIT, NULL, NULL,
1993	copymode)) == NULL) {
1994	error = ENOBUFS;
1995	goto out;
1996	}
1997	m->m_data += max_linkhdr;
1998	m->m_len = hdrlen;
1999	}
2000	}
2001	/*
2002	* If we're sending everything we've got, set PUSH.
2003	* (This will keep happy those implementations which only
2004	* give data to the user when a buffer fills or
2005	* a PUSH comes in.)
2006	*
2007	* On SYN-segments we should not add the PUSH-flag.
2008	*/
2009	if (off + len == so->so_snd.sb_cc && !(flags & TH_SYN))
2010	flags \|= TH_PUSH;
2011	} else {
2012	if (tp->t_flags & TF_ACKNOW)
2013	tcpstat.tcps_sndacks++;
2014	else if (flags & (TH_SYN\|TH_FIN\|TH_RST))
2015	tcpstat.tcps_sndctrl++;
2016	else if (SEQ_GT(tp->snd_up, tp->snd_una))
2017	tcpstat.tcps_sndurg++;
2018	else
2019	tcpstat.tcps_sndwinup++;
2020
2021	MGETHDR(m, M_DONTWAIT, MT_HEADER); / MAC-OK /
2022	if (m == NULL) {
2023	error = ENOBUFS;
2024	goto out;
2025	}
2026	if (MHLEN < (hdrlen + max_linkhdr)) {
2027	MCLGET(m, M_DONTWAIT);
2028	if ((m->m_flags & M_EXT) == `0`) {
2029	m_freem(m);
2030	error = ENOBUFS;
2031	goto out;
2032	}
2033	}
2034	m->m_data += max_linkhdr;
2035	m->m_len = hdrlen;
2036	}
2037	m->m_pkthdr.rcvif = `0`;
2038	#if CONFIG_MACF_NET
2039	mac_mbuf_label_associate_inpcb(inp, m);
2040	#endif
2041	#if INET6
2042	if (isipv6) {
2043	ip6 = mtod(m, struct ip6_hdr *);
2044	th = (struct tcphdr )(void* *)(ip6 + `1`);
2045	tcp_fillheaders(tp, ip6, th);
2046	if ((tp->ecn_flags & TE_SENDIPECT) != `0` && len &&
2047	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
2048	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << `20`);
2049	}
2050	svc_flags \|= PKT_SCF_IPV6;
2051	#if PF_ECN
2052	m_pftag(m)->pftag_hdr = (void *)ip6;
2053	m_pftag(m)->pftag_flags \|= PF_TAG_HDR_INET6;
2054	#endif /* PF_ECN */
2055	} else
2056	#endif /* INET6 */
2057	{
2058	ip = mtod(m, struct ip *);
2059	ipov = (struct ipovly *)ip;
2060	th = (struct tcphdr )(void* *)(ip + `1`);
2061	/ this picks up the pseudo header (w/o the length) /
2062	tcp_fillheaders(tp, ip, th);
2063	if ((tp->ecn_flags & TE_SENDIPECT) != `0` && len &&
2064	!SEQ_LT(tp->snd_nxt, tp->snd_max) &&
2065	!sack_rxmit && !(flags & TH_SYN)) {
2066	ip->ip_tos \|= IPTOS_ECN_ECT0;
2067	}
2068	#if PF_ECN
2069	m_pftag(m)->pftag_hdr = (void *)ip;
2070	m_pftag(m)->pftag_flags \|= PF_TAG_HDR_INET;
2071	#endif /* PF_ECN */
2072	}
2073
2074	/*
2075	* Fill in fields, remembering maximum advertised
2076	* window for use in delaying messages about window sizes.
2077	* If resending a FIN, be sure not to use a new sequence number.
2078	*/
2079	if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
2080	tp->snd_nxt == tp->snd_max)
2081	tp->snd_nxt--;
2082	/*
2083	* If we are doing retransmissions, then snd_nxt will
2084	* not reflect the first unsent octet. For ACK only
2085	* packets, we do not want the sequence number of the
2086	* retransmitted packet, we want the sequence number
2087	* of the next unsent octet. So, if there is no data
2088	* (and no SYN or FIN), use snd_max instead of snd_nxt
2089	* when filling in ti_seq. But if we are in persist
2090	* state, snd_max might reflect one byte beyond the
2091	* right edge of the window, so use snd_nxt in that
2092	* case, since we know we aren't doing a retransmission.
2093	* (retransmit and persist are mutually exclusive...)
2094	*
2095	* Note the state of this retransmit segment to detect spurious
2096	* retransmissions.
2097	*/
2098	if (sack_rxmit == `0`) {
2099	if (len \|\| (flags & (TH_SYN\|TH_FIN)) \|\|
2100	tp->t_timer[TCPT_PERSIST]) {
2101	th->th_seq = htonl(tp->snd_nxt);
2102	if (len > `0`) {
2103	m->m_pkthdr.tx_start_seq = tp->snd_nxt;
2104	m->m_pkthdr.pkt_flags \|= PKTF_START_SEQ;
2105	}
2106	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
2107	if (SACK_ENABLED(tp) && len > `1`) {
2108	tcp_rxtseg_insert(tp, tp->snd_nxt,
2109	(tp->snd_nxt + len - `1`));
2110	}
2111	if (len > `0`)
2112	m->m_pkthdr.pkt_flags \|=
2113	PKTF_TCP_REXMT;
2114	}
2115	} else {
2116	th->th_seq = htonl(tp->snd_max);
2117	}
2118	} else {
2119	th->th_seq = htonl(p->rxmit);
2120	if (len > `0`) {
2121	m->m_pkthdr.pkt_flags \|=
2122	(PKTF_TCP_REXMT \| PKTF_START_SEQ);
2123	m->m_pkthdr.tx_start_seq = p->rxmit;
2124	}
2125	tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - `1`));
2126	p->rxmit += len;
2127	tp->sackhint.sack_bytes_rexmit += len;
2128	}
2129	th->th_ack = htonl(tp->rcv_nxt);
2130	tp->last_ack_sent = tp->rcv_nxt;
2131	if (optlen) {
2132	bcopy(opt, th + `1`, optlen);
2133	th->th_off = (sizeof (struct tcphdr) + optlen) >> `2`;
2134	}
2135	th->th_flags = flags;
2136	th->th_win = htons((u_short) (recwin>>tp->rcv_scale));
2137	if (recwin > `0` && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin))
2138	tp->rcv_adv = tp->rcv_nxt + recwin;
2139
2140	/*
2141	* Adjust the RXWIN0SENT flag - indicate that we have advertised
2142	* a 0 window. This may cause the remote transmitter to stall. This
2143	* flag tells soreceive() to disable delayed acknowledgements when
2144	* draining the buffer. This can occur if the receiver is attempting
2145	* to read more data then can be buffered prior to transmitting on
2146	* the connection.
2147	*/
2148	if (th->th_win == `0`)
2149	tp->t_flags \|= TF_RXWIN0SENT;
2150	else
2151	tp->t_flags &= ~TF_RXWIN0SENT;
2152	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
2153	th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
2154	th->th_flags \|= TH_URG;
2155	} else {
2156	/*
2157	* If no urgent pointer to send, then we pull
2158	* the urgent pointer to the left edge of the send window
2159	* so that it doesn't drift into the send window on sequence
2160	* number wraparound.
2161	*/
2162	tp->snd_up = tp->snd_una; / drag it along /
2163	}
2164
2165	/*
2166	* Put TCP length in extended header, and then
2167	* checksum extended header and data.
2168	*/
2169	m->m_pkthdr.len = hdrlen + len; / in6_cksum() need this /
2170
2171	/*
2172	* If this is potentially the last packet on the stream, then mark
2173	* it in order to enable some optimizations in the underlying
2174	* layers
2175	*/
2176	if (tp->t_state != TCPS_ESTABLISHED &&
2177	(tp->t_state == TCPS_CLOSING \|\| tp->t_state == TCPS_TIME_WAIT
2178	\|\| tp->t_state == TCPS_LAST_ACK \|\| (th->th_flags & TH_RST)))
2179	m->m_pkthdr.pkt_flags \|= PKTF_LAST_PKT;
2180
2181	#if INET6
2182	if (isipv6) {
2183	/*
2184	* ip6_plen is not need to be filled now, and will be filled
2185	* in ip6_output.
2186	*/
2187	m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
2188	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2189	if (len + optlen)
2190	th->th_sum = in_addword(th->th_sum,
2191	htons((u_short)(optlen + len)));
2192	}
2193	else
2194	#endif /* INET6 */
2195	{
2196	m->m_pkthdr.csum_flags = CSUM_TCP;
2197	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2198	if (len + optlen)
2199	th->th_sum = in_addword(th->th_sum,
2200	htons((u_short)(optlen + len)));
2201	}
2202
2203	/*
2204	* Enable TSO and specify the size of the segments.
2205	* The TCP pseudo header checksum is always provided.
2206	*/
2207	if (tso) {
2208	#if INET6
2209	if (isipv6)
2210	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV6;
2211	else
2212	#endif /* INET6 */
2213	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV4;
2214
2215	m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
2216	} else {
2217	m->m_pkthdr.tso_segsz = `0`;
2218	}
2219
2220	/*
2221	* In transmit state, time the transmission and arrange for
2222	* the retransmit. In persist state, just set snd_max.
2223	*/
2224	if (!(tp->t_flagsext & TF_FORCE)
2225	\|\| tp->t_timer[TCPT_PERSIST] == `0`) {
2226	tcp_seq startseq = tp->snd_nxt;
2227
2228	/*
2229	* Advance snd_nxt over sequence space of this segment.
2230	*/
2231	if (flags & (TH_SYN\|TH_FIN)) {
2232	if (flags & TH_SYN)
2233	tp->snd_nxt++;
2234	if ((flags & TH_FIN) &&
2235	!(tp->t_flags & TF_SENTFIN)) {
2236	tp->snd_nxt++;
2237	tp->t_flags \|= TF_SENTFIN;
2238	}
2239	}
2240	if (sack_rxmit)
2241	goto timer;
2242	if (sack_rescue_rxt == TRUE) {
2243	tp->snd_nxt = old_snd_nxt;
2244	sack_rescue_rxt = FALSE;
2245	tcpstat.tcps_pto_in_recovery++;
2246	} else {
2247	tp->snd_nxt += len;
2248	}
2249	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
2250	tp->snd_max = tp->snd_nxt;
2251	tp->t_sndtime = tcp_now;
2252	/*
2253	* Time this transmission if not a retransmission and
2254	* not currently timing anything.
2255	*/
2256	if (tp->t_rtttime == `0`) {
2257	tp->t_rtttime = tcp_now;
2258	tp->t_rtseq = startseq;
2259	tcpstat.tcps_segstimed++;
2260
2261	/ update variables related to pipe ack /
2262	tp->t_pipeack_lastuna = tp->snd_una;
2263	}
2264	}
2265
2266	/*
2267	* Set retransmit timer if not currently set,
2268	* and not doing an ack or a keep-alive probe.
2269	*/
2270	timer:
2271	if (tp->t_timer[TCPT_REXMT] == `0` &&
2272	((sack_rxmit && tp->snd_nxt != tp->snd_max) \|\|
2273	tp->snd_nxt != tp->snd_una \|\| (flags & TH_FIN))) {
2274	if (tp->t_timer[TCPT_PERSIST]) {
2275	tp->t_timer[TCPT_PERSIST] = `0`;
2276	tp->t_persist_stop = `0`;
2277	TCP_RESET_REXMT_STATE(tp);
2278	}
2279	tp->t_timer[TCPT_REXMT] =
2280	OFFSET_FROM_START(tp, tp->t_rxtcur);
2281	}
2282
2283	/*
2284	* Set tail loss probe timeout if new data is being
2285	* transmitted. This will be supported only when
2286	* SACK option is enabled on a connection.
2287	*
2288	* Every time new data is sent PTO will get reset.
2289	*/
2290	if (tcp_enable_tlp && len != `0` && tp->t_state == TCPS_ESTABLISHED &&
2291	SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
2292	tp->snd_nxt == tp->snd_max &&
2293	SEQ_GT(tp->snd_nxt, tp->snd_una) &&
2294	tp->t_rxtshift == `0` &&
2295	(tp->t_flagsext & (TF_SENT_TLPROBE\|TF_PKTS_REORDERED)) == `0`) {
2296	u_int32_t pto, srtt;
2297
2298	/*
2299	* Using SRTT alone to set PTO can cause spurious
2300	* retransmissions on wireless networks where there
2301	* is a lot of variance in RTT. Taking variance
2302	* into account will avoid this.
2303	*/
2304	srtt = tp->t_srtt >> TCP_RTT_SHIFT;
2305	pto = ((TCP_REXMTVAL(tp)) * `3`) >> `1`;
2306	pto = max (`2` * srtt, pto);
2307	if ((tp->snd_max - tp->snd_una) == tp->t_maxseg)
2308	pto = max(pto,
2309	(((`3` * pto) >> `2`) + tcp_delack * `2`));
2310	else
2311	pto = max(`10`, pto);
2312
2313	/ if RTO is less than PTO, choose RTO instead /
2314	if (tp->t_rxtcur < pto)
2315	pto = tp->t_rxtcur;
2316
2317	tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto);
2318	}
2319	} else {
2320	/*
2321	* Persist case, update snd_max but since we are in
2322	* persist mode (no window) we do not update snd_nxt.
2323	*/
2324	int xlen = len;
2325	if (flags & TH_SYN)
2326	++xlen;
2327	if ((flags & TH_FIN) &&
2328	!(tp->t_flags & TF_SENTFIN)) {
2329	++xlen;
2330	tp->t_flags \|= TF_SENTFIN;
2331	}
2332	if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
2333	tp->snd_max = tp->snd_nxt + len;
2334	tp->t_sndtime = tcp_now;
2335	}
2336	}
2337
2338	#if TCPDEBUG
2339	/*
2340	* Trace.
2341	*/
2342	if (so_options & SO_DEBUG)
2343	tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, `0`);
2344	#endif
2345
2346	/*
2347	* Fill in IP length and desired time to live and
2348	* send to IP level. There should be a better way
2349	* to handle ttl and tos; we could keep them in
2350	* the template, but need a way to checksum without them.
2351	*/
2352	#if INET6
2353	/*
2354	* m->m_pkthdr.len should have been set before cksum calcuration,
2355	* because in6_cksum() need it.
2356	*/
2357	if (isipv6) {
2358	/*
2359	* we separately set hoplimit for every segment, since the
2360	* user might want to change the value via setsockopt.
2361	* Also, desired default hop limit might be changed via
2362	* Neighbor Discovery.
2363	*/
2364	ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
2365	inp->in6p_route.ro_rt->rt_ifp : NULL);
2366
2367	/ TODO: IPv6 IP6TOS_ECT bit on /
2368	KERNEL_DEBUG(DBG_LAYER_BEG,
2369	((inp->inp_fport << `16`) \| inp->inp_lport),
2370	(((inp->in6p_laddr.s6_addr16[`0`] & `0xffff`) << `16`) \|
2371	(inp->in6p_faddr.s6_addr16[`0`] & `0xffff`)),
2372	sendalot,`0`,`0`);
2373	} else
2374	#endif /* INET6 */
2375	{
2376	ip->ip_len = m->m_pkthdr.len;
2377	ip->ip_ttl = inp->inp_ip_ttl; / XXX /
2378	ip->ip_tos \|= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);/ XXX /
2379	KERNEL_DEBUG(DBG_LAYER_BEG,
2380	((inp->inp_fport << `16`) \| inp->inp_lport),
2381	(((inp->inp_laddr.s_addr & `0xffff`) << `16`) \|
2382	(inp->inp_faddr.s_addr & `0xffff`)), `0`,`0`,`0`);
2383	}
2384
2385	/*
2386	* See if we should do MTU discovery.
2387	* Look at the flag updated on the following criterias:
2388	* 1) Path MTU discovery is authorized by the sysctl
2389	* 2) The route isn't set yet (unlikely but could happen)
2390	* 3) The route is up
2391	* 4) the MTU is not locked (if it is, then discovery has been
2392	* disabled for that route)
2393	*/
2394	#if INET6
2395	if (!isipv6)
2396	#endif /* INET6 */
2397	if (path_mtu_discovery && (tp->t_flags & TF_PMTUD))
2398	ip->ip_off \|= IP_DF;
2399
2400	#if NECP
2401	{
2402	necp_kernel_policy_id policy_id;
2403	necp_kernel_policy_id skip_policy_id;
2404	u_int32_t route_rule_id;
2405	if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id, &skip_policy_id)) {
2406	m_freem(m);
2407	error = EHOSTUNREACH;
2408	goto out;
2409	}
2410	necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id);
2411
2412	if (net_qos_policy_restricted != `0`) {
2413	necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt,
2414	NULL, route_rule_id);
2415	}
2416	}
2417	#endif /* NECP */
2418
2419	#if IPSEC
2420	if (inp->inp_sp != NULL)
2421	ipsec_setsocket(m, so);
2422	#endif /IPSEC/
2423
2424	/*
2425	* The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
2426	*/
2427	lost = `0`;
2428
2429	/*
2430	* Embed the flow hash in pkt hdr and mark the packet as
2431	* capable of flow controlling
2432	*/
2433	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
2434	m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
2435	m->m_pkthdr.pkt_flags \|= (PKTF_FLOW_ID \| PKTF_FLOW_LOCALSRC \| PKTF_FLOW_ADV);
2436	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
2437	m->m_pkthdr.tx_tcp_pid = so->last_pid;
2438	if (so->so_flags & SOF_DELEGATED)
2439	m->m_pkthdr.tx_tcp_e_pid = so->e_pid;
2440	else
2441	m->m_pkthdr.tx_tcp_e_pid = `0`;
2442
2443	m->m_nextpkt = NULL;
2444
2445	if (inp->inp_last_outifp != NULL &&
2446	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2447	/ Hint to prioritize this packet if*
2448	* 1. if the packet has no data
2449	* 2. the interface supports transmit-start model and did
2450	* not disable ACK prioritization.
2451	* 3. Only ACK flag is set.
2452	* 4. there is no outstanding data on this connection.
2453	*/
2454	if (tcp_prioritize_acks != `0` && len == `0` &&
2455	(inp->inp_last_outifp->if_eflags &
2456	(IFEF_TXSTART \| IFEF_NOACKPRI)) == IFEF_TXSTART) {
2457	if (th->th_flags == TH_ACK &&
2458	tp->snd_una == tp->snd_max &&
2459	tp->t_timer[TCPT_REXMT] == `0`)
2460	svc_flags \|= PKT_SCF_TCP_ACK;
2461	if (th->th_flags & TH_SYN)
2462	svc_flags \|= PKT_SCF_TCP_SYN;
2463	}
2464	set_packet_service_class(m, so, sotc, svc_flags);
2465	} else {
2466	/*
2467	* Optimization for loopback just set the mbuf
2468	* service class
2469	*/
2470	(void) m_set_service_class(m, so_tc2msc(sotc));
2471	}
2472
2473	tp->t_pktlist_sentlen += len;
2474	tp->t_lastchain++;
2475
2476	#if INET6
2477	if (isipv6) {
2478	DTRACE_TCP5(send, struct mbuf , m, struct* inpcb *, inp,
2479	struct ip6 , ip6, struct* tcpcb , tp, struct* tcphdr *,
2480	th);
2481	} else
2482	#endif /* INET6 */
2483	{
2484	DTRACE_TCP5(send, struct mbuf , m, struct* inpcb *, inp,
2485	struct ip , ip, struct* tcpcb , tp, struct* tcphdr *, th);
2486	}
2487
2488	if (tp->t_pktlist_head != NULL) {
2489	tp->t_pktlist_tail->m_nextpkt = m;
2490	tp->t_pktlist_tail = m;
2491	} else {
2492	packchain_newlist++;
2493	tp->t_pktlist_head = tp->t_pktlist_tail = m;
2494	}
2495
2496	if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) &&
2497	((th->th_flags & TH_ACK) == TH_ACK) && (!len) &&
2498	(tp->t_state == TCPS_ESTABLISHED)) {
2499	/ For a pure ACK, see if you need to send more of them /
2500	mnext = tcp_send_lroacks(tp, m, th);
2501	if (mnext) {
2502	tp->t_pktlist_tail->m_nextpkt = mnext;
2503	if (mnext->m_nextpkt == NULL) {
2504	tp->t_pktlist_tail = mnext;
2505	tp->t_lastchain++;
2506	} else {
2507	struct mbuf tail, next;
2508	next = mnext->m_nextpkt;
2509	tail = next->m_nextpkt;
2510	while (tail) {
2511	next = tail;
2512	tail = tail->m_nextpkt;
2513	tp->t_lastchain++;
2514	}
2515	tp->t_pktlist_tail = next;
2516	}
2517	}
2518	}
2519
2520	if (sendalot == `0` \|\| (tp->t_state != TCPS_ESTABLISHED) \|\|
2521	(tp->snd_cwnd <= (tp->snd_wnd / `8`)) \|\|
2522	(tp->t_flags & TF_ACKNOW) \|\|
2523	(tp->t_flagsext & TF_FORCE) \|\|
2524	tp->t_lastchain >= tcp_packet_chaining) {
2525	error = `0`;
2526	while (inp->inp_sndinprog_cnt == `0` &&
2527	tp->t_pktlist_head != NULL) {
2528	packetlist = tp->t_pktlist_head;
2529	packchain_listadd = tp->t_lastchain;
2530	packchain_sent++;
2531	lost = tp->t_pktlist_sentlen;
2532	TCP_PKTLIST_CLEAR(tp);
2533
2534	error = tcp_ip_output(so, tp, packetlist,
2535	packchain_listadd, tp_inp_options,
2536	(so_options & SO_DONTROUTE),
2537	(sack_rxmit \|\| (sack_bytes_rxmt != `0`)), isipv6);
2538	if (error) {
2539	/*
2540	* Take into account the rest of unsent
2541	* packets in the packet list for this tcp
2542	* into "lost", since we're about to free
2543	* the whole list below.
2544	*/
2545	lost += tp->t_pktlist_sentlen;
2546	break;
2547	} else {
2548	lost = `0`;
2549	}
2550	}
2551	/ tcp was closed while we were in ip; resume close /
2552	if (inp->inp_sndinprog_cnt == `0` &&
2553	(tp->t_flags & TF_CLOSING)) {
2554	tp->t_flags &= ~TF_CLOSING;
2555	(void) tcp_close(tp);
2556	return (`0`);
2557	}
2558	} else {
2559	error = `0`;
2560	packchain_looped++;
2561	tcpstat.tcps_sndtotal++;
2562
2563	goto again;
2564	}
2565	if (error) {
2566	/*
2567	* Assume that the packets were lost, so back out the
2568	* sequence number advance, if any. Note that the "lost"
2569	* variable represents the amount of user data sent during
2570	* the recent call to ip_output_list() plus the amount of
2571	* user data in the packet list for this tcp at the moment.
2572	*/
2573	if (!(tp->t_flagsext & TF_FORCE)
2574	\|\| tp->t_timer[TCPT_PERSIST] == `0`) {
2575	/*
2576	* No need to check for TH_FIN here because
2577	* the TF_SENTFIN flag handles that case.
2578	*/
2579	if ((flags & TH_SYN) == `0`) {
2580	if (sack_rxmit) {
2581	if (SEQ_GT((p->rxmit - lost),
2582	tp->snd_una)) {
2583	p->rxmit -= lost;
2584	} else {
2585	lost = p->rxmit - tp->snd_una;
2586	p->rxmit = tp->snd_una;
2587	}
2588	tp->sackhint.sack_bytes_rexmit -= lost;
2589	} else {
2590	if (SEQ_GT((tp->snd_nxt - lost),
2591	tp->snd_una))
2592	tp->snd_nxt -= lost;
2593	else
2594	tp->snd_nxt = tp->snd_una;
2595	}
2596	}
2597	}
2598	out:
2599	if (tp->t_pktlist_head != NULL)
2600	m_freem_list(tp->t_pktlist_head);
2601	TCP_PKTLIST_CLEAR(tp);
2602
2603	if (error == ENOBUFS) {
2604	/*
2605	* Set retransmit timer if not currently set
2606	* when we failed to send a segment that can be
2607	* retransmitted (i.e. not pure ack or rst)
2608	*/
2609	if (!tp->t_timer[TCPT_REXMT] &&
2610	!tp->t_timer[TCPT_PERSIST] &&
2611	(len != `0` \|\| (flags & (TH_SYN \| TH_FIN)) != `0` \|\|
2612	so->so_snd.sb_cc > `0`))
2613	tp->t_timer[TCPT_REXMT] =
2614	OFFSET_FROM_START(tp, tp->t_rxtcur);
2615	tp->snd_cwnd = tp->t_maxseg;
2616	tp->t_bytes_acked = `0`;
2617	tcp_check_timer_state(tp);
2618	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2619
2620	tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR);
2621	return (`0`);
2622	}
2623	if (error == EMSGSIZE) {
2624	/*
2625	* ip_output() will have already fixed the route
2626	* for us. tcp_mtudisc() will, as its last action,
2627	* initiate retransmission, so it is important to
2628	* not do so here.
2629	*
2630	* If TSO was active we either got an interface
2631	* without TSO capabilits or TSO was turned off.
2632	* Disable it for this connection as too and
2633	* immediatly retry with MSS sized segments generated
2634	* by this function.
2635	*/
2636	if (tso)
2637	tp->t_flags &= ~TF_TSO;
2638
2639	tcp_mtudisc(inp, `0`);
2640	tcp_check_timer_state(tp);
2641
2642	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2643	return `0`;
2644	}
2645	/*
2646	* Unless this is due to interface restriction policy,
2647	* treat EHOSTUNREACH/ENETDOWN as a soft error.
2648	*/
2649	if ((error == EHOSTUNREACH \|\| error == ENETDOWN) &&
2650	TCPS_HAVERCVDSYN(tp->t_state) &&
2651	!inp_restricted_send(inp, inp->inp_last_outifp)) {
2652	tp->t_softerror = error;
2653	error = `0`;
2654	}
2655	tcp_check_timer_state(tp);
2656	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`,`0`,`0`,`0`,`0`);
2657	return (error);
2658	}
2659
2660	tcpstat.tcps_sndtotal++;
2661
2662	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END,`0`,`0`,`0`,`0`,`0`);
2663	if (sendalot)
2664	goto again;
2665
2666	tcp_check_timer_state(tp);
2667	return (`0`);
2668	}
2669
2670	static int
2671	tcp_ip_output(struct socket so, struct* tcpcb tp, struct* mbuf *pkt,
2672	int cnt, struct mbuf opt, int* flags, int sack_in_progress, boolean_t isipv6)
2673	{
2674	int error = `0`;
2675	boolean_t chain;
2676	boolean_t unlocked = FALSE;
2677	boolean_t ifdenied = FALSE;
2678	struct inpcb *inp = tp->t_inpcb;
2679	struct ip_out_args ipoa;
2680	struct route ro;
2681	struct ifnet *outif = NULL;
2682
2683	bzero(&ipoa, sizeof(ipoa));
2684	ipoa.ipoa_boundif = IFSCOPE_NONE;
2685	ipoa.ipoa_flags = IPOAF_SELECT_SRCIF \| IPOAF_BOUND_SRCADDR;
2686	ipoa.ipoa_sotc = SO_TC_UNSPEC;
2687	ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
2688	#if INET6
2689	struct ip6_out_args ip6oa;
2690	struct route_in6 ro6;
2691
2692	bzero(&ip6oa, sizeof(ip6oa));
2693	ip6oa.ip6oa_boundif = IFSCOPE_NONE;
2694	ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF \| IP6OAF_BOUND_SRCADDR;
2695	ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
2696	ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
2697
2698	struct flowadv *adv =
2699	(isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv);
2700	#else /* INET6 */
2701	struct flowadv *adv = &ipoa.ipoa_flowadv;
2702	#endif /* !INET6 */
2703
2704	/ If socket was bound to an ifindex, tell ip_output about it /
2705	if (inp->inp_flags & INP_BOUND_IF) {
2706	#if INET6
2707	if (isipv6) {
2708	ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index;
2709	ip6oa.ip6oa_flags \|= IP6OAF_BOUND_IF;
2710	} else
2711	#endif /* INET6 */
2712	{
2713	ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
2714	ipoa.ipoa_flags \|= IPOAF_BOUND_IF;
2715	}
2716	}
2717
2718	if (INP_NO_CELLULAR(inp)) {
2719	#if INET6
2720	if (isipv6)
2721	ip6oa.ip6oa_flags \|= IP6OAF_NO_CELLULAR;
2722	else
2723	#endif /* INET6 */
2724	ipoa.ipoa_flags \|= IPOAF_NO_CELLULAR;
2725	}
2726	if (INP_NO_EXPENSIVE(inp)) {
2727	#if INET6
2728	if (isipv6)
2729	ip6oa.ip6oa_flags \|= IP6OAF_NO_EXPENSIVE;
2730	else
2731	#endif /* INET6 */
2732	ipoa.ipoa_flags \|= IPOAF_NO_EXPENSIVE;
2733
2734	}
2735	if (INP_AWDL_UNRESTRICTED(inp)) {
2736	#if INET6
2737	if (isipv6)
2738	ip6oa.ip6oa_flags \|= IP6OAF_AWDL_UNRESTRICTED;
2739	else
2740	#endif /* INET6 */
2741	ipoa.ipoa_flags \|= IPOAF_AWDL_UNRESTRICTED;
2742
2743	}
2744	#if INET6
2745	if (INP_INTCOPROC_ALLOWED(inp) && isipv6) {
2746	ip6oa.ip6oa_flags \|= IP6OAF_INTCOPROC_ALLOWED;
2747	}
2748	if (isipv6) {
2749	ip6oa.ip6oa_sotc = so->so_traffic_class;
2750	ip6oa.ip6oa_netsvctype = so->so_netsvctype;
2751	} else
2752	#endif /* INET6 */
2753	{
2754	ipoa.ipoa_sotc = so->so_traffic_class;
2755	ipoa.ipoa_netsvctype = so->so_netsvctype;
2756	}
2757	if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
2758	#if INET6
2759	if (isipv6)
2760	ip6oa.ip6oa_flags \|= IP6OAF_QOSMARKING_ALLOWED;
2761	else
2762	#endif /* INET6 */
2763	ipoa.ipoa_flags \|= IPOAF_QOSMARKING_ALLOWED;
2764	}
2765	#if INET6
2766	if (isipv6)
2767	flags \|= IPV6_OUTARGS;
2768	else
2769	#endif /* INET6 */
2770	flags \|= IP_OUTARGS;
2771
2772	/ Copy the cached route and take an extra reference /
2773	#if INET6
2774	if (isipv6)
2775	in6p_route_copyout(inp, &ro6);
2776	else
2777	#endif /* INET6 */
2778	inp_route_copyout(inp, &ro);
2779
2780	/*
2781	* Make sure ACK/DELACK conditions are cleared before
2782	* we unlock the socket.
2783	*/
2784	tp->last_ack_sent = tp->rcv_nxt;
2785	tp->t_flags &= ~(TF_ACKNOW \| TF_DELACK);
2786	tp->t_timer[TCPT_DELACK] = `0`;
2787	tp->t_unacksegs = `0`;
2788
2789	/ Increment the count of outstanding send operations /
2790	inp->inp_sndinprog_cnt++;
2791
2792	/*
2793	* If allowed, unlock TCP socket while in IP
2794	* but only if the connection is established and
2795	* in a normal mode where reentrancy on the tcpcb won't be
2796	* an issue:
2797	* - there is no SACK episode
2798	* - we're not in Fast Recovery mode
2799	* - if we're not sending from an upcall.
2800	*/
2801	if (tcp_output_unlocked && !so->so_upcallusecount &&
2802	(tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == `0`) &&
2803	!IN_FASTRECOVERY(tp) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2804
2805	unlocked = TRUE;
2806	socket_unlock(so, `0`);
2807	}
2808
2809	/*
2810	* Don't send down a chain of packets when:
2811	* - TCP chaining is disabled
2812	* - there is an IPsec rule set
2813	* - there is a non default rule set for the firewall
2814	*/
2815
2816	chain = tcp_packet_chaining > `1`
2817	#if IPSEC
2818	&& ipsec_bypass
2819	#endif
2820	#if IPFIREWALL
2821	&& (fw_enable == `0` \|\| fw_bypass)
2822	#endif
2823	; // I'm important, not extraneous
2824
2825
2826	while (pkt != NULL) {
2827	struct mbuf *npkt = pkt->m_nextpkt;
2828
2829	if (!chain) {
2830	pkt->m_nextpkt = NULL;
2831	/*
2832	* If we are not chaining, make sure to set the packet
2833	* list count to 0 so that IP takes the right path;
2834	* this is important for cases such as IPSec where a
2835	* single mbuf might result in multiple mbufs as part
2836	* of the encapsulation. If a non-zero count is passed
2837	* down to IP, the head of the chain might change and
2838	* we could end up skipping it (thus generating bogus
2839	* packets). Fixing it in IP would be desirable, but
2840	* for now this would do it.
2841	*/
2842	cnt = `0`;
2843	}
2844	#if INET6
2845	if (isipv6) {
2846	error = ip6_output_list(pkt, cnt,
2847	inp->in6p_outputopts, &ro6, flags, NULL, NULL,
2848	&ip6oa);
2849	ifdenied = (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED);
2850	} else {
2851	#endif /* INET6 */
2852	error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL,
2853	&ipoa);
2854	ifdenied = (ipoa.ipoa_retflags & IPOARF_IFDENIED);
2855	}
2856
2857	if (chain \|\| error) {
2858	/*
2859	* If we sent down a chain then we are done since
2860	* the callee had taken care of everything; else
2861	* we need to free the rest of the chain ourselves.
2862	*/
2863	if (!chain)
2864	m_freem_list(npkt);
2865	break;
2866	}
2867	pkt = npkt;
2868	}
2869
2870	if (unlocked)
2871	socket_lock(so, `0`);
2872
2873	/*
2874	* Enter flow controlled state if the connection is established
2875	* and is not in recovery. Flow control is allowed only if there
2876	* is outstanding data.
2877	*
2878	* A connection will enter suspended state even if it is in
2879	* recovery.
2880	*/
2881	if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) \|\|
2882	adv->code == FADV_SUSPENDED) &&
2883	!(tp->t_flags & TF_CLOSING) &&
2884	tp->t_state == TCPS_ESTABLISHED &&
2885	SEQ_GT(tp->snd_max, tp->snd_una)) {
2886	int rc;
2887	rc = inp_set_fc_state(inp, adv->code);
2888
2889	if (rc == `1`)
2890	tcp_ccdbg_trace(tp, NULL,
2891	((adv->code == FADV_FLOW_CONTROLLED) ?
2892	TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND));
2893	}
2894
2895	/*
2896	* When an interface queue gets suspended, some of the
2897	* packets are dropped. Return ENOBUFS, to update the
2898	* pcb state.
2899	*/
2900	if (adv->code == FADV_SUSPENDED)
2901	error = ENOBUFS;
2902
2903	VERIFY(inp->inp_sndinprog_cnt > `0`);
2904	if ( --inp->inp_sndinprog_cnt == `0`)
2905	inp->inp_flags &= ~(INP_FC_FEEDBACK);
2906
2907	#if INET6
2908	if (isipv6) {
2909	if (ro6.ro_rt != NULL)
2910	outif = ro6.ro_rt->rt_ifp;
2911	} else
2912	#endif /* INET6 */
2913	if (ro.ro_rt != NULL)
2914	outif = ro.ro_rt->rt_ifp;
2915
2916	if (outif != NULL && outif != inp->inp_last_outifp &&
2917	so->so_snd.sb_cc > `0`) {
2918	/ Update the send byte count /
2919	if (so->so_snd.sb_flags & SB_SNDBYTE_CNT) {
2920	inp_decr_sndbytes_total(so, so->so_snd.sb_cc);
2921	inp_decr_sndbytes_allunsent(so, tp->snd_una);
2922	so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT;
2923	}
2924	inp->inp_last_outifp = outif;
2925
2926	}
2927
2928	if (error != `0` && ifdenied &&
2929	(INP_NO_CELLULAR(inp) \|\| INP_NO_EXPENSIVE(inp)))
2930	soevent(so,
2931	(SO_FILT_HINT_LOCKED\|SO_FILT_HINT_IFDENIED));
2932
2933	/ Synchronize cached PCB route & options /
2934	#if INET6
2935	if (isipv6)
2936	in6p_route_copyin(inp, &ro6);
2937	else
2938	#endif /* INET6 */
2939	inp_route_copyin(inp, &ro);
2940
2941	if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == `0` &&
2942	tp->t_inpcb->inp_route.ro_rt != NULL) {
2943	/ If we found the route and there is an rtt on it*
2944	* reset the retransmit timer
2945	*/
2946	tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt);
2947	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
2948	}
2949	return (error);
2950	}
2951
2952	int tcptv_persmin_val = TCPTV_PERSMIN;
2953
2954	void
2955	tcp_setpersist(struct tcpcb *tp)
2956	{
2957	int t = ((tp->t_srtt >> `2`) + tp->t_rttvar) >> `1`;
2958
2959	/ If a PERSIST_TIMER option was set we will limit the*
2960	* time the persist timer will be active for that connection
2961	* in order to avoid DOS by using zero window probes.
2962	* see rdar://5805356
2963	*/
2964
2965	if ((tp->t_persist_timeout != `0`) &&
2966	(tp->t_timer[TCPT_PERSIST] == `0`) &&
2967	(tp->t_persist_stop == `0`)) {
2968	tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
2969	}
2970
2971	/*
2972	* Start/restart persistance timer.
2973	*/
2974	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
2975	t * tcp_backoff[tp->t_rxtshift],
2976	tcptv_persmin_val, TCPTV_PERSMAX, `0`);
2977	tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
2978
2979	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2980	tp->t_rxtshift++;
2981	}
2982
2983	/*
2984	* Send as many acks as data coalesced. Every other packet when stretch
2985	* ACK is not enabled. Every 8 packets, if stretch ACK is enabled.
2986	*/
2987	static struct mbuf*
2988	tcp_send_lroacks(struct tcpcb tp, struct* mbuf m, struct* tcphdr *th)
2989	{
2990	struct mbuf mnext = NULL, ack_chain = NULL, *tail = NULL;
2991	int count = `0`;
2992	tcp_seq org_ack = ntohl(th->th_ack);
2993	tcp_seq prev_ack = `0`;
2994	int tack_offset = `28`; / IPv6 and IP options not supported /
2995	int twin_offset = `34`; / IPv6 and IP options not supported /
2996	int ack_size = (tp->t_flags & TF_STRETCHACK) ?
2997	(maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << `1`);
2998	int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : `2`;
2999	struct mbuf *prev_ack_pkt = NULL;
3000	struct socket *so = tp->t_inpcb->inp_socket;
3001	unsigned short winsz = ntohs(th->th_win);
3002	unsigned int scaled_win = winsz<<tp->rcv_scale;
3003	tcp_seq win_rtedge = org_ack + scaled_win;
3004
3005	count = tp->t_lropktlen/tp->t_maxseg;
3006
3007	prev_ack = (org_ack - tp->t_lropktlen) + ack_size;
3008	if (prev_ack < org_ack) {
3009	ack_chain = m_dup(m, M_DONTWAIT);
3010	if (ack_chain) {
3011	th->th_ack = htonl(prev_ack);
3012	/ Keep adv window constant for duplicated ACK packets /
3013	scaled_win = win_rtedge - prev_ack;
3014	if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
3015	scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
3016	th->th_win = htons(scaled_win>>tp->rcv_scale);
3017	if (lrodebug == `5`) {
3018	printf("%s: win = %d winsz = %d sc = %d"
3019	" lro_len %d %d\n",
3020	__func__, scaled_win>>tp->rcv_scale, winsz,
3021	tp->rcv_scale, tp->t_lropktlen, count);
3022	}
3023	tail = ack_chain;
3024	count -= segs_acked; / accounts for prev_ack packet /
3025	count = (count <= segs_acked) ? `0` : count - segs_acked;
3026	tcpstat.tcps_sndacks++;
3027	so_tc_update_stats(m, so, m_get_service_class(m));
3028	} else {
3029	return NULL;
3030	}
3031	}
3032	else {
3033	tp->t_lropktlen = `0`;
3034	return NULL;
3035	}
3036
3037	prev_ack_pkt = ack_chain;
3038
3039	while (count > `0`) {
3040	if ((prev_ack + ack_size) < org_ack) {
3041	prev_ack += ack_size;
3042	} else {
3043	/*
3044	* The last ACK sent must have the ACK number that TCP
3045	* thinks is the last sent ACK number.
3046	*/
3047	prev_ack = org_ack;
3048	}
3049	mnext = m_dup(prev_ack_pkt, M_DONTWAIT);
3050	if (mnext) {
3051	/ Keep adv window constant for duplicated ACK packets /
3052	scaled_win = win_rtedge - prev_ack;
3053	if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale))
3054	scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
3055	winsz = htons(scaled_win>>tp->rcv_scale);
3056	if (lrodebug == `5`) {
3057	printf("%s: winsz = %d ack %x count %d\n",
3058	__func__, scaled_win>>tp->rcv_scale,
3059	prev_ack, count);
3060	}
3061	bcopy(&winsz, mtod(prev_ack_pkt, caddr_t) + twin_offset, `2`);
3062	HTONL(prev_ack);
3063	bcopy(&prev_ack, mtod(prev_ack_pkt, caddr_t) + tack_offset, `4`);
3064	NTOHL(prev_ack);
3065	tail->m_nextpkt = mnext;
3066	tail = mnext;
3067	count -= segs_acked;
3068	tcpstat.tcps_sndacks++;
3069	so_tc_update_stats(m, so, m_get_service_class(m));
3070	} else {
3071	if (lrodebug == `5`) {
3072	printf("%s: failed to alloc mbuf.\n", __func__);
3073	}
3074	break;
3075	}
3076	prev_ack_pkt = mnext;
3077	}
3078	tp->t_lropktlen = `0`;
3079	return ack_chain;
3080	}
3081
3082	static int
3083	tcp_recv_throttle (struct tcpcb *tp)
3084	{
3085	uint32_t base_rtt, newsize;
3086	struct sockbuf *sbrcv = &tp->t_inpcb->inp_socket->so_rcv;
3087
3088	if (tcp_use_rtt_recvbg == `1` &&
3089	TSTMP_SUPPORTED(tp)) {
3090	/*
3091	* Timestamps are supported on this connection. Use
3092	* RTT to look for an increase in latency.
3093	*/
3094
3095	/*
3096	* If the connection is already being throttled, leave it
3097	* in that state until rtt comes closer to base rtt
3098	*/
3099	if (tp->t_flagsext & TF_RECV_THROTTLE)
3100	return (`1`);
3101
3102	base_rtt = get_base_rtt(tp);
3103
3104	if (base_rtt != `0` && tp->t_rttcur != `0`) {
3105	/*
3106	* if latency increased on a background flow,
3107	* return 1 to start throttling.
3108	*/
3109	if (tp->t_rttcur > (base_rtt + target_qdelay)) {
3110	tp->t_flagsext \|= TF_RECV_THROTTLE;
3111	if (tp->t_recv_throttle_ts == `0`)
3112	tp->t_recv_throttle_ts = tcp_now;
3113	/*
3114	* Reduce the recv socket buffer size to
3115	* minimize latecy.
3116	*/
3117	if (sbrcv->sb_idealsize >
3118	tcp_recv_throttle_minwin) {
3119	newsize = sbrcv->sb_idealsize >> `1`;
3120	/ Set a minimum of 16 K /
3121	newsize =
3122	max(newsize,
3123	tcp_recv_throttle_minwin);
3124	sbrcv->sb_idealsize = newsize;
3125	}
3126	return (`1`);
3127	} else {
3128	return (`0`);
3129	}
3130	}
3131	}
3132
3133	/*
3134	* Timestamps are not supported or there is no good RTT
3135	* measurement. Use IPDV in this case.
3136	*/
3137	if (tp->acc_iaj > tcp_acc_iaj_react_limit)
3138	return (`1`);
3139
3140	return (`0`);
3141	}
3142

Browse the source code of codebrowser/bsd/netinet/tcp_output.c