| 1 | /* |
| 2 | * Copyright (c) 2000-2018 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | /* |
| 29 | * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 |
| 30 | * The Regents of the University of California. All rights reserved. |
| 31 | * |
| 32 | * Redistribution and use in source and binary forms, with or without |
| 33 | * modification, are permitted provided that the following conditions |
| 34 | * are met: |
| 35 | * 1. Redistributions of source code must retain the above copyright |
| 36 | * notice, this list of conditions and the following disclaimer. |
| 37 | * 2. Redistributions in binary form must reproduce the above copyright |
| 38 | * notice, this list of conditions and the following disclaimer in the |
| 39 | * documentation and/or other materials provided with the distribution. |
| 40 | * 3. All advertising materials mentioning features or use of this software |
| 41 | * must display the following acknowledgement: |
| 42 | * This product includes software developed by the University of |
| 43 | * California, Berkeley and its contributors. |
| 44 | * 4. Neither the name of the University nor the names of its contributors |
| 45 | * may be used to endorse or promote products derived from this software |
| 46 | * without specific prior written permission. |
| 47 | * |
| 48 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 49 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 50 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 51 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 52 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 53 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 54 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 55 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 56 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 57 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 58 | * SUCH DAMAGE. |
| 59 | * |
| 60 | * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 |
| 61 | * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $ |
| 62 | */ |
| 63 | /* |
| 64 | * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce |
| 65 | * support for mandatory and extensible security protections. This notice |
| 66 | * is included in support of clause 2.2 (b) of the Apple Public License, |
| 67 | * Version 2.0. |
| 68 | */ |
| 69 | |
| 70 | #define _IP_VHL |
| 71 | |
| 72 | |
| 73 | #include <sys/param.h> |
| 74 | #include <sys/systm.h> |
| 75 | #include <sys/kernel.h> |
| 76 | #include <sys/sysctl.h> |
| 77 | #include <sys/mbuf.h> |
| 78 | #include <sys/domain.h> |
| 79 | #include <sys/protosw.h> |
| 80 | #include <sys/socket.h> |
| 81 | #include <sys/socketvar.h> |
| 82 | |
| 83 | #include <net/route.h> |
| 84 | #include <net/ntstat.h> |
| 85 | #include <net/if_var.h> |
| 86 | #include <net/if.h> |
| 87 | #include <net/if_types.h> |
| 88 | #include <net/dlil.h> |
| 89 | |
| 90 | #include <netinet/in.h> |
| 91 | #include <netinet/in_systm.h> |
| 92 | #include <netinet/in_var.h> |
| 93 | #include <netinet/in_tclass.h> |
| 94 | #include <netinet/ip.h> |
| 95 | #include <netinet/in_pcb.h> |
| 96 | #include <netinet/ip_var.h> |
| 97 | #include <mach/sdt.h> |
| 98 | #if INET6 |
| 99 | #include <netinet6/in6_pcb.h> |
| 100 | #include <netinet/ip6.h> |
| 101 | #include <netinet6/ip6_var.h> |
| 102 | #endif |
| 103 | #include <netinet/tcp.h> |
| 104 | #define TCPOUTFLAGS |
| 105 | #include <netinet/tcp_cache.h> |
| 106 | #include <netinet/tcp_fsm.h> |
| 107 | #include <netinet/tcp_seq.h> |
| 108 | #include <netinet/tcp_timer.h> |
| 109 | #include <netinet/tcp_var.h> |
| 110 | #include <netinet/tcpip.h> |
| 111 | #include <netinet/tcp_cc.h> |
| 112 | #if TCPDEBUG |
| 113 | #include <netinet/tcp_debug.h> |
| 114 | #endif |
| 115 | #include <sys/kdebug.h> |
| 116 | #include <mach/sdt.h> |
| 117 | |
| 118 | #if IPSEC |
| 119 | #include <netinet6/ipsec.h> |
| 120 | #endif /*IPSEC*/ |
| 121 | |
| 122 | #if CONFIG_MACF_NET |
| 123 | #include <security/mac_framework.h> |
| 124 | #endif /* MAC_SOCKET */ |
| 125 | |
| 126 | #include <netinet/lro_ext.h> |
| 127 | #if MPTCP |
| 128 | #include <netinet/mptcp_var.h> |
| 129 | #include <netinet/mptcp.h> |
| 130 | #include <netinet/mptcp_opt.h> |
| 131 | #endif |
| 132 | |
| 133 | #include <corecrypto/ccaes.h> |
| 134 | |
| 135 | #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1) |
| 136 | #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3) |
| 137 | #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1) |
| 138 | |
| 139 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, path_mtu_discovery, |
| 140 | CTLFLAG_RW | CTLFLAG_LOCKED, int, path_mtu_discovery, 1, |
| 141 | "Enable Path MTU Discovery" ); |
| 142 | |
| 143 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, slowstart_flightsize, |
| 144 | CTLFLAG_RW | CTLFLAG_LOCKED, int, ss_fltsz, 1, |
| 145 | "Slow start flight size" ); |
| 146 | |
| 147 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, local_slowstart_flightsize, |
| 148 | CTLFLAG_RW | CTLFLAG_LOCKED, int, ss_fltsz_local, 8, |
| 149 | "Slow start flight size for local networks" ); |
| 150 | |
| 151 | int tcp_do_tso = 1; |
| 152 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED, |
| 153 | &tcp_do_tso, 0, "Enable TCP Segmentation Offload" ); |
| 154 | |
| 155 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, ecn_setup_percentage, |
| 156 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_ecn_setup_percentage, 100, |
| 157 | "Max ECN setup percentage" ); |
| 158 | |
| 159 | static int |
| 160 | sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS |
| 161 | { |
| 162 | #pragma unused(oidp, arg1, arg2) |
| 163 | int i, err = 0, changed = 0; |
| 164 | struct ifnet *ifp; |
| 165 | |
| 166 | err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t), |
| 167 | &i, &changed); |
| 168 | if (err != 0 || req->newptr == USER_ADDR_NULL) |
| 169 | return(err); |
| 170 | |
| 171 | if (changed) { |
| 172 | if ((tcp_ecn_outbound == 0 || tcp_ecn_outbound == 1) && |
| 173 | (i == 0 || i == 1)) { |
| 174 | tcp_ecn_outbound = i; |
| 175 | SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound); |
| 176 | return(err); |
| 177 | } |
| 178 | if (tcp_ecn_outbound == 2 && (i == 0 || i == 1)) { |
| 179 | /* |
| 180 | * Reset ECN enable flags on non-cellular |
| 181 | * interfaces so that the system default will take |
| 182 | * over |
| 183 | */ |
| 184 | ifnet_head_lock_shared(); |
| 185 | TAILQ_FOREACH(ifp, &ifnet_head, if_link) { |
| 186 | if (!IFNET_IS_CELLULAR(ifp)) { |
| 187 | ifnet_lock_exclusive(ifp); |
| 188 | ifp->if_eflags &= ~IFEF_ECN_DISABLE; |
| 189 | ifp->if_eflags &= ~IFEF_ECN_ENABLE; |
| 190 | ifnet_lock_done(ifp); |
| 191 | } |
| 192 | } |
| 193 | ifnet_head_done(); |
| 194 | } else { |
| 195 | /* |
| 196 | * Set ECN enable flags on non-cellular |
| 197 | * interfaces |
| 198 | */ |
| 199 | ifnet_head_lock_shared(); |
| 200 | TAILQ_FOREACH(ifp, &ifnet_head, if_link) { |
| 201 | if (!IFNET_IS_CELLULAR(ifp)) { |
| 202 | ifnet_lock_exclusive(ifp); |
| 203 | ifp->if_eflags |= IFEF_ECN_ENABLE; |
| 204 | ifp->if_eflags &= ~IFEF_ECN_DISABLE; |
| 205 | ifnet_lock_done(ifp); |
| 206 | } |
| 207 | } |
| 208 | ifnet_head_done(); |
| 209 | } |
| 210 | tcp_ecn_outbound = i; |
| 211 | SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound); |
| 212 | } |
| 213 | /* Change the other one too as the work is done */ |
| 214 | if (i == 2 || tcp_ecn_inbound == 2) { |
| 215 | tcp_ecn_inbound = i; |
| 216 | SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_negotiate_in, tcp_ecn_inbound); |
| 217 | } |
| 218 | return (err); |
| 219 | } |
| 220 | |
| 221 | int tcp_ecn_outbound = 2; |
| 222 | SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out, |
| 223 | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0, |
| 224 | sysctl_change_ecn_setting, "IU" , |
| 225 | "Initiate ECN for outbound connections" ); |
| 226 | |
| 227 | int tcp_ecn_inbound = 2; |
| 228 | SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, |
| 229 | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0, |
| 230 | sysctl_change_ecn_setting, "IU" , |
| 231 | "Initiate ECN for inbound connections" ); |
| 232 | |
| 233 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, packetchain, |
| 234 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_packet_chaining, 50, |
| 235 | "Enable TCP output packet chaining" ); |
| 236 | |
| 237 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, socket_unlocked_on_output, |
| 238 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_output_unlocked, 1, |
| 239 | "Unlock TCP when sending packets down to IP" ); |
| 240 | |
| 241 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3390, |
| 242 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_do_rfc3390, 1, |
| 243 | "Calculate intial slowstart cwnd depending on MSS" ); |
| 244 | |
| 245 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, min_iaj_win, |
| 246 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_min_iaj_win, MIN_IAJ_WIN, |
| 247 | "Minimum recv win based on inter-packet arrival jitter" ); |
| 248 | |
| 249 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, acc_iaj_react_limit, |
| 250 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_acc_iaj_react_limit, |
| 251 | ACC_IAJ_REACT_LIMIT, "Accumulated IAJ when receiver starts to react" ); |
| 252 | |
| 253 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautosndbuf, |
| 254 | CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_do_autosendbuf, 1, |
| 255 | "Enable send socket buffer auto-tuning" ); |
| 256 | |
| 257 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufinc, |
| 258 | CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_inc, |
| 259 | 8 * 1024, "Increment in send socket bufffer size" ); |
| 260 | |
| 261 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufmax, |
| 262 | CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_max, 512 * 1024, |
| 263 | "Maximum send socket buffer size" ); |
| 264 | |
| 265 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_prioritize, |
| 266 | CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_prioritize_acks, 1, |
| 267 | "Prioritize pure acks" ); |
| 268 | |
| 269 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_recvbg, |
| 270 | CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_use_rtt_recvbg, 1, |
| 271 | "Use RTT for bg recv algorithm" ); |
| 272 | |
| 273 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_throttle_minwin, |
| 274 | CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_recv_throttle_minwin, 16 * 1024, |
| 275 | "Minimum recv win for throttling" ); |
| 276 | |
| 277 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, enable_tlp, |
| 278 | CTLFLAG_RW | CTLFLAG_LOCKED, |
| 279 | int32_t, tcp_enable_tlp, 1, "Enable Tail loss probe" ); |
| 280 | |
| 281 | static int32_t packchain_newlist = 0; |
| 282 | static int32_t packchain_looped = 0; |
| 283 | static int32_t packchain_sent = 0; |
| 284 | |
| 285 | /* temporary: for testing */ |
| 286 | #if IPSEC |
| 287 | extern int ipsec_bypass; |
| 288 | #endif |
| 289 | |
| 290 | extern int slowlink_wsize; /* window correction for slow links */ |
| 291 | #if IPFIREWALL |
| 292 | extern int fw_enable; /* firewall check for packet chaining */ |
| 293 | extern int fw_bypass; /* firewall check: disable packet chaining if there is rules */ |
| 294 | #endif /* IPFIREWALL */ |
| 295 | |
| 296 | extern u_int32_t dlil_filter_disable_tso_count; |
| 297 | extern u_int32_t kipf_count; |
| 298 | |
| 299 | static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, |
| 300 | int, struct mbuf *, int, int, boolean_t); |
| 301 | static struct mbuf* tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th); |
| 302 | static int tcp_recv_throttle(struct tcpcb *tp); |
| 303 | |
| 304 | static int32_t tcp_tfo_check(struct tcpcb *tp, int32_t len) |
| 305 | { |
| 306 | struct socket *so = tp->t_inpcb->inp_socket; |
| 307 | unsigned int optlen = 0; |
| 308 | unsigned int cookie_len; |
| 309 | |
| 310 | if (tp->t_flags & TF_NOOPT) |
| 311 | goto fallback; |
| 312 | |
| 313 | if ((so->so_flags1 & SOF1_DATA_AUTHENTICATED) && |
| 314 | !(tp->t_flagsext & TF_FASTOPEN_HEUR)) |
| 315 | return (len); |
| 316 | |
| 317 | if (!tcp_heuristic_do_tfo(tp)) { |
| 318 | tp->t_tfo_stats |= TFO_S_HEURISTICS_DISABLE; |
| 319 | tcpstat.tcps_tfo_heuristics_disable++; |
| 320 | goto fallback; |
| 321 | } |
| 322 | |
| 323 | if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) |
| 324 | return (len); |
| 325 | |
| 326 | optlen += TCPOLEN_MAXSEG; |
| 327 | |
| 328 | if (tp->t_flags & TF_REQ_SCALE) |
| 329 | optlen += 4; |
| 330 | |
| 331 | #if MPTCP |
| 332 | if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable && |
| 333 | tp->t_rxtshift <= mptcp_mpcap_retries) |
| 334 | optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t); |
| 335 | #endif /* MPTCP */ |
| 336 | |
| 337 | if (tp->t_flags & TF_REQ_TSTMP) |
| 338 | optlen += TCPOLEN_TSTAMP_APPA; |
| 339 | |
| 340 | if (SACK_ENABLED(tp)) |
| 341 | optlen += TCPOLEN_SACK_PERMITTED; |
| 342 | |
| 343 | /* Now, decide whether to use TFO or not */ |
| 344 | |
| 345 | /* Don't even bother trying if there is no space at all... */ |
| 346 | if (MAX_TCPOPTLEN - optlen < TCPOLEN_FASTOPEN_REQ) |
| 347 | goto fallback; |
| 348 | |
| 349 | cookie_len = tcp_cache_get_cookie_len(tp); |
| 350 | if (cookie_len == 0) |
| 351 | /* No cookie, so we request one */ |
| 352 | return (0); |
| 353 | |
| 354 | /* There is not enough space for the cookie, so we cannot do TFO */ |
| 355 | if (MAX_TCPOPTLEN - optlen < cookie_len) |
| 356 | goto fallback; |
| 357 | |
| 358 | /* Do not send SYN+data if there is more in the queue than MSS */ |
| 359 | if (so->so_snd.sb_cc > (tp->t_maxopd - MAX_TCPOPTLEN)) |
| 360 | goto fallback; |
| 361 | |
| 362 | /* Ok, everything looks good. We can go on and do TFO */ |
| 363 | return (len); |
| 364 | |
| 365 | fallback: |
| 366 | tp->t_flagsext &= ~TF_FASTOPEN; |
| 367 | return (0); |
| 368 | } |
| 369 | |
| 370 | /* Returns the number of bytes written to the TCP option-space */ |
| 371 | static unsigned |
| 372 | tcp_tfo_write_cookie_rep(struct tcpcb *tp, unsigned optlen, u_char *opt) |
| 373 | { |
| 374 | u_char out[CCAES_BLOCK_SIZE]; |
| 375 | unsigned ret = 0; |
| 376 | u_char *bp; |
| 377 | |
| 378 | if ((MAX_TCPOPTLEN - optlen) < |
| 379 | (TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT)) |
| 380 | return (ret); |
| 381 | |
| 382 | tcp_tfo_gen_cookie(tp->t_inpcb, out, sizeof(out)); |
| 383 | |
| 384 | bp = opt + optlen; |
| 385 | |
| 386 | *bp++ = TCPOPT_FASTOPEN; |
| 387 | *bp++ = 2 + TFO_COOKIE_LEN_DEFAULT; |
| 388 | memcpy(bp, out, TFO_COOKIE_LEN_DEFAULT); |
| 389 | ret += 2 + TFO_COOKIE_LEN_DEFAULT; |
| 390 | |
| 391 | tp->t_tfo_stats |= TFO_S_COOKIE_SENT; |
| 392 | tcpstat.tcps_tfo_cookie_sent++; |
| 393 | |
| 394 | return (ret); |
| 395 | } |
| 396 | |
| 397 | static unsigned |
| 398 | tcp_tfo_write_cookie(struct tcpcb *tp, unsigned optlen, int32_t len, |
| 399 | u_char *opt) |
| 400 | { |
| 401 | u_int8_t tfo_len = MAX_TCPOPTLEN - optlen - TCPOLEN_FASTOPEN_REQ; |
| 402 | struct socket *so = tp->t_inpcb->inp_socket; |
| 403 | unsigned ret = 0; |
| 404 | int res; |
| 405 | u_char *bp; |
| 406 | |
| 407 | if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) { |
| 408 | /* If there is some data, let's track it */ |
| 409 | if (len > 0) { |
| 410 | tp->t_tfo_stats |= TFO_S_SYN_DATA_SENT; |
| 411 | tcpstat.tcps_tfo_syn_data_sent++; |
| 412 | } |
| 413 | |
| 414 | return (0); |
| 415 | } |
| 416 | |
| 417 | bp = opt + optlen; |
| 418 | |
| 419 | /* |
| 420 | * The cookie will be copied in the appropriate place within the |
| 421 | * TCP-option space. That way we avoid the need for an intermediate |
| 422 | * variable. |
| 423 | */ |
| 424 | res = tcp_cache_get_cookie(tp, bp + TCPOLEN_FASTOPEN_REQ, &tfo_len); |
| 425 | if (res == 0) { |
| 426 | *bp++ = TCPOPT_FASTOPEN; |
| 427 | *bp++ = TCPOLEN_FASTOPEN_REQ; |
| 428 | ret += TCPOLEN_FASTOPEN_REQ; |
| 429 | |
| 430 | tp->t_tfo_flags |= TFO_F_COOKIE_REQ; |
| 431 | |
| 432 | tp->t_tfo_stats |= TFO_S_COOKIE_REQ; |
| 433 | tcpstat.tcps_tfo_cookie_req++; |
| 434 | } else { |
| 435 | *bp++ = TCPOPT_FASTOPEN; |
| 436 | *bp++ = TCPOLEN_FASTOPEN_REQ + tfo_len; |
| 437 | |
| 438 | ret += TCPOLEN_FASTOPEN_REQ + tfo_len; |
| 439 | |
| 440 | tp->t_tfo_flags |= TFO_F_COOKIE_SENT; |
| 441 | |
| 442 | /* If there is some data, let's track it */ |
| 443 | if (len > 0) { |
| 444 | tp->t_tfo_stats |= TFO_S_SYN_DATA_SENT; |
| 445 | tcpstat.tcps_tfo_syn_data_sent++; |
| 446 | } |
| 447 | } |
| 448 | |
| 449 | return (ret); |
| 450 | } |
| 451 | |
| 452 | static inline bool |
| 453 | tcp_send_ecn_flags_on_syn(struct tcpcb *tp, struct socket *so) |
| 454 | { |
| 455 | return(!((tp->ecn_flags & TE_SETUPSENT) || |
| 456 | (so->so_flags & SOF_MP_SUBFLOW) || |
| 457 | (tp->t_flagsext & TF_FASTOPEN))); |
| 458 | } |
| 459 | |
| 460 | void |
| 461 | tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp) |
| 462 | { |
| 463 | boolean_t inbound; |
| 464 | |
| 465 | /* |
| 466 | * Socket option has precedence |
| 467 | */ |
| 468 | if (tp->ecn_flags & TE_ECN_MODE_ENABLE) { |
| 469 | tp->ecn_flags |= TE_ENABLE_ECN; |
| 470 | goto check_heuristic; |
| 471 | } |
| 472 | |
| 473 | if (tp->ecn_flags & TE_ECN_MODE_DISABLE) { |
| 474 | tp->ecn_flags &= ~TE_ENABLE_ECN; |
| 475 | return; |
| 476 | } |
| 477 | /* |
| 478 | * Per interface setting comes next |
| 479 | */ |
| 480 | if (ifp != NULL) { |
| 481 | if (ifp->if_eflags & IFEF_ECN_ENABLE) { |
| 482 | tp->ecn_flags |= TE_ENABLE_ECN; |
| 483 | goto check_heuristic; |
| 484 | } |
| 485 | |
| 486 | if (ifp->if_eflags & IFEF_ECN_DISABLE) { |
| 487 | tp->ecn_flags &= ~TE_ENABLE_ECN; |
| 488 | return; |
| 489 | } |
| 490 | } |
| 491 | /* |
| 492 | * System wide settings come last |
| 493 | */ |
| 494 | inbound = (tp->t_inpcb->inp_socket->so_head != NULL); |
| 495 | if ((inbound && tcp_ecn_inbound == 1) || |
| 496 | (!inbound && tcp_ecn_outbound == 1)) { |
| 497 | tp->ecn_flags |= TE_ENABLE_ECN; |
| 498 | goto check_heuristic; |
| 499 | } else { |
| 500 | tp->ecn_flags &= ~TE_ENABLE_ECN; |
| 501 | } |
| 502 | |
| 503 | return; |
| 504 | |
| 505 | check_heuristic: |
| 506 | if (!tcp_heuristic_do_ecn(tp)) |
| 507 | tp->ecn_flags &= ~TE_ENABLE_ECN; |
| 508 | |
| 509 | /* |
| 510 | * If the interface setting, system-level setting and heuristics |
| 511 | * allow to enable ECN, randomly select 5% of connections to |
| 512 | * enable it |
| 513 | */ |
| 514 | if ((tp->ecn_flags & (TE_ECN_MODE_ENABLE | TE_ECN_MODE_DISABLE |
| 515 | | TE_ENABLE_ECN)) == TE_ENABLE_ECN) { |
| 516 | /* |
| 517 | * Use the random value in iss for randomizing |
| 518 | * this selection |
| 519 | */ |
| 520 | if ((tp->iss % 100) >= tcp_ecn_setup_percentage) |
| 521 | tp->ecn_flags &= ~TE_ENABLE_ECN; |
| 522 | } |
| 523 | } |
| 524 | |
| 525 | /* |
| 526 | * Tcp output routine: figure out what should be sent and send it. |
| 527 | * |
| 528 | * Returns: 0 Success |
| 529 | * EADDRNOTAVAIL |
| 530 | * ENOBUFS |
| 531 | * EMSGSIZE |
| 532 | * EHOSTUNREACH |
| 533 | * ENETDOWN |
| 534 | * ip_output_list:ENOMEM |
| 535 | * ip_output_list:EADDRNOTAVAIL |
| 536 | * ip_output_list:ENETUNREACH |
| 537 | * ip_output_list:EHOSTUNREACH |
| 538 | * ip_output_list:EACCES |
| 539 | * ip_output_list:EMSGSIZE |
| 540 | * ip_output_list:ENOBUFS |
| 541 | * ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL] |
| 542 | * ip6_output_list:EINVAL |
| 543 | * ip6_output_list:EOPNOTSUPP |
| 544 | * ip6_output_list:EHOSTUNREACH |
| 545 | * ip6_output_list:EADDRNOTAVAIL |
| 546 | * ip6_output_list:ENETUNREACH |
| 547 | * ip6_output_list:EMSGSIZE |
| 548 | * ip6_output_list:ENOBUFS |
| 549 | * ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL] |
| 550 | */ |
| 551 | int |
| 552 | tcp_output(struct tcpcb *tp) |
| 553 | { |
| 554 | struct inpcb *inp = tp->t_inpcb; |
| 555 | struct socket *so = inp->inp_socket; |
| 556 | int32_t len, recwin, sendwin, off; |
| 557 | int flags, error; |
| 558 | struct mbuf *m; |
| 559 | struct ip *ip = NULL; |
| 560 | struct ipovly *ipov = NULL; |
| 561 | #if INET6 |
| 562 | struct ip6_hdr *ip6 = NULL; |
| 563 | #endif /* INET6 */ |
| 564 | struct tcphdr *th; |
| 565 | u_char opt[TCP_MAXOLEN]; |
| 566 | unsigned ipoptlen, optlen, hdrlen; |
| 567 | int idle, sendalot, lost = 0; |
| 568 | int i, sack_rxmit; |
| 569 | int tso = 0; |
| 570 | int sack_bytes_rxmt; |
| 571 | tcp_seq old_snd_nxt = 0; |
| 572 | struct sackhole *p; |
| 573 | #if IPSEC |
| 574 | unsigned ipsec_optlen = 0; |
| 575 | #endif /* IPSEC */ |
| 576 | int idle_time = 0; |
| 577 | struct mbuf *packetlist = NULL; |
| 578 | struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options; |
| 579 | #if INET6 |
| 580 | int isipv6 = inp->inp_vflag & INP_IPV6 ; |
| 581 | #else |
| 582 | int isipv6 = 0; |
| 583 | #endif |
| 584 | short packchain_listadd = 0; |
| 585 | int so_options = so->so_options; |
| 586 | struct rtentry *rt; |
| 587 | u_int32_t svc_flags = 0, allocated_len; |
| 588 | u_int32_t lro_ackmore = (tp->t_lropktlen != 0) ? 1 : 0; |
| 589 | struct mbuf *mnext = NULL; |
| 590 | int sackoptlen = 0; |
| 591 | #if MPTCP |
| 592 | boolean_t mptcp_acknow; |
| 593 | #endif /* MPTCP */ |
| 594 | boolean_t cell = FALSE; |
| 595 | boolean_t wifi = FALSE; |
| 596 | boolean_t wired = FALSE; |
| 597 | boolean_t sack_rescue_rxt = FALSE; |
| 598 | int sotc = so->so_traffic_class; |
| 599 | |
| 600 | /* |
| 601 | * Determine length of data that should be transmitted, |
| 602 | * and flags that will be used. |
| 603 | * If there is some data or critical controls (SYN, RST) |
| 604 | * to send, then transmit; otherwise, investigate further. |
| 605 | */ |
| 606 | idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); |
| 607 | |
| 608 | /* Since idle_time is signed integer, the following integer subtraction |
| 609 | * will take care of wrap around of tcp_now |
| 610 | */ |
| 611 | idle_time = tcp_now - tp->t_rcvtime; |
| 612 | if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) { |
| 613 | if (CC_ALGO(tp)->after_idle != NULL && |
| 614 | (tp->tcp_cc_index != TCP_CC_ALGO_CUBIC_INDEX || |
| 615 | idle_time >= TCP_CC_CWND_NONVALIDATED_PERIOD)) { |
| 616 | CC_ALGO(tp)->after_idle(tp); |
| 617 | tcp_ccdbg_trace(tp, NULL, TCP_CC_IDLE_TIMEOUT); |
| 618 | } |
| 619 | |
| 620 | /* |
| 621 | * Do some other tasks that need to be done after |
| 622 | * idle time |
| 623 | */ |
| 624 | if (!SLIST_EMPTY(&tp->t_rxt_segments)) |
| 625 | tcp_rxtseg_clean(tp); |
| 626 | |
| 627 | /* If stretch ack was auto-disabled, re-evaluate it */ |
| 628 | tcp_cc_after_idle_stretchack(tp); |
| 629 | } |
| 630 | tp->t_flags &= ~TF_LASTIDLE; |
| 631 | if (idle) { |
| 632 | if (tp->t_flags & TF_MORETOCOME) { |
| 633 | tp->t_flags |= TF_LASTIDLE; |
| 634 | idle = 0; |
| 635 | } |
| 636 | } |
| 637 | #if MPTCP |
| 638 | if (tp->t_mpflags & TMPF_RESET) { |
| 639 | tcp_check_timer_state(tp); |
| 640 | /* |
| 641 | * Once a RST has been sent for an MPTCP subflow, |
| 642 | * the subflow socket stays around until deleted. |
| 643 | * No packets such as FINs must be sent after RST. |
| 644 | */ |
| 645 | return (0); |
| 646 | } |
| 647 | #endif /* MPTCP */ |
| 648 | |
| 649 | again: |
| 650 | #if MPTCP |
| 651 | mptcp_acknow = FALSE; |
| 652 | #endif |
| 653 | |
| 654 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); |
| 655 | |
| 656 | #if INET6 |
| 657 | if (isipv6) { |
| 658 | KERNEL_DEBUG(DBG_LAYER_BEG, |
| 659 | ((inp->inp_fport << 16) | inp->inp_lport), |
| 660 | (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | |
| 661 | (inp->in6p_faddr.s6_addr16[0] & 0xffff)), |
| 662 | sendalot,0,0); |
| 663 | } else |
| 664 | #endif |
| 665 | |
| 666 | { |
| 667 | KERNEL_DEBUG(DBG_LAYER_BEG, |
| 668 | ((inp->inp_fport << 16) | inp->inp_lport), |
| 669 | (((inp->inp_laddr.s_addr & 0xffff) << 16) | |
| 670 | (inp->inp_faddr.s_addr & 0xffff)), |
| 671 | sendalot,0,0); |
| 672 | } |
| 673 | /* |
| 674 | * If the route generation id changed, we need to check that our |
| 675 | * local (source) IP address is still valid. If it isn't either |
| 676 | * return error or silently do nothing (assuming the address will |
| 677 | * come back before the TCP connection times out). |
| 678 | */ |
| 679 | rt = inp->inp_route.ro_rt; |
| 680 | if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) { |
| 681 | struct ifnet *ifp; |
| 682 | struct in_ifaddr *ia = NULL; |
| 683 | struct in6_ifaddr *ia6 = NULL; |
| 684 | int found_srcaddr = 0; |
| 685 | |
| 686 | /* disable multipages at the socket */ |
| 687 | somultipages(so, FALSE); |
| 688 | |
| 689 | /* Disable TSO for the socket until we know more */ |
| 690 | tp->t_flags &= ~TF_TSO; |
| 691 | |
| 692 | soif2kcl(so, FALSE); |
| 693 | |
| 694 | if (isipv6) { |
| 695 | ia6 = ifa_foraddr6(&inp->in6p_laddr); |
| 696 | if (ia6 != NULL) |
| 697 | found_srcaddr = 1; |
| 698 | } else { |
| 699 | ia = ifa_foraddr(inp->inp_laddr.s_addr); |
| 700 | if (ia != NULL) |
| 701 | found_srcaddr = 1; |
| 702 | } |
| 703 | |
| 704 | /* check that the source address is still valid */ |
| 705 | if (found_srcaddr == 0) { |
| 706 | soevent(so, |
| 707 | (SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR)); |
| 708 | |
| 709 | if (tp->t_state >= TCPS_CLOSE_WAIT) { |
| 710 | tcp_drop(tp, EADDRNOTAVAIL); |
| 711 | return(EADDRNOTAVAIL); |
| 712 | } |
| 713 | |
| 714 | /* Set retransmit timer if it wasn't set, |
| 715 | * reset Persist timer and shift register as the |
| 716 | * advertised peer window may not be valid anymore |
| 717 | */ |
| 718 | |
| 719 | if (!tp->t_timer[TCPT_REXMT]) { |
| 720 | tp->t_timer[TCPT_REXMT] = |
| 721 | OFFSET_FROM_START(tp, tp->t_rxtcur); |
| 722 | if (tp->t_timer[TCPT_PERSIST]) { |
| 723 | tp->t_timer[TCPT_PERSIST] = 0; |
| 724 | tp->t_persist_stop = 0; |
| 725 | TCP_RESET_REXMT_STATE(tp); |
| 726 | } |
| 727 | } |
| 728 | |
| 729 | if (tp->t_pktlist_head != NULL) |
| 730 | m_freem_list(tp->t_pktlist_head); |
| 731 | TCP_PKTLIST_CLEAR(tp); |
| 732 | |
| 733 | /* drop connection if source address isn't available */ |
| 734 | if (so->so_flags & SOF_NOADDRAVAIL) { |
| 735 | tcp_drop(tp, EADDRNOTAVAIL); |
| 736 | return(EADDRNOTAVAIL); |
| 737 | } else { |
| 738 | tcp_check_timer_state(tp); |
| 739 | return(0); /* silently ignore, keep data in socket: address may be back */ |
| 740 | } |
| 741 | } |
| 742 | if (ia != NULL) |
| 743 | IFA_REMREF(&ia->ia_ifa); |
| 744 | |
| 745 | if (ia6 != NULL) |
| 746 | IFA_REMREF(&ia6->ia_ifa); |
| 747 | |
| 748 | /* |
| 749 | * Address is still valid; check for multipages capability |
| 750 | * again in case the outgoing interface has changed. |
| 751 | */ |
| 752 | RT_LOCK(rt); |
| 753 | if ((ifp = rt->rt_ifp) != NULL) { |
| 754 | somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES)); |
| 755 | tcp_set_tso(tp, ifp); |
| 756 | soif2kcl(so, (ifp->if_eflags & IFEF_2KCL)); |
| 757 | tcp_set_ecn(tp, ifp); |
| 758 | } |
| 759 | if (rt->rt_flags & RTF_UP) |
| 760 | RT_GENID_SYNC(rt); |
| 761 | /* |
| 762 | * See if we should do MTU discovery. Don't do it if: |
| 763 | * 1) it is disabled via the sysctl |
| 764 | * 2) the route isn't up |
| 765 | * 3) the MTU is locked (if it is, then discovery |
| 766 | * has been disabled) |
| 767 | */ |
| 768 | |
| 769 | if (!path_mtu_discovery || ((rt != NULL) && |
| 770 | (!(rt->rt_flags & RTF_UP) || |
| 771 | (rt->rt_rmx.rmx_locks & RTV_MTU)))) |
| 772 | tp->t_flags &= ~TF_PMTUD; |
| 773 | else |
| 774 | tp->t_flags |= TF_PMTUD; |
| 775 | |
| 776 | RT_UNLOCK(rt); |
| 777 | } |
| 778 | |
| 779 | if (rt != NULL) { |
| 780 | cell = IFNET_IS_CELLULAR(rt->rt_ifp); |
| 781 | wifi = (!cell && IFNET_IS_WIFI(rt->rt_ifp)); |
| 782 | wired = (!wifi && IFNET_IS_WIRED(rt->rt_ifp)); |
| 783 | } |
| 784 | |
| 785 | /* |
| 786 | * If we've recently taken a timeout, snd_max will be greater than |
| 787 | * snd_nxt. There may be SACK information that allows us to avoid |
| 788 | * resending already delivered data. Adjust snd_nxt accordingly. |
| 789 | */ |
| 790 | if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) |
| 791 | tcp_sack_adjust(tp); |
| 792 | sendalot = 0; |
| 793 | off = tp->snd_nxt - tp->snd_una; |
| 794 | sendwin = min(tp->snd_wnd, tp->snd_cwnd); |
| 795 | |
| 796 | if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) |
| 797 | sendwin = min(sendwin, slowlink_wsize); |
| 798 | |
| 799 | flags = tcp_outflags[tp->t_state]; |
| 800 | /* |
| 801 | * Send any SACK-generated retransmissions. If we're explicitly |
| 802 | * trying to send out new data (when sendalot is 1), bypass this |
| 803 | * function. If we retransmit in fast recovery mode, decrement |
| 804 | * snd_cwnd, since we're replacing a (future) new transmission |
| 805 | * with a retransmission now, and we previously incremented |
| 806 | * snd_cwnd in tcp_input(). |
| 807 | */ |
| 808 | /* |
| 809 | * Still in sack recovery , reset rxmit flag to zero. |
| 810 | */ |
| 811 | sack_rxmit = 0; |
| 812 | sack_bytes_rxmt = 0; |
| 813 | len = 0; |
| 814 | p = NULL; |
| 815 | if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) && |
| 816 | (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { |
| 817 | int32_t cwin; |
| 818 | |
| 819 | cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; |
| 820 | if (cwin < 0) |
| 821 | cwin = 0; |
| 822 | /* Do not retransmit SACK segments beyond snd_recover */ |
| 823 | if (SEQ_GT(p->end, tp->snd_recover)) { |
| 824 | /* |
| 825 | * (At least) part of sack hole extends beyond |
| 826 | * snd_recover. Check to see if we can rexmit data |
| 827 | * for this hole. |
| 828 | */ |
| 829 | if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { |
| 830 | /* |
| 831 | * Can't rexmit any more data for this hole. |
| 832 | * That data will be rexmitted in the next |
| 833 | * sack recovery episode, when snd_recover |
| 834 | * moves past p->rxmit. |
| 835 | */ |
| 836 | p = NULL; |
| 837 | goto after_sack_rexmit; |
| 838 | } else |
| 839 | /* Can rexmit part of the current hole */ |
| 840 | len = ((int32_t)min(cwin, |
| 841 | tp->snd_recover - p->rxmit)); |
| 842 | } else { |
| 843 | len = ((int32_t)min(cwin, p->end - p->rxmit)); |
| 844 | } |
| 845 | if (len > 0) { |
| 846 | off = p->rxmit - tp->snd_una; |
| 847 | sack_rxmit = 1; |
| 848 | sendalot = 1; |
| 849 | tcpstat.tcps_sack_rexmits++; |
| 850 | tcpstat.tcps_sack_rexmit_bytes += |
| 851 | min(len, tp->t_maxseg); |
| 852 | } else { |
| 853 | len = 0; |
| 854 | } |
| 855 | } |
| 856 | after_sack_rexmit: |
| 857 | /* |
| 858 | * Get standard flags, and add SYN or FIN if requested by 'hidden' |
| 859 | * state flags. |
| 860 | */ |
| 861 | if (tp->t_flags & TF_NEEDFIN) |
| 862 | flags |= TH_FIN; |
| 863 | if (tp->t_flags & TF_NEEDSYN) |
| 864 | flags |= TH_SYN; |
| 865 | |
| 866 | /* |
| 867 | * If in persist timeout with window of 0, send 1 byte. |
| 868 | * Otherwise, if window is small but nonzero |
| 869 | * and timer expired, we will send what we can |
| 870 | * and go to transmit state. |
| 871 | */ |
| 872 | if (tp->t_flagsext & TF_FORCE) { |
| 873 | if (sendwin == 0) { |
| 874 | /* |
| 875 | * If we still have some data to send, then |
| 876 | * clear the FIN bit. Usually this would |
| 877 | * happen below when it realizes that we |
| 878 | * aren't sending all the data. However, |
| 879 | * if we have exactly 1 byte of unsent data, |
| 880 | * then it won't clear the FIN bit below, |
| 881 | * and if we are in persist state, we wind |
| 882 | * up sending the packet without recording |
| 883 | * that we sent the FIN bit. |
| 884 | * |
| 885 | * We can't just blindly clear the FIN bit, |
| 886 | * because if we don't have any more data |
| 887 | * to send then the probe will be the FIN |
| 888 | * itself. |
| 889 | */ |
| 890 | if (off < so->so_snd.sb_cc) |
| 891 | flags &= ~TH_FIN; |
| 892 | sendwin = 1; |
| 893 | } else { |
| 894 | tp->t_timer[TCPT_PERSIST] = 0; |
| 895 | tp->t_persist_stop = 0; |
| 896 | TCP_RESET_REXMT_STATE(tp); |
| 897 | } |
| 898 | } |
| 899 | |
| 900 | /* |
| 901 | * If snd_nxt == snd_max and we have transmitted a FIN, the |
| 902 | * offset will be > 0 even if so_snd.sb_cc is 0, resulting in |
| 903 | * a negative length. This can also occur when TCP opens up |
| 904 | * its congestion window while receiving additional duplicate |
| 905 | * acks after fast-retransmit because TCP will reset snd_nxt |
| 906 | * to snd_max after the fast-retransmit. |
| 907 | * |
| 908 | * In the normal retransmit-FIN-only case, however, snd_nxt will |
| 909 | * be set to snd_una, the offset will be 0, and the length may |
| 910 | * wind up 0. |
| 911 | * |
| 912 | * If sack_rxmit is true we are retransmitting from the scoreboard |
| 913 | * in which case len is already set. |
| 914 | */ |
| 915 | if (sack_rxmit == 0) { |
| 916 | if (sack_bytes_rxmt == 0) { |
| 917 | len = min(so->so_snd.sb_cc, sendwin) - off; |
| 918 | } else { |
| 919 | int32_t cwin; |
| 920 | |
| 921 | cwin = tp->snd_cwnd - |
| 922 | (tp->snd_nxt - tp->sack_newdata) - |
| 923 | sack_bytes_rxmt; |
| 924 | if (cwin < 0) |
| 925 | cwin = 0; |
| 926 | /* |
| 927 | * We are inside of a SACK recovery episode and are |
| 928 | * sending new data, having retransmitted all the |
| 929 | * data possible in the scoreboard. |
| 930 | */ |
| 931 | len = min(so->so_snd.sb_cc, tp->snd_wnd) |
| 932 | - off; |
| 933 | /* |
| 934 | * Don't remove this (len > 0) check ! |
| 935 | * We explicitly check for len > 0 here (although it |
| 936 | * isn't really necessary), to work around a gcc |
| 937 | * optimization issue - to force gcc to compute |
| 938 | * len above. Without this check, the computation |
| 939 | * of len is bungled by the optimizer. |
| 940 | */ |
| 941 | if (len > 0) { |
| 942 | len = imin(len, cwin); |
| 943 | } else { |
| 944 | len = 0; |
| 945 | } |
| 946 | /* |
| 947 | * At this point SACK recovery can not send any |
| 948 | * data from scoreboard or any new data. Check |
| 949 | * if we can do a rescue retransmit towards the |
| 950 | * tail end of recovery window. |
| 951 | */ |
| 952 | if (len == 0 && cwin > 0 && |
| 953 | SEQ_LT(tp->snd_fack, tp->snd_recover) && |
| 954 | !(tp->t_flagsext & TF_RESCUE_RXT)) { |
| 955 | len = min((tp->snd_recover - tp->snd_fack), |
| 956 | tp->t_maxseg); |
| 957 | len = imin(len, cwin); |
| 958 | old_snd_nxt = tp->snd_nxt; |
| 959 | sack_rescue_rxt = TRUE; |
| 960 | tp->snd_nxt = tp->snd_recover - len; |
| 961 | /* |
| 962 | * If FIN has been sent, snd_max |
| 963 | * must have been advanced to cover it. |
| 964 | */ |
| 965 | if ((tp->t_flags & TF_SENTFIN) && |
| 966 | tp->snd_max == tp->snd_recover) |
| 967 | tp->snd_nxt--; |
| 968 | |
| 969 | off = tp->snd_nxt - tp->snd_una; |
| 970 | sendalot = 0; |
| 971 | tp->t_flagsext |= TF_RESCUE_RXT; |
| 972 | } |
| 973 | } |
| 974 | } |
| 975 | |
| 976 | /* |
| 977 | * Lop off SYN bit if it has already been sent. However, if this |
| 978 | * is SYN-SENT state and if segment contains data and if we don't |
| 979 | * know that foreign host supports TAO, suppress sending segment. |
| 980 | */ |
| 981 | if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { |
| 982 | if (tp->t_state != TCPS_SYN_RECEIVED || tfo_enabled(tp)) |
| 983 | flags &= ~TH_SYN; |
| 984 | off--; |
| 985 | len++; |
| 986 | if (len > 0 && tp->t_state == TCPS_SYN_SENT) { |
| 987 | while (inp->inp_sndinprog_cnt == 0 && |
| 988 | tp->t_pktlist_head != NULL) { |
| 989 | packetlist = tp->t_pktlist_head; |
| 990 | packchain_listadd = tp->t_lastchain; |
| 991 | packchain_sent++; |
| 992 | TCP_PKTLIST_CLEAR(tp); |
| 993 | |
| 994 | error = tcp_ip_output(so, tp, packetlist, |
| 995 | packchain_listadd, tp_inp_options, |
| 996 | (so_options & SO_DONTROUTE), |
| 997 | (sack_rxmit || (sack_bytes_rxmt != 0)), |
| 998 | isipv6); |
| 999 | } |
| 1000 | |
| 1001 | /* |
| 1002 | * tcp was closed while we were in ip, |
| 1003 | * resume close |
| 1004 | */ |
| 1005 | if (inp->inp_sndinprog_cnt == 0 && |
| 1006 | (tp->t_flags & TF_CLOSING)) { |
| 1007 | tp->t_flags &= ~TF_CLOSING; |
| 1008 | (void) tcp_close(tp); |
| 1009 | } else { |
| 1010 | tcp_check_timer_state(tp); |
| 1011 | } |
| 1012 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, |
| 1013 | 0,0,0,0,0); |
| 1014 | return(0); |
| 1015 | } |
| 1016 | } |
| 1017 | |
| 1018 | /* |
| 1019 | * Be careful not to send data and/or FIN on SYN segments. |
| 1020 | * This measure is needed to prevent interoperability problems |
| 1021 | * with not fully conformant TCP implementations. |
| 1022 | * |
| 1023 | * In case of TFO, we handle the setting of the len in |
| 1024 | * tcp_tfo_check. In case TFO is not enabled, never ever send |
| 1025 | * SYN+data. |
| 1026 | */ |
| 1027 | if ((flags & TH_SYN) && !tfo_enabled(tp)) { |
| 1028 | len = 0; |
| 1029 | flags &= ~TH_FIN; |
| 1030 | } |
| 1031 | |
| 1032 | if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && tfo_enabled(tp)) |
| 1033 | len = tcp_tfo_check(tp, len); |
| 1034 | |
| 1035 | /* |
| 1036 | * The check here used to be (len < 0). Some times len is zero |
| 1037 | * when the congestion window is closed and we need to check |
| 1038 | * if persist timer has to be set in that case. But don't set |
| 1039 | * persist until connection is established. |
| 1040 | */ |
| 1041 | if (len <= 0 && !(flags & TH_SYN)) { |
| 1042 | /* |
| 1043 | * If FIN has been sent but not acked, |
| 1044 | * but we haven't been called to retransmit, |
| 1045 | * len will be < 0. Otherwise, window shrank |
| 1046 | * after we sent into it. If window shrank to 0, |
| 1047 | * cancel pending retransmit, pull snd_nxt back |
| 1048 | * to (closed) window, and set the persist timer |
| 1049 | * if it isn't already going. If the window didn't |
| 1050 | * close completely, just wait for an ACK. |
| 1051 | */ |
| 1052 | len = 0; |
| 1053 | if (sendwin == 0) { |
| 1054 | tp->t_timer[TCPT_REXMT] = 0; |
| 1055 | tp->t_timer[TCPT_PTO] = 0; |
| 1056 | TCP_RESET_REXMT_STATE(tp); |
| 1057 | tp->snd_nxt = tp->snd_una; |
| 1058 | off = 0; |
| 1059 | if (tp->t_timer[TCPT_PERSIST] == 0) |
| 1060 | tcp_setpersist(tp); |
| 1061 | } |
| 1062 | } |
| 1063 | |
| 1064 | /* |
| 1065 | * Automatic sizing of send socket buffer. Increase the send |
| 1066 | * socket buffer size if all of the following criteria are met |
| 1067 | * 1. the receiver has enough buffer space for this data |
| 1068 | * 2. send buffer is filled to 7/8th with data (so we actually |
| 1069 | * have data to make use of it); |
| 1070 | * 3. our send window (slow start and congestion controlled) is |
| 1071 | * larger than sent but unacknowledged data in send buffer. |
| 1072 | */ |
| 1073 | if (tcp_do_autosendbuf == 1 && |
| 1074 | !INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) && |
| 1075 | (so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE && |
| 1076 | tcp_cansbgrow(&so->so_snd)) { |
| 1077 | if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && |
| 1078 | so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && |
| 1079 | sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { |
| 1080 | if (sbreserve(&so->so_snd, |
| 1081 | min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, |
| 1082 | tcp_autosndbuf_max)) == 1) { |
| 1083 | so->so_snd.sb_idealsize = so->so_snd.sb_hiwat; |
| 1084 | } |
| 1085 | } |
| 1086 | } |
| 1087 | |
| 1088 | /* |
| 1089 | * Truncate to the maximum segment length or enable TCP Segmentation |
| 1090 | * Offloading (if supported by hardware) and ensure that FIN is removed |
| 1091 | * if the length no longer contains the last data byte. |
| 1092 | * |
| 1093 | * TSO may only be used if we are in a pure bulk sending state. |
| 1094 | * The presence of TCP-MD5, SACK retransmits, SACK advertizements, |
| 1095 | * ipfw rules and IP options, as well as disabling hardware checksum |
| 1096 | * offload prevent using TSO. With TSO the TCP header is the same |
| 1097 | * (except for the sequence number) for all generated packets. This |
| 1098 | * makes it impossible to transmit any options which vary per generated |
| 1099 | * segment or packet. |
| 1100 | * |
| 1101 | * The length of TSO bursts is limited to TCP_MAXWIN. That limit and |
| 1102 | * removal of FIN (if not already catched here) are handled later after |
| 1103 | * the exact length of the TCP options are known. |
| 1104 | */ |
| 1105 | #if IPSEC |
| 1106 | /* |
| 1107 | * Pre-calculate here as we save another lookup into the darknesses |
| 1108 | * of IPsec that way and can actually decide if TSO is ok. |
| 1109 | */ |
| 1110 | if (ipsec_bypass == 0) |
| 1111 | ipsec_optlen = ipsec_hdrsiz_tcp(tp); |
| 1112 | #endif |
| 1113 | if (len > tp->t_maxseg) { |
| 1114 | if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx && |
| 1115 | ip_use_randomid && kipf_count == 0 && |
| 1116 | dlil_filter_disable_tso_count == 0 && |
| 1117 | tp->rcv_numsacks == 0 && sack_rxmit == 0 && |
| 1118 | sack_bytes_rxmt == 0 && |
| 1119 | inp->inp_options == NULL && |
| 1120 | inp->in6p_options == NULL |
| 1121 | #if IPSEC |
| 1122 | && ipsec_optlen == 0 |
| 1123 | #endif |
| 1124 | #if IPFIREWALL |
| 1125 | && (fw_enable == 0 || fw_bypass) |
| 1126 | #endif |
| 1127 | ) { |
| 1128 | tso = 1; |
| 1129 | sendalot = 0; |
| 1130 | } else { |
| 1131 | len = tp->t_maxseg; |
| 1132 | sendalot = 1; |
| 1133 | tso = 0; |
| 1134 | } |
| 1135 | } |
| 1136 | |
| 1137 | /* Send one segment or less as a tail loss probe */ |
| 1138 | if (tp->t_flagsext & TF_SENT_TLPROBE) { |
| 1139 | len = min(len, tp->t_maxseg); |
| 1140 | sendalot = 0; |
| 1141 | tso = 0; |
| 1142 | } |
| 1143 | |
| 1144 | #if MPTCP |
| 1145 | if ((so->so_flags & SOF_MP_SUBFLOW) && |
| 1146 | !(tp->t_mpflags & TMPF_TCP_FALLBACK)) { |
| 1147 | int newlen = len; |
| 1148 | if (tp->t_state >= TCPS_ESTABLISHED && |
| 1149 | (tp->t_mpflags & TMPF_SND_MPPRIO || |
| 1150 | tp->t_mpflags & TMPF_SND_REM_ADDR || |
| 1151 | tp->t_mpflags & TMPF_SND_MPFAIL || |
| 1152 | tp->t_mpflags & TMPF_SND_KEYS || |
| 1153 | tp->t_mpflags & TMPF_SND_JACK)) { |
| 1154 | if (len > 0) { |
| 1155 | len = 0; |
| 1156 | } |
| 1157 | /* |
| 1158 | * On a new subflow, don't try to send again, because |
| 1159 | * we are still waiting for the fourth ack. |
| 1160 | */ |
| 1161 | if (!(tp->t_mpflags & TMPF_PREESTABLISHED)) |
| 1162 | sendalot = 1; |
| 1163 | mptcp_acknow = TRUE; |
| 1164 | } else { |
| 1165 | mptcp_acknow = FALSE; |
| 1166 | } |
| 1167 | /* |
| 1168 | * The contiguous bytes in the subflow socket buffer can be |
| 1169 | * discontiguous at the MPTCP level. Since only one DSS |
| 1170 | * option can be sent in one packet, reduce length to match |
| 1171 | * the contiguous MPTCP level. Set sendalot to send remainder. |
| 1172 | */ |
| 1173 | if (len > 0) |
| 1174 | newlen = mptcp_adj_sendlen(so, off); |
| 1175 | if (newlen < len) { |
| 1176 | len = newlen; |
| 1177 | sendalot = 1; |
| 1178 | } |
| 1179 | } |
| 1180 | #endif /* MPTCP */ |
| 1181 | |
| 1182 | /* |
| 1183 | * If the socket is capable of doing unordered send, |
| 1184 | * pull the amount of data that can be sent from the |
| 1185 | * unordered priority queues to the serial queue in |
| 1186 | * the socket buffer. If bytes are not yet available |
| 1187 | * in the highest priority message, we may not be able |
| 1188 | * to send any new data. |
| 1189 | */ |
| 1190 | if (so->so_flags & SOF_ENABLE_MSGS) { |
| 1191 | if ((off + len) > |
| 1192 | so->so_msg_state->msg_serial_bytes) { |
| 1193 | sbpull_unordered_data(so, off, len); |
| 1194 | |
| 1195 | /* check if len needs to be modified */ |
| 1196 | if ((off + len) > |
| 1197 | so->so_msg_state->msg_serial_bytes) { |
| 1198 | len = so->so_msg_state->msg_serial_bytes - off; |
| 1199 | if (len <= 0) { |
| 1200 | len = 0; |
| 1201 | tcpstat.tcps_msg_sndwaithipri++; |
| 1202 | } |
| 1203 | } |
| 1204 | } |
| 1205 | } |
| 1206 | |
| 1207 | if (sack_rxmit) { |
| 1208 | if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) |
| 1209 | flags &= ~TH_FIN; |
| 1210 | } else { |
| 1211 | if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) |
| 1212 | flags &= ~TH_FIN; |
| 1213 | } |
| 1214 | /* |
| 1215 | * Compare available window to amount of window |
| 1216 | * known to peer (as advertised window less |
| 1217 | * next expected input). If the difference is at least two |
| 1218 | * max size segments, or at least 25% of the maximum possible |
| 1219 | * window, then want to send a window update to peer. |
| 1220 | * Skip this if the connection is in T/TCP half-open state. |
| 1221 | */ |
| 1222 | recwin = tcp_sbspace(tp); |
| 1223 | #if MPTCP |
| 1224 | if (so->so_flags & SOF_MP_SUBFLOW) { |
| 1225 | struct mptcb *mp_tp = tptomptp(tp); |
| 1226 | |
| 1227 | if (mp_tp != NULL) { |
| 1228 | mpte_lock_assert_held(mp_tp->mpt_mpte); |
| 1229 | recwin = imin(recwin, mptcp_sbspace(mp_tp)); |
| 1230 | } |
| 1231 | } |
| 1232 | #endif |
| 1233 | |
| 1234 | if (recwin < (int32_t)(so->so_rcv.sb_hiwat / 4) && |
| 1235 | recwin < (int)tp->t_maxseg) |
| 1236 | recwin = 0; |
| 1237 | |
| 1238 | #if TRAFFIC_MGT |
| 1239 | if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) { |
| 1240 | if (recwin > 0 && tcp_recv_throttle(tp)) { |
| 1241 | uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg; |
| 1242 | uint32_t bg_rwintop = tp->rcv_adv; |
| 1243 | if (SEQ_LT(bg_rwintop, tp->rcv_nxt + min_iaj_win)) |
| 1244 | bg_rwintop = tp->rcv_nxt + min_iaj_win; |
| 1245 | recwin = imin((int32_t)(bg_rwintop - tp->rcv_nxt), |
| 1246 | recwin); |
| 1247 | if (recwin < 0) |
| 1248 | recwin = 0; |
| 1249 | } |
| 1250 | } |
| 1251 | #endif /* TRAFFIC_MGT */ |
| 1252 | |
| 1253 | if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) |
| 1254 | recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); |
| 1255 | |
| 1256 | /* |
| 1257 | * MPTCP needs to be able to announce a smaller window than previously, |
| 1258 | * because the other subflow may have filled up the available window- |
| 1259 | * space. So we have to be able to go backwards and announce a smaller |
| 1260 | * window. |
| 1261 | */ |
| 1262 | if (!(so->so_flags & SOF_MP_SUBFLOW) && |
| 1263 | recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) |
| 1264 | recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt); |
| 1265 | |
| 1266 | /* |
| 1267 | * Sender silly window avoidance. We transmit under the following |
| 1268 | * conditions when len is non-zero: |
| 1269 | * |
| 1270 | * - we've timed out (e.g. persist timer) |
| 1271 | * - we need to retransmit |
| 1272 | * - We have a full segment (or more with TSO) |
| 1273 | * - This is the last buffer in a write()/send() and we are |
| 1274 | * either idle or running NODELAY |
| 1275 | * - we have more then 1/2 the maximum send window's worth of |
| 1276 | * data (receiver may be limited the window size) |
| 1277 | */ |
| 1278 | if (len) { |
| 1279 | if (tp->t_flagsext & TF_FORCE) |
| 1280 | goto send; |
| 1281 | if (SEQ_LT(tp->snd_nxt, tp->snd_max)) |
| 1282 | goto send; |
| 1283 | if (sack_rxmit) |
| 1284 | goto send; |
| 1285 | |
| 1286 | /* |
| 1287 | * Send new data on the connection only if it is |
| 1288 | * not flow controlled |
| 1289 | */ |
| 1290 | if (!INP_WAIT_FOR_IF_FEEDBACK(inp) || |
| 1291 | tp->t_state != TCPS_ESTABLISHED) { |
| 1292 | if (len >= tp->t_maxseg) |
| 1293 | goto send; |
| 1294 | |
| 1295 | if (!(tp->t_flags & TF_MORETOCOME) && |
| 1296 | (idle || tp->t_flags & TF_NODELAY || |
| 1297 | (tp->t_flags & TF_MAXSEGSNT) || |
| 1298 | ALLOW_LIMITED_TRANSMIT(tp)) && |
| 1299 | (tp->t_flags & TF_NOPUSH) == 0 && |
| 1300 | (len + off >= so->so_snd.sb_cc || |
| 1301 | /* |
| 1302 | * MPTCP needs to respect the DSS-mappings. So, it |
| 1303 | * may be sending data that *could* have been |
| 1304 | * coalesced, but cannot because of |
| 1305 | * mptcp_adj_sendlen(). |
| 1306 | */ |
| 1307 | so->so_flags & SOF_MP_SUBFLOW)) |
| 1308 | goto send; |
| 1309 | if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) |
| 1310 | goto send; |
| 1311 | } else { |
| 1312 | tcpstat.tcps_fcholdpacket++; |
| 1313 | } |
| 1314 | } |
| 1315 | |
| 1316 | if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) { |
| 1317 | /* |
| 1318 | * "adv" is the amount we can increase the window, |
| 1319 | * taking into account that we are limited by |
| 1320 | * TCP_MAXWIN << tp->rcv_scale. |
| 1321 | */ |
| 1322 | int32_t adv, oldwin = 0; |
| 1323 | adv = imin(recwin, (int)TCP_MAXWIN << tp->rcv_scale) - |
| 1324 | (tp->rcv_adv - tp->rcv_nxt); |
| 1325 | |
| 1326 | if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) |
| 1327 | oldwin = tp->rcv_adv - tp->rcv_nxt; |
| 1328 | |
| 1329 | if (adv >= (int32_t) (2 * tp->t_maxseg)) { |
| 1330 | /* |
| 1331 | * Update only if the resulting scaled value of |
| 1332 | * the window changed, or if there is a change in |
| 1333 | * the sequence since the last ack. This avoids |
| 1334 | * what appears as dupe ACKS (see rdar://5640997) |
| 1335 | * |
| 1336 | * If streaming is detected avoid sending too many |
| 1337 | * window updates. We will depend on the delack |
| 1338 | * timer to send a window update when needed. |
| 1339 | */ |
| 1340 | if (!(tp->t_flags & TF_STRETCHACK) && |
| 1341 | (tp->last_ack_sent != tp->rcv_nxt || |
| 1342 | ((oldwin + adv) >> tp->rcv_scale) > |
| 1343 | (oldwin >> tp->rcv_scale))) { |
| 1344 | goto send; |
| 1345 | } |
| 1346 | |
| 1347 | } |
| 1348 | if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat) |
| 1349 | goto send; |
| 1350 | |
| 1351 | /* |
| 1352 | * Make sure that the delayed ack timer is set if |
| 1353 | * we delayed sending a window update because of |
| 1354 | * streaming detection. |
| 1355 | */ |
| 1356 | if ((tp->t_flags & TF_STRETCHACK) && |
| 1357 | !(tp->t_flags & TF_DELACK)) { |
| 1358 | tp->t_flags |= TF_DELACK; |
| 1359 | tp->t_timer[TCPT_DELACK] = |
| 1360 | OFFSET_FROM_START(tp, tcp_delack); |
| 1361 | } |
| 1362 | } |
| 1363 | |
| 1364 | /* |
| 1365 | * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW |
| 1366 | * is also a catch-all for the retransmit timer timeout case. |
| 1367 | */ |
| 1368 | if (tp->t_flags & TF_ACKNOW) |
| 1369 | goto send; |
| 1370 | if ((flags & TH_RST) || |
| 1371 | ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) |
| 1372 | goto send; |
| 1373 | if (SEQ_GT(tp->snd_up, tp->snd_una)) |
| 1374 | goto send; |
| 1375 | #if MPTCP |
| 1376 | if (mptcp_acknow) |
| 1377 | goto send; |
| 1378 | #endif /* MPTCP */ |
| 1379 | /* |
| 1380 | * If our state indicates that FIN should be sent |
| 1381 | * and we have not yet done so, then we need to send. |
| 1382 | */ |
| 1383 | if ((flags & TH_FIN) && |
| 1384 | (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) |
| 1385 | goto send; |
| 1386 | /* |
| 1387 | * In SACK, it is possible for tcp_output to fail to send a segment |
| 1388 | * after the retransmission timer has been turned off. Make sure |
| 1389 | * that the retransmission timer is set. |
| 1390 | */ |
| 1391 | if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) && |
| 1392 | SEQ_GT(tp->snd_max, tp->snd_una) && |
| 1393 | tp->t_timer[TCPT_REXMT] == 0 && |
| 1394 | tp->t_timer[TCPT_PERSIST] == 0) { |
| 1395 | tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, |
| 1396 | tp->t_rxtcur); |
| 1397 | goto just_return; |
| 1398 | } |
| 1399 | /* |
| 1400 | * TCP window updates are not reliable, rather a polling protocol |
| 1401 | * using ``persist'' packets is used to insure receipt of window |
| 1402 | * updates. The three ``states'' for the output side are: |
| 1403 | * idle not doing retransmits or persists |
| 1404 | * persisting to move a small or zero window |
| 1405 | * (re)transmitting and thereby not persisting |
| 1406 | * |
| 1407 | * tp->t_timer[TCPT_PERSIST] |
| 1408 | * is set when we are in persist state. |
| 1409 | * tp->t_force |
| 1410 | * is set when we are called to send a persist packet. |
| 1411 | * tp->t_timer[TCPT_REXMT] |
| 1412 | * is set when we are retransmitting |
| 1413 | * The output side is idle when both timers are zero. |
| 1414 | * |
| 1415 | * If send window is too small, there is data to transmit, and no |
| 1416 | * retransmit or persist is pending, then go to persist state. |
| 1417 | * If nothing happens soon, send when timer expires: |
| 1418 | * if window is nonzero, transmit what we can, |
| 1419 | * otherwise force out a byte. |
| 1420 | */ |
| 1421 | if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && |
| 1422 | tp->t_timer[TCPT_PERSIST] == 0) { |
| 1423 | TCP_RESET_REXMT_STATE(tp); |
| 1424 | tcp_setpersist(tp); |
| 1425 | } |
| 1426 | just_return: |
| 1427 | /* |
| 1428 | * If there is no reason to send a segment, just return. |
| 1429 | * but if there is some packets left in the packet list, send them now. |
| 1430 | */ |
| 1431 | while (inp->inp_sndinprog_cnt == 0 && |
| 1432 | tp->t_pktlist_head != NULL) { |
| 1433 | packetlist = tp->t_pktlist_head; |
| 1434 | packchain_listadd = tp->t_lastchain; |
| 1435 | packchain_sent++; |
| 1436 | TCP_PKTLIST_CLEAR(tp); |
| 1437 | |
| 1438 | error = tcp_ip_output(so, tp, packetlist, |
| 1439 | packchain_listadd, |
| 1440 | tp_inp_options, (so_options & SO_DONTROUTE), |
| 1441 | (sack_rxmit || (sack_bytes_rxmt != 0)), isipv6); |
| 1442 | } |
| 1443 | /* tcp was closed while we were in ip; resume close */ |
| 1444 | if (inp->inp_sndinprog_cnt == 0 && |
| 1445 | (tp->t_flags & TF_CLOSING)) { |
| 1446 | tp->t_flags &= ~TF_CLOSING; |
| 1447 | (void) tcp_close(tp); |
| 1448 | } else { |
| 1449 | tcp_check_timer_state(tp); |
| 1450 | } |
| 1451 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); |
| 1452 | return (0); |
| 1453 | |
| 1454 | send: |
| 1455 | /* |
| 1456 | * Set TF_MAXSEGSNT flag if the segment size is greater than |
| 1457 | * the max segment size. |
| 1458 | */ |
| 1459 | if (len > 0) { |
| 1460 | if (len >= tp->t_maxseg) |
| 1461 | tp->t_flags |= TF_MAXSEGSNT; |
| 1462 | else |
| 1463 | tp->t_flags &= ~TF_MAXSEGSNT; |
| 1464 | } |
| 1465 | /* |
| 1466 | * Before ESTABLISHED, force sending of initial options |
| 1467 | * unless TCP set not to do any options. |
| 1468 | * NOTE: we assume that the IP/TCP header plus TCP options |
| 1469 | * always fit in a single mbuf, leaving room for a maximum |
| 1470 | * link header, i.e. |
| 1471 | * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES |
| 1472 | */ |
| 1473 | optlen = 0; |
| 1474 | #if INET6 |
| 1475 | if (isipv6) |
| 1476 | hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); |
| 1477 | else |
| 1478 | #endif |
| 1479 | hdrlen = sizeof (struct tcpiphdr); |
| 1480 | if (flags & TH_SYN) { |
| 1481 | tp->snd_nxt = tp->iss; |
| 1482 | if ((tp->t_flags & TF_NOOPT) == 0) { |
| 1483 | u_short mss; |
| 1484 | |
| 1485 | opt[0] = TCPOPT_MAXSEG; |
| 1486 | opt[1] = TCPOLEN_MAXSEG; |
| 1487 | mss = htons((u_short) tcp_mssopt(tp)); |
| 1488 | (void)memcpy(opt + 2, &mss, sizeof(mss)); |
| 1489 | optlen = TCPOLEN_MAXSEG; |
| 1490 | |
| 1491 | if ((tp->t_flags & TF_REQ_SCALE) && |
| 1492 | ((flags & TH_ACK) == 0 || |
| 1493 | (tp->t_flags & TF_RCVD_SCALE))) { |
| 1494 | *((u_int32_t *)(void *)(opt + optlen)) = htonl( |
| 1495 | TCPOPT_NOP << 24 | |
| 1496 | TCPOPT_WINDOW << 16 | |
| 1497 | TCPOLEN_WINDOW << 8 | |
| 1498 | tp->request_r_scale); |
| 1499 | optlen += 4; |
| 1500 | } |
| 1501 | #if MPTCP |
| 1502 | if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) { |
| 1503 | optlen = mptcp_setup_syn_opts(so, opt, optlen); |
| 1504 | } |
| 1505 | #endif /* MPTCP */ |
| 1506 | } |
| 1507 | } |
| 1508 | |
| 1509 | /* |
| 1510 | * Send a timestamp and echo-reply if this is a SYN and our side |
| 1511 | * wants to use timestamps (TF_REQ_TSTMP is set) or both our side |
| 1512 | * and our peer have sent timestamps in our SYN's. |
| 1513 | */ |
| 1514 | if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && |
| 1515 | (flags & TH_RST) == 0 && |
| 1516 | ((flags & TH_ACK) == 0 || |
| 1517 | (tp->t_flags & TF_RCVD_TSTMP))) { |
| 1518 | u_int32_t *lp = (u_int32_t *)(void *)(opt + optlen); |
| 1519 | |
| 1520 | /* Form timestamp option as shown in appendix A of RFC 1323. */ |
| 1521 | *lp++ = htonl(TCPOPT_TSTAMP_HDR); |
| 1522 | *lp++ = htonl(tcp_now); |
| 1523 | *lp = htonl(tp->ts_recent); |
| 1524 | optlen += TCPOLEN_TSTAMP_APPA; |
| 1525 | } |
| 1526 | |
| 1527 | /* Note the timestamp for receive buffer autosizing */ |
| 1528 | if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) |
| 1529 | tp->rfbuf_ts = tcp_now; |
| 1530 | |
| 1531 | if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) { |
| 1532 | /* |
| 1533 | * Tack on the SACK permitted option *last*. |
| 1534 | * And do padding of options after tacking this on. |
| 1535 | * This is because of MSS, TS, WinScale and Signatures are |
| 1536 | * all present, we have just 2 bytes left for the SACK |
| 1537 | * permitted option, which is just enough. |
| 1538 | */ |
| 1539 | /* |
| 1540 | * If this is the first SYN of connection (not a SYN |
| 1541 | * ACK), include SACK permitted option. If this is a |
| 1542 | * SYN ACK, include SACK permitted option if peer has |
| 1543 | * already done so. This is only for active connect, |
| 1544 | * since the syncache takes care of the passive connect. |
| 1545 | */ |
| 1546 | if ((flags & TH_SYN) && |
| 1547 | (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) { |
| 1548 | u_char *bp; |
| 1549 | bp = (u_char *)opt + optlen; |
| 1550 | |
| 1551 | *bp++ = TCPOPT_SACK_PERMITTED; |
| 1552 | *bp++ = TCPOLEN_SACK_PERMITTED; |
| 1553 | optlen += TCPOLEN_SACK_PERMITTED; |
| 1554 | } |
| 1555 | } |
| 1556 | #if MPTCP |
| 1557 | if (so->so_flags & SOF_MP_SUBFLOW) { |
| 1558 | /* |
| 1559 | * Its important to piggyback acks with data as ack only packets |
| 1560 | * may get lost and data packets that don't send Data ACKs |
| 1561 | * still advance the subflow level ACK and therefore make it |
| 1562 | * hard for the remote end to recover in low cwnd situations. |
| 1563 | */ |
| 1564 | if (len != 0) { |
| 1565 | tp->t_mpflags |= (TMPF_SEND_DSN | |
| 1566 | TMPF_MPTCP_ACKNOW); |
| 1567 | } else { |
| 1568 | tp->t_mpflags |= TMPF_MPTCP_ACKNOW; |
| 1569 | } |
| 1570 | optlen = mptcp_setup_opts(tp, off, &opt[0], optlen, flags, |
| 1571 | len, &mptcp_acknow); |
| 1572 | tp->t_mpflags &= ~TMPF_SEND_DSN; |
| 1573 | } |
| 1574 | #endif /* MPTCP */ |
| 1575 | |
| 1576 | if (tfo_enabled(tp) && !(tp->t_flags & TF_NOOPT) && |
| 1577 | (flags & (TH_SYN | TH_ACK)) == TH_SYN) |
| 1578 | optlen += tcp_tfo_write_cookie(tp, optlen, len, opt); |
| 1579 | |
| 1580 | if (tfo_enabled(tp) && |
| 1581 | (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) && |
| 1582 | (tp->t_tfo_flags & TFO_F_OFFER_COOKIE)) |
| 1583 | optlen += tcp_tfo_write_cookie_rep(tp, optlen, opt); |
| 1584 | |
| 1585 | if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == 0)) { |
| 1586 | /* |
| 1587 | * Send SACKs if necessary. This should be the last |
| 1588 | * option processed. Only as many SACKs are sent as |
| 1589 | * are permitted by the maximum options size. |
| 1590 | * |
| 1591 | * In general, SACK blocks consume 8*n+2 bytes. |
| 1592 | * So a full size SACK blocks option is 34 bytes |
| 1593 | * (to generate 4 SACK blocks). At a minimum, |
| 1594 | * we need 10 bytes (to generate 1 SACK block). |
| 1595 | * If TCP Timestamps (12 bytes) and TCP Signatures |
| 1596 | * (18 bytes) are both present, we'll just have |
| 1597 | * 10 bytes for SACK options 40 - (12 + 18). |
| 1598 | */ |
| 1599 | if (TCPS_HAVEESTABLISHED(tp->t_state) && |
| 1600 | (tp->t_flags & TF_SACK_PERMIT) && |
| 1601 | (tp->rcv_numsacks > 0 || TCP_SEND_DSACK_OPT(tp)) && |
| 1602 | MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) { |
| 1603 | int nsack, padlen; |
| 1604 | u_char *bp = (u_char *)opt + optlen; |
| 1605 | u_int32_t *lp; |
| 1606 | |
| 1607 | nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK; |
| 1608 | nsack = min(nsack, (tp->rcv_numsacks + |
| 1609 | (TCP_SEND_DSACK_OPT(tp) ? 1 : 0))); |
| 1610 | sackoptlen = (2 + nsack * TCPOLEN_SACK); |
| 1611 | |
| 1612 | /* |
| 1613 | * First we need to pad options so that the |
| 1614 | * SACK blocks can start at a 4-byte boundary |
| 1615 | * (sack option and length are at a 2 byte offset). |
| 1616 | */ |
| 1617 | padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4; |
| 1618 | optlen += padlen; |
| 1619 | while (padlen-- > 0) |
| 1620 | *bp++ = TCPOPT_NOP; |
| 1621 | |
| 1622 | tcpstat.tcps_sack_send_blocks++; |
| 1623 | *bp++ = TCPOPT_SACK; |
| 1624 | *bp++ = sackoptlen; |
| 1625 | lp = (u_int32_t *)(void *)bp; |
| 1626 | |
| 1627 | /* |
| 1628 | * First block of SACK option should represent |
| 1629 | * DSACK. Prefer to send SACK information if there |
| 1630 | * is space for only one SACK block. This will |
| 1631 | * allow for faster recovery. |
| 1632 | */ |
| 1633 | if (TCP_SEND_DSACK_OPT(tp) && nsack > 0 && |
| 1634 | (tp->rcv_numsacks == 0 || nsack > 1)) { |
| 1635 | *lp++ = htonl(tp->t_dsack_lseq); |
| 1636 | *lp++ = htonl(tp->t_dsack_rseq); |
| 1637 | tcpstat.tcps_dsack_sent++; |
| 1638 | tp->t_dsack_sent++; |
| 1639 | nsack--; |
| 1640 | } |
| 1641 | VERIFY(nsack == 0 || tp->rcv_numsacks >= nsack); |
| 1642 | for (i = 0; i < nsack; i++) { |
| 1643 | struct sackblk sack = tp->sackblks[i]; |
| 1644 | *lp++ = htonl(sack.start); |
| 1645 | *lp++ = htonl(sack.end); |
| 1646 | } |
| 1647 | optlen += sackoptlen; |
| 1648 | } |
| 1649 | } |
| 1650 | |
| 1651 | /* Pad TCP options to a 4 byte boundary */ |
| 1652 | if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) { |
| 1653 | int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t)); |
| 1654 | u_char *bp = (u_char *)opt + optlen; |
| 1655 | |
| 1656 | optlen += pad; |
| 1657 | while (pad) { |
| 1658 | *bp++ = TCPOPT_EOL; |
| 1659 | pad--; |
| 1660 | } |
| 1661 | } |
| 1662 | |
| 1663 | /* |
| 1664 | * RFC 3168 states that: |
| 1665 | * - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared |
| 1666 | * to handle the TCP ECE flag, even if you also later send a |
| 1667 | * non-ECN-setup SYN/SYN-ACK. |
| 1668 | * - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set |
| 1669 | * the ip ECT flag. |
| 1670 | * |
| 1671 | * It is not clear how the ECE flag would ever be set if you never |
| 1672 | * set the IP ECT flag on outbound packets. All the same, we use |
| 1673 | * the TE_SETUPSENT to indicate that we have committed to handling |
| 1674 | * the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate |
| 1675 | * whether or not we should set the IP ECT flag on outbound packet |
| 1676 | * |
| 1677 | * For a SYN-ACK, send an ECN setup SYN-ACK |
| 1678 | */ |
| 1679 | if ((flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) && |
| 1680 | (tp->ecn_flags & TE_ENABLE_ECN)) { |
| 1681 | if (tp->ecn_flags & TE_SETUPRECEIVED) { |
| 1682 | if (tcp_send_ecn_flags_on_syn(tp, so)) { |
| 1683 | /* |
| 1684 | * Setting TH_ECE makes this an ECN-setup |
| 1685 | * SYN-ACK |
| 1686 | */ |
| 1687 | flags |= TH_ECE; |
| 1688 | |
| 1689 | /* |
| 1690 | * Record that we sent the ECN-setup and |
| 1691 | * default to setting IP ECT. |
| 1692 | */ |
| 1693 | tp->ecn_flags |= (TE_SETUPSENT|TE_SENDIPECT); |
| 1694 | tcpstat.tcps_ecn_server_setup++; |
| 1695 | tcpstat.tcps_ecn_server_success++; |
| 1696 | } else { |
| 1697 | /* |
| 1698 | * We sent an ECN-setup SYN-ACK but it was |
| 1699 | * dropped. Fallback to non-ECN-setup |
| 1700 | * SYN-ACK and clear flag to indicate that |
| 1701 | * we should not send data with IP ECT set |
| 1702 | * |
| 1703 | * Pretend we didn't receive an |
| 1704 | * ECN-setup SYN. |
| 1705 | * |
| 1706 | * We already incremented the counter |
| 1707 | * assuming that the ECN setup will |
| 1708 | * succeed. Decrementing here |
| 1709 | * tcps_ecn_server_success to correct it. |
| 1710 | */ |
| 1711 | if (tp->ecn_flags & TE_SETUPSENT) { |
| 1712 | tcpstat.tcps_ecn_lost_synack++; |
| 1713 | tcpstat.tcps_ecn_server_success--; |
| 1714 | tp->ecn_flags |= TE_LOST_SYNACK; |
| 1715 | } |
| 1716 | |
| 1717 | tp->ecn_flags &= |
| 1718 | ~(TE_SETUPRECEIVED | TE_SENDIPECT | |
| 1719 | TE_SENDCWR); |
| 1720 | } |
| 1721 | } |
| 1722 | } else if ((flags & (TH_SYN | TH_ACK)) == TH_SYN && |
| 1723 | (tp->ecn_flags & TE_ENABLE_ECN)) { |
| 1724 | if (tcp_send_ecn_flags_on_syn(tp, so)) { |
| 1725 | /* |
| 1726 | * Setting TH_ECE and TH_CWR makes this an |
| 1727 | * ECN-setup SYN |
| 1728 | */ |
| 1729 | flags |= (TH_ECE | TH_CWR); |
| 1730 | tcpstat.tcps_ecn_client_setup++; |
| 1731 | tp->ecn_flags |= TE_CLIENT_SETUP; |
| 1732 | |
| 1733 | /* |
| 1734 | * Record that we sent the ECN-setup and default to |
| 1735 | * setting IP ECT. |
| 1736 | */ |
| 1737 | tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT); |
| 1738 | } else { |
| 1739 | /* |
| 1740 | * We sent an ECN-setup SYN but it was dropped. |
| 1741 | * Fall back to non-ECN and clear flag indicating |
| 1742 | * we should send data with IP ECT set. |
| 1743 | */ |
| 1744 | if (tp->ecn_flags & TE_SETUPSENT) { |
| 1745 | tcpstat.tcps_ecn_lost_syn++; |
| 1746 | tp->ecn_flags |= TE_LOST_SYN; |
| 1747 | } |
| 1748 | tp->ecn_flags &= ~TE_SENDIPECT; |
| 1749 | } |
| 1750 | } |
| 1751 | |
| 1752 | /* |
| 1753 | * Check if we should set the TCP CWR flag. |
| 1754 | * CWR flag is sent when we reduced the congestion window because |
| 1755 | * we received a TCP ECE or we performed a fast retransmit. We |
| 1756 | * never set the CWR flag on retransmitted packets. We only set |
| 1757 | * the CWR flag on data packets. Pure acks don't have this set. |
| 1758 | */ |
| 1759 | if ((tp->ecn_flags & TE_SENDCWR) != 0 && len != 0 && |
| 1760 | !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) { |
| 1761 | flags |= TH_CWR; |
| 1762 | tp->ecn_flags &= ~TE_SENDCWR; |
| 1763 | } |
| 1764 | |
| 1765 | /* |
| 1766 | * Check if we should set the TCP ECE flag. |
| 1767 | */ |
| 1768 | if ((tp->ecn_flags & TE_SENDECE) != 0 && len == 0) { |
| 1769 | flags |= TH_ECE; |
| 1770 | tcpstat.tcps_ecn_sent_ece++; |
| 1771 | } |
| 1772 | |
| 1773 | |
| 1774 | hdrlen += optlen; |
| 1775 | |
| 1776 | /* Reset DSACK sequence numbers */ |
| 1777 | tp->t_dsack_lseq = 0; |
| 1778 | tp->t_dsack_rseq = 0; |
| 1779 | |
| 1780 | #if INET6 |
| 1781 | if (isipv6) |
| 1782 | ipoptlen = ip6_optlen(inp); |
| 1783 | else |
| 1784 | #endif |
| 1785 | { |
| 1786 | if (tp_inp_options) { |
| 1787 | ipoptlen = tp_inp_options->m_len - |
| 1788 | offsetof(struct ipoption, ipopt_list); |
| 1789 | } else { |
| 1790 | ipoptlen = 0; |
| 1791 | } |
| 1792 | } |
| 1793 | #if IPSEC |
| 1794 | ipoptlen += ipsec_optlen; |
| 1795 | #endif |
| 1796 | |
| 1797 | /* |
| 1798 | * Adjust data length if insertion of options will |
| 1799 | * bump the packet length beyond the t_maxopd length. |
| 1800 | * Clear the FIN bit because we cut off the tail of |
| 1801 | * the segment. |
| 1802 | * |
| 1803 | * When doing TSO limit a burst to TCP_MAXWIN minus the |
| 1804 | * IP, TCP and Options length to keep ip->ip_len from |
| 1805 | * overflowing. Prevent the last segment from being |
| 1806 | * fractional thus making them all equal sized and set |
| 1807 | * the flag to continue sending. TSO is disabled when |
| 1808 | * IP options or IPSEC are present. |
| 1809 | */ |
| 1810 | if (len + optlen + ipoptlen > tp->t_maxopd) { |
| 1811 | /* |
| 1812 | * If there is still more to send, |
| 1813 | * don't close the connection. |
| 1814 | */ |
| 1815 | flags &= ~TH_FIN; |
| 1816 | if (tso) { |
| 1817 | int32_t tso_maxlen; |
| 1818 | |
| 1819 | tso_maxlen = tp->tso_max_segment_size ? |
| 1820 | tp->tso_max_segment_size : TCP_MAXWIN; |
| 1821 | |
| 1822 | if (len > tso_maxlen - hdrlen - optlen) { |
| 1823 | len = tso_maxlen - hdrlen - optlen; |
| 1824 | len = len - (len % (tp->t_maxopd - optlen)); |
| 1825 | sendalot = 1; |
| 1826 | } else if (tp->t_flags & TF_NEEDFIN) { |
| 1827 | sendalot = 1; |
| 1828 | } |
| 1829 | } else { |
| 1830 | len = tp->t_maxopd - optlen - ipoptlen; |
| 1831 | sendalot = 1; |
| 1832 | } |
| 1833 | } |
| 1834 | |
| 1835 | if (max_linkhdr + hdrlen > MCLBYTES) |
| 1836 | panic("tcphdr too big" ); |
| 1837 | |
| 1838 | /* Check if there is enough data in the send socket |
| 1839 | * buffer to start measuring bandwidth |
| 1840 | */ |
| 1841 | if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 && |
| 1842 | (tp->t_bwmeas != NULL) && |
| 1843 | (tp->t_flagsext & TF_BWMEAS_INPROGRESS) == 0) { |
| 1844 | tp->t_bwmeas->bw_size = min(min( |
| 1845 | (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)), |
| 1846 | tp->snd_cwnd), tp->snd_wnd); |
| 1847 | if (tp->t_bwmeas->bw_minsize > 0 && |
| 1848 | tp->t_bwmeas->bw_size < tp->t_bwmeas->bw_minsize) |
| 1849 | tp->t_bwmeas->bw_size = 0; |
| 1850 | if (tp->t_bwmeas->bw_maxsize > 0) |
| 1851 | tp->t_bwmeas->bw_size = min(tp->t_bwmeas->bw_size, |
| 1852 | tp->t_bwmeas->bw_maxsize); |
| 1853 | if (tp->t_bwmeas->bw_size > 0) { |
| 1854 | tp->t_flagsext |= TF_BWMEAS_INPROGRESS; |
| 1855 | tp->t_bwmeas->bw_start = tp->snd_max; |
| 1856 | tp->t_bwmeas->bw_ts = tcp_now; |
| 1857 | } |
| 1858 | } |
| 1859 | |
| 1860 | VERIFY(inp->inp_flowhash != 0); |
| 1861 | /* |
| 1862 | * Grab a header mbuf, attaching a copy of data to |
| 1863 | * be transmitted, and initialize the header from |
| 1864 | * the template for sends on this connection. |
| 1865 | */ |
| 1866 | if (len) { |
| 1867 | tp->t_pmtud_lastseg_size = len + optlen + ipoptlen; |
| 1868 | if ((tp->t_flagsext & TF_FORCE) && len == 1) |
| 1869 | tcpstat.tcps_sndprobe++; |
| 1870 | else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { |
| 1871 | tcpstat.tcps_sndrexmitpack++; |
| 1872 | tcpstat.tcps_sndrexmitbyte += len; |
| 1873 | if (nstat_collect) { |
| 1874 | nstat_route_tx(inp->inp_route.ro_rt, 1, |
| 1875 | len, NSTAT_TX_FLAG_RETRANSMIT); |
| 1876 | INP_ADD_STAT(inp, cell, wifi, wired, |
| 1877 | txpackets, 1); |
| 1878 | INP_ADD_STAT(inp, cell, wifi, wired, |
| 1879 | txbytes, len); |
| 1880 | tp->t_stat.txretransmitbytes += len; |
| 1881 | tp->t_stat.rxmitpkts++; |
| 1882 | } |
| 1883 | } else { |
| 1884 | tcpstat.tcps_sndpack++; |
| 1885 | tcpstat.tcps_sndbyte += len; |
| 1886 | |
| 1887 | if (nstat_collect) { |
| 1888 | INP_ADD_STAT(inp, cell, wifi, wired, |
| 1889 | txpackets, 1); |
| 1890 | INP_ADD_STAT(inp, cell, wifi, wired, |
| 1891 | txbytes, len); |
| 1892 | } |
| 1893 | inp_decr_sndbytes_unsent(so, len); |
| 1894 | } |
| 1895 | inp_set_activity_bitmap(inp); |
| 1896 | #if MPTCP |
| 1897 | if (tp->t_mpflags & TMPF_MPTCP_TRUE) { |
| 1898 | tcpstat.tcps_mp_sndpacks++; |
| 1899 | tcpstat.tcps_mp_sndbytes += len; |
| 1900 | } |
| 1901 | #endif /* MPTCP */ |
| 1902 | /* |
| 1903 | * try to use the new interface that allocates all |
| 1904 | * the necessary mbuf hdrs under 1 mbuf lock and |
| 1905 | * avoids rescanning the socket mbuf list if |
| 1906 | * certain conditions are met. This routine can't |
| 1907 | * be used in the following cases... |
| 1908 | * 1) the protocol headers exceed the capacity of |
| 1909 | * of a single mbuf header's data area (no cluster attached) |
| 1910 | * 2) the length of the data being transmitted plus |
| 1911 | * the protocol headers fits into a single mbuf header's |
| 1912 | * data area (no cluster attached) |
| 1913 | */ |
| 1914 | m = NULL; |
| 1915 | |
| 1916 | /* minimum length we are going to allocate */ |
| 1917 | allocated_len = MHLEN; |
| 1918 | if (MHLEN < hdrlen + max_linkhdr) { |
| 1919 | MGETHDR(m, M_DONTWAIT, MT_HEADER); |
| 1920 | if (m == NULL) { |
| 1921 | error = ENOBUFS; |
| 1922 | goto out; |
| 1923 | } |
| 1924 | MCLGET(m, M_DONTWAIT); |
| 1925 | if ((m->m_flags & M_EXT) == 0) { |
| 1926 | m_freem(m); |
| 1927 | error = ENOBUFS; |
| 1928 | goto out; |
| 1929 | } |
| 1930 | m->m_data += max_linkhdr; |
| 1931 | m->m_len = hdrlen; |
| 1932 | allocated_len = MCLBYTES; |
| 1933 | } |
| 1934 | if (len <= allocated_len - hdrlen - max_linkhdr) { |
| 1935 | if (m == NULL) { |
| 1936 | VERIFY(allocated_len <= MHLEN); |
| 1937 | MGETHDR(m, M_DONTWAIT, MT_HEADER); |
| 1938 | if (m == NULL) { |
| 1939 | error = ENOBUFS; |
| 1940 | goto out; |
| 1941 | } |
| 1942 | m->m_data += max_linkhdr; |
| 1943 | m->m_len = hdrlen; |
| 1944 | } |
| 1945 | /* makes sure we still have data left to be sent at this point */ |
| 1946 | if (so->so_snd.sb_mb == NULL || off < 0) { |
| 1947 | if (m != NULL) m_freem(m); |
| 1948 | error = 0; /* should we return an error? */ |
| 1949 | goto out; |
| 1950 | } |
| 1951 | m_copydata(so->so_snd.sb_mb, off, (int) len, |
| 1952 | mtod(m, caddr_t) + hdrlen); |
| 1953 | m->m_len += len; |
| 1954 | } else { |
| 1955 | uint32_t copymode; |
| 1956 | /* |
| 1957 | * Retain packet header metadata at the socket |
| 1958 | * buffer if this is is an MPTCP subflow, |
| 1959 | * otherwise move it. |
| 1960 | */ |
| 1961 | copymode = M_COPYM_MOVE_HDR; |
| 1962 | #if MPTCP |
| 1963 | if (so->so_flags & SOF_MP_SUBFLOW) { |
| 1964 | copymode = M_COPYM_NOOP_HDR; |
| 1965 | } |
| 1966 | #endif /* MPTCP */ |
| 1967 | if (m != NULL) { |
| 1968 | m->m_next = m_copym_mode(so->so_snd.sb_mb, |
| 1969 | off, (int)len, M_DONTWAIT, copymode); |
| 1970 | if (m->m_next == NULL) { |
| 1971 | (void) m_free(m); |
| 1972 | error = ENOBUFS; |
| 1973 | goto out; |
| 1974 | } |
| 1975 | } else { |
| 1976 | /* |
| 1977 | * make sure we still have data left |
| 1978 | * to be sent at this point |
| 1979 | */ |
| 1980 | if (so->so_snd.sb_mb == NULL) { |
| 1981 | error = 0; /* should we return an error? */ |
| 1982 | goto out; |
| 1983 | } |
| 1984 | |
| 1985 | /* |
| 1986 | * m_copym_with_hdrs will always return the |
| 1987 | * last mbuf pointer and the offset into it that |
| 1988 | * it acted on to fullfill the current request, |
| 1989 | * whether a valid 'hint' was passed in or not. |
| 1990 | */ |
| 1991 | if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, |
| 1992 | off, len, M_DONTWAIT, NULL, NULL, |
| 1993 | copymode)) == NULL) { |
| 1994 | error = ENOBUFS; |
| 1995 | goto out; |
| 1996 | } |
| 1997 | m->m_data += max_linkhdr; |
| 1998 | m->m_len = hdrlen; |
| 1999 | } |
| 2000 | } |
| 2001 | /* |
| 2002 | * If we're sending everything we've got, set PUSH. |
| 2003 | * (This will keep happy those implementations which only |
| 2004 | * give data to the user when a buffer fills or |
| 2005 | * a PUSH comes in.) |
| 2006 | * |
| 2007 | * On SYN-segments we should not add the PUSH-flag. |
| 2008 | */ |
| 2009 | if (off + len == so->so_snd.sb_cc && !(flags & TH_SYN)) |
| 2010 | flags |= TH_PUSH; |
| 2011 | } else { |
| 2012 | if (tp->t_flags & TF_ACKNOW) |
| 2013 | tcpstat.tcps_sndacks++; |
| 2014 | else if (flags & (TH_SYN|TH_FIN|TH_RST)) |
| 2015 | tcpstat.tcps_sndctrl++; |
| 2016 | else if (SEQ_GT(tp->snd_up, tp->snd_una)) |
| 2017 | tcpstat.tcps_sndurg++; |
| 2018 | else |
| 2019 | tcpstat.tcps_sndwinup++; |
| 2020 | |
| 2021 | MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ |
| 2022 | if (m == NULL) { |
| 2023 | error = ENOBUFS; |
| 2024 | goto out; |
| 2025 | } |
| 2026 | if (MHLEN < (hdrlen + max_linkhdr)) { |
| 2027 | MCLGET(m, M_DONTWAIT); |
| 2028 | if ((m->m_flags & M_EXT) == 0) { |
| 2029 | m_freem(m); |
| 2030 | error = ENOBUFS; |
| 2031 | goto out; |
| 2032 | } |
| 2033 | } |
| 2034 | m->m_data += max_linkhdr; |
| 2035 | m->m_len = hdrlen; |
| 2036 | } |
| 2037 | m->m_pkthdr.rcvif = 0; |
| 2038 | #if CONFIG_MACF_NET |
| 2039 | mac_mbuf_label_associate_inpcb(inp, m); |
| 2040 | #endif |
| 2041 | #if INET6 |
| 2042 | if (isipv6) { |
| 2043 | ip6 = mtod(m, struct ip6_hdr *); |
| 2044 | th = (struct tcphdr *)(void *)(ip6 + 1); |
| 2045 | tcp_fillheaders(tp, ip6, th); |
| 2046 | if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && |
| 2047 | !SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) { |
| 2048 | ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); |
| 2049 | } |
| 2050 | svc_flags |= PKT_SCF_IPV6; |
| 2051 | #if PF_ECN |
| 2052 | m_pftag(m)->pftag_hdr = (void *)ip6; |
| 2053 | m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET6; |
| 2054 | #endif /* PF_ECN */ |
| 2055 | } else |
| 2056 | #endif /* INET6 */ |
| 2057 | { |
| 2058 | ip = mtod(m, struct ip *); |
| 2059 | ipov = (struct ipovly *)ip; |
| 2060 | th = (struct tcphdr *)(void *)(ip + 1); |
| 2061 | /* this picks up the pseudo header (w/o the length) */ |
| 2062 | tcp_fillheaders(tp, ip, th); |
| 2063 | if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && |
| 2064 | !SEQ_LT(tp->snd_nxt, tp->snd_max) && |
| 2065 | !sack_rxmit && !(flags & TH_SYN)) { |
| 2066 | ip->ip_tos |= IPTOS_ECN_ECT0; |
| 2067 | } |
| 2068 | #if PF_ECN |
| 2069 | m_pftag(m)->pftag_hdr = (void *)ip; |
| 2070 | m_pftag(m)->pftag_flags |= PF_TAG_HDR_INET; |
| 2071 | #endif /* PF_ECN */ |
| 2072 | } |
| 2073 | |
| 2074 | /* |
| 2075 | * Fill in fields, remembering maximum advertised |
| 2076 | * window for use in delaying messages about window sizes. |
| 2077 | * If resending a FIN, be sure not to use a new sequence number. |
| 2078 | */ |
| 2079 | if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && |
| 2080 | tp->snd_nxt == tp->snd_max) |
| 2081 | tp->snd_nxt--; |
| 2082 | /* |
| 2083 | * If we are doing retransmissions, then snd_nxt will |
| 2084 | * not reflect the first unsent octet. For ACK only |
| 2085 | * packets, we do not want the sequence number of the |
| 2086 | * retransmitted packet, we want the sequence number |
| 2087 | * of the next unsent octet. So, if there is no data |
| 2088 | * (and no SYN or FIN), use snd_max instead of snd_nxt |
| 2089 | * when filling in ti_seq. But if we are in persist |
| 2090 | * state, snd_max might reflect one byte beyond the |
| 2091 | * right edge of the window, so use snd_nxt in that |
| 2092 | * case, since we know we aren't doing a retransmission. |
| 2093 | * (retransmit and persist are mutually exclusive...) |
| 2094 | * |
| 2095 | * Note the state of this retransmit segment to detect spurious |
| 2096 | * retransmissions. |
| 2097 | */ |
| 2098 | if (sack_rxmit == 0) { |
| 2099 | if (len || (flags & (TH_SYN|TH_FIN)) || |
| 2100 | tp->t_timer[TCPT_PERSIST]) { |
| 2101 | th->th_seq = htonl(tp->snd_nxt); |
| 2102 | if (len > 0) { |
| 2103 | m->m_pkthdr.tx_start_seq = tp->snd_nxt; |
| 2104 | m->m_pkthdr.pkt_flags |= PKTF_START_SEQ; |
| 2105 | } |
| 2106 | if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { |
| 2107 | if (SACK_ENABLED(tp) && len > 1) { |
| 2108 | tcp_rxtseg_insert(tp, tp->snd_nxt, |
| 2109 | (tp->snd_nxt + len - 1)); |
| 2110 | } |
| 2111 | if (len > 0) |
| 2112 | m->m_pkthdr.pkt_flags |= |
| 2113 | PKTF_TCP_REXMT; |
| 2114 | } |
| 2115 | } else { |
| 2116 | th->th_seq = htonl(tp->snd_max); |
| 2117 | } |
| 2118 | } else { |
| 2119 | th->th_seq = htonl(p->rxmit); |
| 2120 | if (len > 0) { |
| 2121 | m->m_pkthdr.pkt_flags |= |
| 2122 | (PKTF_TCP_REXMT | PKTF_START_SEQ); |
| 2123 | m->m_pkthdr.tx_start_seq = p->rxmit; |
| 2124 | } |
| 2125 | tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - 1)); |
| 2126 | p->rxmit += len; |
| 2127 | tp->sackhint.sack_bytes_rexmit += len; |
| 2128 | } |
| 2129 | th->th_ack = htonl(tp->rcv_nxt); |
| 2130 | tp->last_ack_sent = tp->rcv_nxt; |
| 2131 | if (optlen) { |
| 2132 | bcopy(opt, th + 1, optlen); |
| 2133 | th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; |
| 2134 | } |
| 2135 | th->th_flags = flags; |
| 2136 | th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); |
| 2137 | if (recwin > 0 && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin)) |
| 2138 | tp->rcv_adv = tp->rcv_nxt + recwin; |
| 2139 | |
| 2140 | /* |
| 2141 | * Adjust the RXWIN0SENT flag - indicate that we have advertised |
| 2142 | * a 0 window. This may cause the remote transmitter to stall. This |
| 2143 | * flag tells soreceive() to disable delayed acknowledgements when |
| 2144 | * draining the buffer. This can occur if the receiver is attempting |
| 2145 | * to read more data then can be buffered prior to transmitting on |
| 2146 | * the connection. |
| 2147 | */ |
| 2148 | if (th->th_win == 0) |
| 2149 | tp->t_flags |= TF_RXWIN0SENT; |
| 2150 | else |
| 2151 | tp->t_flags &= ~TF_RXWIN0SENT; |
| 2152 | if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { |
| 2153 | th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); |
| 2154 | th->th_flags |= TH_URG; |
| 2155 | } else { |
| 2156 | /* |
| 2157 | * If no urgent pointer to send, then we pull |
| 2158 | * the urgent pointer to the left edge of the send window |
| 2159 | * so that it doesn't drift into the send window on sequence |
| 2160 | * number wraparound. |
| 2161 | */ |
| 2162 | tp->snd_up = tp->snd_una; /* drag it along */ |
| 2163 | } |
| 2164 | |
| 2165 | /* |
| 2166 | * Put TCP length in extended header, and then |
| 2167 | * checksum extended header and data. |
| 2168 | */ |
| 2169 | m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ |
| 2170 | |
| 2171 | /* |
| 2172 | * If this is potentially the last packet on the stream, then mark |
| 2173 | * it in order to enable some optimizations in the underlying |
| 2174 | * layers |
| 2175 | */ |
| 2176 | if (tp->t_state != TCPS_ESTABLISHED && |
| 2177 | (tp->t_state == TCPS_CLOSING || tp->t_state == TCPS_TIME_WAIT |
| 2178 | || tp->t_state == TCPS_LAST_ACK || (th->th_flags & TH_RST))) |
| 2179 | m->m_pkthdr.pkt_flags |= PKTF_LAST_PKT; |
| 2180 | |
| 2181 | #if INET6 |
| 2182 | if (isipv6) { |
| 2183 | /* |
| 2184 | * ip6_plen is not need to be filled now, and will be filled |
| 2185 | * in ip6_output. |
| 2186 | */ |
| 2187 | m->m_pkthdr.csum_flags = CSUM_TCPIPV6; |
| 2188 | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); |
| 2189 | if (len + optlen) |
| 2190 | th->th_sum = in_addword(th->th_sum, |
| 2191 | htons((u_short)(optlen + len))); |
| 2192 | } |
| 2193 | else |
| 2194 | #endif /* INET6 */ |
| 2195 | { |
| 2196 | m->m_pkthdr.csum_flags = CSUM_TCP; |
| 2197 | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); |
| 2198 | if (len + optlen) |
| 2199 | th->th_sum = in_addword(th->th_sum, |
| 2200 | htons((u_short)(optlen + len))); |
| 2201 | } |
| 2202 | |
| 2203 | /* |
| 2204 | * Enable TSO and specify the size of the segments. |
| 2205 | * The TCP pseudo header checksum is always provided. |
| 2206 | */ |
| 2207 | if (tso) { |
| 2208 | #if INET6 |
| 2209 | if (isipv6) |
| 2210 | m->m_pkthdr.csum_flags |= CSUM_TSO_IPV6; |
| 2211 | else |
| 2212 | #endif /* INET6 */ |
| 2213 | m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4; |
| 2214 | |
| 2215 | m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; |
| 2216 | } else { |
| 2217 | m->m_pkthdr.tso_segsz = 0; |
| 2218 | } |
| 2219 | |
| 2220 | /* |
| 2221 | * In transmit state, time the transmission and arrange for |
| 2222 | * the retransmit. In persist state, just set snd_max. |
| 2223 | */ |
| 2224 | if (!(tp->t_flagsext & TF_FORCE) |
| 2225 | || tp->t_timer[TCPT_PERSIST] == 0) { |
| 2226 | tcp_seq startseq = tp->snd_nxt; |
| 2227 | |
| 2228 | /* |
| 2229 | * Advance snd_nxt over sequence space of this segment. |
| 2230 | */ |
| 2231 | if (flags & (TH_SYN|TH_FIN)) { |
| 2232 | if (flags & TH_SYN) |
| 2233 | tp->snd_nxt++; |
| 2234 | if ((flags & TH_FIN) && |
| 2235 | !(tp->t_flags & TF_SENTFIN)) { |
| 2236 | tp->snd_nxt++; |
| 2237 | tp->t_flags |= TF_SENTFIN; |
| 2238 | } |
| 2239 | } |
| 2240 | if (sack_rxmit) |
| 2241 | goto timer; |
| 2242 | if (sack_rescue_rxt == TRUE) { |
| 2243 | tp->snd_nxt = old_snd_nxt; |
| 2244 | sack_rescue_rxt = FALSE; |
| 2245 | tcpstat.tcps_pto_in_recovery++; |
| 2246 | } else { |
| 2247 | tp->snd_nxt += len; |
| 2248 | } |
| 2249 | if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { |
| 2250 | tp->snd_max = tp->snd_nxt; |
| 2251 | tp->t_sndtime = tcp_now; |
| 2252 | /* |
| 2253 | * Time this transmission if not a retransmission and |
| 2254 | * not currently timing anything. |
| 2255 | */ |
| 2256 | if (tp->t_rtttime == 0) { |
| 2257 | tp->t_rtttime = tcp_now; |
| 2258 | tp->t_rtseq = startseq; |
| 2259 | tcpstat.tcps_segstimed++; |
| 2260 | |
| 2261 | /* update variables related to pipe ack */ |
| 2262 | tp->t_pipeack_lastuna = tp->snd_una; |
| 2263 | } |
| 2264 | } |
| 2265 | |
| 2266 | /* |
| 2267 | * Set retransmit timer if not currently set, |
| 2268 | * and not doing an ack or a keep-alive probe. |
| 2269 | */ |
| 2270 | timer: |
| 2271 | if (tp->t_timer[TCPT_REXMT] == 0 && |
| 2272 | ((sack_rxmit && tp->snd_nxt != tp->snd_max) || |
| 2273 | tp->snd_nxt != tp->snd_una || (flags & TH_FIN))) { |
| 2274 | if (tp->t_timer[TCPT_PERSIST]) { |
| 2275 | tp->t_timer[TCPT_PERSIST] = 0; |
| 2276 | tp->t_persist_stop = 0; |
| 2277 | TCP_RESET_REXMT_STATE(tp); |
| 2278 | } |
| 2279 | tp->t_timer[TCPT_REXMT] = |
| 2280 | OFFSET_FROM_START(tp, tp->t_rxtcur); |
| 2281 | } |
| 2282 | |
| 2283 | /* |
| 2284 | * Set tail loss probe timeout if new data is being |
| 2285 | * transmitted. This will be supported only when |
| 2286 | * SACK option is enabled on a connection. |
| 2287 | * |
| 2288 | * Every time new data is sent PTO will get reset. |
| 2289 | */ |
| 2290 | if (tcp_enable_tlp && len != 0 && tp->t_state == TCPS_ESTABLISHED && |
| 2291 | SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) && |
| 2292 | tp->snd_nxt == tp->snd_max && |
| 2293 | SEQ_GT(tp->snd_nxt, tp->snd_una) && |
| 2294 | tp->t_rxtshift == 0 && |
| 2295 | (tp->t_flagsext & (TF_SENT_TLPROBE|TF_PKTS_REORDERED)) == 0) { |
| 2296 | u_int32_t pto, srtt; |
| 2297 | |
| 2298 | /* |
| 2299 | * Using SRTT alone to set PTO can cause spurious |
| 2300 | * retransmissions on wireless networks where there |
| 2301 | * is a lot of variance in RTT. Taking variance |
| 2302 | * into account will avoid this. |
| 2303 | */ |
| 2304 | srtt = tp->t_srtt >> TCP_RTT_SHIFT; |
| 2305 | pto = ((TCP_REXMTVAL(tp)) * 3) >> 1; |
| 2306 | pto = max (2 * srtt, pto); |
| 2307 | if ((tp->snd_max - tp->snd_una) == tp->t_maxseg) |
| 2308 | pto = max(pto, |
| 2309 | (((3 * pto) >> 2) + tcp_delack * 2)); |
| 2310 | else |
| 2311 | pto = max(10, pto); |
| 2312 | |
| 2313 | /* if RTO is less than PTO, choose RTO instead */ |
| 2314 | if (tp->t_rxtcur < pto) |
| 2315 | pto = tp->t_rxtcur; |
| 2316 | |
| 2317 | tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto); |
| 2318 | } |
| 2319 | } else { |
| 2320 | /* |
| 2321 | * Persist case, update snd_max but since we are in |
| 2322 | * persist mode (no window) we do not update snd_nxt. |
| 2323 | */ |
| 2324 | int xlen = len; |
| 2325 | if (flags & TH_SYN) |
| 2326 | ++xlen; |
| 2327 | if ((flags & TH_FIN) && |
| 2328 | !(tp->t_flags & TF_SENTFIN)) { |
| 2329 | ++xlen; |
| 2330 | tp->t_flags |= TF_SENTFIN; |
| 2331 | } |
| 2332 | if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { |
| 2333 | tp->snd_max = tp->snd_nxt + len; |
| 2334 | tp->t_sndtime = tcp_now; |
| 2335 | } |
| 2336 | } |
| 2337 | |
| 2338 | #if TCPDEBUG |
| 2339 | /* |
| 2340 | * Trace. |
| 2341 | */ |
| 2342 | if (so_options & SO_DEBUG) |
| 2343 | tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); |
| 2344 | #endif |
| 2345 | |
| 2346 | /* |
| 2347 | * Fill in IP length and desired time to live and |
| 2348 | * send to IP level. There should be a better way |
| 2349 | * to handle ttl and tos; we could keep them in |
| 2350 | * the template, but need a way to checksum without them. |
| 2351 | */ |
| 2352 | #if INET6 |
| 2353 | /* |
| 2354 | * m->m_pkthdr.len should have been set before cksum calcuration, |
| 2355 | * because in6_cksum() need it. |
| 2356 | */ |
| 2357 | if (isipv6) { |
| 2358 | /* |
| 2359 | * we separately set hoplimit for every segment, since the |
| 2360 | * user might want to change the value via setsockopt. |
| 2361 | * Also, desired default hop limit might be changed via |
| 2362 | * Neighbor Discovery. |
| 2363 | */ |
| 2364 | ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ? |
| 2365 | inp->in6p_route.ro_rt->rt_ifp : NULL); |
| 2366 | |
| 2367 | /* TODO: IPv6 IP6TOS_ECT bit on */ |
| 2368 | KERNEL_DEBUG(DBG_LAYER_BEG, |
| 2369 | ((inp->inp_fport << 16) | inp->inp_lport), |
| 2370 | (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | |
| 2371 | (inp->in6p_faddr.s6_addr16[0] & 0xffff)), |
| 2372 | sendalot,0,0); |
| 2373 | } else |
| 2374 | #endif /* INET6 */ |
| 2375 | { |
| 2376 | ip->ip_len = m->m_pkthdr.len; |
| 2377 | ip->ip_ttl = inp->inp_ip_ttl; /* XXX */ |
| 2378 | ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);/* XXX */ |
| 2379 | KERNEL_DEBUG(DBG_LAYER_BEG, |
| 2380 | ((inp->inp_fport << 16) | inp->inp_lport), |
| 2381 | (((inp->inp_laddr.s_addr & 0xffff) << 16) | |
| 2382 | (inp->inp_faddr.s_addr & 0xffff)), 0,0,0); |
| 2383 | } |
| 2384 | |
| 2385 | /* |
| 2386 | * See if we should do MTU discovery. |
| 2387 | * Look at the flag updated on the following criterias: |
| 2388 | * 1) Path MTU discovery is authorized by the sysctl |
| 2389 | * 2) The route isn't set yet (unlikely but could happen) |
| 2390 | * 3) The route is up |
| 2391 | * 4) the MTU is not locked (if it is, then discovery has been |
| 2392 | * disabled for that route) |
| 2393 | */ |
| 2394 | #if INET6 |
| 2395 | if (!isipv6) |
| 2396 | #endif /* INET6 */ |
| 2397 | if (path_mtu_discovery && (tp->t_flags & TF_PMTUD)) |
| 2398 | ip->ip_off |= IP_DF; |
| 2399 | |
| 2400 | #if NECP |
| 2401 | { |
| 2402 | necp_kernel_policy_id policy_id; |
| 2403 | necp_kernel_policy_id skip_policy_id; |
| 2404 | u_int32_t route_rule_id; |
| 2405 | if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id, &skip_policy_id)) { |
| 2406 | m_freem(m); |
| 2407 | error = EHOSTUNREACH; |
| 2408 | goto out; |
| 2409 | } |
| 2410 | necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id); |
| 2411 | |
| 2412 | if (net_qos_policy_restricted != 0) { |
| 2413 | necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt, |
| 2414 | NULL, route_rule_id); |
| 2415 | } |
| 2416 | } |
| 2417 | #endif /* NECP */ |
| 2418 | |
| 2419 | #if IPSEC |
| 2420 | if (inp->inp_sp != NULL) |
| 2421 | ipsec_setsocket(m, so); |
| 2422 | #endif /*IPSEC*/ |
| 2423 | |
| 2424 | /* |
| 2425 | * The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active. |
| 2426 | */ |
| 2427 | lost = 0; |
| 2428 | |
| 2429 | /* |
| 2430 | * Embed the flow hash in pkt hdr and mark the packet as |
| 2431 | * capable of flow controlling |
| 2432 | */ |
| 2433 | m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; |
| 2434 | m->m_pkthdr.pkt_flowid = inp->inp_flowhash; |
| 2435 | m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV); |
| 2436 | m->m_pkthdr.pkt_proto = IPPROTO_TCP; |
| 2437 | m->m_pkthdr.tx_tcp_pid = so->last_pid; |
| 2438 | if (so->so_flags & SOF_DELEGATED) |
| 2439 | m->m_pkthdr.tx_tcp_e_pid = so->e_pid; |
| 2440 | else |
| 2441 | m->m_pkthdr.tx_tcp_e_pid = 0; |
| 2442 | |
| 2443 | m->m_nextpkt = NULL; |
| 2444 | |
| 2445 | if (inp->inp_last_outifp != NULL && |
| 2446 | !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) { |
| 2447 | /* Hint to prioritize this packet if |
| 2448 | * 1. if the packet has no data |
| 2449 | * 2. the interface supports transmit-start model and did |
| 2450 | * not disable ACK prioritization. |
| 2451 | * 3. Only ACK flag is set. |
| 2452 | * 4. there is no outstanding data on this connection. |
| 2453 | */ |
| 2454 | if (tcp_prioritize_acks != 0 && len == 0 && |
| 2455 | (inp->inp_last_outifp->if_eflags & |
| 2456 | (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART) { |
| 2457 | if (th->th_flags == TH_ACK && |
| 2458 | tp->snd_una == tp->snd_max && |
| 2459 | tp->t_timer[TCPT_REXMT] == 0) |
| 2460 | svc_flags |= PKT_SCF_TCP_ACK; |
| 2461 | if (th->th_flags & TH_SYN) |
| 2462 | svc_flags |= PKT_SCF_TCP_SYN; |
| 2463 | } |
| 2464 | set_packet_service_class(m, so, sotc, svc_flags); |
| 2465 | } else { |
| 2466 | /* |
| 2467 | * Optimization for loopback just set the mbuf |
| 2468 | * service class |
| 2469 | */ |
| 2470 | (void) m_set_service_class(m, so_tc2msc(sotc)); |
| 2471 | } |
| 2472 | |
| 2473 | tp->t_pktlist_sentlen += len; |
| 2474 | tp->t_lastchain++; |
| 2475 | |
| 2476 | #if INET6 |
| 2477 | if (isipv6) { |
| 2478 | DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp, |
| 2479 | struct ip6 *, ip6, struct tcpcb *, tp, struct tcphdr *, |
| 2480 | th); |
| 2481 | } else |
| 2482 | #endif /* INET6 */ |
| 2483 | { |
| 2484 | DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, inp, |
| 2485 | struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th); |
| 2486 | } |
| 2487 | |
| 2488 | if (tp->t_pktlist_head != NULL) { |
| 2489 | tp->t_pktlist_tail->m_nextpkt = m; |
| 2490 | tp->t_pktlist_tail = m; |
| 2491 | } else { |
| 2492 | packchain_newlist++; |
| 2493 | tp->t_pktlist_head = tp->t_pktlist_tail = m; |
| 2494 | } |
| 2495 | |
| 2496 | if ((lro_ackmore) && (!sackoptlen) && (!tp->t_timer[TCPT_PERSIST]) && |
| 2497 | ((th->th_flags & TH_ACK) == TH_ACK) && (!len) && |
| 2498 | (tp->t_state == TCPS_ESTABLISHED)) { |
| 2499 | /* For a pure ACK, see if you need to send more of them */ |
| 2500 | mnext = tcp_send_lroacks(tp, m, th); |
| 2501 | if (mnext) { |
| 2502 | tp->t_pktlist_tail->m_nextpkt = mnext; |
| 2503 | if (mnext->m_nextpkt == NULL) { |
| 2504 | tp->t_pktlist_tail = mnext; |
| 2505 | tp->t_lastchain++; |
| 2506 | } else { |
| 2507 | struct mbuf *tail, *next; |
| 2508 | next = mnext->m_nextpkt; |
| 2509 | tail = next->m_nextpkt; |
| 2510 | while (tail) { |
| 2511 | next = tail; |
| 2512 | tail = tail->m_nextpkt; |
| 2513 | tp->t_lastchain++; |
| 2514 | } |
| 2515 | tp->t_pktlist_tail = next; |
| 2516 | } |
| 2517 | } |
| 2518 | } |
| 2519 | |
| 2520 | if (sendalot == 0 || (tp->t_state != TCPS_ESTABLISHED) || |
| 2521 | (tp->snd_cwnd <= (tp->snd_wnd / 8)) || |
| 2522 | (tp->t_flags & TF_ACKNOW) || |
| 2523 | (tp->t_flagsext & TF_FORCE) || |
| 2524 | tp->t_lastchain >= tcp_packet_chaining) { |
| 2525 | error = 0; |
| 2526 | while (inp->inp_sndinprog_cnt == 0 && |
| 2527 | tp->t_pktlist_head != NULL) { |
| 2528 | packetlist = tp->t_pktlist_head; |
| 2529 | packchain_listadd = tp->t_lastchain; |
| 2530 | packchain_sent++; |
| 2531 | lost = tp->t_pktlist_sentlen; |
| 2532 | TCP_PKTLIST_CLEAR(tp); |
| 2533 | |
| 2534 | error = tcp_ip_output(so, tp, packetlist, |
| 2535 | packchain_listadd, tp_inp_options, |
| 2536 | (so_options & SO_DONTROUTE), |
| 2537 | (sack_rxmit || (sack_bytes_rxmt != 0)), isipv6); |
| 2538 | if (error) { |
| 2539 | /* |
| 2540 | * Take into account the rest of unsent |
| 2541 | * packets in the packet list for this tcp |
| 2542 | * into "lost", since we're about to free |
| 2543 | * the whole list below. |
| 2544 | */ |
| 2545 | lost += tp->t_pktlist_sentlen; |
| 2546 | break; |
| 2547 | } else { |
| 2548 | lost = 0; |
| 2549 | } |
| 2550 | } |
| 2551 | /* tcp was closed while we were in ip; resume close */ |
| 2552 | if (inp->inp_sndinprog_cnt == 0 && |
| 2553 | (tp->t_flags & TF_CLOSING)) { |
| 2554 | tp->t_flags &= ~TF_CLOSING; |
| 2555 | (void) tcp_close(tp); |
| 2556 | return (0); |
| 2557 | } |
| 2558 | } else { |
| 2559 | error = 0; |
| 2560 | packchain_looped++; |
| 2561 | tcpstat.tcps_sndtotal++; |
| 2562 | |
| 2563 | goto again; |
| 2564 | } |
| 2565 | if (error) { |
| 2566 | /* |
| 2567 | * Assume that the packets were lost, so back out the |
| 2568 | * sequence number advance, if any. Note that the "lost" |
| 2569 | * variable represents the amount of user data sent during |
| 2570 | * the recent call to ip_output_list() plus the amount of |
| 2571 | * user data in the packet list for this tcp at the moment. |
| 2572 | */ |
| 2573 | if (!(tp->t_flagsext & TF_FORCE) |
| 2574 | || tp->t_timer[TCPT_PERSIST] == 0) { |
| 2575 | /* |
| 2576 | * No need to check for TH_FIN here because |
| 2577 | * the TF_SENTFIN flag handles that case. |
| 2578 | */ |
| 2579 | if ((flags & TH_SYN) == 0) { |
| 2580 | if (sack_rxmit) { |
| 2581 | if (SEQ_GT((p->rxmit - lost), |
| 2582 | tp->snd_una)) { |
| 2583 | p->rxmit -= lost; |
| 2584 | } else { |
| 2585 | lost = p->rxmit - tp->snd_una; |
| 2586 | p->rxmit = tp->snd_una; |
| 2587 | } |
| 2588 | tp->sackhint.sack_bytes_rexmit -= lost; |
| 2589 | } else { |
| 2590 | if (SEQ_GT((tp->snd_nxt - lost), |
| 2591 | tp->snd_una)) |
| 2592 | tp->snd_nxt -= lost; |
| 2593 | else |
| 2594 | tp->snd_nxt = tp->snd_una; |
| 2595 | } |
| 2596 | } |
| 2597 | } |
| 2598 | out: |
| 2599 | if (tp->t_pktlist_head != NULL) |
| 2600 | m_freem_list(tp->t_pktlist_head); |
| 2601 | TCP_PKTLIST_CLEAR(tp); |
| 2602 | |
| 2603 | if (error == ENOBUFS) { |
| 2604 | /* |
| 2605 | * Set retransmit timer if not currently set |
| 2606 | * when we failed to send a segment that can be |
| 2607 | * retransmitted (i.e. not pure ack or rst) |
| 2608 | */ |
| 2609 | if (!tp->t_timer[TCPT_REXMT] && |
| 2610 | !tp->t_timer[TCPT_PERSIST] && |
| 2611 | (len != 0 || (flags & (TH_SYN | TH_FIN)) != 0 || |
| 2612 | so->so_snd.sb_cc > 0)) |
| 2613 | tp->t_timer[TCPT_REXMT] = |
| 2614 | OFFSET_FROM_START(tp, tp->t_rxtcur); |
| 2615 | tp->snd_cwnd = tp->t_maxseg; |
| 2616 | tp->t_bytes_acked = 0; |
| 2617 | tcp_check_timer_state(tp); |
| 2618 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); |
| 2619 | |
| 2620 | tcp_ccdbg_trace(tp, NULL, TCP_CC_OUTPUT_ERROR); |
| 2621 | return (0); |
| 2622 | } |
| 2623 | if (error == EMSGSIZE) { |
| 2624 | /* |
| 2625 | * ip_output() will have already fixed the route |
| 2626 | * for us. tcp_mtudisc() will, as its last action, |
| 2627 | * initiate retransmission, so it is important to |
| 2628 | * not do so here. |
| 2629 | * |
| 2630 | * If TSO was active we either got an interface |
| 2631 | * without TSO capabilits or TSO was turned off. |
| 2632 | * Disable it for this connection as too and |
| 2633 | * immediatly retry with MSS sized segments generated |
| 2634 | * by this function. |
| 2635 | */ |
| 2636 | if (tso) |
| 2637 | tp->t_flags &= ~TF_TSO; |
| 2638 | |
| 2639 | tcp_mtudisc(inp, 0); |
| 2640 | tcp_check_timer_state(tp); |
| 2641 | |
| 2642 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); |
| 2643 | return 0; |
| 2644 | } |
| 2645 | /* |
| 2646 | * Unless this is due to interface restriction policy, |
| 2647 | * treat EHOSTUNREACH/ENETDOWN as a soft error. |
| 2648 | */ |
| 2649 | if ((error == EHOSTUNREACH || error == ENETDOWN) && |
| 2650 | TCPS_HAVERCVDSYN(tp->t_state) && |
| 2651 | !inp_restricted_send(inp, inp->inp_last_outifp)) { |
| 2652 | tp->t_softerror = error; |
| 2653 | error = 0; |
| 2654 | } |
| 2655 | tcp_check_timer_state(tp); |
| 2656 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); |
| 2657 | return (error); |
| 2658 | } |
| 2659 | |
| 2660 | tcpstat.tcps_sndtotal++; |
| 2661 | |
| 2662 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0); |
| 2663 | if (sendalot) |
| 2664 | goto again; |
| 2665 | |
| 2666 | tcp_check_timer_state(tp); |
| 2667 | return (0); |
| 2668 | } |
| 2669 | |
| 2670 | static int |
| 2671 | tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, |
| 2672 | int cnt, struct mbuf *opt, int flags, int sack_in_progress, boolean_t isipv6) |
| 2673 | { |
| 2674 | int error = 0; |
| 2675 | boolean_t chain; |
| 2676 | boolean_t unlocked = FALSE; |
| 2677 | boolean_t ifdenied = FALSE; |
| 2678 | struct inpcb *inp = tp->t_inpcb; |
| 2679 | struct ip_out_args ipoa; |
| 2680 | struct route ro; |
| 2681 | struct ifnet *outif = NULL; |
| 2682 | |
| 2683 | bzero(&ipoa, sizeof(ipoa)); |
| 2684 | ipoa.ipoa_boundif = IFSCOPE_NONE; |
| 2685 | ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR; |
| 2686 | ipoa.ipoa_sotc = SO_TC_UNSPEC; |
| 2687 | ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC; |
| 2688 | #if INET6 |
| 2689 | struct ip6_out_args ip6oa; |
| 2690 | struct route_in6 ro6; |
| 2691 | |
| 2692 | bzero(&ip6oa, sizeof(ip6oa)); |
| 2693 | ip6oa.ip6oa_boundif = IFSCOPE_NONE; |
| 2694 | ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR; |
| 2695 | ip6oa.ip6oa_sotc = SO_TC_UNSPEC; |
| 2696 | ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC; |
| 2697 | |
| 2698 | struct flowadv *adv = |
| 2699 | (isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv); |
| 2700 | #else /* INET6 */ |
| 2701 | struct flowadv *adv = &ipoa.ipoa_flowadv; |
| 2702 | #endif /* !INET6 */ |
| 2703 | |
| 2704 | /* If socket was bound to an ifindex, tell ip_output about it */ |
| 2705 | if (inp->inp_flags & INP_BOUND_IF) { |
| 2706 | #if INET6 |
| 2707 | if (isipv6) { |
| 2708 | ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index; |
| 2709 | ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF; |
| 2710 | } else |
| 2711 | #endif /* INET6 */ |
| 2712 | { |
| 2713 | ipoa.ipoa_boundif = inp->inp_boundifp->if_index; |
| 2714 | ipoa.ipoa_flags |= IPOAF_BOUND_IF; |
| 2715 | } |
| 2716 | } |
| 2717 | |
| 2718 | if (INP_NO_CELLULAR(inp)) { |
| 2719 | #if INET6 |
| 2720 | if (isipv6) |
| 2721 | ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR; |
| 2722 | else |
| 2723 | #endif /* INET6 */ |
| 2724 | ipoa.ipoa_flags |= IPOAF_NO_CELLULAR; |
| 2725 | } |
| 2726 | if (INP_NO_EXPENSIVE(inp)) { |
| 2727 | #if INET6 |
| 2728 | if (isipv6) |
| 2729 | ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE; |
| 2730 | else |
| 2731 | #endif /* INET6 */ |
| 2732 | ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE; |
| 2733 | |
| 2734 | } |
| 2735 | if (INP_AWDL_UNRESTRICTED(inp)) { |
| 2736 | #if INET6 |
| 2737 | if (isipv6) |
| 2738 | ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED; |
| 2739 | else |
| 2740 | #endif /* INET6 */ |
| 2741 | ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED; |
| 2742 | |
| 2743 | } |
| 2744 | #if INET6 |
| 2745 | if (INP_INTCOPROC_ALLOWED(inp) && isipv6) { |
| 2746 | ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED; |
| 2747 | } |
| 2748 | if (isipv6) { |
| 2749 | ip6oa.ip6oa_sotc = so->so_traffic_class; |
| 2750 | ip6oa.ip6oa_netsvctype = so->so_netsvctype; |
| 2751 | } else |
| 2752 | #endif /* INET6 */ |
| 2753 | { |
| 2754 | ipoa.ipoa_sotc = so->so_traffic_class; |
| 2755 | ipoa.ipoa_netsvctype = so->so_netsvctype; |
| 2756 | } |
| 2757 | if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) { |
| 2758 | #if INET6 |
| 2759 | if (isipv6) |
| 2760 | ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED; |
| 2761 | else |
| 2762 | #endif /* INET6 */ |
| 2763 | ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED; |
| 2764 | } |
| 2765 | #if INET6 |
| 2766 | if (isipv6) |
| 2767 | flags |= IPV6_OUTARGS; |
| 2768 | else |
| 2769 | #endif /* INET6 */ |
| 2770 | flags |= IP_OUTARGS; |
| 2771 | |
| 2772 | /* Copy the cached route and take an extra reference */ |
| 2773 | #if INET6 |
| 2774 | if (isipv6) |
| 2775 | in6p_route_copyout(inp, &ro6); |
| 2776 | else |
| 2777 | #endif /* INET6 */ |
| 2778 | inp_route_copyout(inp, &ro); |
| 2779 | |
| 2780 | /* |
| 2781 | * Make sure ACK/DELACK conditions are cleared before |
| 2782 | * we unlock the socket. |
| 2783 | */ |
| 2784 | tp->last_ack_sent = tp->rcv_nxt; |
| 2785 | tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); |
| 2786 | tp->t_timer[TCPT_DELACK] = 0; |
| 2787 | tp->t_unacksegs = 0; |
| 2788 | |
| 2789 | /* Increment the count of outstanding send operations */ |
| 2790 | inp->inp_sndinprog_cnt++; |
| 2791 | |
| 2792 | /* |
| 2793 | * If allowed, unlock TCP socket while in IP |
| 2794 | * but only if the connection is established and |
| 2795 | * in a normal mode where reentrancy on the tcpcb won't be |
| 2796 | * an issue: |
| 2797 | * - there is no SACK episode |
| 2798 | * - we're not in Fast Recovery mode |
| 2799 | * - if we're not sending from an upcall. |
| 2800 | */ |
| 2801 | if (tcp_output_unlocked && !so->so_upcallusecount && |
| 2802 | (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) && |
| 2803 | !IN_FASTRECOVERY(tp) && !(so->so_flags & SOF_MP_SUBFLOW)) { |
| 2804 | |
| 2805 | unlocked = TRUE; |
| 2806 | socket_unlock(so, 0); |
| 2807 | } |
| 2808 | |
| 2809 | /* |
| 2810 | * Don't send down a chain of packets when: |
| 2811 | * - TCP chaining is disabled |
| 2812 | * - there is an IPsec rule set |
| 2813 | * - there is a non default rule set for the firewall |
| 2814 | */ |
| 2815 | |
| 2816 | chain = tcp_packet_chaining > 1 |
| 2817 | #if IPSEC |
| 2818 | && ipsec_bypass |
| 2819 | #endif |
| 2820 | #if IPFIREWALL |
| 2821 | && (fw_enable == 0 || fw_bypass) |
| 2822 | #endif |
| 2823 | ; // I'm important, not extraneous |
| 2824 | |
| 2825 | |
| 2826 | while (pkt != NULL) { |
| 2827 | struct mbuf *npkt = pkt->m_nextpkt; |
| 2828 | |
| 2829 | if (!chain) { |
| 2830 | pkt->m_nextpkt = NULL; |
| 2831 | /* |
| 2832 | * If we are not chaining, make sure to set the packet |
| 2833 | * list count to 0 so that IP takes the right path; |
| 2834 | * this is important for cases such as IPSec where a |
| 2835 | * single mbuf might result in multiple mbufs as part |
| 2836 | * of the encapsulation. If a non-zero count is passed |
| 2837 | * down to IP, the head of the chain might change and |
| 2838 | * we could end up skipping it (thus generating bogus |
| 2839 | * packets). Fixing it in IP would be desirable, but |
| 2840 | * for now this would do it. |
| 2841 | */ |
| 2842 | cnt = 0; |
| 2843 | } |
| 2844 | #if INET6 |
| 2845 | if (isipv6) { |
| 2846 | error = ip6_output_list(pkt, cnt, |
| 2847 | inp->in6p_outputopts, &ro6, flags, NULL, NULL, |
| 2848 | &ip6oa); |
| 2849 | ifdenied = (ip6oa.ip6oa_retflags & IP6OARF_IFDENIED); |
| 2850 | } else { |
| 2851 | #endif /* INET6 */ |
| 2852 | error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL, |
| 2853 | &ipoa); |
| 2854 | ifdenied = (ipoa.ipoa_retflags & IPOARF_IFDENIED); |
| 2855 | } |
| 2856 | |
| 2857 | if (chain || error) { |
| 2858 | /* |
| 2859 | * If we sent down a chain then we are done since |
| 2860 | * the callee had taken care of everything; else |
| 2861 | * we need to free the rest of the chain ourselves. |
| 2862 | */ |
| 2863 | if (!chain) |
| 2864 | m_freem_list(npkt); |
| 2865 | break; |
| 2866 | } |
| 2867 | pkt = npkt; |
| 2868 | } |
| 2869 | |
| 2870 | if (unlocked) |
| 2871 | socket_lock(so, 0); |
| 2872 | |
| 2873 | /* |
| 2874 | * Enter flow controlled state if the connection is established |
| 2875 | * and is not in recovery. Flow control is allowed only if there |
| 2876 | * is outstanding data. |
| 2877 | * |
| 2878 | * A connection will enter suspended state even if it is in |
| 2879 | * recovery. |
| 2880 | */ |
| 2881 | if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) || |
| 2882 | adv->code == FADV_SUSPENDED) && |
| 2883 | !(tp->t_flags & TF_CLOSING) && |
| 2884 | tp->t_state == TCPS_ESTABLISHED && |
| 2885 | SEQ_GT(tp->snd_max, tp->snd_una)) { |
| 2886 | int rc; |
| 2887 | rc = inp_set_fc_state(inp, adv->code); |
| 2888 | |
| 2889 | if (rc == 1) |
| 2890 | tcp_ccdbg_trace(tp, NULL, |
| 2891 | ((adv->code == FADV_FLOW_CONTROLLED) ? |
| 2892 | TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND)); |
| 2893 | } |
| 2894 | |
| 2895 | /* |
| 2896 | * When an interface queue gets suspended, some of the |
| 2897 | * packets are dropped. Return ENOBUFS, to update the |
| 2898 | * pcb state. |
| 2899 | */ |
| 2900 | if (adv->code == FADV_SUSPENDED) |
| 2901 | error = ENOBUFS; |
| 2902 | |
| 2903 | VERIFY(inp->inp_sndinprog_cnt > 0); |
| 2904 | if ( --inp->inp_sndinprog_cnt == 0) |
| 2905 | inp->inp_flags &= ~(INP_FC_FEEDBACK); |
| 2906 | |
| 2907 | #if INET6 |
| 2908 | if (isipv6) { |
| 2909 | if (ro6.ro_rt != NULL) |
| 2910 | outif = ro6.ro_rt->rt_ifp; |
| 2911 | } else |
| 2912 | #endif /* INET6 */ |
| 2913 | if (ro.ro_rt != NULL) |
| 2914 | outif = ro.ro_rt->rt_ifp; |
| 2915 | |
| 2916 | if (outif != NULL && outif != inp->inp_last_outifp && |
| 2917 | so->so_snd.sb_cc > 0) { |
| 2918 | /* Update the send byte count */ |
| 2919 | if (so->so_snd.sb_flags & SB_SNDBYTE_CNT) { |
| 2920 | inp_decr_sndbytes_total(so, so->so_snd.sb_cc); |
| 2921 | inp_decr_sndbytes_allunsent(so, tp->snd_una); |
| 2922 | so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT; |
| 2923 | } |
| 2924 | inp->inp_last_outifp = outif; |
| 2925 | |
| 2926 | } |
| 2927 | |
| 2928 | if (error != 0 && ifdenied && |
| 2929 | (INP_NO_CELLULAR(inp) || INP_NO_EXPENSIVE(inp))) |
| 2930 | soevent(so, |
| 2931 | (SO_FILT_HINT_LOCKED|SO_FILT_HINT_IFDENIED)); |
| 2932 | |
| 2933 | /* Synchronize cached PCB route & options */ |
| 2934 | #if INET6 |
| 2935 | if (isipv6) |
| 2936 | in6p_route_copyin(inp, &ro6); |
| 2937 | else |
| 2938 | #endif /* INET6 */ |
| 2939 | inp_route_copyin(inp, &ro); |
| 2940 | |
| 2941 | if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == 0 && |
| 2942 | tp->t_inpcb->inp_route.ro_rt != NULL) { |
| 2943 | /* If we found the route and there is an rtt on it |
| 2944 | * reset the retransmit timer |
| 2945 | */ |
| 2946 | tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt); |
| 2947 | tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); |
| 2948 | } |
| 2949 | return (error); |
| 2950 | } |
| 2951 | |
| 2952 | int tcptv_persmin_val = TCPTV_PERSMIN; |
| 2953 | |
| 2954 | void |
| 2955 | tcp_setpersist(struct tcpcb *tp) |
| 2956 | { |
| 2957 | int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; |
| 2958 | |
| 2959 | /* If a PERSIST_TIMER option was set we will limit the |
| 2960 | * time the persist timer will be active for that connection |
| 2961 | * in order to avoid DOS by using zero window probes. |
| 2962 | * see rdar://5805356 |
| 2963 | */ |
| 2964 | |
| 2965 | if ((tp->t_persist_timeout != 0) && |
| 2966 | (tp->t_timer[TCPT_PERSIST] == 0) && |
| 2967 | (tp->t_persist_stop == 0)) { |
| 2968 | tp->t_persist_stop = tcp_now + tp->t_persist_timeout; |
| 2969 | } |
| 2970 | |
| 2971 | /* |
| 2972 | * Start/restart persistance timer. |
| 2973 | */ |
| 2974 | TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], |
| 2975 | t * tcp_backoff[tp->t_rxtshift], |
| 2976 | tcptv_persmin_val, TCPTV_PERSMAX, 0); |
| 2977 | tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]); |
| 2978 | |
| 2979 | if (tp->t_rxtshift < TCP_MAXRXTSHIFT) |
| 2980 | tp->t_rxtshift++; |
| 2981 | } |
| 2982 | |
| 2983 | /* |
| 2984 | * Send as many acks as data coalesced. Every other packet when stretch |
| 2985 | * ACK is not enabled. Every 8 packets, if stretch ACK is enabled. |
| 2986 | */ |
| 2987 | static struct mbuf* |
| 2988 | tcp_send_lroacks(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th) |
| 2989 | { |
| 2990 | struct mbuf *mnext = NULL, *ack_chain = NULL, *tail = NULL; |
| 2991 | int count = 0; |
| 2992 | tcp_seq org_ack = ntohl(th->th_ack); |
| 2993 | tcp_seq prev_ack = 0; |
| 2994 | int tack_offset = 28; /* IPv6 and IP options not supported */ |
| 2995 | int twin_offset = 34; /* IPv6 and IP options not supported */ |
| 2996 | int ack_size = (tp->t_flags & TF_STRETCHACK) ? |
| 2997 | (maxseg_unacked * tp->t_maxseg) : (tp->t_maxseg << 1); |
| 2998 | int segs_acked = (tp->t_flags & TF_STRETCHACK) ? maxseg_unacked : 2; |
| 2999 | struct mbuf *prev_ack_pkt = NULL; |
| 3000 | struct socket *so = tp->t_inpcb->inp_socket; |
| 3001 | unsigned short winsz = ntohs(th->th_win); |
| 3002 | unsigned int scaled_win = winsz<<tp->rcv_scale; |
| 3003 | tcp_seq win_rtedge = org_ack + scaled_win; |
| 3004 | |
| 3005 | count = tp->t_lropktlen/tp->t_maxseg; |
| 3006 | |
| 3007 | prev_ack = (org_ack - tp->t_lropktlen) + ack_size; |
| 3008 | if (prev_ack < org_ack) { |
| 3009 | ack_chain = m_dup(m, M_DONTWAIT); |
| 3010 | if (ack_chain) { |
| 3011 | th->th_ack = htonl(prev_ack); |
| 3012 | /* Keep adv window constant for duplicated ACK packets */ |
| 3013 | scaled_win = win_rtedge - prev_ack; |
| 3014 | if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) |
| 3015 | scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale); |
| 3016 | th->th_win = htons(scaled_win>>tp->rcv_scale); |
| 3017 | if (lrodebug == 5) { |
| 3018 | printf("%s: win = %d winsz = %d sc = %d" |
| 3019 | " lro_len %d %d\n" , |
| 3020 | __func__, scaled_win>>tp->rcv_scale, winsz, |
| 3021 | tp->rcv_scale, tp->t_lropktlen, count); |
| 3022 | } |
| 3023 | tail = ack_chain; |
| 3024 | count -= segs_acked; /* accounts for prev_ack packet */ |
| 3025 | count = (count <= segs_acked) ? 0 : count - segs_acked; |
| 3026 | tcpstat.tcps_sndacks++; |
| 3027 | so_tc_update_stats(m, so, m_get_service_class(m)); |
| 3028 | } else { |
| 3029 | return NULL; |
| 3030 | } |
| 3031 | } |
| 3032 | else { |
| 3033 | tp->t_lropktlen = 0; |
| 3034 | return NULL; |
| 3035 | } |
| 3036 | |
| 3037 | prev_ack_pkt = ack_chain; |
| 3038 | |
| 3039 | while (count > 0) { |
| 3040 | if ((prev_ack + ack_size) < org_ack) { |
| 3041 | prev_ack += ack_size; |
| 3042 | } else { |
| 3043 | /* |
| 3044 | * The last ACK sent must have the ACK number that TCP |
| 3045 | * thinks is the last sent ACK number. |
| 3046 | */ |
| 3047 | prev_ack = org_ack; |
| 3048 | } |
| 3049 | mnext = m_dup(prev_ack_pkt, M_DONTWAIT); |
| 3050 | if (mnext) { |
| 3051 | /* Keep adv window constant for duplicated ACK packets */ |
| 3052 | scaled_win = win_rtedge - prev_ack; |
| 3053 | if (scaled_win > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) |
| 3054 | scaled_win = (int32_t)(TCP_MAXWIN << tp->rcv_scale); |
| 3055 | winsz = htons(scaled_win>>tp->rcv_scale); |
| 3056 | if (lrodebug == 5) { |
| 3057 | printf("%s: winsz = %d ack %x count %d\n" , |
| 3058 | __func__, scaled_win>>tp->rcv_scale, |
| 3059 | prev_ack, count); |
| 3060 | } |
| 3061 | bcopy(&winsz, mtod(prev_ack_pkt, caddr_t) + twin_offset, 2); |
| 3062 | HTONL(prev_ack); |
| 3063 | bcopy(&prev_ack, mtod(prev_ack_pkt, caddr_t) + tack_offset, 4); |
| 3064 | NTOHL(prev_ack); |
| 3065 | tail->m_nextpkt = mnext; |
| 3066 | tail = mnext; |
| 3067 | count -= segs_acked; |
| 3068 | tcpstat.tcps_sndacks++; |
| 3069 | so_tc_update_stats(m, so, m_get_service_class(m)); |
| 3070 | } else { |
| 3071 | if (lrodebug == 5) { |
| 3072 | printf("%s: failed to alloc mbuf.\n" , __func__); |
| 3073 | } |
| 3074 | break; |
| 3075 | } |
| 3076 | prev_ack_pkt = mnext; |
| 3077 | } |
| 3078 | tp->t_lropktlen = 0; |
| 3079 | return ack_chain; |
| 3080 | } |
| 3081 | |
| 3082 | static int |
| 3083 | tcp_recv_throttle (struct tcpcb *tp) |
| 3084 | { |
| 3085 | uint32_t base_rtt, newsize; |
| 3086 | struct sockbuf *sbrcv = &tp->t_inpcb->inp_socket->so_rcv; |
| 3087 | |
| 3088 | if (tcp_use_rtt_recvbg == 1 && |
| 3089 | TSTMP_SUPPORTED(tp)) { |
| 3090 | /* |
| 3091 | * Timestamps are supported on this connection. Use |
| 3092 | * RTT to look for an increase in latency. |
| 3093 | */ |
| 3094 | |
| 3095 | /* |
| 3096 | * If the connection is already being throttled, leave it |
| 3097 | * in that state until rtt comes closer to base rtt |
| 3098 | */ |
| 3099 | if (tp->t_flagsext & TF_RECV_THROTTLE) |
| 3100 | return (1); |
| 3101 | |
| 3102 | base_rtt = get_base_rtt(tp); |
| 3103 | |
| 3104 | if (base_rtt != 0 && tp->t_rttcur != 0) { |
| 3105 | /* |
| 3106 | * if latency increased on a background flow, |
| 3107 | * return 1 to start throttling. |
| 3108 | */ |
| 3109 | if (tp->t_rttcur > (base_rtt + target_qdelay)) { |
| 3110 | tp->t_flagsext |= TF_RECV_THROTTLE; |
| 3111 | if (tp->t_recv_throttle_ts == 0) |
| 3112 | tp->t_recv_throttle_ts = tcp_now; |
| 3113 | /* |
| 3114 | * Reduce the recv socket buffer size to |
| 3115 | * minimize latecy. |
| 3116 | */ |
| 3117 | if (sbrcv->sb_idealsize > |
| 3118 | tcp_recv_throttle_minwin) { |
| 3119 | newsize = sbrcv->sb_idealsize >> 1; |
| 3120 | /* Set a minimum of 16 K */ |
| 3121 | newsize = |
| 3122 | max(newsize, |
| 3123 | tcp_recv_throttle_minwin); |
| 3124 | sbrcv->sb_idealsize = newsize; |
| 3125 | } |
| 3126 | return (1); |
| 3127 | } else { |
| 3128 | return (0); |
| 3129 | } |
| 3130 | } |
| 3131 | } |
| 3132 | |
| 3133 | /* |
| 3134 | * Timestamps are not supported or there is no good RTT |
| 3135 | * measurement. Use IPDV in this case. |
| 3136 | */ |
| 3137 | if (tp->acc_iaj > tcp_acc_iaj_react_limit) |
| 3138 | return (1); |
| 3139 | |
| 3140 | return (0); |
| 3141 | } |
| 3142 | |