Line data Source code
1 : /* $OpenBSD: tcp_timer.c,v 1.67 2018/06/11 07:40:26 bluhm Exp $ */
2 : /* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */
3 :
4 : /*
5 : * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 : * The Regents of the University of California. All rights reserved.
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : * 3. Neither the name of the University nor the names of its contributors
17 : * may be used to endorse or promote products derived from this software
18 : * without specific prior written permission.
19 : *
20 : * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 : * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 : * SUCH DAMAGE.
31 : *
32 : * @(#)tcp_timer.c 8.1 (Berkeley) 6/10/93
33 : */
34 :
35 : #include <sys/param.h>
36 : #include <sys/systm.h>
37 : #include <sys/mbuf.h>
38 : #include <sys/socket.h>
39 : #include <sys/socketvar.h>
40 : #include <sys/protosw.h>
41 : #include <sys/kernel.h>
42 : #include <sys/pool.h>
43 :
44 : #include <net/route.h>
45 :
46 : #include <netinet/in.h>
47 : #include <netinet/ip.h>
48 : #include <netinet/in_pcb.h>
49 : #include <netinet/ip_var.h>
50 : #include <netinet/tcp.h>
51 : #include <netinet/tcp_fsm.h>
52 : #include <netinet/tcp_timer.h>
53 : #include <netinet/tcp_var.h>
54 : #include <netinet/tcp_debug.h>
55 : #include <netinet/ip_icmp.h>
56 : #include <netinet/tcp_seq.h>
57 :
58 : int tcp_always_keepalive;
59 : int tcp_keepidle;
60 : int tcp_keepintvl;
61 : int tcp_maxpersistidle; /* max idle time in persist */
62 : int tcp_maxidle;
63 :
64 : /*
65 : * Time to delay the ACK. This is initialized in tcp_init(), unless
66 : * its patched.
67 : */
68 : int tcp_delack_msecs;
69 :
70 : void tcp_timer_rexmt(void *);
71 : void tcp_timer_persist(void *);
72 : void tcp_timer_keep(void *);
73 : void tcp_timer_2msl(void *);
74 : void tcp_timer_reaper(void *);
75 : void tcp_timer_delack(void *);
76 :
77 : const tcp_timer_func_t tcp_timer_funcs[TCPT_NTIMERS] = {
78 : tcp_timer_rexmt,
79 : tcp_timer_persist,
80 : tcp_timer_keep,
81 : tcp_timer_2msl,
82 : tcp_timer_reaper,
83 : tcp_timer_delack,
84 : };
85 :
86 : /*
87 : * Timer state initialization, called from tcp_init().
88 : */
89 : void
90 0 : tcp_timer_init(void)
91 : {
92 :
93 0 : if (tcp_keepidle == 0)
94 0 : tcp_keepidle = TCPTV_KEEP_IDLE;
95 :
96 0 : if (tcp_keepintvl == 0)
97 0 : tcp_keepintvl = TCPTV_KEEPINTVL;
98 :
99 0 : if (tcp_maxpersistidle == 0)
100 0 : tcp_maxpersistidle = TCPTV_KEEP_IDLE;
101 :
102 0 : if (tcp_delack_msecs == 0)
103 0 : tcp_delack_msecs = TCP_DELACK_MSECS;
104 0 : }
105 :
106 : /*
107 : * Callout to process delayed ACKs for a TCPCB.
108 : */
109 : void
110 0 : tcp_timer_delack(void *arg)
111 : {
112 0 : struct tcpcb *otp = NULL, *tp = arg;
113 : short ostate;
114 :
115 : /*
116 : * If tcp_output() wasn't able to transmit the ACK
117 : * for whatever reason, it will restart the delayed
118 : * ACK callout.
119 : */
120 0 : NET_LOCK();
121 : /* Ignore canceled timeouts or timeouts that have been rescheduled. */
122 0 : if (!ISSET((tp)->t_flags, TF_TMR_DELACK) ||
123 0 : timeout_pending(&tp->t_timer[TCPT_DELACK]))
124 : goto out;
125 0 : CLR((tp)->t_flags, TF_TMR_DELACK);
126 :
127 0 : if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
128 : otp = tp;
129 0 : ostate = tp->t_state;
130 0 : }
131 0 : tp->t_flags |= TF_ACKNOW;
132 0 : (void) tcp_output(tp);
133 0 : if (otp)
134 0 : tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_DELACK, 0);
135 : out:
136 0 : NET_UNLOCK();
137 0 : }
138 :
139 : /*
140 : * Tcp protocol timeout routine called every 500 ms.
141 : * Updates the timers in all active tcb's and
142 : * causes finite state machine actions if timers expire.
143 : */
144 : void
145 0 : tcp_slowtimo(void)
146 : {
147 0 : NET_LOCK();
148 :
149 0 : tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl;
150 0 : tcp_iss += TCP_ISSINCR2/PR_SLOWHZ; /* increment iss */
151 0 : tcp_now++; /* for timestamps */
152 :
153 0 : NET_UNLOCK();
154 0 : }
155 :
156 : /*
157 : * Cancel all timers for TCP tp.
158 : */
159 : void
160 0 : tcp_canceltimers(struct tcpcb *tp)
161 : {
162 : int i;
163 :
164 0 : for (i = 0; i < TCPT_NTIMERS; i++)
165 0 : TCP_TIMER_DISARM(tp, i);
166 0 : }
167 :
168 : int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
169 : { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
170 :
171 : int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
172 :
173 : /*
174 : * TCP timer processing.
175 : */
176 :
177 : void tcp_timer_freesack(struct tcpcb *);
178 :
179 : void
180 0 : tcp_timer_freesack(struct tcpcb *tp)
181 : {
182 : struct sackhole *p, *q;
183 : /*
184 : * Free SACK holes for 2MSL and REXMT timers.
185 : */
186 0 : q = tp->snd_holes;
187 0 : while (q != NULL) {
188 : p = q;
189 0 : q = q->next;
190 0 : pool_put(&sackhl_pool, p);
191 : }
192 0 : tp->snd_holes = 0;
193 0 : }
194 :
195 : void
196 0 : tcp_timer_rexmt(void *arg)
197 : {
198 0 : struct tcpcb *otp = NULL, *tp = arg;
199 : uint32_t rto;
200 : short ostate;
201 :
202 0 : NET_LOCK();
203 : /* Ignore canceled timeouts or timeouts that have been rescheduled. */
204 0 : if (!ISSET((tp)->t_flags, TF_TMR_REXMT) ||
205 0 : timeout_pending(&tp->t_timer[TCPT_REXMT]))
206 : goto out;
207 0 : CLR((tp)->t_flags, TF_TMR_REXMT);
208 :
209 0 : if ((tp->t_flags & TF_PMTUD_PEND) && tp->t_inpcb &&
210 0 : SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) &&
211 0 : SEQ_LT(tp->t_pmtud_th_seq, (int)(tp->snd_una + tp->t_maxseg))) {
212 0 : struct sockaddr_in sin;
213 0 : struct icmp icmp;
214 :
215 0 : tp->t_flags &= ~TF_PMTUD_PEND;
216 :
217 : /* XXX create fake icmp message with relevant entries */
218 0 : icmp.icmp_nextmtu = tp->t_pmtud_nextmtu;
219 0 : icmp.icmp_ip.ip_len = tp->t_pmtud_ip_len;
220 0 : icmp.icmp_ip.ip_hl = tp->t_pmtud_ip_hl;
221 0 : icmp.icmp_ip.ip_dst = tp->t_inpcb->inp_faddr;
222 0 : icmp_mtudisc(&icmp, tp->t_inpcb->inp_rtableid);
223 :
224 : /*
225 : * Notify all connections to the same peer about
226 : * new mss and trigger retransmit.
227 : */
228 0 : bzero(&sin, sizeof(sin));
229 0 : sin.sin_len = sizeof(sin);
230 0 : sin.sin_family = AF_INET;
231 0 : sin.sin_addr = tp->t_inpcb->inp_faddr;
232 0 : in_pcbnotifyall(&tcbtable, sintosa(&sin),
233 0 : tp->t_inpcb->inp_rtableid, EMSGSIZE, tcp_mtudisc);
234 : goto out;
235 0 : }
236 :
237 0 : tcp_timer_freesack(tp);
238 0 : if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
239 0 : tp->t_rxtshift = TCP_MAXRXTSHIFT;
240 0 : tcpstat_inc(tcps_timeoutdrop);
241 0 : tp = tcp_drop(tp, tp->t_softerror ?
242 : tp->t_softerror : ETIMEDOUT);
243 0 : goto out;
244 : }
245 0 : if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
246 : otp = tp;
247 0 : ostate = tp->t_state;
248 0 : }
249 0 : tcpstat_inc(tcps_rexmttimeo);
250 0 : rto = TCP_REXMTVAL(tp);
251 0 : if (rto < tp->t_rttmin)
252 0 : rto = tp->t_rttmin;
253 0 : TCPT_RANGESET(tp->t_rxtcur,
254 : rto * tcp_backoff[tp->t_rxtshift],
255 : tp->t_rttmin, TCPTV_REXMTMAX);
256 0 : TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
257 :
258 : /*
259 : * If we are losing and we are trying path MTU discovery,
260 : * try turning it off. This will avoid black holes in
261 : * the network which suppress or fail to send "packet
262 : * too big" ICMP messages. We should ideally do
263 : * lots more sophisticated searching to find the right
264 : * value here...
265 : */
266 0 : if (ip_mtudisc && tp->t_inpcb &&
267 0 : TCPS_HAVEESTABLISHED(tp->t_state) &&
268 0 : tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) {
269 : struct inpcb *inp = tp->t_inpcb;
270 : struct rtentry *rt = NULL;
271 :
272 : /* No data to send means path mtu is not a problem */
273 0 : if (!inp->inp_socket->so_snd.sb_cc)
274 : goto leave;
275 :
276 0 : rt = in_pcbrtentry(inp);
277 : /* Check if path MTU discovery is disabled already */
278 0 : if (rt && (rt->rt_flags & RTF_HOST) &&
279 0 : (rt->rt_locks & RTV_MTU))
280 : goto leave;
281 :
282 : rt = NULL;
283 0 : switch(tp->pf) {
284 : #ifdef INET6
285 : case PF_INET6:
286 : /*
287 : * We can not turn off path MTU for IPv6.
288 : * Do nothing for now, maybe lower to
289 : * minimum MTU.
290 : */
291 : break;
292 : #endif
293 : case PF_INET:
294 0 : rt = icmp_mtudisc_clone(inp->inp_faddr,
295 0 : inp->inp_rtableid);
296 0 : break;
297 : }
298 0 : if (rt != NULL) {
299 : /* Disable path MTU discovery */
300 0 : if ((rt->rt_locks & RTV_MTU) == 0) {
301 0 : rt->rt_locks |= RTV_MTU;
302 0 : in_rtchange(inp, 0);
303 0 : }
304 :
305 0 : rtfree(rt);
306 0 : }
307 : leave:
308 : ;
309 0 : }
310 :
311 : /*
312 : * If losing, let the lower level know and try for
313 : * a better route. Also, if we backed off this far,
314 : * our srtt estimate is probably bogus. Clobber it
315 : * so we'll take the next rtt measurement as our srtt;
316 : * move the current srtt into rttvar to keep the current
317 : * retransmit times until then.
318 : */
319 0 : if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
320 0 : in_losing(tp->t_inpcb);
321 0 : tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
322 0 : tp->t_srtt = 0;
323 0 : }
324 0 : tp->snd_nxt = tp->snd_una;
325 : /*
326 : * Note: We overload snd_last to function also as the
327 : * snd_last variable described in RFC 2582
328 : */
329 0 : tp->snd_last = tp->snd_max;
330 : /*
331 : * If timing a segment in this window, stop the timer.
332 : */
333 0 : tp->t_rtttime = 0;
334 : #ifdef TCP_ECN
335 : /*
336 : * if ECN is enabled, there might be a broken firewall which
337 : * blocks ecn packets. fall back to non-ecn.
338 : */
339 0 : if ((tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED)
340 0 : && tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
341 0 : tp->t_flags |= TF_DISABLE_ECN;
342 : #endif
343 : /*
344 : * Close the congestion window down to one segment
345 : * (we'll open it by one segment for each ack we get).
346 : * Since we probably have a window's worth of unacked
347 : * data accumulated, this "slow start" keeps us from
348 : * dumping all that data as back-to-back packets (which
349 : * might overwhelm an intermediate gateway).
350 : *
351 : * There are two phases to the opening: Initially we
352 : * open by one mss on each ack. This makes the window
353 : * size increase exponentially with time. If the
354 : * window is larger than the path can handle, this
355 : * exponential growth results in dropped packet(s)
356 : * almost immediately. To get more time between
357 : * drops but still "push" the network to take advantage
358 : * of improving conditions, we switch from exponential
359 : * to linear window opening at some threshold size.
360 : * For a threshold, we use half the current window
361 : * size, truncated to a multiple of the mss.
362 : *
363 : * (the minimum cwnd that will give us exponential
364 : * growth is 2 mss. We don't allow the threshold
365 : * to go below this.)
366 : */
367 : {
368 0 : u_long win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
369 0 : if (win < 2)
370 : win = 2;
371 0 : tp->snd_cwnd = tp->t_maxseg;
372 0 : tp->snd_ssthresh = win * tp->t_maxseg;
373 0 : tp->t_dupacks = 0;
374 : #ifdef TCP_ECN
375 0 : tp->snd_last = tp->snd_max;
376 0 : tp->t_flags |= TF_SEND_CWR;
377 : #endif
378 : #if 1 /* TCP_ECN */
379 0 : tcpstat_inc(tcps_cwr_timeout);
380 : #endif
381 : }
382 0 : (void) tcp_output(tp);
383 0 : if (otp)
384 0 : tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_REXMT, 0);
385 : out:
386 0 : NET_UNLOCK();
387 0 : }
388 :
389 : void
390 0 : tcp_timer_persist(void *arg)
391 : {
392 0 : struct tcpcb *otp = NULL, *tp = arg;
393 : uint32_t rto;
394 : short ostate;
395 :
396 0 : NET_LOCK();
397 : /* Ignore canceled timeouts or timeouts that have been rescheduled. */
398 0 : if (!ISSET((tp)->t_flags, TF_TMR_PERSIST) ||
399 0 : timeout_pending(&tp->t_timer[TCPT_PERSIST]))
400 : goto out;
401 0 : CLR((tp)->t_flags, TF_TMR_PERSIST);
402 :
403 0 : if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
404 : goto out;
405 :
406 0 : if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
407 : otp = tp;
408 0 : ostate = tp->t_state;
409 0 : }
410 0 : tcpstat_inc(tcps_persisttimeo);
411 : /*
412 : * Hack: if the peer is dead/unreachable, we do not
413 : * time out if the window is closed. After a full
414 : * backoff, drop the connection if the idle time
415 : * (no responses to probes) reaches the maximum
416 : * backoff that we would use if retransmitting.
417 : */
418 0 : rto = TCP_REXMTVAL(tp);
419 0 : if (rto < tp->t_rttmin)
420 0 : rto = tp->t_rttmin;
421 0 : if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
422 0 : ((tcp_now - tp->t_rcvtime) >= tcp_maxpersistidle ||
423 0 : (tcp_now - tp->t_rcvtime) >= rto * tcp_totbackoff)) {
424 0 : tcpstat_inc(tcps_persistdrop);
425 0 : tp = tcp_drop(tp, ETIMEDOUT);
426 0 : goto out;
427 : }
428 0 : tcp_setpersist(tp);
429 0 : tp->t_force = 1;
430 0 : (void) tcp_output(tp);
431 0 : tp->t_force = 0;
432 0 : if (otp)
433 0 : tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_PERSIST, 0);
434 : out:
435 0 : NET_UNLOCK();
436 0 : }
437 :
438 : void
439 0 : tcp_timer_keep(void *arg)
440 : {
441 0 : struct tcpcb *otp = NULL, *tp = arg;
442 : short ostate;
443 :
444 0 : NET_LOCK();
445 : /* Ignore canceled timeouts or timeouts that have been rescheduled. */
446 0 : if (!ISSET((tp)->t_flags, TF_TMR_KEEP) ||
447 0 : timeout_pending(&tp->t_timer[TCPT_KEEP]))
448 : goto out;
449 0 : CLR((tp)->t_flags, TF_TMR_KEEP);
450 :
451 0 : if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
452 : otp = tp;
453 0 : ostate = tp->t_state;
454 0 : }
455 0 : tcpstat_inc(tcps_keeptimeo);
456 0 : if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
457 : goto dropit;
458 0 : if ((tcp_always_keepalive ||
459 0 : tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) &&
460 0 : tp->t_state <= TCPS_CLOSING) {
461 0 : if ((tcp_maxidle > 0) &&
462 0 : ((tcp_now - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle))
463 : goto dropit;
464 : /*
465 : * Send a packet designed to force a response
466 : * if the peer is up and reachable:
467 : * either an ACK if the connection is still alive,
468 : * or an RST if the peer has closed the connection
469 : * due to timeout or reboot.
470 : * Using sequence number tp->snd_una-1
471 : * causes the transmitted zero-length segment
472 : * to lie outside the receive window;
473 : * by the protocol spec, this requires the
474 : * correspondent TCP to respond.
475 : */
476 0 : tcpstat_inc(tcps_keepprobe);
477 0 : tcp_respond(tp, mtod(tp->t_template, caddr_t),
478 0 : NULL, tp->rcv_nxt, tp->snd_una - 1, 0, 0);
479 0 : TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
480 0 : } else
481 0 : TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
482 0 : if (otp)
483 0 : tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_KEEP, 0);
484 : out:
485 0 : NET_UNLOCK();
486 0 : return;
487 :
488 : dropit:
489 0 : tcpstat_inc(tcps_keepdrops);
490 0 : tp = tcp_drop(tp, ETIMEDOUT);
491 0 : NET_UNLOCK();
492 0 : }
493 :
494 : void
495 0 : tcp_timer_2msl(void *arg)
496 : {
497 0 : struct tcpcb *otp = NULL, *tp = arg;
498 : short ostate;
499 :
500 0 : NET_LOCK();
501 : /* Ignore canceled timeouts or timeouts that have been rescheduled. */
502 0 : if (!ISSET((tp)->t_flags, TF_TMR_2MSL) ||
503 0 : timeout_pending(&tp->t_timer[TCPT_2MSL]))
504 : goto out;
505 0 : CLR((tp)->t_flags, TF_TMR_2MSL);
506 :
507 0 : if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
508 : otp = tp;
509 0 : ostate = tp->t_state;
510 0 : }
511 0 : tcp_timer_freesack(tp);
512 :
513 0 : if (tp->t_state != TCPS_TIME_WAIT &&
514 0 : ((tcp_maxidle == 0) || ((tcp_now - tp->t_rcvtime) <= tcp_maxidle)))
515 0 : TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_keepintvl);
516 : else
517 0 : tp = tcp_close(tp);
518 0 : if (otp)
519 0 : tcp_trace(TA_TIMER, ostate, tp, otp, NULL, TCPT_2MSL, 0);
520 : out:
521 0 : NET_UNLOCK();
522 0 : }
523 :
524 : void
525 0 : tcp_timer_reaper(void *arg)
526 : {
527 0 : struct tcpcb *tp = arg;
528 :
529 : /*
530 : * This timer is necessary to delay the pool_put() after all timers
531 : * have finished, even if they were sleeping to grab the net lock.
532 : * Putting the pool_put() in a timer is sufficinet as all timers run
533 : * from the same timeout thread. Note that neither softnet thread nor
534 : * user process may access the tcpcb after arming the reaper timer.
535 : * Freeing may run in parallel as it does not grab the net lock.
536 : */
537 0 : pool_put(&tcpcb_pool, tp);
538 0 : tcpstat_inc(tcps_closed);
539 0 : }
|