Line data Source code
1 : /* $OpenBSD: uipc_socket.c,v 1.227 2018/08/21 12:34:11 bluhm Exp $ */
2 : /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */
3 :
4 : /*
5 : * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 : * The Regents of the University of California. All rights reserved.
7 : *
8 : * Redistribution and use in source and binary forms, with or without
9 : * modification, are permitted provided that the following conditions
10 : * are met:
11 : * 1. Redistributions of source code must retain the above copyright
12 : * notice, this list of conditions and the following disclaimer.
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : * 3. Neither the name of the University nor the names of its contributors
17 : * may be used to endorse or promote products derived from this software
18 : * without specific prior written permission.
19 : *
20 : * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 : * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 : * SUCH DAMAGE.
31 : *
32 : * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 : */
34 :
35 : #include <sys/param.h>
36 : #include <sys/systm.h>
37 : #include <sys/proc.h>
38 : #include <sys/file.h>
39 : #include <sys/filedesc.h>
40 : #include <sys/malloc.h>
41 : #include <sys/mbuf.h>
42 : #include <sys/domain.h>
43 : #include <sys/kernel.h>
44 : #include <sys/event.h>
45 : #include <sys/protosw.h>
46 : #include <sys/socket.h>
47 : #include <sys/unpcb.h>
48 : #include <sys/socketvar.h>
49 : #include <sys/signalvar.h>
50 : #include <net/if.h>
51 : #include <sys/pool.h>
52 : #include <sys/atomic.h>
53 : #include <sys/rwlock.h>
54 :
55 : #ifdef DDB
56 : #include <machine/db_machdep.h>
57 : #endif
58 :
59 : void sbsync(struct sockbuf *, struct mbuf *);
60 :
61 : int sosplice(struct socket *, int, off_t, struct timeval *);
62 : void sounsplice(struct socket *, struct socket *, int);
63 : void soidle(void *);
64 : void sotask(void *);
65 : void soreaper(void *);
66 : void soput(void *);
67 : int somove(struct socket *, int);
68 :
69 : void filt_sordetach(struct knote *kn);
70 : int filt_soread(struct knote *kn, long hint);
71 : void filt_sowdetach(struct knote *kn);
72 : int filt_sowrite(struct knote *kn, long hint);
73 : int filt_solisten(struct knote *kn, long hint);
74 :
75 : struct filterops solisten_filtops =
76 : { 1, NULL, filt_sordetach, filt_solisten };
77 : struct filterops soread_filtops =
78 : { 1, NULL, filt_sordetach, filt_soread };
79 : struct filterops sowrite_filtops =
80 : { 1, NULL, filt_sowdetach, filt_sowrite };
81 :
82 :
83 : #ifndef SOMINCONN
84 : #define SOMINCONN 80
85 : #endif /* SOMINCONN */
86 :
87 : int somaxconn = SOMAXCONN;
88 : int sominconn = SOMINCONN;
89 :
90 : struct pool socket_pool;
91 : #ifdef SOCKET_SPLICE
92 : struct pool sosplice_pool;
93 : struct taskq *sosplice_taskq;
94 : struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
95 : #endif
96 :
97 : void
98 0 : soinit(void)
99 : {
100 0 : pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
101 : "sockpl", NULL);
102 : #ifdef SOCKET_SPLICE
103 0 : pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
104 : "sosppl", NULL);
105 : #endif
106 0 : }
107 :
108 : /*
109 : * Socket operation routines.
110 : * These routines are called by the routines in
111 : * sys_socket.c or from a system process, and
112 : * implement the semantics of socket operations by
113 : * switching out to the protocol specific routines.
114 : */
115 : int
116 0 : socreate(int dom, struct socket **aso, int type, int proto)
117 : {
118 0 : struct proc *p = curproc; /* XXX */
119 : const struct protosw *prp;
120 : struct socket *so;
121 : int error, s;
122 :
123 0 : if (proto)
124 0 : prp = pffindproto(dom, proto, type);
125 : else
126 0 : prp = pffindtype(dom, type);
127 0 : if (prp == NULL || prp->pr_attach == NULL)
128 0 : return (EPROTONOSUPPORT);
129 0 : if (prp->pr_type != type)
130 0 : return (EPROTOTYPE);
131 0 : so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO);
132 0 : TAILQ_INIT(&so->so_q0);
133 0 : TAILQ_INIT(&so->so_q);
134 0 : so->so_type = type;
135 0 : if (suser(p) == 0)
136 0 : so->so_state = SS_PRIV;
137 0 : so->so_ruid = p->p_ucred->cr_ruid;
138 0 : so->so_euid = p->p_ucred->cr_uid;
139 0 : so->so_rgid = p->p_ucred->cr_rgid;
140 0 : so->so_egid = p->p_ucred->cr_gid;
141 0 : so->so_cpid = p->p_p->ps_pid;
142 0 : so->so_proto = prp;
143 :
144 0 : s = solock(so);
145 0 : error = (*prp->pr_attach)(so, proto);
146 0 : if (error) {
147 0 : so->so_state |= SS_NOFDREF;
148 : /* sofree() calls sounlock(). */
149 0 : sofree(so, s);
150 0 : return (error);
151 : }
152 0 : sounlock(so, s);
153 0 : *aso = so;
154 0 : return (0);
155 0 : }
156 :
157 : int
158 0 : sobind(struct socket *so, struct mbuf *nam, struct proc *p)
159 : {
160 : int error;
161 :
162 0 : soassertlocked(so);
163 :
164 0 : error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p);
165 0 : return (error);
166 : }
167 :
168 : int
169 0 : solisten(struct socket *so, int backlog)
170 : {
171 : int s, error;
172 :
173 0 : if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
174 0 : return (EOPNOTSUPP);
175 : #ifdef SOCKET_SPLICE
176 0 : if (isspliced(so) || issplicedback(so))
177 0 : return (EOPNOTSUPP);
178 : #endif /* SOCKET_SPLICE */
179 0 : s = solock(so);
180 0 : error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
181 0 : curproc);
182 0 : if (error) {
183 0 : sounlock(so, s);
184 0 : return (error);
185 : }
186 0 : if (TAILQ_FIRST(&so->so_q) == NULL)
187 0 : so->so_options |= SO_ACCEPTCONN;
188 0 : if (backlog < 0 || backlog > somaxconn)
189 0 : backlog = somaxconn;
190 0 : if (backlog < sominconn)
191 0 : backlog = sominconn;
192 0 : so->so_qlimit = backlog;
193 0 : sounlock(so, s);
194 0 : return (0);
195 0 : }
196 :
197 : void
198 0 : sofree(struct socket *so, int s)
199 : {
200 0 : soassertlocked(so);
201 :
202 0 : if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
203 0 : sounlock(so, s);
204 0 : return;
205 : }
206 0 : if (so->so_head) {
207 : /*
208 : * We must not decommission a socket that's on the accept(2)
209 : * queue. If we do, then accept(2) may hang after select(2)
210 : * indicated that the listening socket was ready.
211 : */
212 0 : if (!soqremque(so, 0)) {
213 0 : sounlock(so, s);
214 0 : return;
215 : }
216 : }
217 : #ifdef SOCKET_SPLICE
218 0 : if (so->so_sp) {
219 0 : if (issplicedback(so))
220 0 : sounsplice(so->so_sp->ssp_soback, so,
221 0 : so->so_sp->ssp_soback != so);
222 0 : if (isspliced(so))
223 0 : sounsplice(so, so->so_sp->ssp_socket, 0);
224 : }
225 : #endif /* SOCKET_SPLICE */
226 0 : sbrelease(so, &so->so_snd);
227 0 : sorflush(so);
228 0 : sounlock(so, s);
229 : #ifdef SOCKET_SPLICE
230 0 : if (so->so_sp) {
231 : /* Reuse splice idle, sounsplice() has been called before. */
232 0 : timeout_set_proc(&so->so_sp->ssp_idleto, soreaper, so);
233 0 : timeout_add(&so->so_sp->ssp_idleto, 0);
234 0 : } else
235 : #endif /* SOCKET_SPLICE */
236 : {
237 0 : pool_put(&socket_pool, so);
238 : }
239 0 : }
240 :
241 : /*
242 : * Close a socket on last file table reference removal.
243 : * Initiate disconnect if connected.
244 : * Free socket when disconnect complete.
245 : */
246 : int
247 0 : soclose(struct socket *so, int flags)
248 : {
249 : struct socket *so2;
250 : int s, error = 0;
251 :
252 0 : s = solock(so);
253 0 : if (so->so_options & SO_ACCEPTCONN) {
254 0 : while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
255 0 : (void) soqremque(so2, 0);
256 0 : (void) soabort(so2);
257 : }
258 0 : while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
259 0 : (void) soqremque(so2, 1);
260 0 : (void) soabort(so2);
261 : }
262 : }
263 0 : if (so->so_pcb == NULL)
264 : goto discard;
265 0 : if (so->so_state & SS_ISCONNECTED) {
266 0 : if ((so->so_state & SS_ISDISCONNECTING) == 0) {
267 0 : error = sodisconnect(so);
268 0 : if (error)
269 : goto drop;
270 : }
271 0 : if (so->so_options & SO_LINGER) {
272 0 : if ((so->so_state & SS_ISDISCONNECTING) &&
273 0 : (flags & MSG_DONTWAIT))
274 : goto drop;
275 0 : while (so->so_state & SS_ISCONNECTED) {
276 0 : error = sosleep(so, &so->so_timeo,
277 : PSOCK | PCATCH, "netcls",
278 0 : so->so_linger * hz);
279 0 : if (error)
280 : break;
281 : }
282 : }
283 : }
284 : drop:
285 0 : if (so->so_pcb) {
286 : int error2;
287 0 : KASSERT(so->so_proto->pr_detach);
288 0 : error2 = (*so->so_proto->pr_detach)(so);
289 0 : if (error == 0)
290 0 : error = error2;
291 0 : }
292 : discard:
293 0 : if (so->so_state & SS_NOFDREF)
294 0 : panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
295 0 : so->so_state |= SS_NOFDREF;
296 : /* sofree() calls sounlock(). */
297 0 : sofree(so, s);
298 0 : return (error);
299 : }
300 :
301 : int
302 0 : soabort(struct socket *so)
303 : {
304 0 : soassertlocked(so);
305 :
306 0 : return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL,
307 0 : curproc);
308 : }
309 :
310 : int
311 0 : soaccept(struct socket *so, struct mbuf *nam)
312 : {
313 : int error = 0;
314 :
315 0 : soassertlocked(so);
316 :
317 0 : if ((so->so_state & SS_NOFDREF) == 0)
318 0 : panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
319 0 : so->so_state &= ~SS_NOFDREF;
320 0 : if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
321 0 : (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
322 0 : error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, NULL,
323 0 : nam, NULL, curproc);
324 : else
325 : error = ECONNABORTED;
326 0 : return (error);
327 : }
328 :
329 : int
330 0 : soconnect(struct socket *so, struct mbuf *nam)
331 : {
332 : int error;
333 :
334 0 : soassertlocked(so);
335 :
336 0 : if (so->so_options & SO_ACCEPTCONN)
337 0 : return (EOPNOTSUPP);
338 : /*
339 : * If protocol is connection-based, can only connect once.
340 : * Otherwise, if connected, try to disconnect first.
341 : * This allows user to disconnect by connecting to, e.g.,
342 : * a null address.
343 : */
344 0 : if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
345 0 : ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
346 0 : (error = sodisconnect(so))))
347 0 : error = EISCONN;
348 : else
349 0 : error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
350 0 : NULL, nam, NULL, curproc);
351 0 : return (error);
352 0 : }
353 :
354 : int
355 0 : soconnect2(struct socket *so1, struct socket *so2)
356 : {
357 : int s, error;
358 :
359 0 : s = solock(so1);
360 0 : error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
361 0 : (struct mbuf *)so2, NULL, curproc);
362 0 : sounlock(so1, s);
363 0 : return (error);
364 : }
365 :
366 : int
367 0 : sodisconnect(struct socket *so)
368 : {
369 : int error;
370 :
371 0 : soassertlocked(so);
372 :
373 0 : if ((so->so_state & SS_ISCONNECTED) == 0)
374 0 : return (ENOTCONN);
375 0 : if (so->so_state & SS_ISDISCONNECTING)
376 0 : return (EALREADY);
377 0 : error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL,
378 0 : NULL, curproc);
379 0 : return (error);
380 0 : }
381 :
382 : int m_getuio(struct mbuf **, int, long, struct uio *);
383 :
384 : #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
385 : /*
386 : * Send on a socket.
387 : * If send must go all at once and message is larger than
388 : * send buffering, then hard error.
389 : * Lock against other senders.
390 : * If must go all at once and not enough room now, then
391 : * inform user that this would block and do nothing.
392 : * Otherwise, if nonblocking, send as much as possible.
393 : * The data to be sent is described by "uio" if nonzero,
394 : * otherwise by the mbuf chain "top" (which must be null
395 : * if uio is not). Data provided in mbuf chain must be small
396 : * enough to send all at once.
397 : *
398 : * Returns nonzero on error, timeout or signal; callers
399 : * must check for short counts if EINTR/ERESTART are returned.
400 : * Data and control buffers are freed on return.
401 : */
402 : int
403 0 : sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
404 : struct mbuf *control, int flags)
405 : {
406 : long space, clen = 0;
407 : size_t resid;
408 : int error, s;
409 0 : int atomic = sosendallatonce(so) || top;
410 :
411 0 : if (uio)
412 0 : resid = uio->uio_resid;
413 : else
414 0 : resid = top->m_pkthdr.len;
415 : /* MSG_EOR on a SOCK_STREAM socket is invalid. */
416 0 : if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
417 0 : m_freem(top);
418 0 : m_freem(control);
419 0 : return (EINVAL);
420 : }
421 0 : if (uio && uio->uio_procp)
422 0 : uio->uio_procp->p_ru.ru_msgsnd++;
423 0 : if (control) {
424 : /*
425 : * In theory clen should be unsigned (since control->m_len is).
426 : * However, space must be signed, as it might be less than 0
427 : * if we over-committed, and we must use a signed comparison
428 : * of space and clen.
429 : */
430 0 : clen = control->m_len;
431 : /* reserve extra space for AF_UNIX's internalize */
432 0 : if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
433 0 : clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
434 0 : mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
435 0 : clen = CMSG_SPACE(
436 : (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
437 : (sizeof(struct fdpass) / sizeof(int)));
438 : }
439 :
440 : #define snderr(errno) { error = errno; goto release; }
441 :
442 0 : s = solock(so);
443 : restart:
444 0 : if ((error = sblock(so, &so->so_snd, SBLOCKWAIT(flags))) != 0)
445 : goto out;
446 0 : so->so_state |= SS_ISSENDING;
447 0 : do {
448 0 : if (so->so_state & SS_CANTSENDMORE)
449 0 : snderr(EPIPE);
450 0 : if (so->so_error) {
451 : error = so->so_error;
452 0 : so->so_error = 0;
453 0 : snderr(error);
454 : }
455 0 : if ((so->so_state & SS_ISCONNECTED) == 0) {
456 0 : if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
457 0 : if (!(resid == 0 && clen != 0))
458 0 : snderr(ENOTCONN);
459 0 : } else if (addr == 0)
460 0 : snderr(EDESTADDRREQ);
461 : }
462 0 : space = sbspace(so, &so->so_snd);
463 0 : if (flags & MSG_OOB)
464 0 : space += 1024;
465 0 : if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
466 0 : if (atomic && resid > so->so_snd.sb_hiwat)
467 0 : snderr(EMSGSIZE);
468 : } else {
469 0 : if (clen > so->so_snd.sb_hiwat ||
470 0 : (atomic && resid > so->so_snd.sb_hiwat - clen))
471 0 : snderr(EMSGSIZE);
472 : }
473 0 : if (space < clen ||
474 0 : (space - clen < resid &&
475 0 : (atomic || space < so->so_snd.sb_lowat))) {
476 0 : if (flags & MSG_DONTWAIT)
477 0 : snderr(EWOULDBLOCK);
478 0 : sbunlock(so, &so->so_snd);
479 0 : error = sbwait(so, &so->so_snd);
480 0 : so->so_state &= ~SS_ISSENDING;
481 0 : if (error)
482 : goto out;
483 0 : goto restart;
484 : }
485 : space -= clen;
486 0 : do {
487 0 : if (uio == NULL) {
488 : /*
489 : * Data is prepackaged in "top".
490 : */
491 : resid = 0;
492 0 : if (flags & MSG_EOR)
493 0 : top->m_flags |= M_EOR;
494 : } else {
495 0 : sounlock(so, s);
496 0 : error = m_getuio(&top, atomic, space, uio);
497 0 : s = solock(so);
498 0 : if (error)
499 : goto release;
500 0 : space -= top->m_pkthdr.len;
501 0 : resid = uio->uio_resid;
502 0 : if (flags & MSG_EOR)
503 0 : top->m_flags |= M_EOR;
504 : }
505 0 : if (resid == 0)
506 0 : so->so_state &= ~SS_ISSENDING;
507 0 : if (top && so->so_options & SO_ZEROIZE)
508 0 : top->m_flags |= M_ZEROIZE;
509 0 : error = (*so->so_proto->pr_usrreq)(so,
510 0 : (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
511 0 : top, addr, control, curproc);
512 : clen = 0;
513 : control = NULL;
514 0 : top = NULL;
515 0 : if (error)
516 : goto release;
517 0 : } while (resid && space > 0);
518 0 : } while (resid);
519 :
520 : release:
521 0 : so->so_state &= ~SS_ISSENDING;
522 0 : sbunlock(so, &so->so_snd);
523 : out:
524 0 : sounlock(so, s);
525 0 : m_freem(top);
526 0 : m_freem(control);
527 0 : return (error);
528 0 : }
529 :
530 : int
531 0 : m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
532 : {
533 0 : struct mbuf *m, *top = NULL;
534 : struct mbuf **nextp = ⊤
535 : u_long len, mlen;
536 0 : size_t resid = uio->uio_resid;
537 : int error;
538 :
539 0 : do {
540 0 : if (top == NULL) {
541 0 : MGETHDR(m, M_WAIT, MT_DATA);
542 : mlen = MHLEN;
543 0 : m->m_pkthdr.len = 0;
544 0 : m->m_pkthdr.ph_ifidx = 0;
545 0 : } else {
546 0 : MGET(m, M_WAIT, MT_DATA);
547 : mlen = MLEN;
548 : }
549 : /* chain mbuf together */
550 0 : *nextp = m;
551 0 : nextp = &m->m_next;
552 :
553 0 : resid = ulmin(resid, space);
554 0 : if (resid >= MINCLSIZE) {
555 0 : MCLGETI(m, M_NOWAIT, NULL, ulmin(resid, MAXMCLBYTES));
556 0 : if ((m->m_flags & M_EXT) == 0)
557 0 : MCLGETI(m, M_NOWAIT, NULL, MCLBYTES);
558 0 : if ((m->m_flags & M_EXT) == 0)
559 : goto nopages;
560 0 : mlen = m->m_ext.ext_size;
561 0 : len = ulmin(mlen, resid);
562 : /*
563 : * For datagram protocols, leave room
564 : * for protocol headers in first mbuf.
565 : */
566 0 : if (atomic && m == top && len < mlen - max_hdr)
567 0 : m->m_data += max_hdr;
568 : } else {
569 : nopages:
570 0 : len = ulmin(mlen, resid);
571 : /*
572 : * For datagram protocols, leave room
573 : * for protocol headers in first mbuf.
574 : */
575 0 : if (atomic && m == top && len < mlen - max_hdr)
576 0 : MH_ALIGN(m, len);
577 : }
578 :
579 0 : error = uiomove(mtod(m, caddr_t), len, uio);
580 0 : if (error) {
581 0 : m_freem(top);
582 0 : return (error);
583 : }
584 :
585 : /* adjust counters */
586 0 : resid = uio->uio_resid;
587 0 : space -= len;
588 0 : m->m_len = len;
589 0 : top->m_pkthdr.len += len;
590 :
591 : /* Is there more space and more data? */
592 0 : } while (space > 0 && resid > 0);
593 :
594 0 : *mp = top;
595 0 : return 0;
596 0 : }
597 :
598 : /*
599 : * Following replacement or removal of the first mbuf on the first
600 : * mbuf chain of a socket buffer, push necessary state changes back
601 : * into the socket buffer so that other consumers see the values
602 : * consistently. 'nextrecord' is the callers locally stored value of
603 : * the original value of sb->sb_mb->m_nextpkt which must be restored
604 : * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL.
605 : */
606 : void
607 0 : sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
608 : {
609 :
610 : /*
611 : * First, update for the new value of nextrecord. If necessary,
612 : * make it the first record.
613 : */
614 0 : if (sb->sb_mb != NULL)
615 0 : sb->sb_mb->m_nextpkt = nextrecord;
616 : else
617 0 : sb->sb_mb = nextrecord;
618 :
619 : /*
620 : * Now update any dependent socket buffer fields to reflect
621 : * the new state. This is an inline of SB_EMPTY_FIXUP, with
622 : * the addition of a second clause that takes care of the
623 : * case where sb_mb has been updated, but remains the last
624 : * record.
625 : */
626 0 : if (sb->sb_mb == NULL) {
627 0 : sb->sb_mbtail = NULL;
628 0 : sb->sb_lastrecord = NULL;
629 0 : } else if (sb->sb_mb->m_nextpkt == NULL)
630 0 : sb->sb_lastrecord = sb->sb_mb;
631 0 : }
632 :
633 : /*
634 : * Implement receive operations on a socket.
635 : * We depend on the way that records are added to the sockbuf
636 : * by sbappend*. In particular, each record (mbufs linked through m_next)
637 : * must begin with an address if the protocol so specifies,
638 : * followed by an optional mbuf or mbufs containing ancillary data,
639 : * and then zero or more mbufs of data.
640 : * In order to avoid blocking network for the entire time here, we release
641 : * the solock() while doing the actual copy to user space.
642 : * Although the sockbuf is locked, new data may still be appended,
643 : * and thus we must maintain consistency of the sockbuf during that time.
644 : *
645 : * The caller may receive the data as a single mbuf chain by supplying
646 : * an mbuf **mp0 for use in returning the chain. The uio is then used
647 : * only for the count in uio_resid.
648 : */
649 : int
650 0 : soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
651 : struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
652 : socklen_t controllen)
653 : {
654 : struct mbuf *m, **mp;
655 : struct mbuf *cm;
656 : u_long len, offset, moff;
657 : int flags, error, s, type, uio_error = 0;
658 0 : const struct protosw *pr = so->so_proto;
659 : struct mbuf *nextrecord;
660 0 : size_t resid, orig_resid = uio->uio_resid;
661 :
662 : mp = mp0;
663 0 : if (paddr)
664 0 : *paddr = 0;
665 0 : if (controlp)
666 0 : *controlp = 0;
667 0 : if (flagsp)
668 0 : flags = *flagsp &~ MSG_EOR;
669 : else
670 : flags = 0;
671 0 : if (flags & MSG_OOB) {
672 0 : m = m_get(M_WAIT, MT_DATA);
673 0 : s = solock(so);
674 0 : error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
675 0 : (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc);
676 0 : sounlock(so, s);
677 0 : if (error)
678 : goto bad;
679 0 : do {
680 0 : error = uiomove(mtod(m, caddr_t),
681 0 : ulmin(uio->uio_resid, m->m_len), uio);
682 0 : m = m_free(m);
683 0 : } while (uio->uio_resid && error == 0 && m);
684 : bad:
685 0 : m_freem(m);
686 0 : return (error);
687 : }
688 0 : if (mp)
689 0 : *mp = NULL;
690 :
691 0 : s = solock(so);
692 : restart:
693 0 : if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
694 0 : sounlock(so, s);
695 0 : return (error);
696 : }
697 :
698 0 : m = so->so_rcv.sb_mb;
699 : #ifdef SOCKET_SPLICE
700 0 : if (isspliced(so))
701 0 : m = NULL;
702 : #endif /* SOCKET_SPLICE */
703 : /*
704 : * If we have less data than requested, block awaiting more
705 : * (subject to any timeout) if:
706 : * 1. the current count is less than the low water mark,
707 : * 2. MSG_WAITALL is set, and it is possible to do the entire
708 : * receive operation at once if we block (resid <= hiwat), or
709 : * 3. MSG_DONTWAIT is not set.
710 : * If MSG_WAITALL is set but resid is larger than the receive buffer,
711 : * we have to do the receive in sections, and thus risk returning
712 : * a short count if a timeout or signal occurs after we start.
713 : */
714 0 : if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
715 0 : so->so_rcv.sb_cc < uio->uio_resid) &&
716 0 : (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
717 0 : ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
718 0 : m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
719 : #ifdef DIAGNOSTIC
720 0 : if (m == NULL && so->so_rcv.sb_cc)
721 : #ifdef SOCKET_SPLICE
722 0 : if (!isspliced(so))
723 : #endif /* SOCKET_SPLICE */
724 0 : panic("receive 1: so %p, so_type %d, sb_cc %lu",
725 0 : so, so->so_type, so->so_rcv.sb_cc);
726 : #endif
727 0 : if (so->so_error) {
728 0 : if (m)
729 : goto dontblock;
730 : error = so->so_error;
731 0 : if ((flags & MSG_PEEK) == 0)
732 0 : so->so_error = 0;
733 : goto release;
734 : }
735 0 : if (so->so_state & SS_CANTRCVMORE) {
736 0 : if (m)
737 : goto dontblock;
738 0 : else if (so->so_rcv.sb_cc == 0)
739 : goto release;
740 : }
741 0 : for (; m; m = m->m_next)
742 0 : if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
743 0 : m = so->so_rcv.sb_mb;
744 0 : goto dontblock;
745 : }
746 0 : if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
747 0 : (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
748 : error = ENOTCONN;
749 0 : goto release;
750 : }
751 0 : if (uio->uio_resid == 0 && controlp == NULL)
752 : goto release;
753 0 : if (flags & MSG_DONTWAIT) {
754 : error = EWOULDBLOCK;
755 0 : goto release;
756 : }
757 : SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
758 : SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
759 0 : sbunlock(so, &so->so_rcv);
760 0 : error = sbwait(so, &so->so_rcv);
761 0 : if (error) {
762 0 : sounlock(so, s);
763 0 : return (error);
764 : }
765 0 : goto restart;
766 : }
767 : dontblock:
768 : /*
769 : * On entry here, m points to the first record of the socket buffer.
770 : * From this point onward, we maintain 'nextrecord' as a cache of the
771 : * pointer to the next record in the socket buffer. We must keep the
772 : * various socket buffer pointers and local stack versions of the
773 : * pointers in sync, pushing out modifications before operations that
774 : * may sleep, and re-reading them afterwards.
775 : *
776 : * Otherwise, we will race with the network stack appending new data
777 : * or records onto the socket buffer by using inconsistent/stale
778 : * versions of the field, possibly resulting in socket buffer
779 : * corruption.
780 : */
781 0 : if (uio->uio_procp)
782 0 : uio->uio_procp->p_ru.ru_msgrcv++;
783 0 : KASSERT(m == so->so_rcv.sb_mb);
784 : SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
785 : SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
786 0 : nextrecord = m->m_nextpkt;
787 0 : if (pr->pr_flags & PR_ADDR) {
788 : #ifdef DIAGNOSTIC
789 0 : if (m->m_type != MT_SONAME)
790 0 : panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
791 0 : so, so->so_type, m, m->m_type);
792 : #endif
793 : orig_resid = 0;
794 0 : if (flags & MSG_PEEK) {
795 0 : if (paddr)
796 0 : *paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
797 0 : m = m->m_next;
798 0 : } else {
799 0 : sbfree(&so->so_rcv, m);
800 0 : if (paddr) {
801 0 : *paddr = m;
802 0 : so->so_rcv.sb_mb = m->m_next;
803 0 : m->m_next = 0;
804 0 : m = so->so_rcv.sb_mb;
805 0 : } else {
806 0 : so->so_rcv.sb_mb = m_free(m);
807 : m = so->so_rcv.sb_mb;
808 : }
809 0 : sbsync(&so->so_rcv, nextrecord);
810 : }
811 : }
812 0 : while (m && m->m_type == MT_CONTROL && error == 0) {
813 0 : if (flags & MSG_PEEK) {
814 0 : if (controlp)
815 0 : *controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
816 0 : m = m->m_next;
817 0 : } else {
818 0 : sbfree(&so->so_rcv, m);
819 0 : so->so_rcv.sb_mb = m->m_next;
820 0 : m->m_nextpkt = m->m_next = NULL;
821 : cm = m;
822 0 : m = so->so_rcv.sb_mb;
823 0 : sbsync(&so->so_rcv, nextrecord);
824 0 : if (controlp) {
825 0 : if (pr->pr_domain->dom_externalize &&
826 0 : mtod(cm, struct cmsghdr *)->cmsg_type ==
827 : SCM_RIGHTS) {
828 : error =
829 0 : (*pr->pr_domain->dom_externalize)
830 : (cm, controllen, flags);
831 0 : }
832 0 : *controlp = cm;
833 0 : } else {
834 : /*
835 : * Dispose of any SCM_RIGHTS message that went
836 : * through the read path rather than recv.
837 : */
838 0 : if (pr->pr_domain->dom_dispose &&
839 0 : mtod(cm, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
840 0 : pr->pr_domain->dom_dispose(cm);
841 0 : m_free(cm);
842 : }
843 : }
844 0 : if (m != NULL)
845 0 : nextrecord = so->so_rcv.sb_mb->m_nextpkt;
846 : else
847 : nextrecord = so->so_rcv.sb_mb;
848 0 : if (controlp) {
849 : orig_resid = 0;
850 0 : controlp = &(*controlp)->m_next;
851 0 : }
852 : }
853 :
854 : /* If m is non-NULL, we have some data to read. */
855 0 : if (m) {
856 0 : type = m->m_type;
857 0 : if (type == MT_OOBDATA)
858 0 : flags |= MSG_OOB;
859 0 : if (m->m_flags & M_BCAST)
860 0 : flags |= MSG_BCAST;
861 0 : if (m->m_flags & M_MCAST)
862 0 : flags |= MSG_MCAST;
863 : }
864 : SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
865 : SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
866 :
867 : moff = 0;
868 : offset = 0;
869 0 : while (m && uio->uio_resid > 0 && error == 0) {
870 0 : if (m->m_type == MT_OOBDATA) {
871 0 : if (type != MT_OOBDATA)
872 : break;
873 0 : } else if (type == MT_OOBDATA)
874 : break;
875 : #ifdef DIAGNOSTIC
876 0 : else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
877 0 : panic("receive 3: so %p, so_type %d, m %p, m_type %d",
878 0 : so, so->so_type, m, m->m_type);
879 : #endif
880 0 : so->so_state &= ~SS_RCVATMARK;
881 0 : len = uio->uio_resid;
882 0 : if (so->so_oobmark && len > so->so_oobmark - offset)
883 0 : len = so->so_oobmark - offset;
884 0 : if (len > m->m_len - moff)
885 0 : len = m->m_len - moff;
886 : /*
887 : * If mp is set, just pass back the mbufs.
888 : * Otherwise copy them out via the uio, then free.
889 : * Sockbuf must be consistent here (points to current mbuf,
890 : * it points to next record) when we drop priority;
891 : * we must note any additions to the sockbuf when we
892 : * block interrupts again.
893 : */
894 0 : if (mp == NULL && uio_error == 0) {
895 : SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
896 : SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
897 0 : resid = uio->uio_resid;
898 0 : sounlock(so, s);
899 0 : uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
900 0 : s = solock(so);
901 0 : if (uio_error)
902 0 : uio->uio_resid = resid - len;
903 : } else
904 0 : uio->uio_resid -= len;
905 0 : if (len == m->m_len - moff) {
906 0 : if (m->m_flags & M_EOR)
907 0 : flags |= MSG_EOR;
908 0 : if (flags & MSG_PEEK) {
909 0 : m = m->m_next;
910 : moff = 0;
911 0 : } else {
912 0 : nextrecord = m->m_nextpkt;
913 0 : sbfree(&so->so_rcv, m);
914 0 : if (mp) {
915 0 : *mp = m;
916 0 : mp = &m->m_next;
917 0 : so->so_rcv.sb_mb = m = m->m_next;
918 0 : *mp = NULL;
919 0 : } else {
920 0 : so->so_rcv.sb_mb = m_free(m);
921 : m = so->so_rcv.sb_mb;
922 : }
923 : /*
924 : * If m != NULL, we also know that
925 : * so->so_rcv.sb_mb != NULL.
926 : */
927 0 : KASSERT(so->so_rcv.sb_mb == m);
928 0 : if (m) {
929 0 : m->m_nextpkt = nextrecord;
930 0 : if (nextrecord == NULL)
931 0 : so->so_rcv.sb_lastrecord = m;
932 : } else {
933 0 : so->so_rcv.sb_mb = nextrecord;
934 0 : SB_EMPTY_FIXUP(&so->so_rcv);
935 : }
936 : SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
937 : SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
938 : }
939 : } else {
940 0 : if (flags & MSG_PEEK)
941 0 : moff += len;
942 : else {
943 0 : if (mp)
944 0 : *mp = m_copym(m, 0, len, M_WAIT);
945 0 : m->m_data += len;
946 0 : m->m_len -= len;
947 0 : so->so_rcv.sb_cc -= len;
948 0 : so->so_rcv.sb_datacc -= len;
949 : }
950 : }
951 0 : if (so->so_oobmark) {
952 0 : if ((flags & MSG_PEEK) == 0) {
953 0 : so->so_oobmark -= len;
954 0 : if (so->so_oobmark == 0) {
955 0 : so->so_state |= SS_RCVATMARK;
956 0 : break;
957 : }
958 : } else {
959 0 : offset += len;
960 0 : if (offset == so->so_oobmark)
961 : break;
962 : }
963 : }
964 0 : if (flags & MSG_EOR)
965 : break;
966 : /*
967 : * If the MSG_WAITALL flag is set (for non-atomic socket),
968 : * we must not quit until "uio->uio_resid == 0" or an error
969 : * termination. If a signal/timeout occurs, return
970 : * with a short count but without error.
971 : * Keep sockbuf locked against other readers.
972 : */
973 0 : while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
974 0 : !sosendallatonce(so) && !nextrecord) {
975 0 : if (so->so_error || so->so_state & SS_CANTRCVMORE)
976 : break;
977 : SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
978 : SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
979 0 : error = sbwait(so, &so->so_rcv);
980 0 : if (error) {
981 0 : sbunlock(so, &so->so_rcv);
982 0 : sounlock(so, s);
983 0 : return (0);
984 : }
985 0 : if ((m = so->so_rcv.sb_mb) != NULL)
986 0 : nextrecord = m->m_nextpkt;
987 : }
988 : }
989 :
990 0 : if (m && pr->pr_flags & PR_ATOMIC) {
991 0 : flags |= MSG_TRUNC;
992 0 : if ((flags & MSG_PEEK) == 0)
993 0 : (void) sbdroprecord(&so->so_rcv);
994 : }
995 0 : if ((flags & MSG_PEEK) == 0) {
996 0 : if (m == NULL) {
997 : /*
998 : * First part is an inline SB_EMPTY_FIXUP(). Second
999 : * part makes sure sb_lastrecord is up-to-date if
1000 : * there is still data in the socket buffer.
1001 : */
1002 0 : so->so_rcv.sb_mb = nextrecord;
1003 0 : if (so->so_rcv.sb_mb == NULL) {
1004 0 : so->so_rcv.sb_mbtail = NULL;
1005 0 : so->so_rcv.sb_lastrecord = NULL;
1006 0 : } else if (nextrecord->m_nextpkt == NULL)
1007 0 : so->so_rcv.sb_lastrecord = nextrecord;
1008 : }
1009 : SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1010 : SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1011 0 : if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1012 0 : (*pr->pr_usrreq)(so, PRU_RCVD, NULL,
1013 0 : (struct mbuf *)(long)flags, NULL, curproc);
1014 : }
1015 0 : if (orig_resid == uio->uio_resid && orig_resid &&
1016 0 : (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1017 0 : sbunlock(so, &so->so_rcv);
1018 0 : goto restart;
1019 : }
1020 :
1021 0 : if (uio_error)
1022 0 : error = uio_error;
1023 :
1024 0 : if (flagsp)
1025 0 : *flagsp |= flags;
1026 : release:
1027 0 : sbunlock(so, &so->so_rcv);
1028 0 : sounlock(so, s);
1029 0 : return (error);
1030 0 : }
1031 :
1032 : int
1033 0 : soshutdown(struct socket *so, int how)
1034 : {
1035 0 : const struct protosw *pr = so->so_proto;
1036 : int s, error = 0;
1037 :
1038 0 : s = solock(so);
1039 0 : switch (how) {
1040 : case SHUT_RD:
1041 0 : sorflush(so);
1042 0 : break;
1043 : case SHUT_RDWR:
1044 0 : sorflush(so);
1045 : /* FALLTHROUGH */
1046 : case SHUT_WR:
1047 0 : error = (*pr->pr_usrreq)(so, PRU_SHUTDOWN, NULL, NULL, NULL,
1048 0 : curproc);
1049 0 : break;
1050 : default:
1051 : error = EINVAL;
1052 0 : break;
1053 : }
1054 0 : sounlock(so, s);
1055 :
1056 0 : return (error);
1057 : }
1058 :
1059 : void
1060 0 : sorflush(struct socket *so)
1061 : {
1062 0 : struct sockbuf *sb = &so->so_rcv;
1063 0 : const struct protosw *pr = so->so_proto;
1064 0 : struct socket aso;
1065 : int error;
1066 :
1067 0 : sb->sb_flags |= SB_NOINTR;
1068 0 : error = sblock(so, sb, M_WAITOK);
1069 : /* with SB_NOINTR and M_WAITOK sblock() must not fail */
1070 0 : KASSERT(error == 0);
1071 0 : socantrcvmore(so);
1072 0 : sbunlock(so, sb);
1073 0 : aso.so_proto = pr;
1074 0 : aso.so_rcv = *sb;
1075 0 : memset(&sb->sb_startzero, 0,
1076 : (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1077 0 : if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1078 0 : (*pr->pr_domain->dom_dispose)(aso.so_rcv.sb_mb);
1079 0 : sbrelease(&aso, &aso.so_rcv);
1080 0 : }
1081 :
1082 : #ifdef SOCKET_SPLICE
1083 :
1084 : #define so_splicelen so_sp->ssp_len
1085 : #define so_splicemax so_sp->ssp_max
1086 : #define so_idletv so_sp->ssp_idletv
1087 : #define so_idleto so_sp->ssp_idleto
1088 : #define so_splicetask so_sp->ssp_task
1089 :
1090 : int
1091 0 : sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1092 : {
1093 0 : struct file *fp;
1094 : struct socket *sosp;
1095 : struct sosplice *sp;
1096 : struct taskq *tq;
1097 : int error = 0;
1098 :
1099 0 : soassertlocked(so);
1100 :
1101 0 : if (sosplice_taskq == NULL) {
1102 0 : rw_enter_write(&sosplice_lock);
1103 0 : if (sosplice_taskq == NULL) {
1104 0 : tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1105 : TASKQ_MPSAFE);
1106 : /* Ensure the taskq is fully visible to other CPUs. */
1107 0 : membar_producer();
1108 0 : sosplice_taskq = tq;
1109 0 : }
1110 0 : rw_exit_write(&sosplice_lock);
1111 0 : }
1112 0 : if (sosplice_taskq == NULL)
1113 0 : return (ENOMEM);
1114 :
1115 0 : if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1116 0 : return (EPROTONOSUPPORT);
1117 0 : if (so->so_options & SO_ACCEPTCONN)
1118 0 : return (EOPNOTSUPP);
1119 0 : if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1120 0 : (so->so_proto->pr_flags & PR_CONNREQUIRED))
1121 0 : return (ENOTCONN);
1122 0 : if (so->so_sp == NULL) {
1123 0 : sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1124 0 : if (so->so_sp == NULL)
1125 0 : so->so_sp = sp;
1126 : else
1127 0 : pool_put(&sosplice_pool, sp);
1128 : }
1129 :
1130 : /* If no fd is given, unsplice by removing existing link. */
1131 0 : if (fd < 0) {
1132 : /* Lock receive buffer. */
1133 0 : if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) {
1134 0 : return (error);
1135 : }
1136 0 : if (so->so_sp->ssp_socket)
1137 0 : sounsplice(so, so->so_sp->ssp_socket, 1);
1138 0 : sbunlock(so, &so->so_rcv);
1139 0 : return (0);
1140 : }
1141 :
1142 0 : if (max && max < 0)
1143 0 : return (EINVAL);
1144 :
1145 0 : if (tv && (tv->tv_sec < 0 || tv->tv_usec < 0))
1146 0 : return (EINVAL);
1147 :
1148 : /* Find sosp, the drain socket where data will be spliced into. */
1149 0 : if ((error = getsock(curproc, fd, &fp)) != 0)
1150 0 : return (error);
1151 0 : sosp = fp->f_data;
1152 0 : if (sosp->so_sp == NULL) {
1153 0 : sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1154 0 : if (sosp->so_sp == NULL)
1155 0 : sosp->so_sp = sp;
1156 : else
1157 0 : pool_put(&sosplice_pool, sp);
1158 : }
1159 :
1160 : /* Lock both receive and send buffer. */
1161 0 : if ((error = sblock(so, &so->so_rcv, M_WAITOK)) != 0) {
1162 : goto frele;
1163 : }
1164 0 : if ((error = sblock(so, &sosp->so_snd, M_WAITOK)) != 0) {
1165 0 : sbunlock(so, &so->so_rcv);
1166 0 : goto frele;
1167 : }
1168 :
1169 0 : if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1170 : error = EBUSY;
1171 0 : goto release;
1172 : }
1173 0 : if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) {
1174 : error = EPROTONOSUPPORT;
1175 0 : goto release;
1176 : }
1177 0 : if (sosp->so_options & SO_ACCEPTCONN) {
1178 : error = EOPNOTSUPP;
1179 0 : goto release;
1180 : }
1181 0 : if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1182 : error = ENOTCONN;
1183 0 : goto release;
1184 : }
1185 :
1186 : /* Splice so and sosp together. */
1187 0 : so->so_sp->ssp_socket = sosp;
1188 0 : sosp->so_sp->ssp_soback = so;
1189 0 : so->so_splicelen = 0;
1190 0 : so->so_splicemax = max;
1191 0 : if (tv)
1192 0 : so->so_idletv = *tv;
1193 : else
1194 0 : timerclear(&so->so_idletv);
1195 0 : timeout_set_proc(&so->so_idleto, soidle, so);
1196 0 : task_set(&so->so_splicetask, sotask, so);
1197 :
1198 : /*
1199 : * To prevent softnet interrupt from calling somove() while
1200 : * we sleep, the socket buffers are not marked as spliced yet.
1201 : */
1202 0 : if (somove(so, M_WAIT)) {
1203 0 : so->so_rcv.sb_flags |= SB_SPLICE;
1204 0 : sosp->so_snd.sb_flags |= SB_SPLICE;
1205 0 : }
1206 :
1207 : release:
1208 0 : sbunlock(sosp, &sosp->so_snd);
1209 0 : sbunlock(so, &so->so_rcv);
1210 : frele:
1211 0 : FRELE(fp, curproc);
1212 0 : return (error);
1213 0 : }
1214 :
1215 : void
1216 0 : sounsplice(struct socket *so, struct socket *sosp, int wakeup)
1217 : {
1218 0 : soassertlocked(so);
1219 :
1220 0 : task_del(sosplice_taskq, &so->so_splicetask);
1221 0 : timeout_del(&so->so_idleto);
1222 0 : sosp->so_snd.sb_flags &= ~SB_SPLICE;
1223 0 : so->so_rcv.sb_flags &= ~SB_SPLICE;
1224 0 : so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1225 0 : if (wakeup && soreadable(so))
1226 0 : sorwakeup(so);
1227 0 : }
1228 :
1229 : void
1230 0 : soidle(void *arg)
1231 : {
1232 0 : struct socket *so = arg;
1233 : int s;
1234 :
1235 0 : s = solock(so);
1236 0 : if (so->so_rcv.sb_flags & SB_SPLICE) {
1237 0 : so->so_error = ETIMEDOUT;
1238 0 : sounsplice(so, so->so_sp->ssp_socket, 1);
1239 0 : }
1240 0 : sounlock(so, s);
1241 0 : }
1242 :
1243 : void
1244 0 : sotask(void *arg)
1245 : {
1246 0 : struct socket *so = arg;
1247 : int s;
1248 :
1249 0 : s = solock(so);
1250 0 : if (so->so_rcv.sb_flags & SB_SPLICE) {
1251 : /*
1252 : * We may not sleep here as sofree() and unsplice() may be
1253 : * called from softnet interrupt context. This would remove
1254 : * the socket during somove().
1255 : */
1256 0 : somove(so, M_DONTWAIT);
1257 0 : }
1258 0 : sounlock(so, s);
1259 :
1260 : /* Avoid user land starvation. */
1261 0 : yield();
1262 0 : }
1263 :
1264 : /*
1265 : * The socket splicing task or idle timeout may sleep while grabbing the net
1266 : * lock. As sofree() can be called anytime, sotask() or soidle() could access
1267 : * the socket memory of a freed socket after wakeup. So delay the pool_put()
1268 : * after all pending socket splicing tasks or timeouts have finished. Do this
1269 : * by scheduling it on the same threads.
1270 : */
1271 : void
1272 0 : soreaper(void *arg)
1273 : {
1274 0 : struct socket *so = arg;
1275 :
1276 : /* Reuse splice task, sounsplice() has been called before. */
1277 0 : task_set(&so->so_sp->ssp_task, soput, so);
1278 0 : task_add(sosplice_taskq, &so->so_sp->ssp_task);
1279 0 : }
1280 :
1281 : void
1282 0 : soput(void *arg)
1283 : {
1284 0 : struct socket *so = arg;
1285 :
1286 0 : pool_put(&sosplice_pool, so->so_sp);
1287 0 : pool_put(&socket_pool, so);
1288 0 : }
1289 :
1290 : /*
1291 : * Move data from receive buffer of spliced source socket to send
1292 : * buffer of drain socket. Try to move as much as possible in one
1293 : * big chunk. It is a TCP only implementation.
1294 : * Return value 0 means splicing has been finished, 1 continue.
1295 : */
1296 : int
1297 0 : somove(struct socket *so, int wait)
1298 : {
1299 0 : struct socket *sosp = so->so_sp->ssp_socket;
1300 0 : struct mbuf *m, **mp, *nextrecord;
1301 : u_long len, off, oobmark;
1302 : long space;
1303 : int error = 0, maxreached = 0;
1304 : unsigned int state;
1305 :
1306 0 : soassertlocked(so);
1307 :
1308 : nextpkt:
1309 0 : if (so->so_error) {
1310 : error = so->so_error;
1311 0 : goto release;
1312 : }
1313 0 : if (sosp->so_state & SS_CANTSENDMORE) {
1314 : error = EPIPE;
1315 0 : goto release;
1316 : }
1317 0 : if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
1318 0 : sosp->so_error != EFBIG && sosp->so_error != ELOOP) {
1319 : error = sosp->so_error;
1320 0 : goto release;
1321 : }
1322 0 : if ((sosp->so_state & SS_ISCONNECTED) == 0)
1323 : goto release;
1324 :
1325 : /* Calculate how many bytes can be copied now. */
1326 0 : len = so->so_rcv.sb_datacc;
1327 0 : if (so->so_splicemax) {
1328 0 : KASSERT(so->so_splicelen < so->so_splicemax);
1329 0 : if (so->so_splicemax <= so->so_splicelen + len) {
1330 0 : len = so->so_splicemax - so->so_splicelen;
1331 : maxreached = 1;
1332 0 : }
1333 : }
1334 0 : space = sbspace(sosp, &sosp->so_snd);
1335 0 : if (so->so_oobmark && so->so_oobmark < len &&
1336 0 : so->so_oobmark < space + 1024)
1337 0 : space += 1024;
1338 0 : if (space <= 0) {
1339 : maxreached = 0;
1340 0 : goto release;
1341 : }
1342 0 : if (space < len) {
1343 : maxreached = 0;
1344 0 : if (space < sosp->so_snd.sb_lowat)
1345 : goto release;
1346 : len = space;
1347 0 : }
1348 0 : sosp->so_state |= SS_ISSENDING;
1349 :
1350 : SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1351 : SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1352 0 : m = so->so_rcv.sb_mb;
1353 0 : if (m == NULL)
1354 : goto release;
1355 0 : nextrecord = m->m_nextpkt;
1356 :
1357 : /* Drop address and control information not used with splicing. */
1358 0 : if (so->so_proto->pr_flags & PR_ADDR) {
1359 : #ifdef DIAGNOSTIC
1360 0 : if (m->m_type != MT_SONAME)
1361 0 : panic("somove soname: so %p, so_type %d, m %p, "
1362 0 : "m_type %d", so, so->so_type, m, m->m_type);
1363 : #endif
1364 0 : m = m->m_next;
1365 0 : }
1366 0 : while (m && m->m_type == MT_CONTROL)
1367 0 : m = m->m_next;
1368 0 : if (m == NULL) {
1369 0 : sbdroprecord(&so->so_rcv);
1370 0 : if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
1371 0 : (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
1372 : NULL, NULL, NULL);
1373 0 : goto nextpkt;
1374 : }
1375 :
1376 : /*
1377 : * By splicing sockets connected to localhost, userland might create a
1378 : * loop. Dissolve splicing with error if loop is detected by counter.
1379 : */
1380 0 : if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) {
1381 : error = ELOOP;
1382 0 : goto release;
1383 : }
1384 :
1385 0 : if (so->so_proto->pr_flags & PR_ATOMIC) {
1386 0 : if ((m->m_flags & M_PKTHDR) == 0)
1387 0 : panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1388 0 : "m_type %d", so, so->so_type, m, m->m_type);
1389 0 : if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1390 : error = EMSGSIZE;
1391 0 : goto release;
1392 : }
1393 0 : if (len < m->m_pkthdr.len)
1394 : goto release;
1395 0 : if (m->m_pkthdr.len < len) {
1396 : maxreached = 0;
1397 : len = m->m_pkthdr.len;
1398 0 : }
1399 : /*
1400 : * Throw away the name mbuf after it has been assured
1401 : * that the whole first record can be processed.
1402 : */
1403 0 : m = so->so_rcv.sb_mb;
1404 0 : sbfree(&so->so_rcv, m);
1405 0 : so->so_rcv.sb_mb = m_free(m);
1406 0 : sbsync(&so->so_rcv, nextrecord);
1407 0 : }
1408 : /*
1409 : * Throw away the control mbufs after it has been assured
1410 : * that the whole first record can be processed.
1411 : */
1412 0 : m = so->so_rcv.sb_mb;
1413 0 : while (m && m->m_type == MT_CONTROL) {
1414 0 : sbfree(&so->so_rcv, m);
1415 0 : so->so_rcv.sb_mb = m_free(m);
1416 0 : m = so->so_rcv.sb_mb;
1417 0 : sbsync(&so->so_rcv, nextrecord);
1418 : }
1419 :
1420 : SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1421 : SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1422 :
1423 : /* Take at most len mbufs out of receive buffer. */
1424 0 : for (off = 0, mp = &m; off <= len && *mp;
1425 0 : off += (*mp)->m_len, mp = &(*mp)->m_next) {
1426 0 : u_long size = len - off;
1427 :
1428 : #ifdef DIAGNOSTIC
1429 0 : if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1430 0 : panic("somove type: so %p, so_type %d, m %p, "
1431 0 : "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1432 : #endif
1433 0 : if ((*mp)->m_len > size) {
1434 : /*
1435 : * Move only a partial mbuf at maximum splice length or
1436 : * if the drain buffer is too small for this large mbuf.
1437 : */
1438 0 : if (!maxreached && so->so_snd.sb_datacc > 0) {
1439 : len -= size;
1440 0 : break;
1441 : }
1442 0 : *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1443 0 : if (*mp == NULL) {
1444 : len -= size;
1445 0 : break;
1446 : }
1447 0 : so->so_rcv.sb_mb->m_data += size;
1448 0 : so->so_rcv.sb_mb->m_len -= size;
1449 0 : so->so_rcv.sb_cc -= size;
1450 0 : so->so_rcv.sb_datacc -= size;
1451 0 : } else {
1452 0 : *mp = so->so_rcv.sb_mb;
1453 0 : sbfree(&so->so_rcv, *mp);
1454 0 : so->so_rcv.sb_mb = (*mp)->m_next;
1455 0 : sbsync(&so->so_rcv, nextrecord);
1456 : }
1457 0 : }
1458 0 : *mp = NULL;
1459 :
1460 : SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1461 : SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1462 : SBCHECK(&so->so_rcv);
1463 0 : if (m == NULL)
1464 : goto release;
1465 0 : m->m_nextpkt = NULL;
1466 0 : if (m->m_flags & M_PKTHDR) {
1467 0 : m_resethdr(m);
1468 0 : m->m_pkthdr.len = len;
1469 0 : }
1470 :
1471 : /* Send window update to source peer as receive buffer has changed. */
1472 0 : if (so->so_proto->pr_flags & PR_WANTRCVD && so->so_pcb)
1473 0 : (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
1474 : NULL, NULL, NULL);
1475 :
1476 : /* Receive buffer did shrink by len bytes, adjust oob. */
1477 0 : state = so->so_state;
1478 0 : so->so_state &= ~SS_RCVATMARK;
1479 0 : oobmark = so->so_oobmark;
1480 0 : so->so_oobmark = oobmark > len ? oobmark - len : 0;
1481 0 : if (oobmark) {
1482 0 : if (oobmark == len)
1483 0 : so->so_state |= SS_RCVATMARK;
1484 0 : if (oobmark >= len)
1485 0 : oobmark = 0;
1486 : }
1487 :
1488 : /*
1489 : * Handle oob data. If any malloc fails, ignore error.
1490 : * TCP urgent data is not very reliable anyway.
1491 : */
1492 0 : while (((state & SS_RCVATMARK) || oobmark) &&
1493 0 : (so->so_options & SO_OOBINLINE)) {
1494 : struct mbuf *o = NULL;
1495 :
1496 0 : if (state & SS_RCVATMARK) {
1497 0 : o = m_get(wait, MT_DATA);
1498 0 : state &= ~SS_RCVATMARK;
1499 0 : } else if (oobmark) {
1500 0 : o = m_split(m, oobmark, wait);
1501 0 : if (o) {
1502 0 : error = (*sosp->so_proto->pr_usrreq)(sosp,
1503 0 : PRU_SEND, m, NULL, NULL, NULL);
1504 0 : if (error) {
1505 0 : if (sosp->so_state & SS_CANTSENDMORE)
1506 0 : error = EPIPE;
1507 0 : m_freem(o);
1508 0 : goto release;
1509 : }
1510 0 : len -= oobmark;
1511 0 : so->so_splicelen += oobmark;
1512 0 : m = o;
1513 0 : o = m_get(wait, MT_DATA);
1514 0 : }
1515 : oobmark = 0;
1516 0 : }
1517 0 : if (o) {
1518 0 : o->m_len = 1;
1519 0 : *mtod(o, caddr_t) = *mtod(m, caddr_t);
1520 0 : error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB,
1521 : o, NULL, NULL, NULL);
1522 0 : if (error) {
1523 0 : if (sosp->so_state & SS_CANTSENDMORE)
1524 0 : error = EPIPE;
1525 0 : m_freem(m);
1526 0 : goto release;
1527 : }
1528 0 : len -= 1;
1529 0 : so->so_splicelen += 1;
1530 0 : if (oobmark) {
1531 0 : oobmark -= 1;
1532 0 : if (oobmark == 0)
1533 0 : state |= SS_RCVATMARK;
1534 : }
1535 0 : m_adj(m, 1);
1536 0 : }
1537 0 : }
1538 :
1539 : /* Append all remaining data to drain socket. */
1540 0 : if (so->so_rcv.sb_cc == 0 || maxreached)
1541 0 : sosp->so_state &= ~SS_ISSENDING;
1542 0 : error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL, NULL,
1543 : NULL);
1544 0 : if (error) {
1545 0 : if (sosp->so_state & SS_CANTSENDMORE)
1546 0 : error = EPIPE;
1547 : goto release;
1548 : }
1549 0 : so->so_splicelen += len;
1550 :
1551 : /* Move several packets if possible. */
1552 0 : if (!maxreached && nextrecord)
1553 0 : goto nextpkt;
1554 :
1555 : release:
1556 0 : sosp->so_state &= ~SS_ISSENDING;
1557 0 : if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1558 0 : error = EFBIG;
1559 0 : if (error)
1560 0 : so->so_error = error;
1561 0 : if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) ||
1562 0 : (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) {
1563 0 : sounsplice(so, sosp, 1);
1564 0 : return (0);
1565 : }
1566 0 : if (timerisset(&so->so_idletv))
1567 0 : timeout_add_tv(&so->so_idleto, &so->so_idletv);
1568 0 : return (1);
1569 0 : }
1570 :
1571 : #endif /* SOCKET_SPLICE */
1572 :
1573 : void
1574 0 : sorwakeup(struct socket *so)
1575 : {
1576 0 : soassertlocked(so);
1577 :
1578 : #ifdef SOCKET_SPLICE
1579 0 : if (so->so_rcv.sb_flags & SB_SPLICE) {
1580 : /*
1581 : * TCP has a sendbuffer that can handle multiple packets
1582 : * at once. So queue the stream a bit to accumulate data.
1583 : * The sosplice thread will call somove() later and send
1584 : * the packets calling tcp_output() only once.
1585 : * In the UDP case, send out the packets immediately.
1586 : * Using a thread would make things slower.
1587 : */
1588 0 : if (so->so_proto->pr_flags & PR_WANTRCVD)
1589 0 : task_add(sosplice_taskq, &so->so_splicetask);
1590 : else
1591 0 : somove(so, M_DONTWAIT);
1592 : }
1593 0 : if (isspliced(so))
1594 : return;
1595 : #endif
1596 0 : sowakeup(so, &so->so_rcv);
1597 0 : if (so->so_upcall)
1598 0 : (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1599 0 : }
1600 :
1601 : void
1602 0 : sowwakeup(struct socket *so)
1603 : {
1604 0 : soassertlocked(so);
1605 :
1606 : #ifdef SOCKET_SPLICE
1607 0 : if (so->so_snd.sb_flags & SB_SPLICE)
1608 0 : task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
1609 : #endif
1610 0 : sowakeup(so, &so->so_snd);
1611 0 : }
1612 :
1613 : int
1614 0 : sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
1615 : {
1616 : int error = 0;
1617 :
1618 0 : soassertlocked(so);
1619 :
1620 0 : if (level != SOL_SOCKET) {
1621 0 : if (so->so_proto->pr_ctloutput) {
1622 0 : error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1623 : level, optname, m);
1624 0 : return (error);
1625 : }
1626 : error = ENOPROTOOPT;
1627 0 : } else {
1628 0 : switch (optname) {
1629 : case SO_BINDANY:
1630 0 : if ((error = suser(curproc)) != 0) /* XXX */
1631 0 : return (error);
1632 : break;
1633 : }
1634 :
1635 0 : switch (optname) {
1636 :
1637 : case SO_LINGER:
1638 0 : if (m == NULL || m->m_len != sizeof (struct linger) ||
1639 0 : mtod(m, struct linger *)->l_linger < 0 ||
1640 0 : mtod(m, struct linger *)->l_linger > SHRT_MAX)
1641 0 : return (EINVAL);
1642 0 : so->so_linger = mtod(m, struct linger *)->l_linger;
1643 : /* FALLTHROUGH */
1644 :
1645 : case SO_BINDANY:
1646 : case SO_DEBUG:
1647 : case SO_KEEPALIVE:
1648 : case SO_USELOOPBACK:
1649 : case SO_BROADCAST:
1650 : case SO_REUSEADDR:
1651 : case SO_REUSEPORT:
1652 : case SO_OOBINLINE:
1653 : case SO_TIMESTAMP:
1654 : case SO_ZEROIZE:
1655 0 : if (m == NULL || m->m_len < sizeof (int))
1656 0 : return (EINVAL);
1657 0 : if (*mtod(m, int *))
1658 0 : so->so_options |= optname;
1659 : else
1660 0 : so->so_options &= ~optname;
1661 : break;
1662 :
1663 : case SO_DONTROUTE:
1664 0 : if (m == NULL || m->m_len < sizeof (int))
1665 0 : return (EINVAL);
1666 0 : if (*mtod(m, int *))
1667 0 : error = EOPNOTSUPP;
1668 : break;
1669 :
1670 : case SO_SNDBUF:
1671 : case SO_RCVBUF:
1672 : case SO_SNDLOWAT:
1673 : case SO_RCVLOWAT:
1674 : {
1675 : u_long cnt;
1676 :
1677 0 : if (m == NULL || m->m_len < sizeof (int))
1678 0 : return (EINVAL);
1679 0 : cnt = *mtod(m, int *);
1680 0 : if ((long)cnt <= 0)
1681 : cnt = 1;
1682 0 : switch (optname) {
1683 :
1684 : case SO_SNDBUF:
1685 0 : if (so->so_state & SS_CANTSENDMORE)
1686 0 : return (EINVAL);
1687 0 : if (sbcheckreserve(cnt, so->so_snd.sb_wat) ||
1688 0 : sbreserve(so, &so->so_snd, cnt))
1689 0 : return (ENOBUFS);
1690 0 : so->so_snd.sb_wat = cnt;
1691 0 : break;
1692 :
1693 : case SO_RCVBUF:
1694 0 : if (so->so_state & SS_CANTRCVMORE)
1695 0 : return (EINVAL);
1696 0 : if (sbcheckreserve(cnt, so->so_rcv.sb_wat) ||
1697 0 : sbreserve(so, &so->so_rcv, cnt))
1698 0 : return (ENOBUFS);
1699 0 : so->so_rcv.sb_wat = cnt;
1700 0 : break;
1701 :
1702 : case SO_SNDLOWAT:
1703 0 : so->so_snd.sb_lowat =
1704 0 : (cnt > so->so_snd.sb_hiwat) ?
1705 : so->so_snd.sb_hiwat : cnt;
1706 0 : break;
1707 : case SO_RCVLOWAT:
1708 0 : so->so_rcv.sb_lowat =
1709 0 : (cnt > so->so_rcv.sb_hiwat) ?
1710 : so->so_rcv.sb_hiwat : cnt;
1711 0 : break;
1712 : }
1713 0 : break;
1714 : }
1715 :
1716 : case SO_SNDTIMEO:
1717 : case SO_RCVTIMEO:
1718 : {
1719 0 : struct timeval tv;
1720 : int val;
1721 :
1722 0 : if (m == NULL || m->m_len < sizeof (tv))
1723 0 : return (EINVAL);
1724 0 : memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
1725 0 : val = tvtohz(&tv);
1726 0 : if (val > USHRT_MAX)
1727 0 : return (EDOM);
1728 :
1729 0 : switch (optname) {
1730 :
1731 : case SO_SNDTIMEO:
1732 0 : so->so_snd.sb_timeo = val;
1733 0 : break;
1734 : case SO_RCVTIMEO:
1735 0 : so->so_rcv.sb_timeo = val;
1736 0 : break;
1737 : }
1738 0 : break;
1739 0 : }
1740 :
1741 : case SO_RTABLE:
1742 0 : if (so->so_proto->pr_domain &&
1743 0 : so->so_proto->pr_domain->dom_protosw &&
1744 0 : so->so_proto->pr_ctloutput) {
1745 : struct domain *dom = so->so_proto->pr_domain;
1746 :
1747 0 : level = dom->dom_protosw->pr_protocol;
1748 0 : error = (*so->so_proto->pr_ctloutput)
1749 : (PRCO_SETOPT, so, level, optname, m);
1750 : return (error);
1751 : }
1752 : error = ENOPROTOOPT;
1753 0 : break;
1754 :
1755 : #ifdef SOCKET_SPLICE
1756 : case SO_SPLICE:
1757 0 : if (m == NULL) {
1758 0 : error = sosplice(so, -1, 0, NULL);
1759 0 : } else if (m->m_len < sizeof(int)) {
1760 0 : return (EINVAL);
1761 0 : } else if (m->m_len < sizeof(struct splice)) {
1762 0 : error = sosplice(so, *mtod(m, int *), 0, NULL);
1763 0 : } else {
1764 0 : error = sosplice(so,
1765 0 : mtod(m, struct splice *)->sp_fd,
1766 0 : mtod(m, struct splice *)->sp_max,
1767 0 : &mtod(m, struct splice *)->sp_idle);
1768 : }
1769 : break;
1770 : #endif /* SOCKET_SPLICE */
1771 :
1772 : default:
1773 : error = ENOPROTOOPT;
1774 0 : break;
1775 : }
1776 0 : if (error == 0 && so->so_proto->pr_ctloutput) {
1777 0 : (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1778 : level, optname, m);
1779 0 : }
1780 : }
1781 :
1782 0 : return (error);
1783 0 : }
1784 :
1785 : int
1786 0 : sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
1787 : {
1788 : int error = 0;
1789 :
1790 0 : soassertlocked(so);
1791 :
1792 0 : if (level != SOL_SOCKET) {
1793 0 : if (so->so_proto->pr_ctloutput) {
1794 0 : m->m_len = 0;
1795 :
1796 0 : error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
1797 : level, optname, m);
1798 0 : if (error)
1799 0 : return (error);
1800 0 : return (0);
1801 : } else
1802 0 : return (ENOPROTOOPT);
1803 : } else {
1804 0 : m->m_len = sizeof (int);
1805 :
1806 0 : switch (optname) {
1807 :
1808 : case SO_LINGER:
1809 0 : m->m_len = sizeof (struct linger);
1810 0 : mtod(m, struct linger *)->l_onoff =
1811 0 : so->so_options & SO_LINGER;
1812 0 : mtod(m, struct linger *)->l_linger = so->so_linger;
1813 0 : break;
1814 :
1815 : case SO_BINDANY:
1816 : case SO_USELOOPBACK:
1817 : case SO_DEBUG:
1818 : case SO_KEEPALIVE:
1819 : case SO_REUSEADDR:
1820 : case SO_REUSEPORT:
1821 : case SO_BROADCAST:
1822 : case SO_OOBINLINE:
1823 : case SO_TIMESTAMP:
1824 : case SO_ZEROIZE:
1825 0 : *mtod(m, int *) = so->so_options & optname;
1826 0 : break;
1827 :
1828 : case SO_DONTROUTE:
1829 0 : *mtod(m, int *) = 0;
1830 0 : break;
1831 :
1832 : case SO_TYPE:
1833 0 : *mtod(m, int *) = so->so_type;
1834 0 : break;
1835 :
1836 : case SO_ERROR:
1837 0 : *mtod(m, int *) = so->so_error;
1838 0 : so->so_error = 0;
1839 0 : break;
1840 :
1841 : case SO_SNDBUF:
1842 0 : *mtod(m, int *) = so->so_snd.sb_hiwat;
1843 0 : break;
1844 :
1845 : case SO_RCVBUF:
1846 0 : *mtod(m, int *) = so->so_rcv.sb_hiwat;
1847 0 : break;
1848 :
1849 : case SO_SNDLOWAT:
1850 0 : *mtod(m, int *) = so->so_snd.sb_lowat;
1851 0 : break;
1852 :
1853 : case SO_RCVLOWAT:
1854 0 : *mtod(m, int *) = so->so_rcv.sb_lowat;
1855 0 : break;
1856 :
1857 : case SO_SNDTIMEO:
1858 : case SO_RCVTIMEO:
1859 : {
1860 : struct timeval tv;
1861 0 : int val = (optname == SO_SNDTIMEO ?
1862 0 : so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1863 :
1864 0 : m->m_len = sizeof(struct timeval);
1865 : memset(&tv, 0, sizeof(tv));
1866 0 : tv.tv_sec = val / hz;
1867 0 : tv.tv_usec = (val % hz) * tick;
1868 0 : memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
1869 : break;
1870 : }
1871 :
1872 : case SO_RTABLE:
1873 0 : if (so->so_proto->pr_domain &&
1874 0 : so->so_proto->pr_domain->dom_protosw &&
1875 0 : so->so_proto->pr_ctloutput) {
1876 : struct domain *dom = so->so_proto->pr_domain;
1877 :
1878 0 : level = dom->dom_protosw->pr_protocol;
1879 0 : error = (*so->so_proto->pr_ctloutput)
1880 : (PRCO_GETOPT, so, level, optname, m);
1881 0 : if (error)
1882 0 : return (error);
1883 0 : break;
1884 : }
1885 0 : return (ENOPROTOOPT);
1886 :
1887 : #ifdef SOCKET_SPLICE
1888 : case SO_SPLICE:
1889 : {
1890 : off_t len;
1891 :
1892 0 : m->m_len = sizeof(off_t);
1893 0 : len = so->so_sp ? so->so_sp->ssp_len : 0;
1894 0 : memcpy(mtod(m, off_t *), &len, sizeof(off_t));
1895 : break;
1896 : }
1897 : #endif /* SOCKET_SPLICE */
1898 :
1899 : case SO_PEERCRED:
1900 0 : if (so->so_proto->pr_protocol == AF_UNIX) {
1901 0 : struct unpcb *unp = sotounpcb(so);
1902 :
1903 0 : if (unp->unp_flags & UNP_FEIDS) {
1904 0 : m->m_len = sizeof(unp->unp_connid);
1905 0 : memcpy(mtod(m, caddr_t),
1906 : &(unp->unp_connid), m->m_len);
1907 0 : break;
1908 : }
1909 0 : return (ENOTCONN);
1910 : }
1911 0 : return (EOPNOTSUPP);
1912 :
1913 : default:
1914 0 : return (ENOPROTOOPT);
1915 : }
1916 0 : return (0);
1917 : }
1918 0 : }
1919 :
1920 : void
1921 0 : sohasoutofband(struct socket *so)
1922 : {
1923 0 : KERNEL_LOCK();
1924 0 : csignal(so->so_pgid, SIGURG, so->so_siguid, so->so_sigeuid);
1925 0 : selwakeup(&so->so_rcv.sb_sel);
1926 0 : KERNEL_UNLOCK();
1927 0 : }
1928 :
1929 : int
1930 0 : soo_kqfilter(struct file *fp, struct knote *kn)
1931 : {
1932 0 : struct socket *so = kn->kn_fp->f_data;
1933 : struct sockbuf *sb;
1934 :
1935 0 : KERNEL_ASSERT_LOCKED();
1936 :
1937 0 : switch (kn->kn_filter) {
1938 : case EVFILT_READ:
1939 0 : if (so->so_options & SO_ACCEPTCONN)
1940 0 : kn->kn_fop = &solisten_filtops;
1941 : else
1942 0 : kn->kn_fop = &soread_filtops;
1943 0 : sb = &so->so_rcv;
1944 0 : break;
1945 : case EVFILT_WRITE:
1946 0 : kn->kn_fop = &sowrite_filtops;
1947 0 : sb = &so->so_snd;
1948 0 : break;
1949 : default:
1950 0 : return (EINVAL);
1951 : }
1952 :
1953 0 : SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
1954 0 : sb->sb_flagsintr |= SB_KNOTE;
1955 :
1956 0 : return (0);
1957 0 : }
1958 :
1959 : void
1960 0 : filt_sordetach(struct knote *kn)
1961 : {
1962 0 : struct socket *so = kn->kn_fp->f_data;
1963 :
1964 0 : KERNEL_ASSERT_LOCKED();
1965 :
1966 0 : SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
1967 0 : if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
1968 0 : so->so_rcv.sb_flagsintr &= ~SB_KNOTE;
1969 0 : }
1970 :
1971 : int
1972 0 : filt_soread(struct knote *kn, long hint)
1973 : {
1974 0 : struct socket *so = kn->kn_fp->f_data;
1975 : int rv;
1976 :
1977 0 : kn->kn_data = so->so_rcv.sb_cc;
1978 : #ifdef SOCKET_SPLICE
1979 0 : if (isspliced(so)) {
1980 : rv = 0;
1981 0 : } else
1982 : #endif /* SOCKET_SPLICE */
1983 0 : if (so->so_state & SS_CANTRCVMORE) {
1984 0 : kn->kn_flags |= EV_EOF;
1985 0 : kn->kn_fflags = so->so_error;
1986 : rv = 1;
1987 0 : } else if (so->so_error) { /* temporary udp error */
1988 : rv = 1;
1989 0 : } else if (kn->kn_sfflags & NOTE_LOWAT) {
1990 0 : rv = (kn->kn_data >= kn->kn_sdata);
1991 0 : } else {
1992 0 : rv = (kn->kn_data >= so->so_rcv.sb_lowat);
1993 : }
1994 :
1995 0 : return rv;
1996 : }
1997 :
1998 : void
1999 0 : filt_sowdetach(struct knote *kn)
2000 : {
2001 0 : struct socket *so = kn->kn_fp->f_data;
2002 :
2003 0 : KERNEL_ASSERT_LOCKED();
2004 :
2005 0 : SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
2006 0 : if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
2007 0 : so->so_snd.sb_flagsintr &= ~SB_KNOTE;
2008 0 : }
2009 :
2010 : int
2011 0 : filt_sowrite(struct knote *kn, long hint)
2012 : {
2013 0 : struct socket *so = kn->kn_fp->f_data;
2014 : int rv;
2015 :
2016 0 : kn->kn_data = sbspace(so, &so->so_snd);
2017 0 : if (so->so_state & SS_CANTSENDMORE) {
2018 0 : kn->kn_flags |= EV_EOF;
2019 0 : kn->kn_fflags = so->so_error;
2020 : rv = 1;
2021 0 : } else if (so->so_error) { /* temporary udp error */
2022 : rv = 1;
2023 0 : } else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2024 0 : (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2025 : rv = 0;
2026 0 : } else if (kn->kn_sfflags & NOTE_LOWAT) {
2027 0 : rv = (kn->kn_data >= kn->kn_sdata);
2028 0 : } else {
2029 0 : rv = (kn->kn_data >= so->so_snd.sb_lowat);
2030 : }
2031 :
2032 0 : return (rv);
2033 : }
2034 :
2035 : int
2036 0 : filt_solisten(struct knote *kn, long hint)
2037 : {
2038 0 : struct socket *so = kn->kn_fp->f_data;
2039 :
2040 0 : kn->kn_data = so->so_qlen;
2041 :
2042 0 : return (kn->kn_data != 0);
2043 : }
2044 :
2045 : #ifdef DDB
2046 : void
2047 : sobuf_print(struct sockbuf *,
2048 : int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2049 :
2050 : void
2051 0 : sobuf_print(struct sockbuf *sb,
2052 : int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2053 : {
2054 0 : (*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2055 0 : (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2056 0 : (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2057 0 : (*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2058 0 : (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2059 0 : (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2060 0 : (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2061 0 : (*pr)("\tsb_mb: %p\n", sb->sb_mb);
2062 0 : (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2063 0 : (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2064 0 : (*pr)("\tsb_sel: ...\n");
2065 0 : (*pr)("\tsb_flagsintr: %d\n", sb->sb_flagsintr);
2066 0 : (*pr)("\tsb_flags: %i\n", sb->sb_flags);
2067 0 : (*pr)("\tsb_timeo: %i\n", sb->sb_timeo);
2068 0 : }
2069 :
2070 : void
2071 0 : so_print(void *v,
2072 : int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2073 : {
2074 0 : struct socket *so = v;
2075 :
2076 0 : (*pr)("socket %p\n", so);
2077 0 : (*pr)("so_type: %i\n", so->so_type);
2078 0 : (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2079 0 : (*pr)("so_linger: %i\n", so->so_linger);
2080 0 : (*pr)("so_state: 0x%04x\n", so->so_state);
2081 0 : (*pr)("so_pcb: %p\n", so->so_pcb);
2082 0 : (*pr)("so_proto: %p\n", so->so_proto);
2083 :
2084 0 : (*pr)("so_head: %p\n", so->so_head);
2085 0 : (*pr)("so_onq: %p\n", so->so_onq);
2086 0 : (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2087 0 : (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2088 0 : (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2089 0 : (*pr)("so_q0len: %i\n", so->so_q0len);
2090 0 : (*pr)("so_qlen: %i\n", so->so_qlen);
2091 0 : (*pr)("so_qlimit: %i\n", so->so_qlimit);
2092 0 : (*pr)("so_timeo: %i\n", so->so_timeo);
2093 0 : (*pr)("so_pgid: %i\n", so->so_pgid);
2094 0 : (*pr)("so_siguid: %i\n", so->so_siguid);
2095 0 : (*pr)("so_sigeuid: %i\n", so->so_sigeuid);
2096 0 : (*pr)("so_obmark: %lu\n", so->so_oobmark);
2097 :
2098 0 : (*pr)("so_sp: %p\n", so->so_sp);
2099 0 : if (so->so_sp != NULL) {
2100 0 : (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2101 0 : (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2102 0 : (*pr)("\tssp_len: %lld\n",
2103 0 : (unsigned long long)so->so_sp->ssp_len);
2104 0 : (*pr)("\tssp_max: %lld\n",
2105 0 : (unsigned long long)so->so_sp->ssp_max);
2106 0 : (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2107 0 : so->so_sp->ssp_idletv.tv_usec);
2108 0 : (*pr)("\tssp_idleto: %spending (@%i)\n",
2109 0 : timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2110 0 : so->so_sp->ssp_idleto.to_time);
2111 0 : }
2112 :
2113 0 : (*pr)("so_rcv:\n");
2114 0 : sobuf_print(&so->so_rcv, pr);
2115 0 : (*pr)("so_snd:\n");
2116 0 : sobuf_print(&so->so_snd, pr);
2117 :
2118 0 : (*pr)("so_upcall: %p so_upcallarg: %p\n",
2119 0 : so->so_upcall, so->so_upcallarg);
2120 :
2121 0 : (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2122 0 : (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2123 0 : (*pr)("so_cpid: %d\n", so->so_cpid);
2124 0 : }
2125 : #endif
2126 :
|