uipc_mbuf.c source code [codebrowser/bsd/kern/uipc_mbuf.c]

1	/*
2	* Copyright (c) 1998-2018 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1982, 1986, 1988, 1991, 1993
31	* The Regents of the University of California. All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	* 3. All advertising materials mentioning features or use of this software
42	* must display the following acknowledgement:
43	* This product includes software developed by the University of
44	* California, Berkeley and its contributors.
45	* 4. Neither the name of the University nor the names of its contributors
46	* may be used to endorse or promote products derived from this software
47	* without specific prior written permission.
48	*
49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59	* SUCH DAMAGE.
60	*
61	* @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62	*/
63	/*
64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65	* support for mandatory and extensible security protections. This notice
66	* is included in support of clause 2.2 (b) of the Apple Public License,
67	* Version 2.0.
68	*/
69
70	#include <sys/param.h>
71	#include <sys/systm.h>
72	#include <sys/malloc.h>
73	#include <sys/mbuf.h>
74	#include <sys/kernel.h>
75	#include <sys/sysctl.h>
76	#include <sys/syslog.h>
77	#include <sys/protosw.h>
78	#include <sys/domain.h>
79	#include <sys/queue.h>
80	#include <sys/proc.h>
81
82	#include <dev/random/randomdev.h>
83
84	#include <kern/kern_types.h>
85	#include <kern/simple_lock.h>
86	#include <kern/queue.h>
87	#include <kern/sched_prim.h>
88	#include <kern/backtrace.h>
89	#include <kern/cpu_number.h>
90	#include <kern/zalloc.h>
91
92	#include <libkern/OSAtomic.h>
93	#include <libkern/OSDebug.h>
94	#include <libkern/libkern.h>
95
96	#include <os/log.h>
97
98	#include <IOKit/IOMapper.h>
99
100	#include <machine/limits.h>
101	#include <machine/machine_routines.h>
102
103	#if CONFIG_MACF_NET
104	#include <security/mac_framework.h>
105	#endif /* MAC_NET */
106
107	#include <sys/mcache.h>
108	#include <net/ntstat.h>
109
110	/*
111	* MBUF IMPLEMENTATION NOTES.
112	*
113	* There is a total of 5 per-CPU caches:
114	*
115	* MC_MBUF:
116	* This is a cache of rudimentary objects of MSIZE in size; each
117	* object represents an mbuf structure. This cache preserves only
118	* the m_type field of the mbuf during its transactions.
119	*
120	* MC_CL:
121	* This is a cache of rudimentary objects of MCLBYTES in size; each
122	* object represents a mcluster structure. This cache does not
123	* preserve the contents of the objects during its transactions.
124	*
125	* MC_BIGCL:
126	* This is a cache of rudimentary objects of MBIGCLBYTES in size; each
127	* object represents a mbigcluster structure. This cache does not
128	* preserve the contents of the objects during its transaction.
129	*
130	* MC_MBUF_CL:
131	* This is a cache of mbufs each having a cluster attached to it.
132	* It is backed by MC_MBUF and MC_CL rudimentary caches. Several
133	* fields of the mbuf related to the external cluster are preserved
134	* during transactions.
135	*
136	* MC_MBUF_BIGCL:
137	* This is a cache of mbufs each having a big cluster attached to it.
138	* It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
139	* fields of the mbuf related to the external cluster are preserved
140	* during transactions.
141	*
142	* OBJECT ALLOCATION:
143	*
144	* Allocation requests are handled first at the per-CPU (mcache) layer
145	* before falling back to the slab layer. Performance is optimal when
146	* the request is satisfied at the CPU layer because global data/lock
147	* never gets accessed. When the slab layer is entered for allocation,
148	* the slab freelist will be checked first for available objects before
149	* the VM backing store is invoked. Slab layer operations are serialized
150	* for all of the caches as the mbuf global lock is held most of the time.
151	* Allocation paths are different depending on the class of objects:
152	*
153	* a. Rudimentary object:
154	*
155	* { m_get_common(), m_clattach(), m_mclget(),
156	* m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
157	* composite object allocation }
158	* \| ^
159	* \| \|
160	* \| +-----------------------+
161	* v \|
162	* mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
163	* \| ^
164	* v \|
165	* [CPU cache] -------> (found?) -------+
166	* \| \|
167	* v \|
168	* mbuf_slab_alloc() \|
169	* \| \|
170	* v \|
171	* +---------> [freelist] -------> (found?) -------+
172	* \| \|
173	* \| v
174	* \| m_clalloc()
175	* \| \|
176	* \| v
177	* +---<<---- kmem_mb_alloc()
178	*
179	* b. Composite object:
180	*
181	* { m_getpackets_internal(), m_allocpacket_internal() }
182	* \| ^
183	* \| \|
184	* \| +------ (done) ---------+
185	* v \|
186	* mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
187	* \| ^
188	* v \|
189	* [CPU cache] -------> (found?) -------+
190	* \| \|
191	* v \|
192	* mbuf_cslab_alloc() \|
193	* \| \|
194	* v \|
195	* [freelist] -------> (found?) -------+
196	* \| \|
197	* v \|
198	* (rudimentary object) \|
199	* mcache_alloc/mcache_alloc_ext() ------>>-----+
200	*
201	* Auditing notes: If auditing is enabled, buffers will be subjected to
202	* integrity checks by the audit routine. This is done by verifying their
203	* contents against DEADBEEF (free) pattern before returning them to caller.
204	* As part of this step, the routine will also record the transaction and
205	* pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
206	* also restore any constructed data structure fields if necessary.
207	*
208	* OBJECT DEALLOCATION:
209	*
210	* Freeing an object simply involves placing it into the CPU cache; this
211	* pollutes the cache to benefit subsequent allocations. The slab layer
212	* will only be entered if the object is to be purged out of the cache.
213	* During normal operations, this happens only when the CPU layer resizes
214	* its bucket while it's adjusting to the allocation load. Deallocation
215	* paths are different depending on the class of objects:
216	*
217	* a. Rudimentary object:
218	*
219	* { m_free(), m_freem_list(), composite object deallocation }
220	* \| ^
221	* \| \|
222	* \| +------ (done) ---------+
223	* v \|
224	* mcache_free/mcache_free_ext() \|
225	* \| \|
226	* v \|
227	* mbuf_slab_audit() \|
228	* \| \|
229	* v \|
230	* [CPU cache] ---> (not purging?) -----+
231	* \| \|
232	* v \|
233	* mbuf_slab_free() \|
234	* \| \|
235	* v \|
236	* [freelist] ----------->>------------+
237	* (objects get purged to VM only on demand)
238	*
239	* b. Composite object:
240	*
241	* { m_free(), m_freem_list() }
242	* \| ^
243	* \| \|
244	* \| +------ (done) ---------+
245	* v \|
246	* mcache_free/mcache_free_ext() \|
247	* \| \|
248	* v \|
249	* mbuf_cslab_audit() \|
250	* \| \|
251	* v \|
252	* [CPU cache] ---> (not purging?) -----+
253	* \| \|
254	* v \|
255	* mbuf_cslab_free() \|
256	* \| \|
257	* v \|
258	* [freelist] ---> (not purging?) -----+
259	* \| \|
260	* v \|
261	* (rudimentary object) \|
262	* mcache_free/mcache_free_ext() ------->>------+
263	*
264	* Auditing notes: If auditing is enabled, the audit routine will save
265	* any constructed data structure fields (if necessary) before filling the
266	* contents of the buffers with DEADBEEF (free) pattern and recording the
267	* transaction. Buffers that are freed (whether at CPU or slab layer) are
268	* expected to contain the free pattern.
269	*
270	* DEBUGGING:
271	*
272	* Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
273	* translates to the mcache flags (MCF_VERIFY \| MCF_AUDIT). Additionally,
274	* the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
275	* i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
276	* detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
277	* "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
278	*
279	* Each object is associated with exactly one mcache_audit_t structure that
280	* contains the information related to its last buffer transaction. Given
281	* an address of an object, the audit structure can be retrieved by finding
282	* the position of the object relevant to the base address of the cluster:
283	*
284	* +------------+ +=============+
285	* \| mbuf addr \| \| mclaudit[i] \|
286	* +------------+ +=============+
287	* \| \| cl_audit[0] \|
288	* i = MTOBG(addr) +-------------+
289	* \| +-----> \| cl_audit[1] \| -----> mcache_audit_t
290	* b = BGTOM(i) \| +-------------+
291	* \| \| \| ... \|
292	* x = MCLIDX(b, addr) \| +-------------+
293	* \| \| \| cl_audit[7] \|
294	* +-----------------+ +-------------+
295	* (e.g. x == 1)
296	*
297	* The mclaudit[] array is allocated at initialization time, but its contents
298	* get populated when the corresponding cluster is created. Because a page
299	* can be turned into NMBPG number of mbufs, we preserve enough space for the
300	* mbufs so that there is a 1-to-1 mapping between them. A page that never
301	* gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
302	* remaining entries unused. For 16KB cluster, only one entry from the first
303	* page is allocated and used for the entire object.
304	*/
305
306	/ TODO: should be in header file /
307	/ kernel translater /
308	extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
309	extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
310	extern vm_map_t mb_map; / special map /
311
312	static uint32_t mb_kmem_contig_failed;
313	static uint32_t mb_kmem_failed;
314	static uint32_t mb_kmem_one_failed;
315	/ Timestamp of allocation failures. /
316	static uint64_t mb_kmem_contig_failed_ts;
317	static uint64_t mb_kmem_failed_ts;
318	static uint64_t mb_kmem_one_failed_ts;
319	static uint64_t mb_kmem_contig_failed_size;
320	static uint64_t mb_kmem_failed_size;
321	static uint32_t mb_kmem_stats[`6`];
322	static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
323	"INVALID_ADDRESS",
324	"RESOURCE_SHORTAGE",
325	"NO_SPACE",
326	"KERN_FAILURE",
327	"OTHERS" };
328
329	/ Global lock /
330	decl_lck_mtx_data(static, mbuf_mlock_data);
331	static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
332	static lck_attr_t *mbuf_mlock_attr;
333	static lck_grp_t *mbuf_mlock_grp;
334	static lck_grp_attr_t *mbuf_mlock_grp_attr;
335
336	/ Back-end (common) layer /
337	static uint64_t mb_expand_cnt;
338	static uint64_t mb_expand_cl_cnt;
339	static uint64_t mb_expand_cl_total;
340	static uint64_t mb_expand_bigcl_cnt;
341	static uint64_t mb_expand_bigcl_total;
342	static uint64_t mb_expand_16kcl_cnt;
343	static uint64_t mb_expand_16kcl_total;
344	static boolean_t mbuf_worker_needs_wakeup; / wait channel for mbuf worker /
345	static uint32_t mbuf_worker_run_cnt;
346	static uint64_t mbuf_worker_last_runtime;
347	static uint64_t mbuf_drain_last_runtime;
348	static int mbuf_worker_ready; / worker thread is runnable /
349	static int ncpu; / number of CPUs /
350	static ppnum_t mcl_paddr; /* Array of cluster physical addresses /
351	static ppnum_t mcl_pages; / Size of array (# physical pages) /
352	static ppnum_t mcl_paddr_base; / Handle returned by IOMapper::iovmAlloc() /
353	static mcache_t ref_cache; /* Cache of cluster reference & flags /
354	static mcache_t mcl_audit_con_cache; /* Audit contents cache /
355	static unsigned int mbuf_debug; / patchable mbuf mcache flags /
356	static unsigned int mb_normalized; / number of packets "normalized" /
357
358	#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
359	#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
360
361	typedef enum {
362	MC_MBUF = `0`, / Regular mbuf /
363	MC_CL, / Cluster /
364	MC_BIGCL, / Large (4KB) cluster /
365	MC_16KCL, / Jumbo (16KB) cluster /
366	MC_MBUF_CL, / mbuf + cluster /
367	MC_MBUF_BIGCL, / mbuf + large (4KB) cluster /
368	MC_MBUF_16KCL / mbuf + jumbo (16KB) cluster /
369	} mbuf_class_t;
370
371	#define MBUF_CLASS_MIN MC_MBUF
372	#define MBUF_CLASS_MAX MC_MBUF_16KCL
373	#define MBUF_CLASS_LAST MC_16KCL
374	#define MBUF_CLASS_VALID(c) \
375	((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
376	#define MBUF_CLASS_COMPOSITE(c) \
377	((int)(c) > MBUF_CLASS_LAST)
378
379
380	/*
381	* mbuf specific mcache allocation request flags.
382	*/
383	#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
384
385	/*
386	* Per-cluster slab structure.
387	*
388	* A slab is a cluster control structure that contains one or more object
389	* chunks; the available chunks are chained in the slab's freelist (sl_head).
390	* Each time a chunk is taken out of the slab, the slab's reference count
391	* gets incremented. When all chunks have been taken out, the empty slab
392	* gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
393	* returned to a slab causes the slab's reference count to be decremented;
394	* it also causes the slab to be reinserted back to class's slab list, if
395	* it's not already done.
396	*
397	* Compartmentalizing of the object chunks into slabs allows us to easily
398	* merge one or more slabs together when the adjacent slabs are idle, as
399	* well as to convert or move a slab from one class to another; e.g. the
400	* mbuf cluster slab can be converted to a regular cluster slab when all
401	* mbufs in the slab have been freed.
402	*
403	* A slab may also span across multiple clusters for chunks larger than
404	* a cluster's size. In this case, only the slab of the first cluster is
405	* used. The rest of the slabs are marked with SLF_PARTIAL to indicate
406	* that they are part of the larger slab.
407	*
408	* Each slab controls a page of memory.
409	*/
410	typedef struct mcl_slab {
411	struct mcl_slab sl_next; /* neighboring slab /
412	u_int8_t sl_class; / controlling mbuf class /
413	int8_t sl_refcnt; / outstanding allocations /
414	int8_t sl_chunks; / chunks (bufs) in this slab /
415	u_int16_t sl_flags; / slab flags (see below) /
416	u_int16_t sl_len; / slab length /
417	void sl_base; /* base of allocated memory /
418	void sl_head; /* first free buffer /
419	TAILQ_ENTRY(mcl_slab) sl_link; / next/prev slab on freelist /
420	} mcl_slab_t;
421
422	#define SLF_MAPPED 0x0001 /* backed by a mapped page */
423	#define SLF_PARTIAL 0x0002 /* part of another slab */
424	#define SLF_DETACHED 0x0004 /* not in slab freelist */
425
426	/*
427	* The array of slabs are broken into groups of arrays per 1MB of kernel
428	* memory to reduce the footprint. Each group is allocated on demand
429	* whenever a new piece of memory mapped in from the VM crosses the 1MB
430	* boundary.
431	*/
432	#define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
433
434	typedef struct mcl_slabg {
435	mcl_slab_t slg_slab; /* group of slabs /
436	} mcl_slabg_t;
437
438	/*
439	* Number of slabs needed to control a 16KB cluster object.
440	*/
441	#define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
442
443	/*
444	* Per-cluster audit structure.
445	*/
446	typedef struct {
447	mcache_audit_t *cl_audit; /* array of audits /
448	} mcl_audit_t;
449
450	typedef struct {
451	struct thread msa_thread; /* thread doing transaction /
452	struct thread msa_pthread; /* previous transaction thread /
453	uint32_t msa_tstamp; / transaction timestamp (ms) /
454	uint32_t msa_ptstamp; / prev transaction timestamp (ms) /
455	uint16_t msa_depth; / pc stack depth /
456	uint16_t msa_pdepth; / previous transaction pc stack /
457	void *msa_stack[MCACHE_STACK_DEPTH];
458	void *msa_pstack[MCACHE_STACK_DEPTH];
459	} mcl_scratch_audit_t;
460
461	typedef struct {
462	/*
463	* Size of data from the beginning of an mbuf that covers m_hdr,
464	* pkthdr and m_ext structures. If auditing is enabled, we allocate
465	* a shadow mbuf structure of this size inside each audit structure,
466	* and the contents of the real mbuf gets copied into it when the mbuf
467	* is freed. This allows us to pattern-fill the mbuf for integrity
468	* check, and to preserve any constructed mbuf fields (e.g. mbuf +
469	* cluster cache case). Note that we don't save the contents of
470	* clusters when they are freed; we simply pattern-fill them.
471	*/
472	u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
473	mcl_scratch_audit_t sc_scratch __attribute__((aligned(`8`)));
474	} mcl_saved_contents_t;
475
476	#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
477
478	#define MCA_SAVED_MBUF_PTR(_mca) \
479	((struct mbuf )(void )((mcl_saved_contents_t *) \
480	(_mca)->mca_contents)->sc_mbuf)
481	#define MCA_SAVED_MBUF_SIZE \
482	(sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
483	#define MCA_SAVED_SCRATCH_PTR(_mca) \
484	(&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
485
486	/*
487	* mbuf specific mcache audit flags
488	*/
489	#define MB_INUSE 0x01 /* object has not been returned to slab */
490	#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
491	#define MB_SCVALID 0x04 /* object has valid saved contents */
492
493	/*
494	* Each of the following two arrays hold up to nmbclusters elements.
495	*/
496	static mcl_audit_t mclaudit; /* array of cluster audit information /
497	static unsigned int maxclaudit; / max # of entries in audit table /
498	static mcl_slabg_t *slabstbl; /* cluster slabs table /
499	static unsigned int maxslabgrp; / max # of entries in slabs table /
500	static unsigned int slabgrp; / # of entries in slabs table /
501
502	/ Globals /
503	int nclusters; / # of clusters for non-jumbo (legacy) sizes /
504	int njcl; / # of clusters for jumbo sizes /
505	int njclbytes; / size of a jumbo cluster /
506	unsigned char mbutl; /* first mapped cluster address /
507	unsigned char embutl; /* ending virtual address of mclusters /
508	int _max_linkhdr; / largest link-level header /
509	int _max_protohdr; / largest protocol header /
510	int max_hdr; / largest link+protocol header /
511	int max_datalen; / MHLEN - max_hdr /
512
513	static boolean_t mclverify; / debug: pattern-checking /
514	static boolean_t mcltrace; / debug: stack tracing /
515	static boolean_t mclfindleak; / debug: leak detection /
516	static boolean_t mclexpleak; / debug: expose leak info to user space /
517
518	static struct timeval mb_start; / beginning of time /
519
520	/ mbuf leak detection variables /
521	static struct mleak_table mleak_table;
522	static mleak_stat_t *mleak_stat;
523
524	#define MLEAK_STAT_SIZE(n) \
525	__builtin_offsetof(mleak_stat_t, ml_trace[n])
526
527	struct mallocation {
528	mcache_obj_t element; /* the alloc'ed element, NULL if unused /
529	u_int32_t trace_index; / mtrace index for corresponding backtrace /
530	u_int32_t count; / How many objects were requested /
531	u_int64_t hitcount; / for determining hash effectiveness /
532	};
533
534	struct mtrace {
535	u_int64_t collisions;
536	u_int64_t hitcount;
537	u_int64_t allocs;
538	u_int64_t depth;
539	uintptr_t addr[MLEAK_STACK_DEPTH];
540	};
541
542	/ Size must be a power of two for the zhash to be able to just mask off bits /
543	#define MLEAK_ALLOCATION_MAP_NUM 512
544	#define MLEAK_TRACE_MAP_NUM 256
545
546	/*
547	* Sample factor for how often to record a trace. This is overwritable
548	* by the boot-arg mleak_sample_factor.
549	*/
550	#define MLEAK_SAMPLE_FACTOR 500
551
552	/*
553	* Number of top leakers recorded.
554	*/
555	#define MLEAK_NUM_TRACES 5
556
557	#define MB_LEAK_SPACING_64 " "
558	#define MB_LEAK_SPACING_32 " "
559
560
561	#define MB_LEAK_HDR_32 "\n\
562	trace [1] trace [2] trace [3] trace [4] trace [5] \n\
563	---------- ---------- ---------- ---------- ---------- \n\
564	"
565
566	#define MB_LEAK_HDR_64 "\n\
567	trace [1] trace [2] trace [3] \
568	trace [4] trace [5] \n\
569	------------------ ------------------ ------------------ \
570	------------------ ------------------ \n\
571	"
572
573	static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
574	static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
575
576	/ Hashmaps of allocations and their corresponding traces /
577	static struct mallocation *mleak_allocations;
578	static struct mtrace *mleak_traces;
579	static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
580
581	/ Lock to protect mleak tables from concurrent modification /
582	decl_lck_mtx_data(static, mleak_lock_data);
583	static lck_mtx_t *mleak_lock = &mleak_lock_data;
584	static lck_attr_t *mleak_lock_attr;
585	static lck_grp_t *mleak_lock_grp;
586	static lck_grp_attr_t *mleak_lock_grp_attr;
587
588	/ Failed large allocations. /
589	struct mtracelarge {
590	uint64_t size;
591	uint64_t depth;
592	uintptr_t addr[MLEAK_STACK_DEPTH];
593	};
594
595	#define MTRACELARGE_NUM_TRACES 5
596	static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
597
598	static void mtracelarge_register(size_t size);
599
600	/ Lock to protect the completion callback table /
601	static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL;
602	static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL;
603	static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL;
604	decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data);
605	lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data;
606
607	extern u_int32_t high_sb_max;
608
609	/ The minimum number of objects that are allocated, to start. /
610	#define MINCL 32
611	#define MINBIGCL (MINCL >> 1)
612	#define MIN16KCL (MINCL >> 2)
613
614	/ Low watermarks (only map in pages once free counts go below) /
615	#define MBIGCL_LOWAT MINBIGCL
616	#define M16KCL_LOWAT MIN16KCL
617
618	typedef struct {
619	mbuf_class_t mtbl_class; / class type /
620	mcache_t mtbl_cache; /* mcache for this buffer class /
621	TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; / slab list /
622	mcache_obj_t mtbl_cobjlist; /* composite objects freelist /
623	mb_class_stat_t mtbl_stats; /* statistics fetchable via sysctl /
624	u_int32_t mtbl_maxsize; / maximum buffer size /
625	int mtbl_minlimit; / minimum allowed /
626	int mtbl_maxlimit; / maximum allowed /
627	u_int32_t mtbl_wantpurge; / purge during next reclaim /
628	uint32_t mtbl_avgtotal; / average total on iOS /
629	u_int32_t mtbl_expand; / worker should expand the class /
630	} mbuf_table_t;
631
632	#define m_class(c) mbuf_table[c].mtbl_class
633	#define m_cache(c) mbuf_table[c].mtbl_cache
634	#define m_slablist(c) mbuf_table[c].mtbl_slablist
635	#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
636	#define m_maxsize(c) mbuf_table[c].mtbl_maxsize
637	#define m_minlimit(c) mbuf_table[c].mtbl_minlimit
638	#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
639	#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
640	#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
641	#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
642	#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
643	#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
644	#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
645	#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
646	#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
647	#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
648	#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
649	#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
650	#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
651	#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
652	#define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
653	#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
654	#define m_region_expand(c) mbuf_table[c].mtbl_expand
655
656	static mbuf_table_t mbuf_table[] = {
657	/*
658	* The caches for mbufs, regular clusters and big clusters.
659	* The average total values were based on data gathered by actual
660	* usage patterns on iOS.
661	*/
662	{ MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
663	NULL, NULL, `0`, `0`, `0`, `0`, `3000`, `0` },
664	{ MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
665	NULL, NULL, `0`, `0`, `0`, `0`, `2000`, `0` },
666	{ MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
667	NULL, NULL, `0`, `0`, `0`, `0`, `1000`, `0` },
668	{ MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
669	NULL, NULL, `0`, `0`, `0`, `0`, `200`, `0` },
670	/*
671	* The following are special caches; they serve as intermediate
672	* caches backed by the above rudimentary caches. Each object
673	* in the cache is an mbuf with a cluster attached to it. Unlike
674	* the above caches, these intermediate caches do not directly
675	* deal with the slab structures; instead, the constructed
676	* cached elements are simply stored in the freelists.
677	*/
678	{ MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, `0`, `0`, `0`, `0`, `2000`, `0` },
679	{ MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, `0`, `0`, `0`, `0`, `1000`, `0` },
680	{ MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, `0`, `0`, `0`, `0`, `200`, `0` },
681	};
682
683	#define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
684
685
686	static uint32_t
687	m_avgtotal(mbuf_class_t c)
688	{
689	return (mbuf_table[c].mtbl_avgtotal);
690	}
691
692	static void mb_waitchan = &mbuf_table; /* wait channel for all caches /
693	static int mb_waiters; / number of waiters /
694
695	boolean_t mb_peak_newreport = FALSE;
696	boolean_t mb_peak_firstreport = FALSE;
697
698	/ generate a report by default after 1 week of uptime /
699	#define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
700
701	#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
702	static struct timeval mb_wdtstart; / watchdog start timestamp /
703	static char *mbuf_dump_buf;
704
705	#define MBUF_DUMP_BUF_SIZE 4096
706
707	/*
708	* mbuf watchdog is enabled by default on embedded platforms. It is
709	* also toggeable via the kern.ipc.mb_watchdog sysctl.
710	* Garbage collection is also enabled by default on embedded platforms.
711	* mb_drain_maxint controls the amount of time to wait (in seconds) before
712	* consecutive calls to mbuf_drain().
713	*/
714	#if CONFIG_EMBEDDED
715	static unsigned int mb_watchdog = `1`;
716	static unsigned int mb_drain_maxint = `60`;
717	#else
718	static unsigned int mb_watchdog = `0`;
719	static unsigned int mb_drain_maxint = `0`;
720	#endif /* CONFIG_EMBEDDED */
721
722	uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
723	uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
724
725	/ Red zone /
726	static u_int32_t mb_redzone_cookie;
727	static void m_redzone_init(struct mbuf *);
728	static void m_redzone_verify(struct mbuf *m);
729
730	/ The following are used to serialize m_clalloc() /
731	static boolean_t mb_clalloc_busy;
732	static void *mb_clalloc_waitchan = &mb_clalloc_busy;
733	static int mb_clalloc_waiters;
734
735	static void mbuf_mtypes_sync(boolean_t);
736	static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
737	static void mbuf_stat_sync(void);
738	static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
739	static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
740	static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
741	static char mbuf_dump(void*);
742	static void mbuf_table_init(void);
743	static inline void m_incref(struct mbuf *);
744	static inline u_int16_t m_decref(struct mbuf *);
745	static int m_clalloc(const u_int32_t, const int, const u_int32_t);
746	static void mbuf_worker_thread_init(void);
747	static mcache_obj_t slab_alloc(mbuf_class_t, int*);
748	static void slab_free(mbuf_class_t, mcache_obj_t *);
749	static unsigned int mbuf_slab_alloc(void , mcache_obj_t **,
750	unsigned int, int);
751	static void mbuf_slab_free(void , mcache_obj_t , int);
752	static void mbuf_slab_audit(void , mcache_obj_t , boolean_t);
753	static void mbuf_slab_notify(void *, u_int32_t);
754	static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
755	unsigned int);
756	static unsigned int cslab_free(mbuf_class_t, mcache_obj_t , int*);
757	static unsigned int mbuf_cslab_alloc(void , mcache_obj_t **,
758	unsigned int, int);
759	static void mbuf_cslab_free(void , mcache_obj_t , int);
760	static void mbuf_cslab_audit(void , mcache_obj_t , boolean_t);
761	static int freelist_populate(mbuf_class_t, unsigned int, int);
762	static void freelist_init(mbuf_class_t);
763	static boolean_t mbuf_cached_above(mbuf_class_t, int);
764	static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
765	static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
766	static int m_howmany(int, size_t);
767	static void mbuf_worker_thread(void);
768	static void mbuf_watchdog(void);
769	static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
770
771	static void mcl_audit_init(void , mcache_audit_t , mcache_obj_t *,
772	size_t, unsigned int);
773	static void mcl_audit_free(void , unsigned* int);
774	static mcache_audit_t mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t );
775	static void mcl_audit_mbuf(mcache_audit_t , void* *, boolean_t, boolean_t);
776	static void mcl_audit_cluster(mcache_audit_t , void* *, size_t, boolean_t,
777	boolean_t);
778	static void mcl_audit_restore_mbuf(struct mbuf , mcache_audit_t , boolean_t);
779	static void mcl_audit_save_mbuf(struct mbuf , mcache_audit_t );
780	static void mcl_audit_scratch(mcache_audit_t *);
781	static void mcl_audit_mcheck_panic(struct mbuf *);
782	static void mcl_audit_verify_nextptr(void , mcache_audit_t );
783
784	static void mleak_activate(void);
785	static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
786	static boolean_t mleak_log(uintptr_t , mcache_obj_t , uint32_t, int);
787	static void mleak_free(mcache_obj_t *);
788	static void mleak_sort_traces(void);
789	static void mleak_update_stats(void);
790
791	static mcl_slab_t slab_get(void* *);
792	static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
793	void , void* , unsigned* int, int, int);
794	static void slab_insert(mcl_slab_t *, mbuf_class_t);
795	static void slab_remove(mcl_slab_t *, mbuf_class_t);
796	static boolean_t slab_inrange(mcl_slab_t , void* *);
797	static void slab_nextptr_panic(mcl_slab_t , void* *);
798	static void slab_detach(mcl_slab_t *);
799	static boolean_t slab_is_detached(mcl_slab_t *);
800
801	static int m_copyback0(struct mbuf *, int, int, const* void , int, int*);
802	static struct mbuf m_split0(struct* mbuf , int, int, int*);
803	__private_extern__ void mbuf_report_peak_usage(void);
804	static boolean_t mbuf_report_usage(mbuf_class_t);
805	#if DEBUG \|\| DEVELOPMENT
806	#define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
807	static void _mbwdog_logger(const char func, const* int line, const char *fmt, ...);
808	static char *mbwdog_logging;
809	const unsigned mbwdog_logging_size = `4096`;
810	static size_t mbwdog_logging_used;
811	#else
812	#define mbwdog_logger(fmt, ...) do { } while (0)
813	#endif
814	static void mbuf_drain_locked(boolean_t);
815
816	/ flags for m_copyback0 /
817	#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
818	#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
819	#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
820	#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
821
822	/*
823	* This flag is set for all mbufs that come out of and into the composite
824	* mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
825	* are marked with such a flag have clusters attached to them, and will be
826	* treated differently when they are freed; instead of being placed back
827	* into the mbuf and cluster freelists, the composite mbuf + cluster objects
828	* are placed back into the appropriate composite cache's freelist, and the
829	* actual freeing is deferred until the composite objects are purged. At
830	* such a time, this flag will be cleared from the mbufs and the objects
831	* will be freed into their own separate freelists.
832	*/
833	#define EXTF_COMPOSITE 0x1
834
835	/*
836	* This flag indicates that the external cluster is read-only, i.e. it is
837	* or was referred to by more than one mbufs. Once set, this flag is never
838	* cleared.
839	*/
840	#define EXTF_READONLY 0x2
841	/*
842	* This flag indicates that the external cluster is paired with the mbuf.
843	* Pairing implies an external free routine defined which will be invoked
844	* when the reference count drops to the minimum at m_free time. This
845	* flag is never cleared.
846	*/
847	#define EXTF_PAIRED 0x4
848
849	#define EXTF_MASK \
850	(EXTF_COMPOSITE \| EXTF_READONLY \| EXTF_PAIRED)
851
852	#define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
853	#define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
854	#define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
855	#define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
856	#define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
857	#define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
858	#define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
859	#define MBUF_IS_COMPOSITE(m) \
860	(MEXT_REF(m) == MEXT_MINREF(m) && \
861	(MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
862	/*
863	* This macro can be used to test if the mbuf is paired to an external
864	* cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
865	* is important, as EXTF_PAIRED alone is insufficient since it is immutable,
866	* and thus survives calls to m_free_paired.
867	*/
868	#define MBUF_IS_PAIRED(m) \
869	(((m)->m_flags & M_EXT) && \
870	(MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
871	MEXT_PMBUF(m) == (m))
872
873	/*
874	* Macros used to verify the integrity of the mbuf.
875	*/
876	#define _MCHECK(m) { \
877	if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
878	if (mclaudit == NULL) \
879	panic("MCHECK: m_type=%d m=%p", \
880	(u_int16_t)(m)->m_type, m); \
881	else \
882	mcl_audit_mcheck_panic(m); \
883	} \
884	}
885
886	#define MBUF_IN_MAP(addr) \
887	((unsigned char *)(addr) >= mbutl && \
888	(unsigned char *)(addr) < embutl)
889
890	#define MRANGE(addr) { \
891	if (!MBUF_IN_MAP(addr)) \
892	panic("MRANGE: address out of range 0x%p", addr); \
893	}
894
895	/*
896	* Macro version of mtod.
897	*/
898	#define MTOD(m, t) ((t)((m)->m_data))
899
900	/*
901	* Macros to obtain page index given a base cluster address
902	*/
903	#define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
904	#define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
905
906	/*
907	* Macro to find the mbuf index relative to a base.
908	*/
909	#define MBPAGEIDX(c, m) \
910	(((unsigned char )(m) - (unsigned char )(c)) >> MSIZESHIFT)
911
912	/*
913	* Same thing for 2KB cluster index.
914	*/
915	#define CLPAGEIDX(c, m) \
916	(((unsigned char )(m) - (unsigned char )(c)) >> MCLSHIFT)
917
918	/*
919	* Macro to find 4KB cluster index relative to a base
920	*/
921	#define BCLPAGEIDX(c, m) \
922	(((unsigned char )(m) - (unsigned char )(c)) >> MBIGCLSHIFT)
923
924	/*
925	* Macros used during mbuf and cluster initialization.
926	*/
927	#define MBUF_INIT_PKTHDR(m) { \
928	(m)->m_pkthdr.rcvif = NULL; \
929	(m)->m_pkthdr.pkt_hdr = NULL; \
930	(m)->m_pkthdr.len = 0; \
931	(m)->m_pkthdr.csum_flags = 0; \
932	(m)->m_pkthdr.csum_data = 0; \
933	(m)->m_pkthdr.vlan_tag = 0; \
934	m_classifier_init(m, 0); \
935	m_tag_init(m, 1); \
936	m_scratch_init(m); \
937	m_redzone_init(m); \
938	}
939
940	#define MBUF_INIT(m, pkthdr, type) { \
941	_MCHECK(m); \
942	(m)->m_next = (m)->m_nextpkt = NULL; \
943	(m)->m_len = 0; \
944	(m)->m_type = type; \
945	if ((pkthdr) == 0) { \
946	(m)->m_data = (m)->m_dat; \
947	(m)->m_flags = 0; \
948	} else { \
949	(m)->m_data = (m)->m_pktdat; \
950	(m)->m_flags = M_PKTHDR; \
951	MBUF_INIT_PKTHDR(m); \
952	} \
953	}
954
955	#define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \
956	priv, pm) { \
957	(m)->m_data = (m)->m_ext.ext_buf = (buf); \
958	(m)->m_flags \|= M_EXT; \
959	m_set_ext((m), (rfa), (free), (arg)); \
960	(m)->m_ext.ext_size = (size); \
961	MEXT_MINREF(m) = (min); \
962	MEXT_REF(m) = (ref); \
963	MEXT_PREF(m) = (pref); \
964	MEXT_FLAGS(m) = (flag); \
965	MEXT_PRIV(m) = (priv); \
966	MEXT_PMBUF(m) = (pm); \
967	}
968
969	#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
970	MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
971	ref, 0, flag, 0, NULL)
972
973	#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
974	MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
975	ref, 0, flag, 0, NULL)
976
977	#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
978	MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
979	ref, 0, flag, 0, NULL)
980
981	/*
982	* Macro to convert BSD malloc sleep flag to mcache's
983	*/
984	#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
985
986	/*
987	* The structure that holds all mbuf class statistics exportable via sysctl.
988	* Similar to mbstat structure, the mb_stat structure is protected by the
989	* global mbuf lock. It contains additional information about the classes
990	* that allows for a more accurate view of the state of the allocator.
991	*/
992	struct mb_stat *mb_stat;
993	struct omb_stat omb_stat; /* For backwards compatibility /
994
995	#define MB_STAT_SIZE(n) \
996	__builtin_offsetof(mb_stat_t, mbs_class[n])
997	#define OMB_STAT_SIZE(n) \
998	((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
999
1000	/*
1001	* The legacy structure holding all of the mbuf allocation statistics.
1002	* The actual statistics used by the kernel are stored in the mbuf_table
1003	* instead, and are updated atomically while the global mbuf lock is held.
1004	* They are mirrored in mbstat to support legacy applications (e.g. netstat).
1005	* Unlike before, the kernel no longer relies on the contents of mbstat for
1006	* its operations (e.g. cluster expansion) because the structure is exposed
1007	* to outside and could possibly be modified, therefore making it unsafe.
1008	* With the exception of the mbstat.m_mtypes array (see below), all of the
1009	* statistics are updated as they change.
1010	*/
1011	struct mbstat mbstat;
1012
1013	#define MBSTAT_MTYPES_MAX \
1014	(sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1015
1016	/*
1017	* Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1018	* atomically and stored in a per-CPU structure which is lock-free; this is
1019	* done in order to avoid writing to the global mbstat data structure which
1020	* would cause false sharing. During sysctl request for kern.ipc.mbstat,
1021	* the statistics across all CPUs will be converged into the mbstat.m_mtypes
1022	* array and returned to the application. Any updates for types greater or
1023	* equal than MT_MAX would be done atomically to the mbstat; this slows down
1024	* performance but is okay since the kernel uses only up to MT_MAX-1 while
1025	* anything beyond that (up to type 255) is considered a corner case.
1026	*/
1027	typedef struct {
1028	unsigned int cpu_mtypes[MT_MAX];
1029	} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
1030
1031	typedef struct {
1032	mtypes_cpu_t mbs_cpu[`1`];
1033	} mbuf_mtypes_t;
1034
1035	static mbuf_mtypes_t mbuf_mtypes; /* per-CPU statistics /
1036
1037	#define MBUF_MTYPES_SIZE(n) \
1038	((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
1039
1040	#define MTYPES_CPU(p) \
1041	((mtypes_cpu_t )(void )((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
1042
1043	#define mtype_stat_add(type, n) { \
1044	if ((unsigned)(type) < MT_MAX) { \
1045	mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
1046	atomic_add_32(&mbs->cpu_mtypes[type], n); \
1047	} else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1048	atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
1049	} \
1050	}
1051
1052	#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1053	#define mtype_stat_inc(t) mtype_stat_add(t, 1)
1054	#define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1055
1056	static void
1057	mbuf_mtypes_sync(boolean_t locked)
1058	{
1059	int m, n;
1060	mtypes_cpu_t mtc;
1061
1062	if (locked)
1063	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1064
1065	bzero(&mtc, sizeof (mtc));
1066	for (m = `0`; m < ncpu; m++) {
1067	mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
1068	mtypes_cpu_t temp;
1069
1070	bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
1071	sizeof (temp.cpu_mtypes));
1072
1073	for (n = `0`; n < MT_MAX; n++)
1074	mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
1075	}
1076	if (!locked)
1077	lck_mtx_lock(mbuf_mlock);
1078	for (n = `0`; n < MT_MAX; n++)
1079	mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1080	if (!locked)
1081	lck_mtx_unlock(mbuf_mlock);
1082	}
1083
1084	static int
1085	mbstat_sysctl SYSCTL_HANDLER_ARGS
1086	{
1087	#pragma unused(oidp, arg1, arg2)
1088	mbuf_mtypes_sync(FALSE);
1089
1090	return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
1091	}
1092
1093	static void
1094	mbuf_stat_sync(void)
1095	{
1096	mb_class_stat_t *sp;
1097	mcache_cpu_t *ccp;
1098	mcache_t *cp;
1099	int k, m, bktsize;
1100
1101	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1102
1103	for (k = `0`; k < NELEM(mbuf_table); k++) {
1104	cp = m_cache(k);
1105	ccp = &cp->mc_cpu[`0`];
1106	bktsize = ccp->cc_bktsize;
1107	sp = mbuf_table[k].mtbl_stats;
1108
1109	if (cp->mc_flags & MCF_NOCPUCACHE)
1110	sp->mbcl_mc_state = MCS_DISABLED;
1111	else if (cp->mc_purge_cnt > `0`)
1112	sp->mbcl_mc_state = MCS_PURGING;
1113	else if (bktsize == `0`)
1114	sp->mbcl_mc_state = MCS_OFFLINE;
1115	else
1116	sp->mbcl_mc_state = MCS_ONLINE;
1117
1118	sp->mbcl_mc_cached = `0`;
1119	for (m = `0`; m < ncpu; m++) {
1120	ccp = &cp->mc_cpu[m];
1121	if (ccp->cc_objs > `0`)
1122	sp->mbcl_mc_cached += ccp->cc_objs;
1123	if (ccp->cc_pobjs > `0`)
1124	sp->mbcl_mc_cached += ccp->cc_pobjs;
1125	}
1126	sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1127	sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1128	sp->mbcl_infree;
1129
1130	sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1131	sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1132	sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1133
1134	/ Calculate total count specific to each class /
1135	sp->mbcl_ctotal = sp->mbcl_total;
1136	switch (m_class(k)) {
1137	case MC_MBUF:
1138	/ Deduct mbufs used in composite caches /
1139	sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1140	m_total(MC_MBUF_BIGCL));
1141	break;
1142
1143	case MC_CL:
1144	/ Deduct clusters used in composite cache /
1145	sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1146	break;
1147
1148	case MC_BIGCL:
1149	/ Deduct clusters used in composite cache /
1150	sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1151	break;
1152
1153	case MC_16KCL:
1154	/ Deduct clusters used in composite cache /
1155	sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1156	break;
1157
1158	default:
1159	break;
1160	}
1161	}
1162	}
1163
1164	static int
1165	mb_stat_sysctl SYSCTL_HANDLER_ARGS
1166	{
1167	#pragma unused(oidp, arg1, arg2)
1168	void *statp;
1169	int k, statsz, proc64 = proc_is64bit(req->p);
1170
1171	lck_mtx_lock(mbuf_mlock);
1172	mbuf_stat_sync();
1173
1174	if (!proc64) {
1175	struct omb_class_stat *oc;
1176	struct mb_class_stat *c;
1177
1178	omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1179	oc = &omb_stat->mbs_class[`0`];
1180	c = &mb_stat->mbs_class[`0`];
1181	for (k = `0`; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1182	(void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1183	"%s", c->mbcl_cname);
1184	oc->mbcl_size = c->mbcl_size;
1185	oc->mbcl_total = c->mbcl_total;
1186	oc->mbcl_active = c->mbcl_active;
1187	oc->mbcl_infree = c->mbcl_infree;
1188	oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1189	oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1190	oc->mbcl_free_cnt = c->mbcl_free_cnt;
1191	oc->mbcl_notified = c->mbcl_notified;
1192	oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1193	oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1194	oc->mbcl_ctotal = c->mbcl_ctotal;
1195	oc->mbcl_release_cnt = c->mbcl_release_cnt;
1196	oc->mbcl_mc_state = c->mbcl_mc_state;
1197	oc->mbcl_mc_cached = c->mbcl_mc_cached;
1198	oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1199	oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1200	oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1201	}
1202	statp = omb_stat;
1203	statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1204	} else {
1205	statp = mb_stat;
1206	statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1207	}
1208
1209	lck_mtx_unlock(mbuf_mlock);
1210
1211	return (SYSCTL_OUT(req, statp, statsz));
1212	}
1213
1214	static int
1215	mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1216	{
1217	#pragma unused(oidp, arg1, arg2)
1218	int i;
1219
1220	/ Ensure leak tracing turned on /
1221	if (!mclfindleak \|\| !mclexpleak)
1222	return (ENXIO);
1223
1224	lck_mtx_lock(mleak_lock);
1225	mleak_update_stats();
1226	i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1227	lck_mtx_unlock(mleak_lock);
1228
1229	return (i);
1230	}
1231
1232	static int
1233	mleak_table_sysctl SYSCTL_HANDLER_ARGS
1234	{
1235	#pragma unused(oidp, arg1, arg2)
1236	int i = `0`;
1237
1238	/ Ensure leak tracing turned on /
1239	if (!mclfindleak \|\| !mclexpleak)
1240	return (ENXIO);
1241
1242	lck_mtx_lock(mleak_lock);
1243	i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1244	lck_mtx_unlock(mleak_lock);
1245
1246	return (i);
1247	}
1248
1249	static inline void
1250	m_incref(struct mbuf *m)
1251	{
1252	UInt16 old, new;
1253	volatile UInt16 addr = (volatile* UInt16 *)&MEXT_REF(m);
1254
1255	do {
1256	old = *addr;
1257	new = old + `1`;
1258	ASSERT(new != `0`);
1259	} while (!OSCompareAndSwap16(old, new, addr));
1260
1261	/*
1262	* If cluster is shared, mark it with (sticky) EXTF_READONLY;
1263	* we don't clear the flag when the refcount goes back to the
1264	* minimum, to simplify code calling m_mclhasreference().
1265	*/
1266	if (new > (MEXT_MINREF(m) + `1`) && !(MEXT_FLAGS(m) & EXTF_READONLY))
1267	(void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1268	}
1269
1270	static inline u_int16_t
1271	m_decref(struct mbuf *m)
1272	{
1273	UInt16 old, new;
1274	volatile UInt16 addr = (volatile* UInt16 *)&MEXT_REF(m);
1275
1276	do {
1277	old = *addr;
1278	new = old - `1`;
1279	ASSERT(old != `0`);
1280	} while (!OSCompareAndSwap16(old, new, addr));
1281
1282	return (new);
1283	}
1284
1285	static void
1286	mbuf_table_init(void)
1287	{
1288	unsigned int b, c, s;
1289	int m, config_mbuf_jumbo = `0`;
1290
1291	MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1292	M_TEMP, M_WAITOK \| M_ZERO);
1293	VERIFY(omb_stat != NULL);
1294
1295	MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1296	M_TEMP, M_WAITOK \| M_ZERO);
1297	VERIFY(mb_stat != NULL);
1298
1299	mb_stat->mbs_cnt = NELEM(mbuf_table);
1300	for (m = `0`; m < NELEM(mbuf_table); m++)
1301	mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1302
1303	#if CONFIG_MBUF_JUMBO
1304	config_mbuf_jumbo = `1`;
1305	#endif /* CONFIG_MBUF_JUMBO */
1306
1307	if (config_mbuf_jumbo == `1` \|\| PAGE_SIZE == M16KCLBYTES) {
1308	/*
1309	* Set aside 1/3 of the mbuf cluster map for jumbo
1310	* clusters; we do this only on platforms where jumbo
1311	* cluster pool is enabled.
1312	*/
1313	njcl = nmbclusters / `3`;
1314	njclbytes = M16KCLBYTES;
1315	}
1316
1317	/*
1318	* nclusters holds both the 2KB and 4KB pools, so ensure it's
1319	* a multiple of 4KB clusters.
1320	*/
1321	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1322	if (njcl > `0`) {
1323	/*
1324	* Each jumbo cluster takes 8 2KB clusters, so make
1325	* sure that the pool size is evenly divisible by 8;
1326	* njcl is in 2KB unit, hence treated as such.
1327	*/
1328	njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1329
1330	/ Update nclusters with rounded down value of njcl /
1331	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1332	}
1333
1334	/*
1335	* njcl is valid only on platforms with 16KB jumbo clusters or
1336	* with 16KB pages, where it is configured to 1/3 of the pool
1337	* size. On these platforms, the remaining is used for 2KB
1338	* and 4KB clusters. On platforms without 16KB jumbo clusters,
1339	* the entire pool is used for both 2KB and 4KB clusters. A 4KB
1340	* cluster can either be splitted into 16 mbufs, or into 2 2KB
1341	* clusters.
1342	*
1343	* +---+---+------------ ... -----------+------- ... -------+
1344	* \| c \| b \| s \| njcl \|
1345	* +---+---+------------ ... -----------+------- ... -------+
1346	*
1347	* 1/32th of the shared region is reserved for pure 2KB and 4KB
1348	* clusters (1/64th each.)
1349	*/
1350	c = P2ROUNDDOWN((nclusters >> `6`), NCLPG); / in 2KB unit /
1351	b = P2ROUNDDOWN((nclusters >> (`6` + NCLPBGSHIFT)), NBCLPG); / in 4KB unit /
1352	s = nclusters - (c + (b << NCLPBGSHIFT)); / in 2KB unit /
1353
1354	/*
1355	* 1/64th (c) is reserved for 2KB clusters.
1356	*/
1357	m_minlimit(MC_CL) = c;
1358	m_maxlimit(MC_CL) = s + c; / in 2KB unit /
1359	m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1360	(void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1361
1362	/*
1363	* Another 1/64th (b) of the map is reserved for 4KB clusters.
1364	* It cannot be turned into 2KB clusters or mbufs.
1365	*/
1366	m_minlimit(MC_BIGCL) = b;
1367	m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; / in 4KB unit /
1368	m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1369	(void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1370
1371	/*
1372	* The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1373	*/
1374	m_minlimit(MC_MBUF) = `0`;
1375	m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); / in mbuf unit /
1376	m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1377	(void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1378
1379	/*
1380	* Set limits for the composite classes.
1381	*/
1382	m_minlimit(MC_MBUF_CL) = `0`;
1383	m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1384	m_maxsize(MC_MBUF_CL) = MCLBYTES;
1385	m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1386	(void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1387
1388	m_minlimit(MC_MBUF_BIGCL) = `0`;
1389	m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1390	m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1391	m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1392	(void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1393
1394	/*
1395	* And for jumbo classes.
1396	*/
1397	m_minlimit(MC_16KCL) = `0`;
1398	m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); / in 16KB unit /
1399	m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1400	(void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1401
1402	m_minlimit(MC_MBUF_16KCL) = `0`;
1403	m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1404	m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1405	m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1406	(void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1407
1408	/*
1409	* Initialize the legacy mbstat structure.
1410	*/
1411	bzero(&mbstat, sizeof (mbstat));
1412	mbstat.m_msize = m_maxsize(MC_MBUF);
1413	mbstat.m_mclbytes = m_maxsize(MC_CL);
1414	mbstat.m_minclsize = MINCLSIZE;
1415	mbstat.m_mlen = MLEN;
1416	mbstat.m_mhlen = MHLEN;
1417	mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1418	}
1419
1420	#if defined(__LP64__)
1421	typedef struct ncl_tbl {
1422	uint64_t nt_maxmem; / memory (sane) size /
1423	uint32_t nt_mbpool; / mbuf pool size /
1424	} ncl_tbl_t;
1425
1426	/ Non-server /
1427	static ncl_tbl_t ncl_table[] = {
1428	{ (`1ULL` << GBSHIFT) / 1 GB /, (`64` << MBSHIFT) / 64 MB / },
1429	{ (`1ULL` << (GBSHIFT + `3`)) / 8 GB /, (`96` << MBSHIFT) / 96 MB / },
1430	{ (`1ULL` << (GBSHIFT + `4`)) / 16 GB /, (`128` << MBSHIFT) / 128 MB / },
1431	{ `0`, `0` }
1432	};
1433
1434	/ Server /
1435	static ncl_tbl_t ncl_table_srv[] = {
1436	{ (`1ULL` << GBSHIFT) / 1 GB /, (`96` << MBSHIFT) / 96 MB / },
1437	{ (`1ULL` << (GBSHIFT + `2`)) / 4 GB /, (`128` << MBSHIFT) / 128 MB / },
1438	{ (`1ULL` << (GBSHIFT + `3`)) / 8 GB /, (`160` << MBSHIFT) / 160 MB / },
1439	{ (`1ULL` << (GBSHIFT + `4`)) / 16 GB /, (`192` << MBSHIFT) / 192 MB / },
1440	{ (`1ULL` << (GBSHIFT + `5`)) / 32 GB /, (`256` << MBSHIFT) / 256 MB / },
1441	{ (`1ULL` << (GBSHIFT + `6`)) / 64 GB /, (`384` << MBSHIFT) / 384 MB / },
1442	{ `0`, `0` }
1443	};
1444	#endif /* __LP64__ */
1445
1446	__private_extern__ unsigned int
1447	mbuf_default_ncl(int server, uint64_t mem)
1448	{
1449	#if !defined(__LP64__)
1450	#pragma unused(server)
1451	unsigned int n;
1452	/*
1453	* 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1454	*/
1455	if ((n = ((mem / `16`) / MCLBYTES)) > `32768`)
1456	n = `32768`;
1457	#else
1458	unsigned int n, i;
1459	ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1460	/*
1461	* 64-bit kernel (mbuf pool size based on table).
1462	*/
1463	n = tbl[`0`].nt_mbpool;
1464	for (i = `0`; tbl[i].nt_mbpool != `0`; i++) {
1465	if (mem < tbl[i].nt_maxmem)
1466	break;
1467	n = tbl[i].nt_mbpool;
1468	}
1469	n >>= MCLSHIFT;
1470	#endif /* !__LP64__ */
1471	return (n);
1472	}
1473
1474	__private_extern__ void
1475	mbinit(void)
1476	{
1477	unsigned int m;
1478	unsigned int initmcl = `0`;
1479	void *buf;
1480	thread_t thread = THREAD_NULL;
1481
1482	microuptime(&mb_start);
1483
1484	/*
1485	* These MBUF_ values must be equal to their private counterparts.
1486	*/
1487	_CASSERT(MBUF_EXT == M_EXT);
1488	_CASSERT(MBUF_PKTHDR == M_PKTHDR);
1489	_CASSERT(MBUF_EOR == M_EOR);
1490	_CASSERT(MBUF_LOOP == M_LOOP);
1491	_CASSERT(MBUF_BCAST == M_BCAST);
1492	_CASSERT(MBUF_MCAST == M_MCAST);
1493	_CASSERT(MBUF_FRAG == M_FRAG);
1494	_CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1495	_CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1496	_CASSERT(MBUF_PROMISC == M_PROMISC);
1497	_CASSERT(MBUF_HASFCS == M_HASFCS);
1498
1499	_CASSERT(MBUF_TYPE_FREE == MT_FREE);
1500	_CASSERT(MBUF_TYPE_DATA == MT_DATA);
1501	_CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1502	_CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1503	_CASSERT(MBUF_TYPE_PCB == MT_PCB);
1504	_CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1505	_CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1506	_CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1507	_CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1508	_CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1509	_CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1510	_CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1511	_CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1512	_CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1513	_CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1514
1515	_CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1516	_CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1517	_CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1518	_CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1519	_CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1520	_CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1521	_CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1522	_CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1523	_CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1524	_CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1525	_CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1526	_CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1527	_CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1528	_CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1529
1530	_CASSERT(MBUF_WAITOK == M_WAIT);
1531	_CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1532	_CASSERT(MBUF_COPYALL == M_COPYALL);
1533
1534	_CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1535	_CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1536	_CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1537	_CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1538	_CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1539	_CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1540	_CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1541	_CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1542	_CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1543	_CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1544	_CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1545
1546	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1547	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1548	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1549	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1550
1551	/ Module specific scratch space (32-bit alignment requirement) /
1552	_CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1553	sizeof (uint32_t)));
1554
1555	/ Initialize random red zone cookie value /
1556	_CASSERT(sizeof (mb_redzone_cookie) ==
1557	sizeof (((struct pkthdr *)`0`)->redzone));
1558	read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1559	read_random(&mb_obscure_extref, sizeof (mb_obscure_extref));
1560	read_random(&mb_obscure_extfree, sizeof (mb_obscure_extfree));
1561	mb_obscure_extref \|= `0x3`;
1562	mb_obscure_extfree \|= `0x3`;
1563
1564	/ Make sure we don't save more than we should /
1565	_CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1566
1567	if (nmbclusters == `0`)
1568	nmbclusters = NMBCLUSTERS;
1569
1570	/ This should be a sane (at least even) value by now /
1571	VERIFY(nmbclusters != `0` && !(nmbclusters & `0x1`));
1572
1573	/ Setup the mbuf table /
1574	mbuf_table_init();
1575
1576	/ Global lock for common layer /
1577	mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1578	mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1579	mbuf_mlock_attr = lck_attr_alloc_init();
1580	lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1581
1582	/*
1583	* Allocate cluster slabs table:
1584	*
1585	* maxslabgrp = (N * 2048) / (1024 * 1024)
1586	*
1587	* Where N is nmbclusters rounded up to the nearest 512. This yields
1588	* mcl_slab_g_t units, each one representing a MB of memory.
1589	*/
1590	maxslabgrp =
1591	(P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1592	MALLOC(slabstbl, mcl_slabg_t *, maxslabgrp sizeof (mcl_slabg_t *),
1593	M_TEMP, M_WAITOK \| M_ZERO);
1594	VERIFY(slabstbl != NULL);
1595
1596	/*
1597	* Allocate audit structures, if needed:
1598	*
1599	* maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1600	*
1601	* This yields mcl_audit_t units, each one representing a page.
1602	*/
1603	PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1604	mbuf_debug \|= mcache_getflags();
1605	if (mbuf_debug & MCF_DEBUG) {
1606	int l;
1607	mcl_audit_t *mclad;
1608	maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1609	MALLOC(mclaudit, mcl_audit_t , maxclaudit sizeof (*mclaudit),
1610	M_TEMP, M_WAITOK \| M_ZERO);
1611	VERIFY(mclaudit != NULL);
1612	for (l = `0`, mclad = mclaudit; l < maxclaudit; l++) {
1613	MALLOC(mclad[l].cl_audit, mcache_audit_t **,
1614	NMBPG * sizeof(mcache_audit_t *),
1615	M_TEMP, M_WAITOK \| M_ZERO);
1616	VERIFY(mclad[l].cl_audit != NULL);
1617	}
1618
1619	mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1620	AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), `0`, MCR_SLEEP);
1621	VERIFY(mcl_audit_con_cache != NULL);
1622	}
1623	mclverify = (mbuf_debug & MCF_VERIFY);
1624	mcltrace = (mbuf_debug & MCF_TRACE);
1625	mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1626	mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1627
1628	/ Enable mbuf leak logging, with a lock to protect the tables /
1629
1630	mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1631	mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1632	mleak_lock_attr = lck_attr_alloc_init();
1633	lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1634
1635	mleak_activate();
1636
1637	/*
1638	* Allocate structure for per-CPU statistics that's aligned
1639	* on the CPU cache boundary; this code assumes that we never
1640	* uninitialize this framework, since the original address
1641	* before alignment is not saved.
1642	*/
1643	ncpu = ml_get_max_cpus();
1644	MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1645	M_TEMP, M_WAITOK);
1646	VERIFY(buf != NULL);
1647
1648	mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1649	CPU_CACHE_LINE_SIZE);
1650	bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1651
1652	/ Calculate the number of pages assigned to the cluster pool /
1653	mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1654	MALLOC(mcl_paddr, ppnum_t , mcl_pages sizeof (ppnum_t),
1655	M_TEMP, M_WAITOK);
1656	VERIFY(mcl_paddr != NULL);
1657
1658	/ Register with the I/O Bus mapper /
1659	mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1660	bzero((char )mcl_paddr, mcl_pages sizeof (ppnum_t));
1661
1662	embutl = (mbutl + (nmbclusters * MCLBYTES));
1663	VERIFY(((embutl - mbutl) % MBIGCLBYTES) == `0`);
1664
1665	/ Prime up the freelist /
1666	PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1667	if (initmcl != `0`) {
1668	initmcl >>= NCLPBGSHIFT; / become a 4K unit /
1669	if (initmcl > m_maxlimit(MC_BIGCL))
1670	initmcl = m_maxlimit(MC_BIGCL);
1671	}
1672	if (initmcl < m_minlimit(MC_BIGCL))
1673	initmcl = m_minlimit(MC_BIGCL);
1674
1675	lck_mtx_lock(mbuf_mlock);
1676
1677	/*
1678	* For classes with non-zero minimum limits, populate their freelists
1679	* so that m_total(class) is at least m_minlimit(class).
1680	*/
1681	VERIFY(m_total(MC_BIGCL) == `0` && m_minlimit(MC_BIGCL) != `0`);
1682	freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1683	VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1684	freelist_init(m_class(MC_CL));
1685
1686	for (m = `0`; m < NELEM(mbuf_table); m++) {
1687	/ Make sure we didn't miss any /
1688	VERIFY(m_minlimit(m_class(m)) == `0` \|\|
1689	m_total(m_class(m)) >= m_minlimit(m_class(m)));
1690
1691	/ populate the initial sizes and report from there on /
1692	m_peak(m_class(m)) = m_total(m_class(m));
1693	}
1694	mb_peak_newreport = FALSE;
1695
1696	lck_mtx_unlock(mbuf_mlock);
1697
1698	(void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1699	NULL, &thread);
1700	thread_deallocate(thread);
1701
1702	ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1703	`0`, `0`, MCR_SLEEP);
1704
1705	/ Create the cache for each class /
1706	for (m = `0`; m < NELEM(mbuf_table); m++) {
1707	void allocfunc, freefunc, auditfunc, logfunc;
1708	u_int32_t flags;
1709
1710	flags = mbuf_debug;
1711	if (m_class(m) == MC_MBUF_CL \|\| m_class(m) == MC_MBUF_BIGCL \|\|
1712	m_class(m) == MC_MBUF_16KCL) {
1713	allocfunc = mbuf_cslab_alloc;
1714	freefunc = mbuf_cslab_free;
1715	auditfunc = mbuf_cslab_audit;
1716	logfunc = mleak_logger;
1717	} else {
1718	allocfunc = mbuf_slab_alloc;
1719	freefunc = mbuf_slab_free;
1720	auditfunc = mbuf_slab_audit;
1721	logfunc = mleak_logger;
1722	}
1723
1724	/*
1725	* Disable per-CPU caches for jumbo classes if there
1726	* is no jumbo cluster pool available in the system.
1727	* The cache itself is still created (but will never
1728	* be populated) since it simplifies the code.
1729	*/
1730	if ((m_class(m) == MC_MBUF_16KCL \|\| m_class(m) == MC_16KCL) &&
1731	njcl == `0`)
1732	flags \|= MCF_NOCPUCACHE;
1733
1734	if (!mclfindleak)
1735	flags \|= MCF_NOLEAKLOG;
1736
1737	m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1738	allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1739	(void *)(uintptr_t)m, flags, MCR_SLEEP);
1740	}
1741
1742	/*
1743	* Set the max limit on sb_max to be 1/16 th of the size of
1744	* memory allocated for mbuf clusters.
1745	*/
1746	high_sb_max = (nmbclusters << (MCLSHIFT - `4`));
1747	if (high_sb_max < sb_max) {
1748	/ sb_max is too large for this configuration, scale it down /
1749	if (high_sb_max > (`1` << MBSHIFT)) {
1750	/ We have atleast 16 M of mbuf pool /
1751	sb_max = high_sb_max;
1752	} else if ((nmbclusters << MCLSHIFT) > (`1` << MBSHIFT)) {
1753	/*
1754	* If we have more than 1M of mbufpool, cap the size of
1755	* max sock buf at 1M
1756	*/
1757	sb_max = high_sb_max = (`1` << MBSHIFT);
1758	} else {
1759	sb_max = high_sb_max;
1760	}
1761	}
1762
1763	/ allocate space for mbuf_dump_buf /
1764	MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1765	VERIFY(mbuf_dump_buf != NULL);
1766
1767	if (mbuf_debug & MCF_DEBUG) {
1768	printf("%s: MLEN %d, MHLEN %d\n", __func__,
1769	(int)_MLEN, (int)_MHLEN);
1770	}
1771
1772	printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1773	(nmbclusters << MCLSHIFT) >> MBSHIFT,
1774	(nclusters << MCLSHIFT) >> MBSHIFT,
1775	(njcl << MCLSHIFT) >> MBSHIFT);
1776
1777	/ initialize lock form tx completion callback table /
1778	mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init();
1779	if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) {
1780	panic("%s: lck_grp_attr_alloc_init failed", __func__);
1781	/ NOTREACHED /
1782	}
1783	mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl",
1784	mbuf_tx_compl_tbl_lck_grp_attr);
1785	if (mbuf_tx_compl_tbl_lck_grp == NULL) {
1786	panic("%s: lck_grp_alloc_init failed", __func__);
1787	/ NOTREACHED /
1788	}
1789	mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init();
1790	if (mbuf_tx_compl_tbl_lck_attr == NULL) {
1791	panic("%s: lck_attr_alloc_init failed", __func__);
1792	/ NOTREACHED /
1793	}
1794	lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp,
1795	mbuf_tx_compl_tbl_lck_attr);
1796
1797	}
1798
1799	/*
1800	* Obtain a slab of object(s) from the class's freelist.
1801	*/
1802	static mcache_obj_t *
1803	slab_alloc(mbuf_class_t class, int wait)
1804	{
1805	mcl_slab_t *sp;
1806	mcache_obj_t *buf;
1807
1808	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1809
1810	/ This should always be NULL for us /
1811	VERIFY(m_cobjlist(class) == NULL);
1812
1813	/*
1814	* Treat composite objects as having longer lifespan by using
1815	* a slab from the reverse direction, in hoping that this could
1816	* reduce the probability of fragmentation for slabs that hold
1817	* more than one buffer chunks (e.g. mbuf slabs). For other
1818	* slabs, this probably doesn't make much of a difference.
1819	*/
1820	if ((class == MC_MBUF \|\| class == MC_CL \|\| class == MC_BIGCL)
1821	&& (wait & MCR_COMP))
1822	sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1823	else
1824	sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1825
1826	if (sp == NULL) {
1827	VERIFY(m_infree(class) == `0` && m_slab_cnt(class) == `0`);
1828	/ The slab list for this class is empty /
1829	return (NULL);
1830	}
1831
1832	VERIFY(m_infree(class) > `0`);
1833	VERIFY(!slab_is_detached(sp));
1834	VERIFY(sp->sl_class == class &&
1835	(sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
1836	buf = sp->sl_head;
1837	VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1838	sp->sl_head = buf->obj_next;
1839	/ Increment slab reference /
1840	sp->sl_refcnt++;
1841
1842	VERIFY(sp->sl_head != NULL \|\| sp->sl_refcnt == sp->sl_chunks);
1843
1844	if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1845	slab_nextptr_panic(sp, sp->sl_head);
1846	/ In case sl_head is in the map but not in the slab /
1847	VERIFY(slab_inrange(sp, sp->sl_head));
1848	/ NOTREACHED /
1849	}
1850
1851	if (mclaudit != NULL) {
1852	mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1853	mca->mca_uflags = `0`;
1854	/ Save contents on mbuf objects only /
1855	if (class == MC_MBUF)
1856	mca->mca_uflags \|= MB_SCVALID;
1857	}
1858
1859	if (class == MC_CL) {
1860	mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1861	/*
1862	* A 2K cluster slab can have at most NCLPG references.
1863	*/
1864	VERIFY(sp->sl_refcnt >= `1` && sp->sl_refcnt <= NCLPG &&
1865	sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1866	VERIFY(sp->sl_refcnt < NCLPG \|\| sp->sl_head == NULL);
1867	} else if (class == MC_BIGCL) {
1868	mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1869	m_infree(MC_MBUF_BIGCL);
1870	/*
1871	* A 4K cluster slab can have NBCLPG references.
1872	*/
1873	VERIFY(sp->sl_refcnt >= `1` && sp->sl_chunks == NBCLPG &&
1874	sp->sl_len == PAGE_SIZE &&
1875	(sp->sl_refcnt < NBCLPG \|\| sp->sl_head == NULL));
1876	} else if (class == MC_16KCL) {
1877	mcl_slab_t *nsp;
1878	int k;
1879
1880	--m_infree(MC_16KCL);
1881	VERIFY(sp->sl_refcnt == `1` && sp->sl_chunks == `1` &&
1882	sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1883	/*
1884	* Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1885	* A 16KB big cluster takes NSLABSP16KB slabs, each having at
1886	* most 1 reference.
1887	*/
1888	for (nsp = sp, k = `1`; k < NSLABSP16KB; k++) {
1889	nsp = nsp->sl_next;
1890	/ Next slab must already be present /
1891	VERIFY(nsp != NULL);
1892	nsp->sl_refcnt++;
1893	VERIFY(!slab_is_detached(nsp));
1894	VERIFY(nsp->sl_class == MC_16KCL &&
1895	nsp->sl_flags == (SLF_MAPPED \| SLF_PARTIAL) &&
1896	nsp->sl_refcnt == `1` && nsp->sl_chunks == `0` &&
1897	nsp->sl_len == `0` && nsp->sl_base == sp->sl_base &&
1898	nsp->sl_head == NULL);
1899	}
1900	} else {
1901	VERIFY(class == MC_MBUF);
1902	--m_infree(MC_MBUF);
1903	/*
1904	* If auditing is turned on, this check is
1905	* deferred until later in mbuf_slab_audit().
1906	*/
1907	if (mclaudit == NULL)
1908	_MCHECK((struct mbuf *)buf);
1909	/*
1910	* Since we have incremented the reference count above,
1911	* an mbuf slab (formerly a 4KB cluster slab that was cut
1912	* up into mbufs) must have a reference count between 1
1913	* and NMBPG at this point.
1914	*/
1915	VERIFY(sp->sl_refcnt >= `1` && sp->sl_refcnt <= NMBPG &&
1916	sp->sl_chunks == NMBPG &&
1917	sp->sl_len == PAGE_SIZE);
1918	VERIFY(sp->sl_refcnt < NMBPG \|\| sp->sl_head == NULL);
1919	}
1920
1921	/ If empty, remove this slab from the class's freelist /
1922	if (sp->sl_head == NULL) {
1923	VERIFY(class != MC_MBUF \|\| sp->sl_refcnt == NMBPG);
1924	VERIFY(class != MC_CL \|\| sp->sl_refcnt == NCLPG);
1925	VERIFY(class != MC_BIGCL \|\| sp->sl_refcnt == NBCLPG);
1926	slab_remove(sp, class);
1927	}
1928
1929	return (buf);
1930	}
1931
1932	/*
1933	* Place a slab of object(s) back into a class's slab list.
1934	*/
1935	static void
1936	slab_free(mbuf_class_t class, mcache_obj_t *buf)
1937	{
1938	mcl_slab_t *sp;
1939	boolean_t reinit_supercl = false;
1940	mbuf_class_t super_class;
1941
1942	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1943
1944	VERIFY(class != MC_16KCL \|\| njcl > `0`);
1945	VERIFY(buf->obj_next == NULL);
1946
1947	/*
1948	* Synchronizing with m_clalloc, as it reads m_total, while we here
1949	* are modifying m_total.
1950	*/
1951	while (mb_clalloc_busy) {
1952	mb_clalloc_waiters++;
1953	(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1954	(PZERO-`1`), "m_clalloc", NULL);
1955	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1956	}
1957
1958	/ We are busy now; tell everyone else to go away /
1959	mb_clalloc_busy = TRUE;
1960
1961	sp = slab_get(buf);
1962	VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1963	(sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
1964
1965	/ Decrement slab reference /
1966	sp->sl_refcnt--;
1967
1968	if (class == MC_CL) {
1969	VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1970	/*
1971	* A slab that has been splitted for 2KB clusters can have
1972	* at most 1 outstanding reference at this point.
1973	*/
1974	VERIFY(sp->sl_refcnt >= `0` && sp->sl_refcnt <= (NCLPG - `1`) &&
1975	sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1976	VERIFY(sp->sl_refcnt < (NCLPG - `1`) \|\|
1977	(slab_is_detached(sp) && sp->sl_head == NULL));
1978	} else if (class == MC_BIGCL) {
1979	VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1980
1981	/ A 4KB cluster slab can have NBCLPG references at most /
1982	VERIFY(sp->sl_refcnt >= `0` && sp->sl_chunks == NBCLPG);
1983	VERIFY(sp->sl_refcnt < (NBCLPG - `1`) \|\|
1984	(slab_is_detached(sp) && sp->sl_head == NULL));
1985	} else if (class == MC_16KCL) {
1986	mcl_slab_t *nsp;
1987	int k;
1988	/*
1989	* A 16KB cluster takes NSLABSP16KB slabs, all must
1990	* now have 0 reference.
1991	*/
1992	VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1993	VERIFY(sp->sl_refcnt == `0` && sp->sl_chunks == `1` &&
1994	sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1995	VERIFY(slab_is_detached(sp));
1996	for (nsp = sp, k = `1`; k < NSLABSP16KB; k++) {
1997	nsp = nsp->sl_next;
1998	/ Next slab must already be present /
1999	VERIFY(nsp != NULL);
2000	nsp->sl_refcnt--;
2001	VERIFY(slab_is_detached(nsp));
2002	VERIFY(nsp->sl_class == MC_16KCL &&
2003	(nsp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) &&
2004	nsp->sl_refcnt == `0` && nsp->sl_chunks == `0` &&
2005	nsp->sl_len == `0` && nsp->sl_base == sp->sl_base &&
2006	nsp->sl_head == NULL);
2007	}
2008	} else {
2009	/*
2010	* A slab that has been splitted for mbufs has at most
2011	* NMBPG reference counts. Since we have decremented
2012	* one reference above, it must now be between 0 and
2013	* NMBPG-1.
2014	*/
2015	VERIFY(class == MC_MBUF);
2016	VERIFY(sp->sl_refcnt >= `0` &&
2017	sp->sl_refcnt <= (NMBPG - `1`) &&
2018	sp->sl_chunks == NMBPG &&
2019	sp->sl_len == PAGE_SIZE);
2020	VERIFY(sp->sl_refcnt < (NMBPG - `1`) \|\|
2021	(slab_is_detached(sp) && sp->sl_head == NULL));
2022	}
2023
2024	/*
2025	* When auditing is enabled, ensure that the buffer still
2026	* contains the free pattern. Otherwise it got corrupted
2027	* while at the CPU cache layer.
2028	*/
2029	if (mclaudit != NULL) {
2030	mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2031	if (mclverify) {
2032	mcache_audit_free_verify(mca, buf, `0`,
2033	m_maxsize(class));
2034	}
2035	mca->mca_uflags &= ~MB_SCVALID;
2036	}
2037
2038	if (class == MC_CL) {
2039	mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2040	buf->obj_next = sp->sl_head;
2041	} else if (class == MC_BIGCL) {
2042	mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2043	m_infree(MC_MBUF_BIGCL);
2044	buf->obj_next = sp->sl_head;
2045	} else if (class == MC_16KCL) {
2046	++m_infree(MC_16KCL);
2047	} else {
2048	++m_infree(MC_MBUF);
2049	buf->obj_next = sp->sl_head;
2050	}
2051	sp->sl_head = buf;
2052
2053	/*
2054	* If a slab has been split to either one which holds 2KB clusters,
2055	* or one which holds mbufs, turn it back to one which holds a
2056	* 4 or 16 KB cluster depending on the page size.
2057	*/
2058	if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2059	super_class = MC_BIGCL;
2060	} else {
2061	VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2062	super_class = MC_16KCL;
2063	}
2064	if (class == MC_MBUF && sp->sl_refcnt == `0` &&
2065	m_total(class) >= (m_minlimit(class) + NMBPG) &&
2066	m_total(super_class) < m_maxlimit(super_class)) {
2067	int i = NMBPG;
2068
2069	m_total(MC_MBUF) -= NMBPG;
2070	mbstat.m_mbufs = m_total(MC_MBUF);
2071	m_infree(MC_MBUF) -= NMBPG;
2072	mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2073
2074	while (i--) {
2075	struct mbuf *m = sp->sl_head;
2076	VERIFY(m != NULL);
2077	sp->sl_head = m->m_next;
2078	m->m_next = NULL;
2079	}
2080	reinit_supercl = true;
2081	} else if (class == MC_CL && sp->sl_refcnt == `0` &&
2082	m_total(class) >= (m_minlimit(class) + NCLPG) &&
2083	m_total(super_class) < m_maxlimit(super_class)) {
2084	int i = NCLPG;
2085
2086	m_total(MC_CL) -= NCLPG;
2087	mbstat.m_clusters = m_total(MC_CL);
2088	m_infree(MC_CL) -= NCLPG;
2089
2090	while (i--) {
2091	union mcluster *c = sp->sl_head;
2092	VERIFY(c != NULL);
2093	sp->sl_head = c->mcl_next;
2094	c->mcl_next = NULL;
2095	}
2096	reinit_supercl = true;
2097	} else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2098	sp->sl_refcnt == `0` &&
2099	m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2100	m_total(super_class) < m_maxlimit(super_class)) {
2101	int i = NBCLPG;
2102
2103	VERIFY(super_class == MC_16KCL);
2104	m_total(MC_BIGCL) -= NBCLPG;
2105	mbstat.m_bigclusters = m_total(MC_BIGCL);
2106	m_infree(MC_BIGCL) -= NBCLPG;
2107
2108	while (i--) {
2109	union mbigcluster *bc = sp->sl_head;
2110	VERIFY(bc != NULL);
2111	sp->sl_head = bc->mbc_next;
2112	bc->mbc_next = NULL;
2113	}
2114	reinit_supercl = true;
2115	}
2116
2117	if (reinit_supercl) {
2118	VERIFY(sp->sl_head == NULL);
2119	VERIFY(m_total(class) >= m_minlimit(class));
2120	slab_remove(sp, class);
2121
2122	/ Reinitialize it as a cluster for the super class /
2123	m_total(super_class)++;
2124	m_infree(super_class)++;
2125	VERIFY(sp->sl_flags == (SLF_MAPPED \| SLF_DETACHED) &&
2126	sp->sl_len == PAGE_SIZE && sp->sl_refcnt == `0`);
2127
2128	slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2129	sp->sl_base, PAGE_SIZE, `0`, `1`);
2130	if (mclverify)
2131	mcache_set_pattern(MCACHE_FREE_PATTERN,
2132	(caddr_t)sp->sl_base, sp->sl_len);
2133	((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2134
2135	if (super_class == MC_BIGCL) {
2136	mbstat.m_bigclusters = m_total(MC_BIGCL);
2137	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2138	m_infree(MC_MBUF_BIGCL);
2139	}
2140
2141	VERIFY(slab_is_detached(sp));
2142	VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2143
2144	/ And finally switch class /
2145	class = super_class;
2146	}
2147
2148	/ Reinsert the slab to the class's slab list /
2149	if (slab_is_detached(sp))
2150	slab_insert(sp, class);
2151
2152	/ We're done; let others enter /
2153	mb_clalloc_busy = FALSE;
2154	if (mb_clalloc_waiters > `0`) {
2155	mb_clalloc_waiters = `0`;
2156	wakeup(mb_clalloc_waitchan);
2157	}
2158	}
2159
2160	/*
2161	* Common allocator for rudimentary objects called by the CPU cache layer
2162	* during an allocation request whenever there is no available element in the
2163	* bucket layer. It returns one or more elements from the appropriate global
2164	* freelist. If the freelist is empty, it will attempt to populate it and
2165	* retry the allocation.
2166	*/
2167	static unsigned int
2168	mbuf_slab_alloc(void arg, mcache_obj_t *plist, unsigned* int num, int wait)
2169	{
2170	mbuf_class_t class = (mbuf_class_t)arg;
2171	unsigned int need = num;
2172	mcache_obj_t *list = plist;
2173
2174	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2175	ASSERT(need > `0`);
2176
2177	lck_mtx_lock(mbuf_mlock);
2178
2179	for (;;) {
2180	if ((*list = slab_alloc(class, wait)) != NULL) {
2181	(*list)->obj_next = NULL;
2182	list = plist = &(list)->obj_next;
2183
2184	if (--need == `0`) {
2185	/*
2186	* If the number of elements in freelist has
2187	* dropped below low watermark, asynchronously
2188	* populate the freelist now rather than doing
2189	* it later when we run out of elements.
2190	*/
2191	if (!mbuf_cached_above(class, wait) &&
2192	m_infree(class) < (m_total(class) >> `5`)) {
2193	(void) freelist_populate(class, `1`,
2194	M_DONTWAIT);
2195	}
2196	break;
2197	}
2198	} else {
2199	VERIFY(m_infree(class) == `0` \|\| class == MC_CL);
2200
2201	(void) freelist_populate(class, `1`,
2202	(wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2203
2204	if (m_infree(class) > `0`)
2205	continue;
2206
2207	/ Check if there's anything at the cache layer /
2208	if (mbuf_cached_above(class, wait))
2209	break;
2210
2211	/ watchdog checkpoint /
2212	mbuf_watchdog();
2213
2214	/ We have nothing and cannot block; give up /
2215	if (wait & MCR_NOSLEEP) {
2216	if (!(wait & MCR_TRYHARD)) {
2217	m_fail_cnt(class)++;
2218	mbstat.m_drops++;
2219	break;
2220	}
2221	}
2222
2223	/*
2224	* If the freelist is still empty and the caller is
2225	* willing to be blocked, sleep on the wait channel
2226	* until an element is available. Otherwise, if
2227	* MCR_TRYHARD is set, do our best to satisfy the
2228	* request without having to go to sleep.
2229	*/
2230	if (mbuf_worker_ready &&
2231	mbuf_sleep(class, need, wait))
2232	break;
2233
2234	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2235	}
2236	}
2237
2238	m_alloc_cnt(class) += num - need;
2239	lck_mtx_unlock(mbuf_mlock);
2240
2241	return (num - need);
2242	}
2243
2244	/*
2245	* Common de-allocator for rudimentary objects called by the CPU cache
2246	* layer when one or more elements need to be returned to the appropriate
2247	* global freelist.
2248	*/
2249	static void
2250	mbuf_slab_free(void arg, mcache_obj_t list, __unused int purged)
2251	{
2252	mbuf_class_t class = (mbuf_class_t)arg;
2253	mcache_obj_t *nlist;
2254	unsigned int num = `0`;
2255	int w;
2256
2257	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2258
2259	lck_mtx_lock(mbuf_mlock);
2260
2261	for (;;) {
2262	nlist = list->obj_next;
2263	list->obj_next = NULL;
2264	slab_free(class, list);
2265	++num;
2266	if ((list = nlist) == NULL)
2267	break;
2268	}
2269	m_free_cnt(class) += num;
2270
2271	if ((w = mb_waiters) > `0`)
2272	mb_waiters = `0`;
2273	if (w) {
2274	mbwdog_logger("waking up all threads");
2275	}
2276	lck_mtx_unlock(mbuf_mlock);
2277
2278	if (w != `0`)
2279	wakeup(mb_waitchan);
2280	}
2281
2282	/*
2283	* Common auditor for rudimentary objects called by the CPU cache layer
2284	* during an allocation or free request. For the former, this is called
2285	* after the objects are obtained from either the bucket or slab layer
2286	* and before they are returned to the caller. For the latter, this is
2287	* called immediately during free and before placing the objects into
2288	* the bucket or slab layer.
2289	*/
2290	static void
2291	mbuf_slab_audit(void arg, mcache_obj_t list, boolean_t alloc)
2292	{
2293	mbuf_class_t class = (mbuf_class_t)arg;
2294	mcache_audit_t *mca;
2295
2296	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2297
2298	while (list != NULL) {
2299	lck_mtx_lock(mbuf_mlock);
2300	mca = mcl_audit_buf2mca(class, list);
2301
2302	/ Do the sanity checks /
2303	if (class == MC_MBUF) {
2304	mcl_audit_mbuf(mca, list, FALSE, alloc);
2305	ASSERT(mca->mca_uflags & MB_SCVALID);
2306	} else {
2307	mcl_audit_cluster(mca, list, m_maxsize(class),
2308	alloc, TRUE);
2309	ASSERT(!(mca->mca_uflags & MB_SCVALID));
2310	}
2311	/ Record this transaction /
2312	if (mcltrace)
2313	mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2314
2315	if (alloc)
2316	mca->mca_uflags \|= MB_INUSE;
2317	else
2318	mca->mca_uflags &= ~MB_INUSE;
2319	/ Unpair the object (unconditionally) /
2320	mca->mca_uptr = NULL;
2321	lck_mtx_unlock(mbuf_mlock);
2322
2323	list = list->obj_next;
2324	}
2325	}
2326
2327	/*
2328	* Common notify routine for all caches. It is called by mcache when
2329	* one or more objects get freed. We use this indication to trigger
2330	* the wakeup of any sleeping threads so that they can retry their
2331	* allocation requests.
2332	*/
2333	static void
2334	mbuf_slab_notify(void *arg, u_int32_t reason)
2335	{
2336	mbuf_class_t class = (mbuf_class_t)arg;
2337	int w;
2338
2339	ASSERT(MBUF_CLASS_VALID(class));
2340
2341	if (reason != MCN_RETRYALLOC)
2342	return;
2343
2344	lck_mtx_lock(mbuf_mlock);
2345	if ((w = mb_waiters) > `0`) {
2346	m_notified(class)++;
2347	mb_waiters = `0`;
2348	}
2349	if (w) {
2350	mbwdog_logger("waking up all threads");
2351	}
2352	lck_mtx_unlock(mbuf_mlock);
2353
2354	if (w != `0`)
2355	wakeup(mb_waitchan);
2356	}
2357
2358	/*
2359	* Obtain object(s) from the composite class's freelist.
2360	*/
2361	static unsigned int
2362	cslab_alloc(mbuf_class_t class, mcache_obj_t **plist, unsigned* int num)
2363	{
2364	unsigned int need = num;
2365	mcl_slab_t sp, clsp, *nsp;
2366	struct mbuf *m;
2367	mcache_obj_t *list = plist;
2368	void *cl;
2369
2370	VERIFY(need > `0`);
2371	VERIFY(class != MC_MBUF_16KCL \|\| njcl > `0`);
2372	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2373
2374	/ Get what we can from the freelist /
2375	while ((*list = m_cobjlist(class)) != NULL) {
2376	MRANGE(*list);
2377
2378	m = (struct mbuf )list;
2379	sp = slab_get(m);
2380	cl = m->m_ext.ext_buf;
2381	clsp = slab_get(cl);
2382	VERIFY(m->m_flags == M_EXT && cl != NULL);
2383	VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2384
2385	if (class == MC_MBUF_CL) {
2386	VERIFY(clsp->sl_refcnt >= `1` &&
2387	clsp->sl_refcnt <= NCLPG);
2388	} else {
2389	VERIFY(clsp->sl_refcnt >= `1` &&
2390	clsp->sl_refcnt <= NBCLPG);
2391	}
2392
2393	if (class == MC_MBUF_16KCL) {
2394	int k;
2395	for (nsp = clsp, k = `1`; k < NSLABSP16KB; k++) {
2396	nsp = nsp->sl_next;
2397	/ Next slab must already be present /
2398	VERIFY(nsp != NULL);
2399	VERIFY(nsp->sl_refcnt == `1`);
2400	}
2401	}
2402
2403	if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2404	!MBUF_IN_MAP(m_cobjlist(class))) {
2405	slab_nextptr_panic(sp, m_cobjlist(class));
2406	/ NOTREACHED /
2407	}
2408	(*list)->obj_next = NULL;
2409	list = plist = &(list)->obj_next;
2410
2411	if (--need == `0`)
2412	break;
2413	}
2414	m_infree(class) -= (num - need);
2415
2416	return (num - need);
2417	}
2418
2419	/*
2420	* Place object(s) back into a composite class's freelist.
2421	*/
2422	static unsigned int
2423	cslab_free(mbuf_class_t class, mcache_obj_t list, int* purged)
2424	{
2425	mcache_obj_t o, tail;
2426	unsigned int num = `0`;
2427	struct mbuf m, ms;
2428	mcache_audit_t *mca = NULL;
2429	mcache_obj_t *ref_list = NULL;
2430	mcl_slab_t clsp, nsp;
2431	void *cl;
2432	mbuf_class_t cl_class;
2433
2434	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2435	VERIFY(class != MC_MBUF_16KCL \|\| njcl > `0`);
2436	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2437
2438	if (class == MC_MBUF_CL) {
2439	cl_class = MC_CL;
2440	} else if (class == MC_MBUF_BIGCL) {
2441	cl_class = MC_BIGCL;
2442	} else {
2443	VERIFY(class == MC_MBUF_16KCL);
2444	cl_class = MC_16KCL;
2445	}
2446
2447	o = tail = list;
2448
2449	while ((m = ms = (struct mbuf *)o) != NULL) {
2450	mcache_obj_t rfa, nexto = o->obj_next;
2451
2452	/ Do the mbuf sanity checks /
2453	if (mclaudit != NULL) {
2454	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2455	if (mclverify) {
2456	mcache_audit_free_verify(mca, m, `0`,
2457	m_maxsize(MC_MBUF));
2458	}
2459	ms = MCA_SAVED_MBUF_PTR(mca);
2460	}
2461
2462	/ Do the cluster sanity checks /
2463	cl = ms->m_ext.ext_buf;
2464	clsp = slab_get(cl);
2465	if (mclverify) {
2466	size_t size = m_maxsize(cl_class);
2467	mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2468	(mcache_obj_t *)cl), cl, `0`, size);
2469	}
2470	VERIFY(ms->m_type == MT_FREE);
2471	VERIFY(ms->m_flags == M_EXT);
2472	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2473	if (cl_class == MC_CL) {
2474	VERIFY(clsp->sl_refcnt >= `1` &&
2475	clsp->sl_refcnt <= NCLPG);
2476	} else {
2477	VERIFY(clsp->sl_refcnt >= `1` &&
2478	clsp->sl_refcnt <= NBCLPG);
2479	}
2480	if (cl_class == MC_16KCL) {
2481	int k;
2482	for (nsp = clsp, k = `1`; k < NSLABSP16KB; k++) {
2483	nsp = nsp->sl_next;
2484	/ Next slab must already be present /
2485	VERIFY(nsp != NULL);
2486	VERIFY(nsp->sl_refcnt == `1`);
2487	}
2488	}
2489
2490	/*
2491	* If we're asked to purge, restore the actual mbuf using
2492	* contents of the shadow structure (if auditing is enabled)
2493	* and clear EXTF_COMPOSITE flag from the mbuf, as we are
2494	* about to free it and the attached cluster into their caches.
2495	*/
2496	if (purged) {
2497	/ Restore constructed mbuf fields /
2498	if (mclaudit != NULL)
2499	mcl_audit_restore_mbuf(m, mca, TRUE);
2500
2501	MEXT_MINREF(m) = `0`;
2502	MEXT_REF(m) = `0`;
2503	MEXT_PREF(m) = `0`;
2504	MEXT_FLAGS(m) = `0`;
2505	MEXT_PRIV(m) = `0`;
2506	MEXT_PMBUF(m) = NULL;
2507	MEXT_TOKEN(m) = `0`;
2508
2509	rfa = (mcache_obj_t )(void* *)m_get_rfa(m);
2510	m_set_ext(m, NULL, NULL, NULL);
2511	rfa->obj_next = ref_list;
2512	ref_list = rfa;
2513
2514	m->m_type = MT_FREE;
2515	m->m_flags = m->m_len = `0`;
2516	m->m_next = m->m_nextpkt = NULL;
2517
2518	/ Save mbuf fields and make auditing happy /
2519	if (mclaudit != NULL)
2520	mcl_audit_mbuf(mca, o, FALSE, FALSE);
2521
2522	VERIFY(m_total(class) > `0`);
2523	m_total(class)--;
2524
2525	/ Free the mbuf /
2526	o->obj_next = NULL;
2527	slab_free(MC_MBUF, o);
2528
2529	/ And free the cluster /
2530	((mcache_obj_t *)cl)->obj_next = NULL;
2531	if (class == MC_MBUF_CL)
2532	slab_free(MC_CL, cl);
2533	else if (class == MC_MBUF_BIGCL)
2534	slab_free(MC_BIGCL, cl);
2535	else
2536	slab_free(MC_16KCL, cl);
2537	}
2538
2539	++num;
2540	tail = o;
2541	o = nexto;
2542	}
2543
2544	if (!purged) {
2545	tail->obj_next = m_cobjlist(class);
2546	m_cobjlist(class) = list;
2547	m_infree(class) += num;
2548	} else if (ref_list != NULL) {
2549	mcache_free_ext(ref_cache, ref_list);
2550	}
2551
2552	return (num);
2553	}
2554
2555	/*
2556	* Common allocator for composite objects called by the CPU cache layer
2557	* during an allocation request whenever there is no available element in
2558	* the bucket layer. It returns one or more composite elements from the
2559	* appropriate global freelist. If the freelist is empty, it will attempt
2560	* to obtain the rudimentary objects from their caches and construct them
2561	* into composite mbuf + cluster objects.
2562	*/
2563	static unsigned int
2564	mbuf_cslab_alloc(void arg, mcache_obj_t *plist, unsigned* int needed,
2565	int wait)
2566	{
2567	mbuf_class_t class = (mbuf_class_t)arg;
2568	mbuf_class_t cl_class = `0`;
2569	unsigned int num = `0`, cnum = `0`, want = needed;
2570	mcache_obj_t *ref_list = NULL;
2571	mcache_obj_t *mp_list = NULL;
2572	mcache_obj_t *clp_list = NULL;
2573	mcache_obj_t **list;
2574	struct ext_ref *rfa;
2575	struct mbuf *m;
2576	void *cl;
2577
2578	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2579	ASSERT(needed > `0`);
2580
2581	VERIFY(class != MC_MBUF_16KCL \|\| njcl > `0`);
2582
2583	/ There should not be any slab for this class /
2584	VERIFY(m_slab_cnt(class) == `0` &&
2585	m_slablist(class).tqh_first == NULL &&
2586	m_slablist(class).tqh_last == NULL);
2587
2588	lck_mtx_lock(mbuf_mlock);
2589
2590	/ Try using the freelist first /
2591	num = cslab_alloc(class, plist, needed);
2592	list = *plist;
2593	if (num == needed) {
2594	m_alloc_cnt(class) += num;
2595	lck_mtx_unlock(mbuf_mlock);
2596	return (needed);
2597	}
2598
2599	lck_mtx_unlock(mbuf_mlock);
2600
2601	/*
2602	* We could not satisfy the request using the freelist alone;
2603	* allocate from the appropriate rudimentary caches and use
2604	* whatever we can get to construct the composite objects.
2605	*/
2606	needed -= num;
2607
2608	/*
2609	* Mark these allocation requests as coming from a composite cache.
2610	* Also, if the caller is willing to be blocked, mark the request
2611	* with MCR_FAILOK such that we don't end up sleeping at the mbuf
2612	* slab layer waiting for the individual object when one or more
2613	* of the already-constructed composite objects are available.
2614	*/
2615	wait \|= MCR_COMP;
2616	if (!(wait & MCR_NOSLEEP))
2617	wait \|= MCR_FAILOK;
2618
2619	/ allocate mbufs /
2620	needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2621	if (needed == `0`) {
2622	ASSERT(mp_list == NULL);
2623	goto fail;
2624	}
2625
2626	/ allocate clusters /
2627	if (class == MC_MBUF_CL) {
2628	cl_class = MC_CL;
2629	} else if (class == MC_MBUF_BIGCL) {
2630	cl_class = MC_BIGCL;
2631	} else {
2632	VERIFY(class == MC_MBUF_16KCL);
2633	cl_class = MC_16KCL;
2634	}
2635	needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2636	if (needed == `0`) {
2637	ASSERT(clp_list == NULL);
2638	goto fail;
2639	}
2640
2641	needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2642	if (needed == `0`) {
2643	ASSERT(ref_list == NULL);
2644	goto fail;
2645	}
2646
2647	/*
2648	* By this time "needed" is MIN(mbuf, cluster, ref). Any left
2649	* overs will get freed accordingly before we return to caller.
2650	*/
2651	for (cnum = `0`; cnum < needed; cnum++) {
2652	struct mbuf *ms;
2653
2654	m = ms = (struct mbuf *)mp_list;
2655	mp_list = mp_list->obj_next;
2656
2657	cl = clp_list;
2658	clp_list = clp_list->obj_next;
2659	((mcache_obj_t *)cl)->obj_next = NULL;
2660
2661	rfa = (struct ext_ref *)ref_list;
2662	ref_list = ref_list->obj_next;
2663	((mcache_obj_t )(void* *)rfa)->obj_next = NULL;
2664
2665	/*
2666	* If auditing is enabled, construct the shadow mbuf
2667	* in the audit structure instead of in the actual one.
2668	* mbuf_cslab_audit() will take care of restoring the
2669	* contents after the integrity check.
2670	*/
2671	if (mclaudit != NULL) {
2672	mcache_audit_t mca, cl_mca;
2673
2674	lck_mtx_lock(mbuf_mlock);
2675	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2676	ms = MCA_SAVED_MBUF_PTR(mca);
2677	cl_mca = mcl_audit_buf2mca(cl_class,
2678	(mcache_obj_t *)cl);
2679
2680	/*
2681	* Pair them up. Note that this is done at the time
2682	* the mbuf+cluster objects are constructed. This
2683	* information should be treated as "best effort"
2684	* debugging hint since more than one mbufs can refer
2685	* to a cluster. In that case, the cluster might not
2686	* be freed along with the mbuf it was paired with.
2687	*/
2688	mca->mca_uptr = cl_mca;
2689	cl_mca->mca_uptr = mca;
2690
2691	ASSERT(mca->mca_uflags & MB_SCVALID);
2692	ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2693	lck_mtx_unlock(mbuf_mlock);
2694
2695	/ Technically, they are in the freelist /
2696	if (mclverify) {
2697	size_t size;
2698
2699	mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2700	m_maxsize(MC_MBUF));
2701
2702	if (class == MC_MBUF_CL)
2703	size = m_maxsize(MC_CL);
2704	else if (class == MC_MBUF_BIGCL)
2705	size = m_maxsize(MC_BIGCL);
2706	else
2707	size = m_maxsize(MC_16KCL);
2708
2709	mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2710	size);
2711	}
2712	}
2713
2714	MBUF_INIT(ms, `0`, MT_FREE);
2715	if (class == MC_MBUF_16KCL) {
2716	MBUF_16KCL_INIT(ms, cl, rfa, `0`, EXTF_COMPOSITE);
2717	} else if (class == MC_MBUF_BIGCL) {
2718	MBUF_BIGCL_INIT(ms, cl, rfa, `0`, EXTF_COMPOSITE);
2719	} else {
2720	MBUF_CL_INIT(ms, cl, rfa, `0`, EXTF_COMPOSITE);
2721	}
2722	VERIFY(ms->m_flags == M_EXT);
2723	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2724
2725	list = (mcache_obj_t )m;
2726	(*list)->obj_next = NULL;
2727	list = plist = &(list)->obj_next;
2728	}
2729
2730	fail:
2731	/*
2732	* Free up what's left of the above.
2733	*/
2734	if (mp_list != NULL)
2735	mcache_free_ext(m_cache(MC_MBUF), mp_list);
2736	if (clp_list != NULL)
2737	mcache_free_ext(m_cache(cl_class), clp_list);
2738	if (ref_list != NULL)
2739	mcache_free_ext(ref_cache, ref_list);
2740
2741	lck_mtx_lock(mbuf_mlock);
2742	if (num > `0` \|\| cnum > `0`) {
2743	m_total(class) += cnum;
2744	VERIFY(m_total(class) <= m_maxlimit(class));
2745	m_alloc_cnt(class) += num + cnum;
2746	}
2747	if ((num + cnum) < want)
2748	m_fail_cnt(class) += (want - (num + cnum));
2749	lck_mtx_unlock(mbuf_mlock);
2750
2751	return (num + cnum);
2752	}
2753
2754	/*
2755	* Common de-allocator for composite objects called by the CPU cache
2756	* layer when one or more elements need to be returned to the appropriate
2757	* global freelist.
2758	*/
2759	static void
2760	mbuf_cslab_free(void arg, mcache_obj_t list, int purged)
2761	{
2762	mbuf_class_t class = (mbuf_class_t)arg;
2763	unsigned int num;
2764	int w;
2765
2766	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2767
2768	lck_mtx_lock(mbuf_mlock);
2769
2770	num = cslab_free(class, list, purged);
2771	m_free_cnt(class) += num;
2772
2773	if ((w = mb_waiters) > `0`)
2774	mb_waiters = `0`;
2775	if (w) {
2776	mbwdog_logger("waking up all threads");
2777	}
2778
2779	lck_mtx_unlock(mbuf_mlock);
2780
2781	if (w != `0`)
2782	wakeup(mb_waitchan);
2783	}
2784
2785	/*
2786	* Common auditor for composite objects called by the CPU cache layer
2787	* during an allocation or free request. For the former, this is called
2788	* after the objects are obtained from either the bucket or slab layer
2789	* and before they are returned to the caller. For the latter, this is
2790	* called immediately during free and before placing the objects into
2791	* the bucket or slab layer.
2792	*/
2793	static void
2794	mbuf_cslab_audit(void arg, mcache_obj_t list, boolean_t alloc)
2795	{
2796	mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2797	mcache_audit_t *mca;
2798	struct mbuf m, ms;
2799	mcl_slab_t clsp, nsp;
2800	size_t cl_size;
2801	void *cl;
2802
2803	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2804	if (class == MC_MBUF_CL)
2805	cl_class = MC_CL;
2806	else if (class == MC_MBUF_BIGCL)
2807	cl_class = MC_BIGCL;
2808	else
2809	cl_class = MC_16KCL;
2810	cl_size = m_maxsize(cl_class);
2811
2812	while ((m = ms = (struct mbuf *)list) != NULL) {
2813	lck_mtx_lock(mbuf_mlock);
2814	/ Do the mbuf sanity checks and record its transaction /
2815	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2816	mcl_audit_mbuf(mca, m, TRUE, alloc);
2817	if (mcltrace)
2818	mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2819
2820	if (alloc)
2821	mca->mca_uflags \|= MB_COMP_INUSE;
2822	else
2823	mca->mca_uflags &= ~MB_COMP_INUSE;
2824
2825	/*
2826	* Use the shadow mbuf in the audit structure if we are
2827	* freeing, since the contents of the actual mbuf has been
2828	* pattern-filled by the above call to mcl_audit_mbuf().
2829	*/
2830	if (!alloc && mclverify)
2831	ms = MCA_SAVED_MBUF_PTR(mca);
2832
2833	/ Do the cluster sanity checks and record its transaction /
2834	cl = ms->m_ext.ext_buf;
2835	clsp = slab_get(cl);
2836	VERIFY(ms->m_flags == M_EXT && cl != NULL);
2837	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2838	if (class == MC_MBUF_CL)
2839	VERIFY(clsp->sl_refcnt >= `1` &&
2840	clsp->sl_refcnt <= NCLPG);
2841	else
2842	VERIFY(clsp->sl_refcnt >= `1` &&
2843	clsp->sl_refcnt <= NBCLPG);
2844
2845	if (class == MC_MBUF_16KCL) {
2846	int k;
2847	for (nsp = clsp, k = `1`; k < NSLABSP16KB; k++) {
2848	nsp = nsp->sl_next;
2849	/ Next slab must already be present /
2850	VERIFY(nsp != NULL);
2851	VERIFY(nsp->sl_refcnt == `1`);
2852	}
2853	}
2854
2855
2856	mca = mcl_audit_buf2mca(cl_class, cl);
2857	mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2858	if (mcltrace)
2859	mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2860
2861	if (alloc)
2862	mca->mca_uflags \|= MB_COMP_INUSE;
2863	else
2864	mca->mca_uflags &= ~MB_COMP_INUSE;
2865	lck_mtx_unlock(mbuf_mlock);
2866
2867	list = list->obj_next;
2868	}
2869	}
2870
2871	static void
2872	m_vm_error_stats(uint32_t cnt, uint64_t ts, uint64_t *size,
2873	uint64_t alloc_size, kern_return_t error)
2874	{
2875
2876	cnt = cnt + `1`;
2877	*ts = net_uptime();
2878	if (size) {
2879	*size = alloc_size;
2880	}
2881	_CASSERT(sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[`0`]) ==
2882	sizeof(mb_kmem_stats_labels) / sizeof(mb_kmem_stats_labels[`0`]));
2883	switch (error) {
2884	case KERN_SUCCESS:
2885	break;
2886	case KERN_INVALID_ARGUMENT:
2887	mb_kmem_stats[`0`]++;
2888	break;
2889	case KERN_INVALID_ADDRESS:
2890	mb_kmem_stats[`1`]++;
2891	break;
2892	case KERN_RESOURCE_SHORTAGE:
2893	mb_kmem_stats[`2`]++;
2894	break;
2895	case KERN_NO_SPACE:
2896	mb_kmem_stats[`3`]++;
2897	break;
2898	case KERN_FAILURE:
2899	mb_kmem_stats[`4`]++;
2900	break;
2901	default:
2902	mb_kmem_stats[`5`]++;
2903	break;
2904	}
2905	}
2906
2907	/*
2908	* Allocate some number of mbuf clusters and place on cluster freelist.
2909	*/
2910	static int
2911	m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2912	{
2913	int i, count = `0`;
2914	vm_size_t size = `0`;
2915	int numpages = `0`, large_buffer;
2916	vm_offset_t page = `0`;
2917	mcache_audit_t *mca_list = NULL;
2918	mcache_obj_t *con_list = NULL;
2919	mcl_slab_t *sp;
2920	mbuf_class_t class;
2921	kern_return_t error;
2922
2923	/ Set if a buffer allocation needs allocation of multiple pages /
2924	large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2925	PAGE_SIZE < M16KCLBYTES);
2926	VERIFY(bufsize == m_maxsize(MC_BIGCL) \|\|
2927	bufsize == m_maxsize(MC_16KCL));
2928
2929	VERIFY((bufsize == PAGE_SIZE) \|\|
2930	(bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2931
2932	if (bufsize == m_size(MC_BIGCL))
2933	class = MC_BIGCL;
2934	else
2935	class = MC_16KCL;
2936
2937	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2938
2939	/*
2940	* Multiple threads may attempt to populate the cluster map one
2941	* after another. Since we drop the lock below prior to acquiring
2942	* the physical page(s), our view of the cluster map may no longer
2943	* be accurate, and we could end up over-committing the pages beyond
2944	* the maximum allowed for each class. To prevent it, this entire
2945	* operation (including the page mapping) is serialized.
2946	*/
2947	while (mb_clalloc_busy) {
2948	mb_clalloc_waiters++;
2949	(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2950	(PZERO-`1`), "m_clalloc", NULL);
2951	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2952	}
2953
2954	/ We are busy now; tell everyone else to go away /
2955	mb_clalloc_busy = TRUE;
2956
2957	/*
2958	* Honor the caller's wish to block or not block. We have a way
2959	* to grow the pool asynchronously using the mbuf worker thread.
2960	*/
2961	i = m_howmany(num, bufsize);
2962	if (i <= `0` \|\| (wait & M_DONTWAIT))
2963	goto out;
2964
2965	lck_mtx_unlock(mbuf_mlock);
2966
2967	size = round_page(i * bufsize);
2968	page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2969
2970	/*
2971	* If we did ask for "n" 16KB physically contiguous chunks
2972	* and didn't get them, then please try again without this
2973	* restriction.
2974	*/
2975	net_update_uptime();
2976	if (large_buffer && page == `0`) {
2977	m_vm_error_stats(&mb_kmem_contig_failed,
2978	&mb_kmem_contig_failed_ts,
2979	&mb_kmem_contig_failed_size,
2980	size, error);
2981	page = kmem_mb_alloc(mb_map, size, `0`, &error);
2982	}
2983
2984	if (page == `0`) {
2985	m_vm_error_stats(&mb_kmem_failed,
2986	&mb_kmem_failed_ts,
2987	&mb_kmem_failed_size,
2988	size, error);
2989	#if PAGE_SIZE == 4096
2990	if (bufsize == m_maxsize(MC_BIGCL)) {
2991	#else
2992	if (bufsize >= m_maxsize(MC_BIGCL)) {
2993	#endif
2994	/ Try for 1 page if failed /
2995	size = PAGE_SIZE;
2996	page = kmem_mb_alloc(mb_map, size, `0`, &error);
2997	if (page == `0`) {
2998	m_vm_error_stats(&mb_kmem_one_failed,
2999	&mb_kmem_one_failed_ts,
3000	NULL, size, error);
3001	}
3002	}
3003
3004	if (page == `0`) {
3005	lck_mtx_lock(mbuf_mlock);
3006	goto out;
3007	}
3008	}
3009
3010	VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3011	numpages = size / PAGE_SIZE;
3012
3013	/ If auditing is enabled, allocate the audit structures now /
3014	if (mclaudit != NULL) {
3015	int needed;
3016
3017	/*
3018	* Yes, I realize this is a waste of memory for clusters
3019	* that never get transformed into mbufs, as we may end
3020	* up with NMBPG-1 unused audit structures per cluster.
3021	* But doing so tremendously simplifies the allocation
3022	* strategy, since at this point we are not holding the
3023	* mbuf lock and the caller is okay to be blocked.
3024	*/
3025	if (bufsize == PAGE_SIZE) {
3026	needed = numpages * NMBPG;
3027
3028	i = mcache_alloc_ext(mcl_audit_con_cache,
3029	&con_list, needed, MCR_SLEEP);
3030
3031	VERIFY(con_list != NULL && i == needed);
3032	} else {
3033	/*
3034	* if multiple 4K pages are being used for a
3035	* 16K cluster
3036	*/
3037	needed = numpages / NSLABSP16KB;
3038	}
3039
3040	i = mcache_alloc_ext(mcache_audit_cache,
3041	(mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3042
3043	VERIFY(mca_list != NULL && i == needed);
3044	}
3045
3046	lck_mtx_lock(mbuf_mlock);
3047
3048	for (i = `0`; i < numpages; i++, page += PAGE_SIZE) {
3049	ppnum_t offset =
3050	((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3051	ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3052
3053	/*
3054	* If there is a mapper the appropriate I/O page is
3055	* returned; zero out the page to discard its past
3056	* contents to prevent exposing leftover kernel memory.
3057	*/
3058	VERIFY(offset < mcl_pages);
3059	if (mcl_paddr_base != `0`) {
3060	bzero((void *)(uintptr_t) page, PAGE_SIZE);
3061	new_page = IOMapperInsertPage(mcl_paddr_base,
3062	offset, new_page);
3063	}
3064	mcl_paddr[offset] = new_page;
3065
3066	/ Pattern-fill this fresh page /
3067	if (mclverify) {
3068	mcache_set_pattern(MCACHE_FREE_PATTERN,
3069	(caddr_t)page, PAGE_SIZE);
3070	}
3071	if (bufsize == PAGE_SIZE) {
3072	mcache_obj_t *buf;
3073	/ One for the entire page /
3074	sp = slab_get((void *)page);
3075	if (mclaudit != NULL) {
3076	mcl_audit_init((void *)page,
3077	&mca_list, &con_list,
3078	AUDIT_CONTENTS_SIZE, NMBPG);
3079	}
3080	VERIFY(sp->sl_refcnt == `0` && sp->sl_flags == `0`);
3081	slab_init(sp, class, SLF_MAPPED, (void *)page,
3082	(void *)page, PAGE_SIZE, `0`, `1`);
3083	buf = (mcache_obj_t *)page;
3084	buf->obj_next = NULL;
3085
3086	/ Insert this slab /
3087	slab_insert(sp, class);
3088
3089	/ Update stats now since slab_get drops the lock /
3090	++m_infree(class);
3091	++m_total(class);
3092	VERIFY(m_total(class) <= m_maxlimit(class));
3093	if (class == MC_BIGCL) {
3094	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3095	m_infree(MC_MBUF_BIGCL);
3096	mbstat.m_bigclusters = m_total(MC_BIGCL);
3097	}
3098	++count;
3099	} else if ((bufsize > PAGE_SIZE) &&
3100	(i % NSLABSP16KB) == `0`) {
3101	union m16kcluster m16kcl = (union* m16kcluster *)page;
3102	mcl_slab_t *nsp;
3103	int k;
3104
3105	/ One for the entire 16KB /
3106	sp = slab_get(m16kcl);
3107	if (mclaudit != NULL)
3108	mcl_audit_init(m16kcl, &mca_list, NULL, `0`, `1`);
3109
3110	VERIFY(sp->sl_refcnt == `0` && sp->sl_flags == `0`);
3111	slab_init(sp, MC_16KCL, SLF_MAPPED,
3112	m16kcl, m16kcl, bufsize, `0`, `1`);
3113	m16kcl->m16kcl_next = NULL;
3114
3115	/*
3116	* 2nd-Nth page's slab is part of the first one,
3117	* where N is NSLABSP16KB.
3118	*/
3119	for (k = `1`; k < NSLABSP16KB; k++) {
3120	nsp = slab_get(((union mbigcluster *)page) + k);
3121	VERIFY(nsp->sl_refcnt == `0` &&
3122	nsp->sl_flags == `0`);
3123	slab_init(nsp, MC_16KCL,
3124	SLF_MAPPED \| SLF_PARTIAL,
3125	m16kcl, NULL, `0`, `0`, `0`);
3126	}
3127	/ Insert this slab /
3128	slab_insert(sp, MC_16KCL);
3129
3130	/ Update stats now since slab_get drops the lock /
3131	++m_infree(MC_16KCL);
3132	++m_total(MC_16KCL);
3133	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3134	++count;
3135	}
3136	}
3137	VERIFY(mca_list == NULL && con_list == NULL);
3138
3139	if (!mb_peak_newreport && mbuf_report_usage(class))
3140	mb_peak_newreport = TRUE;
3141
3142	/ We're done; let others enter /
3143	mb_clalloc_busy = FALSE;
3144	if (mb_clalloc_waiters > `0`) {
3145	mb_clalloc_waiters = `0`;
3146	wakeup(mb_clalloc_waitchan);
3147	}
3148
3149	return (count);
3150	out:
3151	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3152
3153	mtracelarge_register(size);
3154
3155	/ We're done; let others enter /
3156	mb_clalloc_busy = FALSE;
3157	if (mb_clalloc_waiters > `0`) {
3158	mb_clalloc_waiters = `0`;
3159	wakeup(mb_clalloc_waitchan);
3160	}
3161
3162	/*
3163	* When non-blocking we kick a thread if we have to grow the
3164	* pool or if the number of free clusters is less than requested.
3165	*/
3166	if (i > `0` && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3167	mbwdog_logger("waking up the worker thread to to grow %s by %d",
3168	m_cname(class), i);
3169	wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3170	mbuf_worker_needs_wakeup = FALSE;
3171	}
3172	if (class == MC_BIGCL) {
3173	if (i > `0`) {
3174	/*
3175	* Remember total number of 4KB clusters needed
3176	* at this time.
3177	*/
3178	i += m_total(MC_BIGCL);
3179	if (i > m_region_expand(MC_BIGCL)) {
3180	m_region_expand(MC_BIGCL) = i;
3181	}
3182	}
3183	if (m_infree(MC_BIGCL) >= num)
3184	return (`1`);
3185	} else {
3186	if (i > `0`) {
3187	/*
3188	* Remember total number of 16KB clusters needed
3189	* at this time.
3190	*/
3191	i += m_total(MC_16KCL);
3192	if (i > m_region_expand(MC_16KCL)) {
3193	m_region_expand(MC_16KCL) = i;
3194	}
3195	}
3196	if (m_infree(MC_16KCL) >= num)
3197	return (`1`);
3198	}
3199	return (`0`);
3200	}
3201
3202	/*
3203	* Populate the global freelist of the corresponding buffer class.
3204	*/
3205	static int
3206	freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3207	{
3208	mcache_obj_t *o = NULL;
3209	int i, numpages = `0`, count;
3210	mbuf_class_t super_class;
3211
3212	VERIFY(class == MC_MBUF \|\| class == MC_CL \|\| class == MC_BIGCL \|\|
3213	class == MC_16KCL);
3214
3215	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3216
3217	VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) \|\|
3218	PAGE_SIZE == m_maxsize(MC_16KCL));
3219
3220	if (m_maxsize(class) >= PAGE_SIZE)
3221	return(m_clalloc(num, wait, m_maxsize(class)) != `0`);
3222
3223	/*
3224	* The rest of the function will allocate pages and will slice
3225	* them up into the right size
3226	*/
3227
3228	numpages = (num * m_size(class) + PAGE_SIZE - `1`) / PAGE_SIZE;
3229
3230	/ Currently assume that pages are 4K or 16K /
3231	if (PAGE_SIZE == m_maxsize(MC_BIGCL))
3232	super_class = MC_BIGCL;
3233	else
3234	super_class = MC_16KCL;
3235
3236	i = m_clalloc(numpages, wait, m_maxsize(super_class));
3237
3238	/ how many objects will we cut the page into? /
3239	int numobj = PAGE_SIZE / m_maxsize(class);
3240
3241	for (count = `0`; count < numpages; count++) {
3242	/ respect totals, minlimit, maxlimit /
3243	if (m_total(super_class) <= m_minlimit(super_class) \|\|
3244	m_total(class) >= m_maxlimit(class))
3245	break;
3246
3247	if ((o = slab_alloc(super_class, wait)) == NULL)
3248	break;
3249
3250	struct mbuf m = (struct* mbuf *)o;
3251	union mcluster c = (union* mcluster *)o;
3252	union mbigcluster mbc = (union* mbigcluster *)o;
3253	mcl_slab_t *sp = slab_get(o);
3254	mcache_audit_t *mca = NULL;
3255
3256	/*
3257	* since one full page will be converted to MC_MBUF or
3258	* MC_CL, verify that the reference count will match that
3259	* assumption
3260	*/
3261	VERIFY(sp->sl_refcnt == `1` && slab_is_detached(sp));
3262	VERIFY((sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
3263	/*
3264	* Make sure that the cluster is unmolested
3265	* while in freelist
3266	*/
3267	if (mclverify) {
3268	mca = mcl_audit_buf2mca(super_class,
3269	(mcache_obj_t *)o);
3270	mcache_audit_free_verify(mca,
3271	(mcache_obj_t *)o, `0`, m_maxsize(super_class));
3272	}
3273
3274	/ Reinitialize it as an mbuf or 2K or 4K slab /
3275	slab_init(sp, class, sp->sl_flags,
3276	sp->sl_base, NULL, PAGE_SIZE, `0`, numobj);
3277
3278	VERIFY(sp->sl_head == NULL);
3279
3280	VERIFY(m_total(super_class) >= `1`);
3281	m_total(super_class)--;
3282
3283	if (super_class == MC_BIGCL)
3284	mbstat.m_bigclusters = m_total(MC_BIGCL);
3285
3286	m_total(class) += numobj;
3287	VERIFY(m_total(class) <= m_maxlimit(class));
3288	m_infree(class) += numobj;
3289
3290	if (!mb_peak_newreport && mbuf_report_usage(class))
3291	mb_peak_newreport = TRUE;
3292
3293	i = numobj;
3294	if (class == MC_MBUF) {
3295	mbstat.m_mbufs = m_total(MC_MBUF);
3296	mtype_stat_add(MT_FREE, NMBPG);
3297	while (i--) {
3298	/*
3299	* If auditing is enabled, construct the
3300	* shadow mbuf in the audit structure
3301	* instead of the actual one.
3302	* mbuf_slab_audit() will take care of
3303	* restoring the contents after the
3304	* integrity check.
3305	*/
3306	if (mclaudit != NULL) {
3307	struct mbuf *ms;
3308	mca = mcl_audit_buf2mca(MC_MBUF,
3309	(mcache_obj_t *)m);
3310	ms = MCA_SAVED_MBUF_PTR(mca);
3311	ms->m_type = MT_FREE;
3312	} else {
3313	m->m_type = MT_FREE;
3314	}
3315	m->m_next = sp->sl_head;
3316	sp->sl_head = (void *)m++;
3317	}
3318	} else if (class == MC_CL) { / MC_CL /
3319	mbstat.m_clfree =
3320	m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3321	mbstat.m_clusters = m_total(MC_CL);
3322	while (i--) {
3323	c->mcl_next = sp->sl_head;
3324	sp->sl_head = (void *)c++;
3325	}
3326	} else {
3327	VERIFY(class == MC_BIGCL);
3328	mbstat.m_bigclusters = m_total(MC_BIGCL);
3329	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3330	m_infree(MC_MBUF_BIGCL);
3331	while (i--) {
3332	mbc->mbc_next = sp->sl_head;
3333	sp->sl_head = (void *)mbc++;
3334	}
3335	}
3336
3337	/ Insert into the mbuf or 2k or 4k slab list /
3338	slab_insert(sp, class);
3339
3340	if ((i = mb_waiters) > `0`)
3341	mb_waiters = `0`;
3342	if (i != `0`) {
3343	mbwdog_logger("waking up all threads");
3344	wakeup(mb_waitchan);
3345	}
3346	}
3347	return (count != `0`);
3348	}
3349
3350	/*
3351	* For each class, initialize the freelist to hold m_minlimit() objects.
3352	*/
3353	static void
3354	freelist_init(mbuf_class_t class)
3355	{
3356	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3357
3358	VERIFY(class == MC_CL \|\| class == MC_BIGCL);
3359	VERIFY(m_total(class) == `0`);
3360	VERIFY(m_minlimit(class) > `0`);
3361
3362	while (m_total(class) < m_minlimit(class))
3363	(void) freelist_populate(class, m_minlimit(class), M_WAIT);
3364
3365	VERIFY(m_total(class) >= m_minlimit(class));
3366	}
3367
3368	/*
3369	* (Inaccurately) check if it might be worth a trip back to the
3370	* mcache layer due the availability of objects there. We'll
3371	* end up back here if there's nothing up there.
3372	*/
3373	static boolean_t
3374	mbuf_cached_above(mbuf_class_t class, int wait)
3375	{
3376	switch (class) {
3377	case MC_MBUF:
3378	if (wait & MCR_COMP)
3379	return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) \|\|
3380	!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3381	break;
3382
3383	case MC_CL:
3384	if (wait & MCR_COMP)
3385	return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3386	break;
3387
3388	case MC_BIGCL:
3389	if (wait & MCR_COMP)
3390	return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3391	break;
3392
3393	case MC_16KCL:
3394	if (wait & MCR_COMP)
3395	return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3396	break;
3397
3398	case MC_MBUF_CL:
3399	case MC_MBUF_BIGCL:
3400	case MC_MBUF_16KCL:
3401	break;
3402
3403	default:
3404	VERIFY(`0`);
3405	/ NOTREACHED /
3406	}
3407
3408	return (!mcache_bkt_isempty(m_cache(class)));
3409	}
3410
3411	/*
3412	* If possible, convert constructed objects to raw ones.
3413	*/
3414	static boolean_t
3415	mbuf_steal(mbuf_class_t class, unsigned int num)
3416	{
3417	mcache_obj_t *top = NULL;
3418	mcache_obj_t **list = &top;
3419	unsigned int tot = `0`;
3420
3421	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3422
3423	switch (class) {
3424	case MC_MBUF:
3425	case MC_CL:
3426	case MC_BIGCL:
3427	case MC_16KCL:
3428	return (FALSE);
3429
3430	case MC_MBUF_CL:
3431	case MC_MBUF_BIGCL:
3432	case MC_MBUF_16KCL:
3433	/ Get the required number of constructed objects if possible /
3434	if (m_infree(class) > m_minlimit(class)) {
3435	tot = cslab_alloc(class, &list,
3436	MIN(num, m_infree(class)));
3437	}
3438
3439	/ And destroy them to get back the raw objects /
3440	if (top != NULL)
3441	(void) cslab_free(class, top, `1`);
3442	break;
3443
3444	default:
3445	VERIFY(`0`);
3446	/ NOTREACHED /
3447	}
3448
3449	return (tot == num);
3450	}
3451
3452	static void
3453	m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3454	{
3455	int m, bmap = `0`;
3456
3457	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3458
3459	VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3460	VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3461	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3462
3463	/*
3464	* This logic can be made smarter; for now, simply mark
3465	* all other related classes as potential victims.
3466	*/
3467	switch (class) {
3468	case MC_MBUF:
3469	m_wantpurge(MC_CL)++;
3470	m_wantpurge(MC_BIGCL)++;
3471	m_wantpurge(MC_MBUF_CL)++;
3472	m_wantpurge(MC_MBUF_BIGCL)++;
3473	break;
3474
3475	case MC_CL:
3476	m_wantpurge(MC_MBUF)++;
3477	m_wantpurge(MC_BIGCL)++;
3478	m_wantpurge(MC_MBUF_BIGCL)++;
3479	if (!comp)
3480	m_wantpurge(MC_MBUF_CL)++;
3481	break;
3482
3483	case MC_BIGCL:
3484	m_wantpurge(MC_MBUF)++;
3485	m_wantpurge(MC_CL)++;
3486	m_wantpurge(MC_MBUF_CL)++;
3487	if (!comp)
3488	m_wantpurge(MC_MBUF_BIGCL)++;
3489	break;
3490
3491	case MC_16KCL:
3492	if (!comp)
3493	m_wantpurge(MC_MBUF_16KCL)++;
3494	break;
3495
3496	default:
3497	VERIFY(`0`);
3498	/ NOTREACHED /
3499	}
3500
3501	/*
3502	* Run through each marked class and check if we really need to
3503	* purge (and therefore temporarily disable) the per-CPU caches
3504	* layer used by the class. If so, remember the classes since
3505	* we are going to drop the lock below prior to purging.
3506	*/
3507	for (m = `0`; m < NELEM(mbuf_table); m++) {
3508	if (m_wantpurge(m) > `0`) {
3509	m_wantpurge(m) = `0`;
3510	/*
3511	* Try hard to steal the required number of objects
3512	* from the freelist of other mbuf classes. Only
3513	* purge and disable the per-CPU caches layer when
3514	* we don't have enough; it's the last resort.
3515	*/
3516	if (!mbuf_steal(m, num))
3517	bmap \|= (`1` << m);
3518	}
3519	}
3520
3521	lck_mtx_unlock(mbuf_mlock);
3522
3523	if (bmap != `0`) {
3524	/ signal the domains to drain /
3525	net_drain_domains();
3526
3527	/ Sigh; we have no other choices but to ask mcache to purge /
3528	for (m = `0`; m < NELEM(mbuf_table); m++) {
3529	if ((bmap & (`1` << m)) &&
3530	mcache_purge_cache(m_cache(m), TRUE)) {
3531	lck_mtx_lock(mbuf_mlock);
3532	m_purge_cnt(m)++;
3533	mbstat.m_drain++;
3534	lck_mtx_unlock(mbuf_mlock);
3535	}
3536	}
3537	} else {
3538	/*
3539	* Request mcache to reap extra elements from all of its caches;
3540	* note that all reaps are serialized and happen only at a fixed
3541	* interval.
3542	*/
3543	mcache_reap();
3544	}
3545	lck_mtx_lock(mbuf_mlock);
3546	}
3547
3548	static inline struct mbuf *
3549	m_get_common(int wait, short type, int hdr)
3550	{
3551	struct mbuf *m;
3552	int mcflags = MSLEEPF(wait);
3553
3554	/ Is this due to a non-blocking retry? If so, then try harder /
3555	if (mcflags & MCR_NOSLEEP)
3556	mcflags \|= MCR_TRYHARD;
3557
3558	m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3559	if (m != NULL) {
3560	MBUF_INIT(m, hdr, type);
3561	mtype_stat_inc(type);
3562	mtype_stat_dec(MT_FREE);
3563	#if CONFIG_MACF_NET
3564	if (hdr && mac_init_mbuf(m, wait) != `0`) {
3565	m_free(m);
3566	return (NULL);
3567	}
3568	#endif /* MAC_NET */
3569	}
3570	return (m);
3571	}
3572
3573	/*
3574	* Space allocation routines; these are also available as macros
3575	* for critical paths.
3576	*/
3577	#define _M_GET(wait, type) m_get_common(wait, type, 0)
3578	#define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3579	#define _M_RETRY(wait, type) _M_GET(wait, type)
3580	#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3581	#define _MGET(m, how, type) ((m) = _M_GET(how, type))
3582	#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3583
3584	struct mbuf *
3585	m_get(int wait, int type)
3586	{
3587	return (_M_GET(wait, type));
3588	}
3589
3590	struct mbuf *
3591	m_gethdr(int wait, int type)
3592	{
3593	return (_M_GETHDR(wait, type));
3594	}
3595
3596	struct mbuf *
3597	m_retry(int wait, int type)
3598	{
3599	return (_M_RETRY(wait, type));
3600	}
3601
3602	struct mbuf *
3603	m_retryhdr(int wait, int type)
3604	{
3605	return (_M_RETRYHDR(wait, type));
3606	}
3607
3608	struct mbuf *
3609	m_getclr(int wait, int type)
3610	{
3611	struct mbuf *m;
3612
3613	_MGET(m, wait, type);
3614	if (m != NULL)
3615	bzero(MTOD(m, caddr_t), MLEN);
3616	return (m);
3617	}
3618
3619	static int
3620	m_free_paired(struct mbuf *m)
3621	{
3622	VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3623
3624	membar_sync();
3625	if (MEXT_PMBUF(m) == m) {
3626	volatile UInt16 addr = (volatile* UInt16 *)&MEXT_PREF(m);
3627	int16_t oprefcnt, prefcnt;
3628
3629	/*
3630	* Paired ref count might be negative in case we lose
3631	* against another thread clearing MEXT_PMBUF, in the
3632	* event it occurs after the above memory barrier sync.
3633	* In that case just ignore as things have been unpaired.
3634	*/
3635	do {
3636	oprefcnt = *addr;
3637	prefcnt = oprefcnt - `1`;
3638	} while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3639
3640	if (prefcnt > `1`) {
3641	return (`1`);
3642	} else if (prefcnt == `1`) {
3643	(*(m_get_ext_free(m)))(m->m_ext.ext_buf,
3644	m->m_ext.ext_size, m_get_ext_arg(m));
3645	return (`1`);
3646	} else if (prefcnt == `0`) {
3647	VERIFY(MBUF_IS_PAIRED(m));
3648
3649	/*
3650	* Restore minref to its natural value, so that
3651	* the caller will be able to free the cluster
3652	* as appropriate.
3653	*/
3654	MEXT_MINREF(m) = `0`;
3655
3656	/*
3657	* Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3658	* as it is immutable. atomic_set_ptr also causes
3659	* memory barrier sync.
3660	*/
3661	atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3662
3663	switch (m->m_ext.ext_size) {
3664	case MCLBYTES:
3665	m_set_ext(m, m_get_rfa(m), NULL, NULL);
3666	break;
3667
3668	case MBIGCLBYTES:
3669	m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3670	break;
3671
3672	case M16KCLBYTES:
3673	m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3674	break;
3675
3676	default:
3677	VERIFY(`0`);
3678	/ NOTREACHED /
3679	}
3680	}
3681	}
3682
3683	/*
3684	* Tell caller the unpair has occurred, and that the reference
3685	* count on the external cluster held for the paired mbuf should
3686	* now be dropped.
3687	*/
3688	return (`0`);
3689	}
3690
3691	struct mbuf *
3692	m_free(struct mbuf *m)
3693	{
3694	struct mbuf *n = m->m_next;
3695
3696	if (m->m_type == MT_FREE)
3697	panic("m_free: freeing an already freed mbuf");
3698
3699	if (m->m_flags & M_PKTHDR) {
3700	/ Check for scratch area overflow /
3701	m_redzone_verify(m);
3702	/ Free the aux data and tags if there is any /
3703	m_tag_delete_chain(m, NULL);
3704
3705	m_do_tx_compl_callback(m, NULL);
3706	}
3707
3708	if (m->m_flags & M_EXT) {
3709	u_int16_t refcnt;
3710	u_int32_t composite;
3711	m_ext_free_func_t m_free_func;
3712
3713	if (MBUF_IS_PAIRED(m) && m_free_paired(m))
3714	return (n);
3715
3716	refcnt = m_decref(m);
3717	composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3718	m_free_func = m_get_ext_free(m);
3719
3720	if (refcnt == MEXT_MINREF(m) && !composite) {
3721	if (m_free_func == NULL) {
3722	mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3723	} else if (m_free_func == m_bigfree) {
3724	mcache_free(m_cache(MC_BIGCL),
3725	m->m_ext.ext_buf);
3726	} else if (m_free_func == m_16kfree) {
3727	mcache_free(m_cache(MC_16KCL),
3728	m->m_ext.ext_buf);
3729	} else {
3730	(*m_free_func)(m->m_ext.ext_buf,
3731	m->m_ext.ext_size, m_get_ext_arg(m));
3732	}
3733	mcache_free(ref_cache, m_get_rfa(m));
3734	m_set_ext(m, NULL, NULL, NULL);
3735	} else if (refcnt == MEXT_MINREF(m) && composite) {
3736	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3737	VERIFY(m->m_type != MT_FREE);
3738
3739	mtype_stat_dec(m->m_type);
3740	mtype_stat_inc(MT_FREE);
3741
3742	m->m_type = MT_FREE;
3743	m->m_flags = M_EXT;
3744	m->m_len = `0`;
3745	m->m_next = m->m_nextpkt = NULL;
3746
3747	MEXT_FLAGS(m) &= ~EXTF_READONLY;
3748
3749	/ "Free" into the intermediate cache /
3750	if (m_free_func == NULL) {
3751	mcache_free(m_cache(MC_MBUF_CL), m);
3752	} else if (m_free_func == m_bigfree) {
3753	mcache_free(m_cache(MC_MBUF_BIGCL), m);
3754	} else {
3755	VERIFY(m_free_func == m_16kfree);
3756	mcache_free(m_cache(MC_MBUF_16KCL), m);
3757	}
3758	return (n);
3759	}
3760	}
3761
3762	if (m->m_type != MT_FREE) {
3763	mtype_stat_dec(m->m_type);
3764	mtype_stat_inc(MT_FREE);
3765	}
3766
3767	m->m_type = MT_FREE;
3768	m->m_flags = m->m_len = `0`;
3769	m->m_next = m->m_nextpkt = NULL;
3770
3771	mcache_free(m_cache(MC_MBUF), m);
3772
3773	return (n);
3774	}
3775
3776	__private_extern__ struct mbuf *
3777	m_clattach(struct mbuf m, int* type, caddr_t extbuf,
3778	void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3779	int wait, int pair)
3780	{
3781	struct ext_ref *rfa = NULL;
3782
3783	/*
3784	* If pairing is requested and an existing mbuf is provided, reject
3785	* it if it's already been paired to another cluster. Otherwise,
3786	* allocate a new one or free any existing below.
3787	*/
3788	if ((m != NULL && MBUF_IS_PAIRED(m)) \|\|
3789	(m == NULL && (m = _M_GETHDR(wait, type)) == NULL))
3790	return (NULL);
3791
3792	if (m->m_flags & M_EXT) {
3793	u_int16_t refcnt;
3794	u_int32_t composite;
3795	m_ext_free_func_t m_free_func;
3796
3797	refcnt = m_decref(m);
3798	composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3799	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3800	m_free_func = m_get_ext_free(m);
3801	if (refcnt == MEXT_MINREF(m) && !composite) {
3802	if (m_free_func == NULL) {
3803	mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3804	} else if (m_free_func == m_bigfree) {
3805	mcache_free(m_cache(MC_BIGCL),
3806	m->m_ext.ext_buf);
3807	} else if (m_free_func == m_16kfree) {
3808	mcache_free(m_cache(MC_16KCL),
3809	m->m_ext.ext_buf);
3810	} else {
3811	(*m_free_func)(m->m_ext.ext_buf,
3812	m->m_ext.ext_size, m_get_ext_arg(m));
3813	}
3814	/ Re-use the reference structure /
3815	rfa = m_get_rfa(m);
3816	} else if (refcnt == MEXT_MINREF(m) && composite) {
3817	VERIFY(m->m_type != MT_FREE);
3818
3819	mtype_stat_dec(m->m_type);
3820	mtype_stat_inc(MT_FREE);
3821
3822	m->m_type = MT_FREE;
3823	m->m_flags = M_EXT;
3824	m->m_len = `0`;
3825	m->m_next = m->m_nextpkt = NULL;
3826
3827	MEXT_FLAGS(m) &= ~EXTF_READONLY;
3828
3829	/ "Free" into the intermediate cache /
3830	if (m_free_func == NULL) {
3831	mcache_free(m_cache(MC_MBUF_CL), m);
3832	} else if (m_free_func == m_bigfree) {
3833	mcache_free(m_cache(MC_MBUF_BIGCL), m);
3834	} else {
3835	VERIFY(m_free_func == m_16kfree);
3836	mcache_free(m_cache(MC_MBUF_16KCL), m);
3837	}
3838	/*
3839	* Allocate a new mbuf, since we didn't divorce
3840	* the composite mbuf + cluster pair above.
3841	*/
3842	if ((m = _M_GETHDR(wait, type)) == NULL)
3843	return (NULL);
3844	}
3845	}
3846
3847	if (rfa == NULL &&
3848	(rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3849	m_free(m);
3850	return (NULL);
3851	}
3852
3853	if (!pair) {
3854	MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3855	`0`, `1`, `0`, `0`, `0`, NULL);
3856	} else {
3857	MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3858	`1`, `1`, `1`, EXTF_PAIRED, `0`, m);
3859	}
3860
3861	return (m);
3862	}
3863
3864	/*
3865	* Perform `fast' allocation mbuf clusters from a cache of recently-freed
3866	* clusters. (If the cache is empty, new clusters are allocated en-masse.)
3867	*/
3868	struct mbuf *
3869	m_getcl(int wait, int type, int flags)
3870	{
3871	struct mbuf *m;
3872	int mcflags = MSLEEPF(wait);
3873	int hdr = (flags & M_PKTHDR);
3874
3875	/ Is this due to a non-blocking retry? If so, then try harder /
3876	if (mcflags & MCR_NOSLEEP)
3877	mcflags \|= MCR_TRYHARD;
3878
3879	m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3880	if (m != NULL) {
3881	u_int16_t flag;
3882	struct ext_ref *rfa;
3883	void *cl;
3884
3885	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3886	cl = m->m_ext.ext_buf;
3887	rfa = m_get_rfa(m);
3888
3889	ASSERT(cl != NULL && rfa != NULL);
3890	VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3891
3892	flag = MEXT_FLAGS(m);
3893
3894	MBUF_INIT(m, hdr, type);
3895	MBUF_CL_INIT(m, cl, rfa, `1`, flag);
3896
3897	mtype_stat_inc(type);
3898	mtype_stat_dec(MT_FREE);
3899	#if CONFIG_MACF_NET
3900	if (hdr && mac_init_mbuf(m, wait) != `0`) {
3901	m_freem(m);
3902	return (NULL);
3903	}
3904	#endif /* MAC_NET */
3905	}
3906	return (m);
3907	}
3908
3909	/ m_mclget() add an mbuf cluster to a normal mbuf /
3910	struct mbuf *
3911	m_mclget(struct mbuf m, int* wait)
3912	{
3913	struct ext_ref *rfa;
3914
3915	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3916	return (m);
3917
3918	m->m_ext.ext_buf = m_mclalloc(wait);
3919	if (m->m_ext.ext_buf != NULL) {
3920	MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, `1`, `0`);
3921	} else {
3922	mcache_free(ref_cache, rfa);
3923	}
3924	return (m);
3925	}
3926
3927	/ Allocate an mbuf cluster /
3928	caddr_t
3929	m_mclalloc(int wait)
3930	{
3931	int mcflags = MSLEEPF(wait);
3932
3933	/ Is this due to a non-blocking retry? If so, then try harder /
3934	if (mcflags & MCR_NOSLEEP)
3935	mcflags \|= MCR_TRYHARD;
3936
3937	return (mcache_alloc(m_cache(MC_CL), mcflags));
3938	}
3939
3940	/ Free an mbuf cluster /
3941	void
3942	m_mclfree(caddr_t p)
3943	{
3944	mcache_free(m_cache(MC_CL), p);
3945	}
3946
3947	/*
3948	* mcl_hasreference() checks if a cluster of an mbuf is referenced by
3949	* another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3950	*/
3951	int
3952	m_mclhasreference(struct mbuf *m)
3953	{
3954	if (!(m->m_flags & M_EXT))
3955	return (`0`);
3956
3957	ASSERT(m_get_rfa(m) != NULL);
3958
3959	return ((MEXT_FLAGS(m) & EXTF_READONLY) ? `1` : `0`);
3960	}
3961
3962	__private_extern__ caddr_t
3963	m_bigalloc(int wait)
3964	{
3965	int mcflags = MSLEEPF(wait);
3966
3967	/ Is this due to a non-blocking retry? If so, then try harder /
3968	if (mcflags & MCR_NOSLEEP)
3969	mcflags \|= MCR_TRYHARD;
3970
3971	return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3972	}
3973
3974	__private_extern__ void
3975	m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3976	{
3977	mcache_free(m_cache(MC_BIGCL), p);
3978	}
3979
3980	/ m_mbigget() add an 4KB mbuf cluster to a normal mbuf /
3981	__private_extern__ struct mbuf *
3982	m_mbigget(struct mbuf m, int* wait)
3983	{
3984	struct ext_ref *rfa;
3985
3986	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3987	return (m);
3988
3989	m->m_ext.ext_buf = m_bigalloc(wait);
3990	if (m->m_ext.ext_buf != NULL) {
3991	MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, `1`, `0`);
3992	} else {
3993	mcache_free(ref_cache, rfa);
3994	}
3995	return (m);
3996	}
3997
3998	__private_extern__ caddr_t
3999	m_16kalloc(int wait)
4000	{
4001	int mcflags = MSLEEPF(wait);
4002
4003	/ Is this due to a non-blocking retry? If so, then try harder /
4004	if (mcflags & MCR_NOSLEEP)
4005	mcflags \|= MCR_TRYHARD;
4006
4007	return (mcache_alloc(m_cache(MC_16KCL), mcflags));
4008	}
4009
4010	__private_extern__ void
4011	m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4012	{
4013	mcache_free(m_cache(MC_16KCL), p);
4014	}
4015
4016	/ m_m16kget() add a 16KB mbuf cluster to a normal mbuf /
4017	__private_extern__ struct mbuf *
4018	m_m16kget(struct mbuf m, int* wait)
4019	{
4020	struct ext_ref *rfa;
4021
4022	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
4023	return (m);
4024
4025	m->m_ext.ext_buf = m_16kalloc(wait);
4026	if (m->m_ext.ext_buf != NULL) {
4027	MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, `1`, `0`);
4028	} else {
4029	mcache_free(ref_cache, rfa);
4030	}
4031	return (m);
4032	}
4033
4034	/*
4035	* "Move" mbuf pkthdr from "from" to "to".
4036	* "from" must have M_PKTHDR set, and "to" must be empty.
4037	*/
4038	void
4039	m_copy_pkthdr(struct mbuf to, struct* mbuf *from)
4040	{
4041	VERIFY(from->m_flags & M_PKTHDR);
4042
4043	/ Check for scratch area overflow /
4044	m_redzone_verify(from);
4045
4046	if (to->m_flags & M_PKTHDR) {
4047	/ Check for scratch area overflow /
4048	m_redzone_verify(to);
4049	/ We will be taking over the tags of 'to' /
4050	m_tag_delete_chain(to, NULL);
4051	}
4052	to->m_pkthdr = from->m_pkthdr; / especially tags /
4053	m_classifier_init(from, `0`); / purge classifier info /
4054	m_tag_init(from, `1`); / purge all tags from src /
4055	m_scratch_init(from); / clear src scratch area /
4056	to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
4057	if ((to->m_flags & M_EXT) == `0`)
4058	to->m_data = to->m_pktdat;
4059	m_redzone_init(to); / setup red zone on dst /
4060	}
4061
4062	/*
4063	* Duplicate "from"'s mbuf pkthdr in "to".
4064	* "from" must have M_PKTHDR set, and "to" must be empty.
4065	* In particular, this does a deep copy of the packet tags.
4066	*/
4067	static int
4068	m_dup_pkthdr(struct mbuf to, struct* mbuf from, int* how)
4069	{
4070	VERIFY(from->m_flags & M_PKTHDR);
4071
4072	/ Check for scratch area overflow /
4073	m_redzone_verify(from);
4074
4075	if (to->m_flags & M_PKTHDR) {
4076	/ Check for scratch area overflow /
4077	m_redzone_verify(to);
4078	/ We will be taking over the tags of 'to' /
4079	m_tag_delete_chain(to, NULL);
4080	}
4081	to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
4082	if ((to->m_flags & M_EXT) == `0`)
4083	to->m_data = to->m_pktdat;
4084	to->m_pkthdr = from->m_pkthdr;
4085	m_redzone_init(to); / setup red zone on dst /
4086	m_tag_init(to, `0`); / preserve dst static tags /
4087	return (m_tag_copy_chain(to, from, how));
4088	}
4089
4090	void
4091	m_copy_pftag(struct mbuf to, struct* mbuf *from)
4092	{
4093	memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4094	#if PF_ECN
4095	m_pftag(to)->pftag_hdr = NULL;
4096	m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET\|PF_TAG_HDR_INET6);
4097	#endif /* PF_ECN */
4098	}
4099
4100	void
4101	m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4102	{
4103	VERIFY(m->m_flags & M_PKTHDR);
4104
4105	m->m_pkthdr.pkt_proto = `0`;
4106	m->m_pkthdr.pkt_flowsrc = `0`;
4107	m->m_pkthdr.pkt_flowid = `0`;
4108	m->m_pkthdr.pkt_flags &= pktf_mask; / caller-defined mask /
4109	/ preserve service class and interface info for loopback packets /
4110	if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
4111	(void) m_set_service_class(m, MBUF_SC_BE);
4112	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4113	m->m_pkthdr.pkt_ifainfo = `0`;
4114	/*
4115	* Preserve timestamp if requested
4116	*/
4117	if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID))
4118	m->m_pkthdr.pkt_timestamp = `0`;
4119	}
4120
4121	void
4122	m_copy_classifier(struct mbuf to, struct* mbuf *from)
4123	{
4124	VERIFY(to->m_flags & M_PKTHDR);
4125	VERIFY(from->m_flags & M_PKTHDR);
4126
4127	to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4128	to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4129	to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4130	to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4131	(void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4132	to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
4133	}
4134
4135	/*
4136	* Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4137	* if wantall is not set, return whatever number were available. Set up the
4138	* first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4139	* are chained on the m_nextpkt field. Any packets requested beyond this
4140	* are chained onto the last packet header's m_next field. The size of
4141	* the cluster is controlled by the parameter bufsize.
4142	*/
4143	__private_extern__ struct mbuf *
4144	m_getpackets_internal(unsigned int num_needed, int* num_with_pkthdrs,
4145	int wait, int wantall, size_t bufsize)
4146	{
4147	struct mbuf *m;
4148	struct mbuf *np, top;
4149	unsigned int pnum, needed = *num_needed;
4150	mcache_obj_t *mp_list = NULL;
4151	int mcflags = MSLEEPF(wait);
4152	u_int16_t flag;
4153	struct ext_ref *rfa;
4154	mcache_t *cp;
4155	void *cl;
4156
4157	ASSERT(bufsize == m_maxsize(MC_CL) \|\|
4158	bufsize == m_maxsize(MC_BIGCL) \|\|
4159	bufsize == m_maxsize(MC_16KCL));
4160
4161	/*
4162	* Caller must first check for njcl because this
4163	* routine is internal and not exposed/used via KPI.
4164	*/
4165	VERIFY(bufsize != m_maxsize(MC_16KCL) \|\| njcl > `0`);
4166
4167	top = NULL;
4168	np = &top;
4169	pnum = `0`;
4170
4171	/*
4172	* The caller doesn't want all the requested buffers; only some.
4173	* Try hard to get what we can, but don't block. This effectively
4174	* overrides MCR_SLEEP, since this thread will not go to sleep
4175	* if we can't get all the buffers.
4176	*/
4177	if (!wantall \|\| (mcflags & MCR_NOSLEEP))
4178	mcflags \|= MCR_TRYHARD;
4179
4180	/ Allocate the composite mbuf + cluster elements from the cache /
4181	if (bufsize == m_maxsize(MC_CL))
4182	cp = m_cache(MC_MBUF_CL);
4183	else if (bufsize == m_maxsize(MC_BIGCL))
4184	cp = m_cache(MC_MBUF_BIGCL);
4185	else
4186	cp = m_cache(MC_MBUF_16KCL);
4187	needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4188
4189	for (pnum = `0`; pnum < needed; pnum++) {
4190	m = (struct mbuf *)mp_list;
4191	mp_list = mp_list->obj_next;
4192
4193	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4194	cl = m->m_ext.ext_buf;
4195	rfa = m_get_rfa(m);
4196
4197	ASSERT(cl != NULL && rfa != NULL);
4198	VERIFY(MBUF_IS_COMPOSITE(m));
4199
4200	flag = MEXT_FLAGS(m);
4201
4202	MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4203	if (bufsize == m_maxsize(MC_16KCL)) {
4204	MBUF_16KCL_INIT(m, cl, rfa, `1`, flag);
4205	} else if (bufsize == m_maxsize(MC_BIGCL)) {
4206	MBUF_BIGCL_INIT(m, cl, rfa, `1`, flag);
4207	} else {
4208	MBUF_CL_INIT(m, cl, rfa, `1`, flag);
4209	}
4210
4211	if (num_with_pkthdrs > `0`) {
4212	--num_with_pkthdrs;
4213	#if CONFIG_MACF_NET
4214	if (mac_mbuf_label_init(m, wait) != `0`) {
4215	m_freem(m);
4216	break;
4217	}
4218	#endif /* MAC_NET */
4219	}
4220
4221	*np = m;
4222	if (num_with_pkthdrs > `0`)
4223	np = &m->m_nextpkt;
4224	else
4225	np = &m->m_next;
4226	}
4227	ASSERT(pnum != *num_needed \|\| mp_list == NULL);
4228	if (mp_list != NULL)
4229	mcache_free_ext(cp, mp_list);
4230
4231	if (pnum > `0`) {
4232	mtype_stat_add(MT_DATA, pnum);
4233	mtype_stat_sub(MT_FREE, pnum);
4234	}
4235
4236	if (wantall && (pnum != *num_needed)) {
4237	if (top != NULL)
4238	m_freem_list(top);
4239	return (NULL);
4240	}
4241
4242	if (pnum > *num_needed) {
4243	printf("%s: File a radar related to <rdar://10146739>. \
4244	needed = %u, pnum = %u, num_needed = %u \n",
4245	__func__, needed, pnum, *num_needed);
4246	}
4247
4248	*num_needed = pnum;
4249	return (top);
4250	}
4251
4252	/*
4253	* Return list of mbuf linked by m_nextpkt. Try for numlist, and if
4254	* wantall is not set, return whatever number were available. The size of
4255	* each mbuf in the list is controlled by the parameter packetlen. Each
4256	* mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
4257	* in the chain is called a segment. If maxsegments is not null and the
4258	* value pointed to is not null, this specify the maximum number of segments
4259	* for a chain of mbufs. If maxsegments is zero or the value pointed to
4260	* is zero the caller does not have any restriction on the number of segments.
4261	* The actual number of segments of a mbuf chain is return in the value
4262	* pointed to by maxsegments.
4263	*/
4264	__private_extern__ struct mbuf *
4265	m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4266	unsigned int maxsegments, int* wait, int wantall, size_t wantsize)
4267	{
4268	struct mbuf *np, top, *first = NULL;
4269	size_t bufsize, r_bufsize;
4270	unsigned int num = `0`;
4271	unsigned int nsegs = `0`;
4272	unsigned int needed, resid;
4273	int mcflags = MSLEEPF(wait);
4274	mcache_obj_t mp_list = NULL, rmp_list = NULL;
4275	mcache_t cp = NULL, rcp = NULL;
4276
4277	if (*numlist == `0`)
4278	return (NULL);
4279
4280	top = NULL;
4281	np = &top;
4282
4283	if (wantsize == `0`) {
4284	if (packetlen <= MINCLSIZE) {
4285	bufsize = packetlen;
4286	} else if (packetlen > m_maxsize(MC_CL)) {
4287	/ Use 4KB if jumbo cluster pool isn't available /
4288	if (packetlen <= m_maxsize(MC_BIGCL) \|\| njcl == `0`)
4289	bufsize = m_maxsize(MC_BIGCL);
4290	else
4291	bufsize = m_maxsize(MC_16KCL);
4292	} else {
4293	bufsize = m_maxsize(MC_CL);
4294	}
4295	} else if (wantsize == m_maxsize(MC_CL) \|\|
4296	wantsize == m_maxsize(MC_BIGCL) \|\|
4297	(wantsize == m_maxsize(MC_16KCL) && njcl > `0`)) {
4298	bufsize = wantsize;
4299	} else {
4300	return (NULL);
4301	}
4302
4303	if (bufsize <= MHLEN) {
4304	nsegs = `1`;
4305	} else if (bufsize <= MINCLSIZE) {
4306	if (maxsegments != NULL && *maxsegments == `1`) {
4307	bufsize = m_maxsize(MC_CL);
4308	nsegs = `1`;
4309	} else {
4310	nsegs = `2`;
4311	}
4312	} else if (bufsize == m_maxsize(MC_16KCL)) {
4313	VERIFY(njcl > `0`);
4314	nsegs = ((packetlen - `1`) >> M16KCLSHIFT) + `1`;
4315	} else if (bufsize == m_maxsize(MC_BIGCL)) {
4316	nsegs = ((packetlen - `1`) >> MBIGCLSHIFT) + `1`;
4317	} else {
4318	nsegs = ((packetlen - `1`) >> MCLSHIFT) + `1`;
4319	}
4320	if (maxsegments != NULL) {
4321	if (maxsegments && nsegs > maxsegments) {
4322	*maxsegments = nsegs;
4323	return (NULL);
4324	}
4325	*maxsegments = nsegs;
4326	}
4327
4328	/*
4329	* The caller doesn't want all the requested buffers; only some.
4330	* Try hard to get what we can, but don't block. This effectively
4331	* overrides MCR_SLEEP, since this thread will not go to sleep
4332	* if we can't get all the buffers.
4333	*/
4334	if (!wantall \|\| (mcflags & MCR_NOSLEEP))
4335	mcflags \|= MCR_TRYHARD;
4336
4337	/*
4338	* Simple case where all elements in the lists/chains are mbufs.
4339	* Unless bufsize is greater than MHLEN, each segment chain is made
4340	* up of exactly 1 mbuf. Otherwise, each segment chain is made up
4341	* of 2 mbufs; the second one is used for the residual data, i.e.
4342	* the remaining data that cannot fit into the first mbuf.
4343	*/
4344	if (bufsize <= MINCLSIZE) {
4345	/ Allocate the elements in one shot from the mbuf cache /
4346	ASSERT(bufsize <= MHLEN \|\| nsegs == `2`);
4347	cp = m_cache(MC_MBUF);
4348	needed = mcache_alloc_ext(cp, &mp_list,
4349	(numlist) nsegs, mcflags);
4350
4351	/*
4352	* The number of elements must be even if we are to use an
4353	* mbuf (instead of a cluster) to store the residual data.
4354	* If we couldn't allocate the requested number of mbufs,
4355	* trim the number down (if it's odd) in order to avoid
4356	* creating a partial segment chain.
4357	*/
4358	if (bufsize > MHLEN && (needed & `0x1`))
4359	needed--;
4360
4361	while (num < needed) {
4362	struct mbuf *m;
4363
4364	m = (struct mbuf *)mp_list;
4365	mp_list = mp_list->obj_next;
4366	ASSERT(m != NULL);
4367
4368	MBUF_INIT(m, `1`, MT_DATA);
4369	#if CONFIG_MACF_NET
4370	if (mac_init_mbuf(m, wait) != `0`) {
4371	m_free(m);
4372	break;
4373	}
4374	#endif /* MAC_NET */
4375	num++;
4376	if (bufsize > MHLEN) {
4377	/ A second mbuf for this segment chain /
4378	m->m_next = (struct mbuf *)mp_list;
4379	mp_list = mp_list->obj_next;
4380	ASSERT(m->m_next != NULL);
4381
4382	MBUF_INIT(m->m_next, `0`, MT_DATA);
4383	num++;
4384	}
4385	*np = m;
4386	np = &m->m_nextpkt;
4387	}
4388	ASSERT(num != *numlist \|\| mp_list == NULL);
4389
4390	if (num > `0`) {
4391	mtype_stat_add(MT_DATA, num);
4392	mtype_stat_sub(MT_FREE, num);
4393	}
4394	num /= nsegs;
4395
4396	/ We've got them all; return to caller /
4397	if (num == *numlist)
4398	return (top);
4399
4400	goto fail;
4401	}
4402
4403	/*
4404	* Complex cases where elements are made up of one or more composite
4405	* mbufs + cluster, depending on packetlen. Each N-segment chain can
4406	* be illustrated as follows:
4407	*
4408	* [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4409	*
4410	* Every composite mbuf + cluster element comes from the intermediate
4411	* cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4412	* the last composite element will come from the MC_MBUF_CL cache,
4413	* unless the residual data is larger than 2KB where we use the
4414	* big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4415	* data is defined as extra data beyond the first element that cannot
4416	* fit into the previous element, i.e. there is no residual data if
4417	* the chain only has 1 segment.
4418	*/
4419	r_bufsize = bufsize;
4420	resid = packetlen > bufsize ? packetlen % bufsize : `0`;
4421	if (resid > `0`) {
4422	/ There is residual data; figure out the cluster size /
4423	if (wantsize == `0` && packetlen > MINCLSIZE) {
4424	/*
4425	* Caller didn't request that all of the segments
4426	* in the chain use the same cluster size; use the
4427	* smaller of the cluster sizes.
4428	*/
4429	if (njcl > `0` && resid > m_maxsize(MC_BIGCL))
4430	r_bufsize = m_maxsize(MC_16KCL);
4431	else if (resid > m_maxsize(MC_CL))
4432	r_bufsize = m_maxsize(MC_BIGCL);
4433	else
4434	r_bufsize = m_maxsize(MC_CL);
4435	} else {
4436	/ Use the same cluster size as the other segments /
4437	resid = `0`;
4438	}
4439	}
4440
4441	needed = *numlist;
4442	if (resid > `0`) {
4443	/*
4444	* Attempt to allocate composite mbuf + cluster elements for
4445	* the residual data in each chain; record the number of such
4446	* elements that can be allocated so that we know how many
4447	* segment chains we can afford to create.
4448	*/
4449	if (r_bufsize <= m_maxsize(MC_CL))
4450	rcp = m_cache(MC_MBUF_CL);
4451	else if (r_bufsize <= m_maxsize(MC_BIGCL))
4452	rcp = m_cache(MC_MBUF_BIGCL);
4453	else
4454	rcp = m_cache(MC_MBUF_16KCL);
4455	needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4456
4457	if (needed == `0`)
4458	goto fail;
4459
4460	/ This is temporarily reduced for calculation /
4461	ASSERT(nsegs > `1`);
4462	nsegs--;
4463	}
4464
4465	/*
4466	* Attempt to allocate the rest of the composite mbuf + cluster
4467	* elements for the number of segment chains that we need.
4468	*/
4469	if (bufsize <= m_maxsize(MC_CL))
4470	cp = m_cache(MC_MBUF_CL);
4471	else if (bufsize <= m_maxsize(MC_BIGCL))
4472	cp = m_cache(MC_MBUF_BIGCL);
4473	else
4474	cp = m_cache(MC_MBUF_16KCL);
4475	needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4476
4477	/ Round it down to avoid creating a partial segment chain /
4478	needed = (needed / nsegs) * nsegs;
4479	if (needed == `0`)
4480	goto fail;
4481
4482	if (resid > `0`) {
4483	/*
4484	* We're about to construct the chain(s); take into account
4485	* the number of segments we have created above to hold the
4486	* residual data for each chain, as well as restore the
4487	* original count of segments per chain.
4488	*/
4489	ASSERT(nsegs > `0`);
4490	needed += needed / nsegs;
4491	nsegs++;
4492	}
4493
4494	for (;;) {
4495	struct mbuf *m;
4496	u_int16_t flag;
4497	struct ext_ref *rfa;
4498	void *cl;
4499	int pkthdr;
4500	m_ext_free_func_t m_free_func;
4501
4502	++num;
4503	if (nsegs == `1` \|\| (num % nsegs) != `0` \|\| resid == `0`) {
4504	m = (struct mbuf *)mp_list;
4505	mp_list = mp_list->obj_next;
4506	} else {
4507	m = (struct mbuf *)rmp_list;
4508	rmp_list = rmp_list->obj_next;
4509	}
4510	m_free_func = m_get_ext_free(m);
4511	ASSERT(m != NULL);
4512	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4513	VERIFY(m_free_func == NULL \|\| m_free_func == m_bigfree \|\|
4514	m_free_func == m_16kfree);
4515
4516	cl = m->m_ext.ext_buf;
4517	rfa = m_get_rfa(m);
4518
4519	ASSERT(cl != NULL && rfa != NULL);
4520	VERIFY(MBUF_IS_COMPOSITE(m));
4521
4522	flag = MEXT_FLAGS(m);
4523
4524	pkthdr = (nsegs == `1` \|\| (num % nsegs) == `1`);
4525	if (pkthdr)
4526	first = m;
4527	MBUF_INIT(m, pkthdr, MT_DATA);
4528	if (m_free_func == m_16kfree) {
4529	MBUF_16KCL_INIT(m, cl, rfa, `1`, flag);
4530	} else if (m_free_func == m_bigfree) {
4531	MBUF_BIGCL_INIT(m, cl, rfa, `1`, flag);
4532	} else {
4533	MBUF_CL_INIT(m, cl, rfa, `1`, flag);
4534	}
4535	#if CONFIG_MACF_NET
4536	if (pkthdr && mac_init_mbuf(m, wait) != `0`) {
4537	--num;
4538	m_freem(m);
4539	break;
4540	}
4541	#endif /* MAC_NET */
4542
4543	*np = m;
4544	if ((num % nsegs) == `0`)
4545	np = &first->m_nextpkt;
4546	else
4547	np = &m->m_next;
4548
4549	if (num == needed)
4550	break;
4551	}
4552
4553	if (num > `0`) {
4554	mtype_stat_add(MT_DATA, num);
4555	mtype_stat_sub(MT_FREE, num);
4556	}
4557
4558	num /= nsegs;
4559
4560	/ We've got them all; return to caller /
4561	if (num == *numlist) {
4562	ASSERT(mp_list == NULL && rmp_list == NULL);
4563	return (top);
4564	}
4565
4566	fail:
4567	/ Free up what's left of the above /
4568	if (mp_list != NULL)
4569	mcache_free_ext(cp, mp_list);
4570	if (rmp_list != NULL)
4571	mcache_free_ext(rcp, rmp_list);
4572	if (wantall && top != NULL) {
4573	m_freem(top);
4574	return (NULL);
4575	}
4576	*numlist = num;
4577	return (top);
4578	}
4579
4580	/*
4581	* Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4582	* packets on receive ring.
4583	*/
4584	__private_extern__ struct mbuf *
4585	m_getpacket_how(int wait)
4586	{
4587	unsigned int num_needed = `1`;
4588
4589	return (m_getpackets_internal(&num_needed, `1`, wait, `1`,
4590	m_maxsize(MC_CL)));
4591	}
4592
4593	/*
4594	* Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4595	* packets on receive ring.
4596	*/
4597	struct mbuf *
4598	m_getpacket(void)
4599	{
4600	unsigned int num_needed = `1`;
4601
4602	return (m_getpackets_internal(&num_needed, `1`, M_WAIT, `1`,
4603	m_maxsize(MC_CL)));
4604	}
4605
4606	/*
4607	* Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4608	* if this can't be met, return whatever number were available. Set up the
4609	* first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4610	* are chained on the m_nextpkt field. Any packets requested beyond this are
4611	* chained onto the last packet header's m_next field.
4612	*/
4613	struct mbuf *
4614	m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4615	{
4616	unsigned int n = num_needed;
4617
4618	return (m_getpackets_internal(&n, num_with_pkthdrs, how, `0`,
4619	m_maxsize(MC_CL)));
4620	}
4621
4622	/*
4623	* Return a list of mbuf hdrs set up as packet hdrs chained together
4624	* on the m_nextpkt field
4625	*/
4626	struct mbuf *
4627	m_getpackethdrs(int num_needed, int how)
4628	{
4629	struct mbuf *m;
4630	struct mbuf *np, top;
4631
4632	top = NULL;
4633	np = &top;
4634
4635	while (num_needed--) {
4636	m = _M_RETRYHDR(how, MT_DATA);
4637	if (m == NULL)
4638	break;
4639
4640	*np = m;
4641	np = &m->m_nextpkt;
4642	}
4643
4644	return (top);
4645	}
4646
4647	/*
4648	* Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4649	* for mbufs packets freed. Used by the drivers.
4650	*/
4651	int
4652	m_freem_list(struct mbuf *m)
4653	{
4654	struct mbuf *nextpkt;
4655	mcache_obj_t *mp_list = NULL;
4656	mcache_obj_t *mcl_list = NULL;
4657	mcache_obj_t *mbc_list = NULL;
4658	mcache_obj_t *m16k_list = NULL;
4659	mcache_obj_t *m_mcl_list = NULL;
4660	mcache_obj_t *m_mbc_list = NULL;
4661	mcache_obj_t *m_m16k_list = NULL;
4662	mcache_obj_t *ref_list = NULL;
4663	int pktcount = `0`;
4664	int mt_free = `0`, mt_data = `0`, mt_header = `0`, mt_soname = `0`, mt_tag = `0`;
4665
4666	while (m != NULL) {
4667	pktcount++;
4668
4669	nextpkt = m->m_nextpkt;
4670	m->m_nextpkt = NULL;
4671
4672	while (m != NULL) {
4673	struct mbuf *next = m->m_next;
4674	mcache_obj_t o, rfa;
4675	u_int32_t composite;
4676	u_int16_t refcnt;
4677	m_ext_free_func_t m_free_func;
4678
4679	if (m->m_type == MT_FREE)
4680	panic("m_free: freeing an already freed mbuf");
4681
4682	if (m->m_flags & M_PKTHDR) {
4683	/ Check for scratch area overflow /
4684	m_redzone_verify(m);
4685	/ Free the aux data and tags if there is any /
4686	m_tag_delete_chain(m, NULL);
4687	}
4688
4689	if (!(m->m_flags & M_EXT)) {
4690	mt_free++;
4691	goto simple_free;
4692	}
4693
4694	if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4695	m = next;
4696	continue;
4697	}
4698
4699	mt_free++;
4700
4701	o = (mcache_obj_t )(void* *)m->m_ext.ext_buf;
4702	refcnt = m_decref(m);
4703	composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4704	m_free_func = m_get_ext_free(m);
4705	if (refcnt == MEXT_MINREF(m) && !composite) {
4706	if (m_free_func == NULL) {
4707	o->obj_next = mcl_list;
4708	mcl_list = o;
4709	} else if (m_free_func == m_bigfree) {
4710	o->obj_next = mbc_list;
4711	mbc_list = o;
4712	} else if (m_free_func == m_16kfree) {
4713	o->obj_next = m16k_list;
4714	m16k_list = o;
4715	} else {
4716	(*(m_free_func))((caddr_t)o,
4717	m->m_ext.ext_size,
4718	m_get_ext_arg(m));
4719	}
4720	rfa = (mcache_obj_t )(void* *)m_get_rfa(m);
4721	rfa->obj_next = ref_list;
4722	ref_list = rfa;
4723	m_set_ext(m, NULL, NULL, NULL);
4724	} else if (refcnt == MEXT_MINREF(m) && composite) {
4725	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4726	VERIFY(m->m_type != MT_FREE);
4727	/*
4728	* Amortize the costs of atomic operations
4729	* by doing them at the end, if possible.
4730	*/
4731	if (m->m_type == MT_DATA)
4732	mt_data++;
4733	else if (m->m_type == MT_HEADER)
4734	mt_header++;
4735	else if (m->m_type == MT_SONAME)
4736	mt_soname++;
4737	else if (m->m_type == MT_TAG)
4738	mt_tag++;
4739	else
4740	mtype_stat_dec(m->m_type);
4741
4742	m->m_type = MT_FREE;
4743	m->m_flags = M_EXT;
4744	m->m_len = `0`;
4745	m->m_next = m->m_nextpkt = NULL;
4746
4747	MEXT_FLAGS(m) &= ~EXTF_READONLY;
4748
4749	/ "Free" into the intermediate cache /
4750	o = (mcache_obj_t *)m;
4751	if (m_free_func == NULL) {
4752	o->obj_next = m_mcl_list;
4753	m_mcl_list = o;
4754	} else if (m_free_func == m_bigfree) {
4755	o->obj_next = m_mbc_list;
4756	m_mbc_list = o;
4757	} else {
4758	VERIFY(m_free_func == m_16kfree);
4759	o->obj_next = m_m16k_list;
4760	m_m16k_list = o;
4761	}
4762	m = next;
4763	continue;
4764	}
4765	simple_free:
4766	/*
4767	* Amortize the costs of atomic operations
4768	* by doing them at the end, if possible.
4769	*/
4770	if (m->m_type == MT_DATA)
4771	mt_data++;
4772	else if (m->m_type == MT_HEADER)
4773	mt_header++;
4774	else if (m->m_type == MT_SONAME)
4775	mt_soname++;
4776	else if (m->m_type == MT_TAG)
4777	mt_tag++;
4778	else if (m->m_type != MT_FREE)
4779	mtype_stat_dec(m->m_type);
4780
4781	m->m_type = MT_FREE;
4782	m->m_flags = m->m_len = `0`;
4783	m->m_next = m->m_nextpkt = NULL;
4784
4785	((mcache_obj_t *)m)->obj_next = mp_list;
4786	mp_list = (mcache_obj_t *)m;
4787
4788	m = next;
4789	}
4790
4791	m = nextpkt;
4792	}
4793
4794	if (mt_free > `0`)
4795	mtype_stat_add(MT_FREE, mt_free);
4796	if (mt_data > `0`)
4797	mtype_stat_sub(MT_DATA, mt_data);
4798	if (mt_header > `0`)
4799	mtype_stat_sub(MT_HEADER, mt_header);
4800	if (mt_soname > `0`)
4801	mtype_stat_sub(MT_SONAME, mt_soname);
4802	if (mt_tag > `0`)
4803	mtype_stat_sub(MT_TAG, mt_tag);
4804
4805	if (mp_list != NULL)
4806	mcache_free_ext(m_cache(MC_MBUF), mp_list);
4807	if (mcl_list != NULL)
4808	mcache_free_ext(m_cache(MC_CL), mcl_list);
4809	if (mbc_list != NULL)
4810	mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4811	if (m16k_list != NULL)
4812	mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4813	if (m_mcl_list != NULL)
4814	mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4815	if (m_mbc_list != NULL)
4816	mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4817	if (m_m16k_list != NULL)
4818	mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4819	if (ref_list != NULL)
4820	mcache_free_ext(ref_cache, ref_list);
4821
4822	return (pktcount);
4823	}
4824
4825	void
4826	m_freem(struct mbuf *m)
4827	{
4828	while (m != NULL)
4829	m = m_free(m);
4830	}
4831
4832	/*
4833	* Mbuffer utility routines.
4834	*/
4835	/*
4836	* Set the m_data pointer of a newly allocated mbuf to place an object of the
4837	* specified size at the end of the mbuf, longword aligned.
4838	*
4839	* NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
4840	* separate macros, each asserting that it was called at the proper moment.
4841	* This required callers to themselves test the storage type and call the
4842	* right one. Rather than require callers to be aware of those layout
4843	* decisions, we centralize here.
4844	*/
4845	void
4846	m_align(struct mbuf m, int* len)
4847	{
4848	int adjust = `0`;
4849
4850	/ At this point data must point to start /
4851	VERIFY(m->m_data == M_START(m));
4852	VERIFY(len >= `0`);
4853	VERIFY(len <= M_SIZE(m));
4854	adjust = M_SIZE(m) - len;
4855	m->m_data += adjust &~ (sizeof(long) - `1`);
4856	}
4857
4858	/*
4859	* Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4860	* copy junk along. Does not adjust packet header length.
4861	*/
4862	struct mbuf *
4863	m_prepend(struct mbuf m, int* len, int how)
4864	{
4865	struct mbuf *mn;
4866
4867	_MGET(mn, how, m->m_type);
4868	if (mn == NULL) {
4869	m_freem(m);
4870	return (NULL);
4871	}
4872	if (m->m_flags & M_PKTHDR) {
4873	M_COPY_PKTHDR(mn, m);
4874	m->m_flags &= ~M_PKTHDR;
4875	}
4876	mn->m_next = m;
4877	m = mn;
4878	if (m->m_flags & M_PKTHDR) {
4879	VERIFY(len <= MHLEN);
4880	MH_ALIGN(m, len);
4881	} else {
4882	VERIFY(len <= MLEN);
4883	M_ALIGN(m, len);
4884	}
4885	m->m_len = len;
4886	return (m);
4887	}
4888
4889	/*
4890	* Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4891	* chain, copy junk along, and adjust length.
4892	*/
4893	struct mbuf *
4894	m_prepend_2(struct mbuf m, int* len, int how, int align)
4895	{
4896	if (M_LEADINGSPACE(m) >= len &&
4897	(!align \|\| IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
4898	m->m_data -= len;
4899	m->m_len += len;
4900	} else {
4901	m = m_prepend(m, len, how);
4902	}
4903	if ((m) && (m->m_flags & M_PKTHDR))
4904	m->m_pkthdr.len += len;
4905	return (m);
4906	}
4907
4908	/*
4909	* Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4910	* continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4911	* The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4912	*/
4913	int MCFail;
4914
4915	struct mbuf *
4916	m_copym_mode(struct mbuf m, int* off0, int len, int wait, uint32_t mode)
4917	{
4918	struct mbuf n, mhdr = NULL, **np;
4919	int off = off0;
4920	struct mbuf *top;
4921	int copyhdr = `0`;
4922
4923	if (off < `0` \|\| len < `0`)
4924	panic("m_copym: invalid offset %d or len %d", off, len);
4925
4926	VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4927	mode != M_COPYM_MUST_MOVE_HDR) \|\| (m->m_flags & M_PKTHDR));
4928
4929	if ((off == `0` && (m->m_flags & M_PKTHDR)) \|\|
4930	mode == M_COPYM_MUST_COPY_HDR \|\| mode == M_COPYM_MUST_MOVE_HDR) {
4931	mhdr = m;
4932	copyhdr = `1`;
4933	}
4934
4935	while (off >= m->m_len) {
4936	if (m->m_next == NULL)
4937	panic("m_copym: invalid mbuf chain");
4938	off -= m->m_len;
4939	m = m->m_next;
4940	}
4941	np = &top;
4942	top = NULL;
4943
4944	while (len > `0`) {
4945	if (m == NULL) {
4946	if (len != M_COPYALL)
4947	panic("m_copym: len != M_COPYALL");
4948	break;
4949	}
4950
4951	if (copyhdr)
4952	n = _M_RETRYHDR(wait, m->m_type);
4953	else
4954	n = _M_RETRY(wait, m->m_type);
4955	*np = n;
4956
4957	if (n == NULL)
4958	goto nospace;
4959
4960	if (copyhdr != `0`) {
4961	if ((mode == M_COPYM_MOVE_HDR) \|\|
4962	(mode == M_COPYM_MUST_MOVE_HDR)) {
4963	M_COPY_PKTHDR(n, mhdr);
4964	} else if ((mode == M_COPYM_COPY_HDR) \|\|
4965	(mode == M_COPYM_MUST_COPY_HDR)) {
4966	if (m_dup_pkthdr(n, mhdr, wait) == `0`)
4967	goto nospace;
4968	}
4969	if (len == M_COPYALL)
4970	n->m_pkthdr.len -= off0;
4971	else
4972	n->m_pkthdr.len = len;
4973	copyhdr = `0`;
4974	/*
4975	* There is data to copy from the packet header mbuf
4976	* if it is empty or it is before the starting offset
4977	*/
4978	if (mhdr != m) {
4979	np = &n->m_next;
4980	continue;
4981	}
4982	}
4983	n->m_len = MIN(len, (m->m_len - off));
4984	if (m->m_flags & M_EXT) {
4985	n->m_ext = m->m_ext;
4986	m_incref(m);
4987	n->m_data = m->m_data + off;
4988	n->m_flags \|= M_EXT;
4989	} else {
4990	/*
4991	* Limit to the capacity of the destination
4992	*/
4993	if (n->m_flags & M_PKTHDR)
4994	n->m_len = MIN(n->m_len, MHLEN);
4995	else
4996	n->m_len = MIN(n->m_len, MLEN);
4997
4998	if (MTOD(n, char ) + n->m_len > ((char* *)n) + MSIZE)
4999	panic("%s n %p copy overflow",
5000	__func__, n);
5001
5002	bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5003	(unsigned)n->m_len);
5004	}
5005	if (len != M_COPYALL)
5006	len -= n->m_len;
5007	off = `0`;
5008	m = m->m_next;
5009	np = &n->m_next;
5010	}
5011
5012	if (top == NULL)
5013	MCFail++;
5014
5015	return (top);
5016	nospace:
5017
5018	m_freem(top);
5019	MCFail++;
5020	return (NULL);
5021	}
5022
5023
5024	struct mbuf *
5025	m_copym(struct mbuf m, int* off0, int len, int wait)
5026	{
5027	return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
5028	}
5029
5030	/*
5031	* Equivalent to m_copym except that all necessary mbuf hdrs are allocated
5032	* within this routine also, the last mbuf and offset accessed are passed
5033	* out and can be passed back in to avoid having to rescan the entire mbuf
5034	* list (normally hung off of the socket)
5035	*/
5036	struct mbuf *
5037	m_copym_with_hdrs(struct mbuf m0, int* off0, int len0, int wait,
5038	struct mbuf *m_lastm, int* *m_off, uint32_t mode)
5039	{
5040	struct mbuf m = m0, n, **np = NULL;
5041	int off = off0, len = len0;
5042	struct mbuf *top = NULL;
5043	int mcflags = MSLEEPF(wait);
5044	int copyhdr = `0`;
5045	int type = `0`;
5046	mcache_obj_t *list = NULL;
5047	int needed = `0`;
5048
5049	if (off == `0` && (m->m_flags & M_PKTHDR))
5050	copyhdr = `1`;
5051
5052	if (m_lastm != NULL && *m_lastm != NULL) {
5053	m = *m_lastm;
5054	off = *m_off;
5055	} else {
5056	while (off >= m->m_len) {
5057	off -= m->m_len;
5058	m = m->m_next;
5059	}
5060	}
5061
5062	n = m;
5063	while (len > `0`) {
5064	needed++;
5065	ASSERT(n != NULL);
5066	len -= MIN(len, (n->m_len - ((needed == `1`) ? off : `0`)));
5067	n = n->m_next;
5068	}
5069	needed++;
5070	len = len0;
5071
5072	/*
5073	* If the caller doesn't want to be put to sleep, mark it with
5074	* MCR_TRYHARD so that we may reclaim buffers from other places
5075	* before giving up.
5076	*/
5077	if (mcflags & MCR_NOSLEEP)
5078	mcflags \|= MCR_TRYHARD;
5079
5080	if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5081	mcflags) != needed)
5082	goto nospace;
5083
5084	needed = `0`;
5085	while (len > `0`) {
5086	n = (struct mbuf *)list;
5087	list = list->obj_next;
5088	ASSERT(n != NULL && m != NULL);
5089
5090	type = (top == NULL) ? MT_HEADER : m->m_type;
5091	MBUF_INIT(n, (top == NULL), type);
5092	#if CONFIG_MACF_NET
5093	if (top == NULL && mac_mbuf_label_init(n, wait) != `0`) {
5094	mtype_stat_inc(MT_HEADER);
5095	mtype_stat_dec(MT_FREE);
5096	m_free(n);
5097	goto nospace;
5098	}
5099	#endif /* MAC_NET */
5100
5101	if (top == NULL) {
5102	top = n;
5103	np = &top->m_next;
5104	continue;
5105	} else {
5106	needed++;
5107	*np = n;
5108	}
5109
5110	if (copyhdr) {
5111	if ((mode == M_COPYM_MOVE_HDR) \|\|
5112	(mode == M_COPYM_MUST_MOVE_HDR)) {
5113	M_COPY_PKTHDR(n, m);
5114	} else if ((mode == M_COPYM_COPY_HDR) \|\|
5115	(mode == M_COPYM_MUST_COPY_HDR)) {
5116	if (m_dup_pkthdr(n, m, wait) == `0`)
5117	goto nospace;
5118	}
5119	n->m_pkthdr.len = len;
5120	copyhdr = `0`;
5121	}
5122	n->m_len = MIN(len, (m->m_len - off));
5123
5124	if (m->m_flags & M_EXT) {
5125	n->m_ext = m->m_ext;
5126	m_incref(m);
5127	n->m_data = m->m_data + off;
5128	n->m_flags \|= M_EXT;
5129	} else {
5130	if (MTOD(n, char ) + n->m_len > ((char* *)n) + MSIZE)
5131	panic("%s n %p copy overflow",
5132	__func__, n);
5133
5134	bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5135	(unsigned)n->m_len);
5136	}
5137	len -= n->m_len;
5138
5139	if (len == `0`) {
5140	if (m_lastm != NULL && m_off != NULL) {
5141	if ((off + n->m_len) == m->m_len) {
5142	*m_lastm = m->m_next;
5143	*m_off = `0`;
5144	} else {
5145	*m_lastm = m;
5146	*m_off = off + n->m_len;
5147	}
5148	}
5149	break;
5150	}
5151	off = `0`;
5152	m = m->m_next;
5153	np = &n->m_next;
5154	}
5155
5156	mtype_stat_inc(MT_HEADER);
5157	mtype_stat_add(type, needed);
5158	mtype_stat_sub(MT_FREE, needed + `1`);
5159
5160	ASSERT(list == NULL);
5161	return (top);
5162
5163	nospace:
5164	if (list != NULL)
5165	mcache_free_ext(m_cache(MC_MBUF), list);
5166	if (top != NULL)
5167	m_freem(top);
5168	MCFail++;
5169	return (NULL);
5170	}
5171
5172	/*
5173	* Copy data from an mbuf chain starting "off" bytes from the beginning,
5174	* continuing for "len" bytes, into the indicated buffer.
5175	*/
5176	void
5177	m_copydata(struct mbuf m, int* off, int len, void *vp)
5178	{
5179	int off0 = off, len0 = len;
5180	struct mbuf *m0 = m;
5181	unsigned count;
5182	char *cp = vp;
5183
5184	if (__improbable(off < `0` \|\| len < `0`)) {
5185	panic("%s: invalid offset %d or len %d", __func__, off, len);
5186	/ NOTREACHED /
5187	}
5188
5189	while (off > `0`) {
5190	if (__improbable(m == NULL)) {
5191	panic("%s: invalid mbuf chain %p [off %d, len %d]",
5192	__func__, m0, off0, len0);
5193	/ NOTREACHED /
5194	}
5195	if (off < m->m_len)
5196	break;
5197	off -= m->m_len;
5198	m = m->m_next;
5199	}
5200	while (len > `0`) {
5201	if (__improbable(m == NULL)) {
5202	panic("%s: invalid mbuf chain %p [off %d, len %d]",
5203	__func__, m0, off0, len0);
5204	/ NOTREACHED /
5205	}
5206	count = MIN(m->m_len - off, len);
5207	bcopy(MTOD(m, caddr_t) + off, cp, count);
5208	len -= count;
5209	cp += count;
5210	off = `0`;
5211	m = m->m_next;
5212	}
5213	}
5214
5215	/*
5216	* Concatenate mbuf chain n to m. Both chains must be of the same type
5217	* (e.g. MT_DATA). Any m_pkthdr is not updated.
5218	*/
5219	void
5220	m_cat(struct mbuf m, struct* mbuf *n)
5221	{
5222	while (m->m_next)
5223	m = m->m_next;
5224	while (n) {
5225	if ((m->m_flags & M_EXT) \|\|
5226	m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5227	/ just join the two chains /
5228	m->m_next = n;
5229	return;
5230	}
5231	/ splat the data from one into the other /
5232	bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5233	(u_int)n->m_len);
5234	m->m_len += n->m_len;
5235	n = m_free(n);
5236	}
5237	}
5238
5239	void
5240	m_adj(struct mbuf mp, int* req_len)
5241	{
5242	int len = req_len;
5243	struct mbuf *m;
5244	int count;
5245
5246	if ((m = mp) == NULL)
5247	return;
5248	if (len >= `0`) {
5249	/*
5250	* Trim from head.
5251	*/
5252	while (m != NULL && len > `0`) {
5253	if (m->m_len <= len) {
5254	len -= m->m_len;
5255	m->m_len = `0`;
5256	m = m->m_next;
5257	} else {
5258	m->m_len -= len;
5259	m->m_data += len;
5260	len = `0`;
5261	}
5262	}
5263	m = mp;
5264	if (m->m_flags & M_PKTHDR)
5265	m->m_pkthdr.len -= (req_len - len);
5266	} else {
5267	/*
5268	* Trim from tail. Scan the mbuf chain,
5269	* calculating its length and finding the last mbuf.
5270	* If the adjustment only affects this mbuf, then just
5271	* adjust and return. Otherwise, rescan and truncate
5272	* after the remaining size.
5273	*/
5274	len = -len;
5275	count = `0`;
5276	for (;;) {
5277	count += m->m_len;
5278	if (m->m_next == (struct mbuf *)`0`)
5279	break;
5280	m = m->m_next;
5281	}
5282	if (m->m_len >= len) {
5283	m->m_len -= len;
5284	m = mp;
5285	if (m->m_flags & M_PKTHDR)
5286	m->m_pkthdr.len -= len;
5287	return;
5288	}
5289	count -= len;
5290	if (count < `0`)
5291	count = `0`;
5292	/*
5293	* Correct length for chain is "count".
5294	* Find the mbuf with last data, adjust its length,
5295	* and toss data from remaining mbufs on chain.
5296	*/
5297	m = mp;
5298	if (m->m_flags & M_PKTHDR)
5299	m->m_pkthdr.len = count;
5300	for (; m; m = m->m_next) {
5301	if (m->m_len >= count) {
5302	m->m_len = count;
5303	break;
5304	}
5305	count -= m->m_len;
5306	}
5307	while ((m = m->m_next))
5308	m->m_len = `0`;
5309	}
5310	}
5311
5312	/*
5313	* Rearange an mbuf chain so that len bytes are contiguous
5314	* and in the data area of an mbuf (so that mtod and dtom
5315	* will work for a structure of size len). Returns the resulting
5316	* mbuf chain on success, frees it and returns null on failure.
5317	* If there is room, it will add up to max_protohdr-len extra bytes to the
5318	* contiguous region in an attempt to avoid being called next time.
5319	*/
5320	int MPFail;
5321
5322	struct mbuf *
5323	m_pullup(struct mbuf n, int* len)
5324	{
5325	struct mbuf *m;
5326	int count;
5327	int space;
5328
5329	/ check invalid arguments /
5330	if (n == NULL) {
5331	panic("%s: n == NULL", __func__);
5332	}
5333	if (len < `0`) {
5334	os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
5335	__func__, len);
5336	goto bad;
5337	}
5338	if (len > MLEN) {
5339	os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
5340	__func__, len);
5341	goto bad;
5342	}
5343	if ((n->m_flags & M_EXT) == `0` &&
5344	n->m_data >= &n->m_dat[MLEN]) {
5345	os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
5346	__func__);
5347	goto bad;
5348	}
5349
5350	/*
5351	* If first mbuf has no cluster, and has room for len bytes
5352	* without shifting current data, pullup into it,
5353	* otherwise allocate a new mbuf to prepend to the chain.
5354	*/
5355	if ((n->m_flags & M_EXT) == `0` &&
5356	len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
5357	if (n->m_len >= len)
5358	return (n);
5359	m = n;
5360	n = n->m_next;
5361	len -= m->m_len;
5362	} else {
5363	if (len > MHLEN)
5364	goto bad;
5365	_MGET(m, M_DONTWAIT, n->m_type);
5366	if (m == `0`)
5367	goto bad;
5368	m->m_len = `0`;
5369	if (n->m_flags & M_PKTHDR) {
5370	M_COPY_PKTHDR(m, n);
5371	n->m_flags &= ~M_PKTHDR;
5372	}
5373	}
5374	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5375	do {
5376	count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5377	bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5378	(unsigned)count);
5379	len -= count;
5380	m->m_len += count;
5381	n->m_len -= count;
5382	space -= count;
5383	if (n->m_len != `0`)
5384	n->m_data += count;
5385	else
5386	n = m_free(n);
5387	} while (len > `0` && n != NULL);
5388	if (len > `0`) {
5389	(void) m_free(m);
5390	goto bad;
5391	}
5392	m->m_next = n;
5393	return (m);
5394	bad:
5395	m_freem(n);
5396	MPFail++;
5397	return (`0`);
5398	}
5399
5400	/*
5401	* Like m_pullup(), except a new mbuf is always allocated, and we allow
5402	* the amount of empty space before the data in the new mbuf to be specified
5403	* (in the event that the caller expects to prepend later).
5404	*/
5405	__private_extern__ int MSFail = `0`;
5406
5407	__private_extern__ struct mbuf *
5408	m_copyup(struct mbuf n, int* len, int dstoff)
5409	{
5410	struct mbuf *m;
5411	int count, space;
5412
5413	if (len > (MHLEN - dstoff))
5414	goto bad;
5415	MGET(m, M_DONTWAIT, n->m_type);
5416	if (m == NULL)
5417	goto bad;
5418	m->m_len = `0`;
5419	if (n->m_flags & M_PKTHDR) {
5420	m_copy_pkthdr(m, n);
5421	n->m_flags &= ~M_PKTHDR;
5422	}
5423	m->m_data += dstoff;
5424	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5425	do {
5426	count = min(min(max(len, max_protohdr), space), n->m_len);
5427	memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5428	(unsigned)count);
5429	len -= count;
5430	m->m_len += count;
5431	n->m_len -= count;
5432	space -= count;
5433	if (n->m_len)
5434	n->m_data += count;
5435	else
5436	n = m_free(n);
5437	} while (len > `0` && n);
5438	if (len > `0`) {
5439	(void) m_free(m);
5440	goto bad;
5441	}
5442	m->m_next = n;
5443	return (m);
5444	bad:
5445	m_freem(n);
5446	MSFail++;
5447	return (NULL);
5448	}
5449
5450	/*
5451	* Partition an mbuf chain in two pieces, returning the tail --
5452	* all but the first len0 bytes. In case of failure, it returns NULL and
5453	* attempts to restore the chain to its original state.
5454	*/
5455	struct mbuf *
5456	m_split(struct mbuf m0, int* len0, int wait)
5457	{
5458	return (m_split0(m0, len0, wait, `1`));
5459	}
5460
5461	static struct mbuf *
5462	m_split0(struct mbuf m0, int* len0, int wait, int copyhdr)
5463	{
5464	struct mbuf m, n;
5465	unsigned len = len0, remain;
5466
5467	/*
5468	* First iterate to the mbuf which contains the first byte of
5469	* data at offset len0
5470	*/
5471	for (m = m0; m && len > m->m_len; m = m->m_next)
5472	len -= m->m_len;
5473	if (m == NULL)
5474	return (NULL);
5475	/*
5476	* len effectively is now the offset in the current
5477	* mbuf where we have to perform split.
5478	*
5479	* remain becomes the tail length.
5480	* Note that len can also be == m->m_len
5481	*/
5482	remain = m->m_len - len;
5483
5484	/*
5485	* If current mbuf len contains the entire remaining offset len,
5486	* just make the second mbuf chain pointing to next mbuf onwards
5487	* and return after making necessary adjustments
5488	*/
5489	if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == `0`) {
5490	_MGETHDR(n, wait, m0->m_type);
5491	if (n == NULL)
5492	return (NULL);
5493	n->m_next = m->m_next;
5494	m->m_next = NULL;
5495	n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5496	n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5497	m0->m_pkthdr.len = len0;
5498	return (n);
5499	} if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5500	_MGETHDR(n, wait, m0->m_type);
5501	if (n == NULL)
5502	return (NULL);
5503	n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5504	n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5505	m0->m_pkthdr.len = len0;
5506
5507	/*
5508	* If current points to external storage
5509	* then it can be shared by making last mbuf
5510	* of head chain and first mbuf of current chain
5511	* pointing to different data offsets
5512	*/
5513	if (m->m_flags & M_EXT)
5514	goto extpacket;
5515	if (remain > MHLEN) {
5516	/ m can't be the lead packet /
5517	MH_ALIGN(n, `0`);
5518	n->m_next = m_split(m, len, wait);
5519	if (n->m_next == NULL) {
5520	(void) m_free(n);
5521	return (NULL);
5522	} else
5523	return (n);
5524	} else
5525	MH_ALIGN(n, remain);
5526	} else if (remain == `0`) {
5527	n = m->m_next;
5528	m->m_next = NULL;
5529	return (n);
5530	} else {
5531	_MGET(n, wait, m->m_type);
5532	if (n == NULL)
5533	return (NULL);
5534
5535	if ((m->m_flags & M_EXT) == `0`) {
5536	VERIFY(remain <= MLEN);
5537	M_ALIGN(n, remain);
5538	}
5539	}
5540	extpacket:
5541	if (m->m_flags & M_EXT) {
5542	n->m_flags \|= M_EXT;
5543	n->m_ext = m->m_ext;
5544	m_incref(m);
5545	n->m_data = m->m_data + len;
5546	} else {
5547	bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5548	}
5549	n->m_len = remain;
5550	m->m_len = len;
5551	n->m_next = m->m_next;
5552	m->m_next = NULL;
5553	return (n);
5554	}
5555
5556	/*
5557	* Routine to copy from device local memory into mbufs.
5558	*/
5559	struct mbuf *
5560	m_devget(char buf, int* totlen, int off0, struct ifnet *ifp,
5561	void (copy)(const* void , void* *, size_t))
5562	{
5563	struct mbuf *m;
5564	struct mbuf top = NULL, *mp = &top;
5565	int off = off0, len;
5566	char *cp;
5567	char *epkt;
5568
5569	cp = buf;
5570	epkt = cp + totlen;
5571	if (off) {
5572	/*
5573	* If 'off' is non-zero, packet is trailer-encapsulated,
5574	* so we have to skip the type and length fields.
5575	*/
5576	cp += off + `2` * sizeof (u_int16_t);
5577	totlen -= `2` * sizeof (u_int16_t);
5578	}
5579	_MGETHDR(m, M_DONTWAIT, MT_DATA);
5580	if (m == NULL)
5581	return (NULL);
5582	m->m_pkthdr.rcvif = ifp;
5583	m->m_pkthdr.len = totlen;
5584	m->m_len = MHLEN;
5585
5586	while (totlen > `0`) {
5587	if (top != NULL) {
5588	_MGET(m, M_DONTWAIT, MT_DATA);
5589	if (m == NULL) {
5590	m_freem(top);
5591	return (NULL);
5592	}
5593	m->m_len = MLEN;
5594	}
5595	len = MIN(totlen, epkt - cp);
5596	if (len >= MINCLSIZE) {
5597	MCLGET(m, M_DONTWAIT);
5598	if (m->m_flags & M_EXT) {
5599	m->m_len = len = MIN(len, m_maxsize(MC_CL));
5600	} else {
5601	/ give up when it's out of cluster mbufs /
5602	if (top != NULL)
5603	m_freem(top);
5604	m_freem(m);
5605	return (NULL);
5606	}
5607	} else {
5608	/*
5609	* Place initial small packet/header at end of mbuf.
5610	*/
5611	if (len < m->m_len) {
5612	if (top == NULL &&
5613	len + max_linkhdr <= m->m_len)
5614	m->m_data += max_linkhdr;
5615	m->m_len = len;
5616	} else {
5617	len = m->m_len;
5618	}
5619	}
5620	if (copy)
5621	copy(cp, MTOD(m, caddr_t), (unsigned)len);
5622	else
5623	bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5624	cp += len;
5625	*mp = m;
5626	mp = &m->m_next;
5627	totlen -= len;
5628	if (cp == epkt)
5629	cp = buf;
5630	}
5631	return (top);
5632	}
5633
5634	#ifndef MBUF_GROWTH_NORMAL_THRESH
5635	#define MBUF_GROWTH_NORMAL_THRESH 25
5636	#endif
5637
5638	/*
5639	* Cluster freelist allocation check.
5640	*/
5641	static int
5642	m_howmany(int num, size_t bufsize)
5643	{
5644	int i = `0`, j = `0`;
5645	u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5646	u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5647	u_int32_t sumclusters, freeclusters;
5648	u_int32_t percent_pool, percent_kmem;
5649	u_int32_t mb_growth, mb_growth_thresh;
5650
5651	VERIFY(bufsize == m_maxsize(MC_BIGCL) \|\|
5652	bufsize == m_maxsize(MC_16KCL));
5653
5654	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5655
5656	/ Numbers in 2K cluster units /
5657	m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5658	m_clusters = m_total(MC_CL);
5659	m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5660	m_16kclusters = m_total(MC_16KCL);
5661	sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5662
5663	m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5664	m_clfree = m_infree(MC_CL);
5665	m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5666	m_16kclfree = m_infree(MC_16KCL);
5667	freeclusters = m_mbfree + m_clfree + m_bigclfree;
5668
5669	/ Bail if we've maxed out the mbuf memory map /
5670	if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) \|\|
5671	(njcl > `0` && bufsize == m_maxsize(MC_16KCL) &&
5672	(m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5673	mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
5674	sumclusters, nclusters,
5675	(m_16kclusters << NCLPJCLSHIFT), njcl);
5676	return (`0`);
5677	}
5678
5679	if (bufsize == m_maxsize(MC_BIGCL)) {
5680	/ Under minimum /
5681	if (m_bigclusters < m_minlimit(MC_BIGCL))
5682	return (m_minlimit(MC_BIGCL) - m_bigclusters);
5683
5684	percent_pool =
5685	((sumclusters - freeclusters) * `100`) / sumclusters;
5686	percent_kmem = (sumclusters * `100`) / nclusters;
5687
5688	/*
5689	* If a light/normal user, grow conservatively (75%)
5690	* If a heavy user, grow aggressively (50%)
5691	*/
5692	if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5693	mb_growth = MB_GROWTH_NORMAL;
5694	else
5695	mb_growth = MB_GROWTH_AGGRESSIVE;
5696
5697	if (percent_kmem < `5`) {
5698	/ For initial allocations /
5699	i = num;
5700	} else {
5701	/ Return if >= MBIGCL_LOWAT clusters available /
5702	if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5703	m_total(MC_BIGCL) >=
5704	MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5705	return (`0`);
5706
5707	/ Ensure at least num clusters are accessible /
5708	if (num >= m_infree(MC_BIGCL))
5709	i = num - m_infree(MC_BIGCL);
5710	if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5711	j = num - (m_total(MC_BIGCL) -
5712	m_minlimit(MC_BIGCL));
5713
5714	i = MAX(i, j);
5715
5716	/*
5717	* Grow pool if percent_pool > 75 (normal growth)
5718	* or percent_pool > 50 (aggressive growth).
5719	*/
5720	mb_growth_thresh = `100` - (`100` / (`1` << mb_growth));
5721	if (percent_pool > mb_growth_thresh)
5722	j = ((sumclusters + num) >> mb_growth) -
5723	freeclusters;
5724	i = MAX(i, j);
5725	}
5726
5727	/ Check to ensure we didn't go over limits /
5728	if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5729	i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5730	if ((i << `1`) + sumclusters >= nclusters)
5731	i = (nclusters - sumclusters) >> `1`;
5732	VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5733	VERIFY(sumclusters + (i << `1`) <= nclusters);
5734
5735	} else { / 16K CL /
5736	VERIFY(njcl > `0`);
5737	/ Ensure at least num clusters are available /
5738	if (num >= m_16kclfree)
5739	i = num - m_16kclfree;
5740
5741	/ Always grow 16KCL pool aggressively /
5742	if (((m_16kclusters + num) >> `1`) > m_16kclfree)
5743	j = ((m_16kclusters + num) >> `1`) - m_16kclfree;
5744	i = MAX(i, j);
5745
5746	/ Check to ensure we don't go over limit /
5747	if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL))
5748	i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5749	}
5750	return (i);
5751	}
5752	/*
5753	* Return the number of bytes in the mbuf chain, m.
5754	*/
5755	unsigned int
5756	m_length(struct mbuf *m)
5757	{
5758	struct mbuf *m0;
5759	unsigned int pktlen;
5760
5761	if (m->m_flags & M_PKTHDR)
5762	return (m->m_pkthdr.len);
5763
5764	pktlen = `0`;
5765	for (m0 = m; m0 != NULL; m0 = m0->m_next)
5766	pktlen += m0->m_len;
5767	return (pktlen);
5768	}
5769
5770	/*
5771	* Copy data from a buffer back into the indicated mbuf chain,
5772	* starting "off" bytes from the beginning, extending the mbuf
5773	* chain if necessary.
5774	*/
5775	void
5776	m_copyback(struct mbuf m0, int* off, int len, const void *cp)
5777	{
5778	#if DEBUG
5779	struct mbuf *origm = m0;
5780	int error;
5781	#endif /* DEBUG */
5782
5783	if (m0 == NULL)
5784	return;
5785
5786	#if DEBUG
5787	error =
5788	#endif /* DEBUG */
5789	m_copyback0(&m0, off, len, cp,
5790	M_COPYBACK0_COPYBACK \| M_COPYBACK0_EXTEND, M_DONTWAIT);
5791
5792	#if DEBUG
5793	if (error != `0` \|\| (m0 != NULL && origm != m0))
5794	panic("m_copyback");
5795	#endif /* DEBUG */
5796	}
5797
5798	struct mbuf *
5799	m_copyback_cow(struct mbuf m0, int* off, int len, const void cp, int* how)
5800	{
5801	int error;
5802
5803	/ don't support chain expansion /
5804	VERIFY(off + len <= m_length(m0));
5805
5806	error = m_copyback0(&m0, off, len, cp,
5807	M_COPYBACK0_COPYBACK \| M_COPYBACK0_COW, how);
5808	if (error) {
5809	/*
5810	* no way to recover from partial success.
5811	* just free the chain.
5812	*/
5813	m_freem(m0);
5814	return (NULL);
5815	}
5816	return (m0);
5817	}
5818
5819	/*
5820	* m_makewritable: ensure the specified range writable.
5821	*/
5822	int
5823	m_makewritable(struct mbuf *mp, int* off, int len, int how)
5824	{
5825	int error;
5826	#if DEBUG
5827	struct mbuf *n;
5828	int origlen, reslen;
5829
5830	origlen = m_length(*mp);
5831	#endif /* DEBUG */
5832
5833	#if 0 /* M_COPYALL is large enough */
5834	if (len == M_COPYALL)
5835	len = m_length(mp) - off; /* XXX /
5836	#endif
5837
5838	error = m_copyback0(mp, off, len, NULL,
5839	M_COPYBACK0_PRESERVE \| M_COPYBACK0_COW, how);
5840
5841	#if DEBUG
5842	reslen = `0`;
5843	for (n = *mp; n; n = n->m_next)
5844	reslen += n->m_len;
5845	if (origlen != reslen)
5846	panic("m_makewritable: length changed");
5847	if (((mp)->m_flags & M_PKTHDR) && reslen != (mp)->m_pkthdr.len)
5848	panic("m_makewritable: inconsist");
5849	#endif /* DEBUG */
5850
5851	return (error);
5852	}
5853
5854	static int
5855	m_copyback0(struct mbuf *mp0, int* off, int len, const void vp, int* flags,
5856	int how)
5857	{
5858	int mlen;
5859	struct mbuf m, n;
5860	struct mbuf **mp;
5861	int totlen = `0`;
5862	const char *cp = vp;
5863
5864	VERIFY(mp0 != NULL);
5865	VERIFY(*mp0 != NULL);
5866	VERIFY((flags & M_COPYBACK0_PRESERVE) == `0` \|\| cp == NULL);
5867	VERIFY((flags & M_COPYBACK0_COPYBACK) == `0` \|\| cp != NULL);
5868
5869	/*
5870	* we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5871	* assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5872	*/
5873
5874	VERIFY((~flags & (M_COPYBACK0_EXTEND\|M_COPYBACK0_COW)) != `0`);
5875
5876	mp = mp0;
5877	m = *mp;
5878	while (off > (mlen = m->m_len)) {
5879	off -= mlen;
5880	totlen += mlen;
5881	if (m->m_next == NULL) {
5882	int tspace;
5883	extend:
5884	if (!(flags & M_COPYBACK0_EXTEND))
5885	goto out;
5886
5887	/*
5888	* try to make some space at the end of "m".
5889	*/
5890
5891	mlen = m->m_len;
5892	if (off + len >= MINCLSIZE &&
5893	!(m->m_flags & M_EXT) && m->m_len == `0`) {
5894	MCLGET(m, how);
5895	}
5896	tspace = M_TRAILINGSPACE(m);
5897	if (tspace > `0`) {
5898	tspace = MIN(tspace, off + len);
5899	VERIFY(tspace > `0`);
5900	bzero(mtod(m, char *) + m->m_len,
5901	MIN(off, tspace));
5902	m->m_len += tspace;
5903	off += mlen;
5904	totlen -= mlen;
5905	continue;
5906	}
5907
5908	/*
5909	* need to allocate an mbuf.
5910	*/
5911
5912	if (off + len >= MINCLSIZE) {
5913	n = m_getcl(how, m->m_type, `0`);
5914	} else {
5915	n = _M_GET(how, m->m_type);
5916	}
5917	if (n == NULL) {
5918	goto out;
5919	}
5920	n->m_len = `0`;
5921	n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5922	bzero(mtod(n, char *), MIN(n->m_len, off));
5923	m->m_next = n;
5924	}
5925	mp = &m->m_next;
5926	m = m->m_next;
5927	}
5928	while (len > `0`) {
5929	mlen = m->m_len - off;
5930	if (mlen != `0` && m_mclhasreference(m)) {
5931	char *datap;
5932	int eatlen;
5933
5934	/*
5935	* this mbuf is read-only.
5936	* allocate a new writable mbuf and try again.
5937	*/
5938
5939	#if DIAGNOSTIC
5940	if (!(flags & M_COPYBACK0_COW))
5941	panic("m_copyback0: read-only");
5942	#endif /* DIAGNOSTIC */
5943
5944	/*
5945	* if we're going to write into the middle of
5946	* a mbuf, split it first.
5947	*/
5948	if (off > `0` && len < mlen) {
5949	n = m_split0(m, off, how, `0`);
5950	if (n == NULL)
5951	goto enobufs;
5952	m->m_next = n;
5953	mp = &m->m_next;
5954	m = n;
5955	off = `0`;
5956	continue;
5957	}
5958
5959	/*
5960	* XXX TODO coalesce into the trailingspace of
5961	* the previous mbuf when possible.
5962	*/
5963
5964	/*
5965	* allocate a new mbuf. copy packet header if needed.
5966	*/
5967	n = _M_GET(how, m->m_type);
5968	if (n == NULL)
5969	goto enobufs;
5970	if (off == `0` && (m->m_flags & M_PKTHDR)) {
5971	M_COPY_PKTHDR(n, m);
5972	n->m_len = MHLEN;
5973	} else {
5974	if (len >= MINCLSIZE)
5975	MCLGET(n, M_DONTWAIT);
5976	n->m_len =
5977	(n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5978	}
5979	if (n->m_len > len)
5980	n->m_len = len;
5981
5982	/*
5983	* free the region which has been overwritten.
5984	* copying data from old mbufs if requested.
5985	*/
5986	if (flags & M_COPYBACK0_PRESERVE)
5987	datap = mtod(n, char *);
5988	else
5989	datap = NULL;
5990	eatlen = n->m_len;
5991	VERIFY(off == `0` \|\| eatlen >= mlen);
5992	if (off > `0`) {
5993	VERIFY(len >= mlen);
5994	m->m_len = off;
5995	m->m_next = n;
5996	if (datap) {
5997	m_copydata(m, off, mlen, datap);
5998	datap += mlen;
5999	}
6000	eatlen -= mlen;
6001	mp = &m->m_next;
6002	m = m->m_next;
6003	}
6004	while (m != NULL && m_mclhasreference(m) &&
6005	n->m_type == m->m_type && eatlen > `0`) {
6006	mlen = MIN(eatlen, m->m_len);
6007	if (datap) {
6008	m_copydata(m, `0`, mlen, datap);
6009	datap += mlen;
6010	}
6011	m->m_data += mlen;
6012	m->m_len -= mlen;
6013	eatlen -= mlen;
6014	if (m->m_len == `0`)
6015	*mp = m = m_free(m);
6016	}
6017	if (eatlen > `0`)
6018	n->m_len -= eatlen;
6019	n->m_next = m;
6020	*mp = m = n;
6021	continue;
6022	}
6023	mlen = MIN(mlen, len);
6024	if (flags & M_COPYBACK0_COPYBACK) {
6025	bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
6026	cp += mlen;
6027	}
6028	len -= mlen;
6029	mlen += off;
6030	off = `0`;
6031	totlen += mlen;
6032	if (len == `0`)
6033	break;
6034	if (m->m_next == NULL) {
6035	goto extend;
6036	}
6037	mp = &m->m_next;
6038	m = m->m_next;
6039	}
6040	out:
6041	if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
6042	VERIFY(flags & M_COPYBACK0_EXTEND);
6043	m->m_pkthdr.len = totlen;
6044	}
6045
6046	return (`0`);
6047
6048	enobufs:
6049	return (ENOBUFS);
6050	}
6051
6052	uint64_t
6053	mcl_to_paddr(char *addr)
6054	{
6055	vm_offset_t base_phys;
6056
6057	if (!MBUF_IN_MAP(addr))
6058	return (`0`);
6059	base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
6060
6061	if (base_phys == `0`)
6062	return (`0`);
6063	return ((uint64_t)(ptoa_64(base_phys) \| ((uint64_t)addr & PAGE_MASK)));
6064	}
6065
6066	/*
6067	* Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
6068	* And really copy the thing. That way, we don't "precompute" checksums
6069	* for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
6070	* small packets, don't dup into a cluster. That way received packets
6071	* don't take up too much room in the sockbuf (cf. sbspace()).
6072	*/
6073	int MDFail;
6074
6075	struct mbuf *
6076	m_dup(struct mbuf m, int* how)
6077	{
6078	struct mbuf n, *np;
6079	struct mbuf *top;
6080	int copyhdr = `0`;
6081
6082	np = &top;
6083	top = NULL;
6084	if (m->m_flags & M_PKTHDR)
6085	copyhdr = `1`;
6086
6087	/*
6088	* Quick check: if we have one mbuf and its data fits in an
6089	* mbuf with packet header, just copy and go.
6090	*/
6091	if (m->m_next == NULL) {
6092	/ Then just move the data into an mbuf and be done... /
6093	if (copyhdr) {
6094	if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
6095	if ((n = _M_GETHDR(how, m->m_type)) == NULL)
6096	return (NULL);
6097	n->m_len = m->m_len;
6098	m_dup_pkthdr(n, m, how);
6099	bcopy(m->m_data, n->m_data, m->m_len);
6100	return (n);
6101	}
6102	} else if (m->m_len <= MLEN) {
6103	if ((n = _M_GET(how, m->m_type)) == NULL)
6104	return (NULL);
6105	bcopy(m->m_data, n->m_data, m->m_len);
6106	n->m_len = m->m_len;
6107	return (n);
6108	}
6109	}
6110	while (m != NULL) {
6111	#if BLUE_DEBUG
6112	printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6113	m->m_data);
6114	#endif
6115	if (copyhdr)
6116	n = _M_GETHDR(how, m->m_type);
6117	else
6118	n = _M_GET(how, m->m_type);
6119	if (n == NULL)
6120	goto nospace;
6121	if (m->m_flags & M_EXT) {
6122	if (m->m_len <= m_maxsize(MC_CL))
6123	MCLGET(n, how);
6124	else if (m->m_len <= m_maxsize(MC_BIGCL))
6125	n = m_mbigget(n, how);
6126	else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > `0`)
6127	n = m_m16kget(n, how);
6128	if (!(n->m_flags & M_EXT)) {
6129	(void) m_free(n);
6130	goto nospace;
6131	}
6132	}
6133	*np = n;
6134	if (copyhdr) {
6135	/ Don't use M_COPY_PKTHDR: preserve m_data /
6136	m_dup_pkthdr(n, m, how);
6137	copyhdr = `0`;
6138	if (!(n->m_flags & M_EXT))
6139	n->m_data = n->m_pktdat;
6140	}
6141	n->m_len = m->m_len;
6142	/*
6143	* Get the dup on the same bdry as the original
6144	* Assume that the two mbufs have the same offset to data area
6145	* (up to word boundaries)
6146	*/
6147	bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6148	m = m->m_next;
6149	np = &n->m_next;
6150	#if BLUE_DEBUG
6151	printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6152	n->m_data);
6153	#endif
6154	}
6155
6156	if (top == NULL)
6157	MDFail++;
6158	return (top);
6159
6160	nospace:
6161	m_freem(top);
6162	MDFail++;
6163	return (NULL);
6164	}
6165
6166	#define MBUF_MULTIPAGES(m) \
6167	(((m)->m_flags & M_EXT) && \
6168	((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
6169	&& (m)->m_len > PAGE_SIZE) \|\| \
6170	(!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
6171	P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6172
6173	static struct mbuf *
6174	m_expand(struct mbuf m, struct* mbuf **last)
6175	{
6176	struct mbuf *top = NULL;
6177	struct mbuf **nm = &top;
6178	uintptr_t data0, data;
6179	unsigned int len0, len;
6180
6181	VERIFY(MBUF_MULTIPAGES(m));
6182	VERIFY(m->m_next == NULL);
6183	data0 = (uintptr_t)m->m_data;
6184	len0 = m->m_len;
6185	*last = top;
6186
6187	for (;;) {
6188	struct mbuf *n;
6189
6190	data = data0;
6191	if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE)
6192	len = PAGE_SIZE;
6193	else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6194	P2ROUNDUP(data, PAGE_SIZE) < (data + len0))
6195	len = P2ROUNDUP(data, PAGE_SIZE) - data;
6196	else
6197	len = len0;
6198
6199	VERIFY(len > `0`);
6200	VERIFY(m->m_flags & M_EXT);
6201	m->m_data = (void *)data;
6202	m->m_len = len;
6203
6204	nm = last = m;
6205	nm = &m->m_next;
6206	m->m_next = NULL;
6207
6208	data0 += len;
6209	len0 -= len;
6210	if (len0 == `0`)
6211	break;
6212
6213	n = _M_RETRY(M_DONTWAIT, MT_DATA);
6214	if (n == NULL) {
6215	m_freem(top);
6216	top = *last = NULL;
6217	break;
6218	}
6219
6220	n->m_ext = m->m_ext;
6221	m_incref(m);
6222	n->m_flags \|= M_EXT;
6223	m = n;
6224	}
6225	return (top);
6226	}
6227
6228	struct mbuf *
6229	m_normalize(struct mbuf *m)
6230	{
6231	struct mbuf *top = NULL;
6232	struct mbuf **nm = &top;
6233	boolean_t expanded = FALSE;
6234
6235	while (m != NULL) {
6236	struct mbuf *n;
6237
6238	n = m->m_next;
6239	m->m_next = NULL;
6240
6241	/ Does the data cross one or more page boundaries? /
6242	if (MBUF_MULTIPAGES(m)) {
6243	struct mbuf *last;
6244	if ((m = m_expand(m, &last)) == NULL) {
6245	m_freem(n);
6246	m_freem(top);
6247	top = NULL;
6248	break;
6249	}
6250	*nm = m;
6251	nm = &last->m_next;
6252	expanded = TRUE;
6253	} else {
6254	*nm = m;
6255	nm = &m->m_next;
6256	}
6257	m = n;
6258	}
6259	if (expanded)
6260	atomic_add_32(&mb_normalized, `1`);
6261	return (top);
6262	}
6263
6264	/*
6265	* Append the specified data to the indicated mbuf chain,
6266	* Extend the mbuf chain if the new data does not fit in
6267	* existing space.
6268	*
6269	* Return 1 if able to complete the job; otherwise 0.
6270	*/
6271	int
6272	m_append(struct mbuf m0, int* len, caddr_t cp)
6273	{
6274	struct mbuf m, n;
6275	int remainder, space;
6276
6277	for (m = m0; m->m_next != NULL; m = m->m_next)
6278	;
6279	remainder = len;
6280	space = M_TRAILINGSPACE(m);
6281	if (space > `0`) {
6282	/*
6283	* Copy into available space.
6284	*/
6285	if (space > remainder)
6286	space = remainder;
6287	bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6288	m->m_len += space;
6289	cp += space;
6290	remainder -= space;
6291	}
6292	while (remainder > `0`) {
6293	/*
6294	* Allocate a new mbuf; could check space
6295	* and allocate a cluster instead.
6296	*/
6297	n = m_get(M_WAITOK, m->m_type);
6298	if (n == NULL)
6299	break;
6300	n->m_len = min(MLEN, remainder);
6301	bcopy(cp, mtod(n, caddr_t), n->m_len);
6302	cp += n->m_len;
6303	remainder -= n->m_len;
6304	m->m_next = n;
6305	m = n;
6306	}
6307	if (m0->m_flags & M_PKTHDR)
6308	m0->m_pkthdr.len += len - remainder;
6309	return (remainder == `0`);
6310	}
6311
6312	struct mbuf *
6313	m_last(struct mbuf *m)
6314	{
6315	while (m->m_next != NULL)
6316	m = m->m_next;
6317	return (m);
6318	}
6319
6320	unsigned int
6321	m_fixhdr(struct mbuf *m0)
6322	{
6323	u_int len;
6324
6325	VERIFY(m0->m_flags & M_PKTHDR);
6326
6327	len = m_length2(m0, NULL);
6328	m0->m_pkthdr.len = len;
6329	return (len);
6330	}
6331
6332	unsigned int
6333	m_length2(struct mbuf m0, struct* mbuf **last)
6334	{
6335	struct mbuf *m;
6336	u_int len;
6337
6338	len = `0`;
6339	for (m = m0; m != NULL; m = m->m_next) {
6340	len += m->m_len;
6341	if (m->m_next == NULL)
6342	break;
6343	}
6344	if (last != NULL)
6345	*last = m;
6346	return (len);
6347	}
6348
6349	/*
6350	* Defragment a mbuf chain, returning the shortest possible chain of mbufs
6351	* and clusters. If allocation fails and this cannot be completed, NULL will
6352	* be returned, but the passed in chain will be unchanged. Upon success,
6353	* the original chain will be freed, and the new chain will be returned.
6354	*
6355	* If a non-packet header is passed in, the original mbuf (chain?) will
6356	* be returned unharmed.
6357	*
6358	* If offset is specfied, the first mbuf in the chain will have a leading
6359	* space of the amount stated by the "off" parameter.
6360	*
6361	* This routine requires that the m_pkthdr.header field of the original
6362	* mbuf chain is cleared by the caller.
6363	*/
6364	struct mbuf *
6365	m_defrag_offset(struct mbuf m0, u_int32_t off, int* how)
6366	{
6367	struct mbuf m_new = NULL, m_final = NULL;
6368	int progress = `0`, length, pktlen;
6369
6370	if (!(m0->m_flags & M_PKTHDR))
6371	return (m0);
6372
6373	VERIFY(off < MHLEN);
6374	m_fixhdr(m0); / Needed sanity check /
6375
6376	pktlen = m0->m_pkthdr.len + off;
6377	if (pktlen > MHLEN)
6378	m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6379	else
6380	m_final = m_gethdr(how, MT_DATA);
6381
6382	if (m_final == NULL)
6383	goto nospace;
6384
6385	if (off > `0`) {
6386	pktlen -= off;
6387	m_final->m_data += off;
6388	}
6389
6390	/*
6391	* Caller must have handled the contents pointed to by this
6392	* pointer before coming here, as otherwise it will point to
6393	* the original mbuf which will get freed upon success.
6394	*/
6395	VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6396
6397	if (m_dup_pkthdr(m_final, m0, how) == `0`)
6398	goto nospace;
6399
6400	m_new = m_final;
6401
6402	while (progress < pktlen) {
6403	length = pktlen - progress;
6404	if (length > MCLBYTES)
6405	length = MCLBYTES;
6406	length -= ((m_new == m_final) ? off : `0`);
6407	if (length < `0`)
6408	goto nospace;
6409
6410	if (m_new == NULL) {
6411	if (length > MLEN)
6412	m_new = m_getcl(how, MT_DATA, `0`);
6413	else
6414	m_new = m_get(how, MT_DATA);
6415	if (m_new == NULL)
6416	goto nospace;
6417	}
6418
6419	m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6420	progress += length;
6421	m_new->m_len = length;
6422	if (m_new != m_final)
6423	m_cat(m_final, m_new);
6424	m_new = NULL;
6425	}
6426	m_freem(m0);
6427	m0 = m_final;
6428	return (m0);
6429	nospace:
6430	if (m_final)
6431	m_freem(m_final);
6432	return (NULL);
6433	}
6434
6435	struct mbuf *
6436	m_defrag(struct mbuf m0, int* how)
6437	{
6438	return (m_defrag_offset(m0, `0`, how));
6439	}
6440
6441	void
6442	m_mchtype(struct mbuf m, int* t)
6443	{
6444	mtype_stat_inc(t);
6445	mtype_stat_dec(m->m_type);
6446	(m)->m_type = t;
6447	}
6448
6449	void *
6450	m_mtod(struct mbuf *m)
6451	{
6452	return (MTOD(m, void *));
6453	}
6454
6455	struct mbuf *
6456	m_dtom(void *x)
6457	{
6458	return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-`1`)));
6459	}
6460
6461	void
6462	m_mcheck(struct mbuf *m)
6463	{
6464	_MCHECK(m);
6465	}
6466
6467	/*
6468	* Return a pointer to mbuf/offset of location in mbuf chain.
6469	*/
6470	struct mbuf *
6471	m_getptr(struct mbuf m, int* loc, int *off)
6472	{
6473
6474	while (loc >= `0`) {
6475	/ Normal end of search. /
6476	if (m->m_len > loc) {
6477	*off = loc;
6478	return (m);
6479	} else {
6480	loc -= m->m_len;
6481	if (m->m_next == NULL) {
6482	if (loc == `0`) {
6483	/ Point at the end of valid data. /
6484	*off = m->m_len;
6485	return (m);
6486	}
6487	return (NULL);
6488	}
6489	m = m->m_next;
6490	}
6491	}
6492	return (NULL);
6493	}
6494
6495	/*
6496	* Inform the corresponding mcache(s) that there's a waiter below.
6497	*/
6498	static void
6499	mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6500	{
6501	mcache_waiter_inc(m_cache(class));
6502	if (comp) {
6503	if (class == MC_CL) {
6504	mcache_waiter_inc(m_cache(MC_MBUF_CL));
6505	} else if (class == MC_BIGCL) {
6506	mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6507	} else if (class == MC_16KCL) {
6508	mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6509	} else {
6510	mcache_waiter_inc(m_cache(MC_MBUF_CL));
6511	mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6512	}
6513	}
6514	}
6515
6516	/*
6517	* Inform the corresponding mcache(s) that there's no more waiter below.
6518	*/
6519	static void
6520	mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6521	{
6522	mcache_waiter_dec(m_cache(class));
6523	if (comp) {
6524	if (class == MC_CL) {
6525	mcache_waiter_dec(m_cache(MC_MBUF_CL));
6526	} else if (class == MC_BIGCL) {
6527	mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6528	} else if (class == MC_16KCL) {
6529	mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6530	} else {
6531	mcache_waiter_dec(m_cache(MC_MBUF_CL));
6532	mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6533	}
6534	}
6535	}
6536
6537	/*
6538	* Called during slab (blocking and non-blocking) allocation. If there
6539	* is at least one waiter, and the time since the first waiter is blocked
6540	* is greater than the watchdog timeout, panic the system.
6541	*/
6542	static void
6543	mbuf_watchdog(void)
6544	{
6545	struct timeval now;
6546	unsigned int since;
6547
6548	if (mb_waiters == `0` \|\| !mb_watchdog)
6549	return;
6550
6551	microuptime(&now);
6552	since = now.tv_sec - mb_wdtstart.tv_sec;
6553	if (since >= MB_WDT_MAXTIME) {
6554	panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6555	mb_waiters, since, mbuf_dump());
6556	/ NOTREACHED /
6557	}
6558	}
6559
6560	/*
6561	* Called during blocking allocation. Returns TRUE if one or more objects
6562	* are available at the per-CPU caches layer and that allocation should be
6563	* retried at that level.
6564	*/
6565	static boolean_t
6566	mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6567	{
6568	boolean_t mcache_retry = FALSE;
6569
6570	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6571
6572	/ Check if there's anything at the cache layer /
6573	if (mbuf_cached_above(class, wait)) {
6574	mcache_retry = TRUE;
6575	goto done;
6576	}
6577
6578	/ Nothing? Then try hard to get it from somewhere /
6579	m_reclaim(class, num, (wait & MCR_COMP));
6580
6581	/ We tried hard and got something? /
6582	if (m_infree(class) > `0`) {
6583	mbstat.m_wait++;
6584	goto done;
6585	} else if (mbuf_cached_above(class, wait)) {
6586	mbstat.m_wait++;
6587	mcache_retry = TRUE;
6588	goto done;
6589	} else if (wait & MCR_TRYHARD) {
6590	mcache_retry = TRUE;
6591	goto done;
6592	}
6593
6594	/*
6595	* There's really nothing for us right now; inform the
6596	* cache(s) that there is a waiter below and go to sleep.
6597	*/
6598	mbuf_waiter_inc(class, (wait & MCR_COMP));
6599
6600	VERIFY(!(wait & MCR_NOSLEEP));
6601
6602	/*
6603	* If this is the first waiter, arm the watchdog timer. Otherwise
6604	* check if we need to panic the system due to watchdog timeout.
6605	*/
6606	if (mb_waiters == `0`)
6607	microuptime(&mb_wdtstart);
6608	else
6609	mbuf_watchdog();
6610
6611	mb_waiters++;
6612	m_region_expand(class) += m_total(class) + num;
6613	/ wake up the worker thread /
6614	if (mbuf_worker_ready &&
6615	mbuf_worker_needs_wakeup) {
6616	wakeup((caddr_t)&mbuf_worker_needs_wakeup);
6617	mbuf_worker_needs_wakeup = FALSE;
6618	}
6619	mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
6620	(void) msleep(mb_waitchan, mbuf_mlock, (PZERO-`1`), m_cname(class), NULL);
6621	mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
6622
6623	/ We are now up; stop getting notified until next round /
6624	mbuf_waiter_dec(class, (wait & MCR_COMP));
6625
6626	/ We waited and got something /
6627	if (m_infree(class) > `0`) {
6628	mbstat.m_wait++;
6629	goto done;
6630	} else if (mbuf_cached_above(class, wait)) {
6631	mbstat.m_wait++;
6632	mcache_retry = TRUE;
6633	}
6634	done:
6635	return (mcache_retry);
6636	}
6637
6638	__attribute__((noreturn))
6639	static void
6640	mbuf_worker_thread(void)
6641	{
6642	int mbuf_expand;
6643
6644	while (`1`) {
6645	lck_mtx_lock(mbuf_mlock);
6646	mbwdog_logger("worker thread running");
6647	mbuf_worker_run_cnt++;
6648	mbuf_expand = `0`;
6649	/*
6650	* Allocations are based on page size, so if we have depleted
6651	* the reserved spaces, try to free mbufs from the major classes.
6652	*/
6653	#if PAGE_SIZE == 4096
6654	uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
6655	uint32_t m_clusters = m_total(MC_CL);
6656	uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
6657	uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
6658	if (sumclusters >= nclusters) {
6659	mbwdog_logger("reclaiming bigcl");
6660	mbuf_drain_locked(TRUE);
6661	m_reclaim(MC_BIGCL, `4`, FALSE);
6662	}
6663	#else
6664	uint32_t m_16kclusters = m_total(MC_16KCL);
6665	if (njcl > `0` && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
6666	mbwdog_logger("reclaiming 16kcl");
6667	mbuf_drain_locked(TRUE);
6668	m_reclaim(MC_16KCL, `4`, FALSE);
6669	}
6670	#endif
6671	if (m_region_expand(MC_CL) > `0`) {
6672	int n;
6673	mb_expand_cl_cnt++;
6674	/ Adjust to current number of cluster in use /
6675	n = m_region_expand(MC_CL) -
6676	(m_total(MC_CL) - m_infree(MC_CL));
6677	if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6678	n = m_maxlimit(MC_CL) - m_total(MC_CL);
6679	if (n > `0`) {
6680	mb_expand_cl_total += n;
6681	}
6682	m_region_expand(MC_CL) = `0`;
6683
6684	if (n > `0`) {
6685	mbwdog_logger("expanding MC_CL by %d", n);
6686	freelist_populate(MC_CL, n, M_WAIT);
6687	}
6688	}
6689	if (m_region_expand(MC_BIGCL) > `0`) {
6690	int n;
6691	mb_expand_bigcl_cnt++;
6692	/ Adjust to current number of 4 KB cluster in use /
6693	n = m_region_expand(MC_BIGCL) -
6694	(m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6695	if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6696	n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6697	if (n > `0`) {
6698	mb_expand_bigcl_total += n;
6699	}
6700	m_region_expand(MC_BIGCL) = `0`;
6701
6702	if (n > `0`) {
6703	mbwdog_logger("expanding MC_BIGCL by %d", n);
6704	freelist_populate(MC_BIGCL, n, M_WAIT);
6705	}
6706	}
6707	if (m_region_expand(MC_16KCL) > `0`) {
6708	int n;
6709	mb_expand_16kcl_cnt++;
6710	/ Adjust to current number of 16 KB cluster in use /
6711	n = m_region_expand(MC_16KCL) -
6712	(m_total(MC_16KCL) - m_infree(MC_16KCL));
6713	if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6714	n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6715	if (n > `0`) {
6716	mb_expand_16kcl_total += n;
6717	}
6718	m_region_expand(MC_16KCL) = `0`;
6719
6720	if (n > `0`) {
6721	mbwdog_logger("expanding MC_16KCL by %d", n);
6722	(void) freelist_populate(MC_16KCL, n, M_WAIT);
6723	}
6724	}
6725
6726	/*
6727	* Because we can run out of memory before filling the mbuf
6728	* map, we should not allocate more clusters than they are
6729	* mbufs -- otherwise we could have a large number of useless
6730	* clusters allocated.
6731	*/
6732	mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
6733	m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
6734	m_total(MC_16KCL));
6735	uint32_t total_mbufs = m_total(MC_MBUF);
6736	uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
6737	m_total(MC_16KCL);
6738	if (total_mbufs < total_clusters) {
6739	mbwdog_logger("expanding MC_MBUF by %d",
6740	total_clusters - total_mbufs);
6741	}
6742	while (total_mbufs < total_clusters) {
6743	mb_expand_cnt++;
6744	if (freelist_populate(MC_MBUF, `1`, M_WAIT) == `0`)
6745	break;
6746	total_mbufs = m_total(MC_MBUF);
6747	total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
6748	m_total(MC_16KCL);
6749	}
6750
6751	mbuf_worker_needs_wakeup = TRUE;
6752	/*
6753	* If there's a deadlock and we're not sending / receiving
6754	* packets, net_uptime() won't be updated. Update it here
6755	* so we are sure it's correct.
6756	*/
6757	net_update_uptime();
6758	mbuf_worker_last_runtime = net_uptime();
6759	assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
6760	THREAD_UNINT);
6761	mbwdog_logger("worker thread sleeping");
6762	lck_mtx_unlock(mbuf_mlock);
6763	(void) thread_block((thread_continue_t)mbuf_worker_thread);
6764	}
6765	}
6766
6767	__attribute__((noreturn))
6768	static void
6769	mbuf_worker_thread_init(void)
6770	{
6771	mbuf_worker_ready++;
6772	mbuf_worker_thread();
6773	}
6774
6775	static mcl_slab_t *
6776	slab_get(void *buf)
6777	{
6778	mcl_slabg_t *slg;
6779	unsigned int ix, k;
6780
6781	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6782
6783	VERIFY(MBUF_IN_MAP(buf));
6784	ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
6785	VERIFY(ix < maxslabgrp);
6786
6787	if ((slg = slabstbl[ix]) == NULL) {
6788	/*
6789	* In the current implementation, we never shrink the slabs
6790	* table; if we attempt to reallocate a cluster group when
6791	* it's already allocated, panic since this is a sign of a
6792	* memory corruption (slabstbl[ix] got nullified).
6793	*/
6794	++slabgrp;
6795	VERIFY(ix < slabgrp);
6796	/*
6797	* Slabs expansion can only be done single threaded; when
6798	* we get here, it must be as a result of m_clalloc() which
6799	* is serialized and therefore mb_clalloc_busy must be set.
6800	*/
6801	VERIFY(mb_clalloc_busy);
6802	lck_mtx_unlock(mbuf_mlock);
6803
6804	/ This is a new buffer; create the slabs group for it /
6805	MALLOC(slg, mcl_slabg_t , sizeof* (*slg), M_TEMP,
6806	M_WAITOK \| M_ZERO);
6807	MALLOC(slg->slg_slab, mcl_slab_t , sizeof(mcl_slab_t) NSLABSPMB,
6808	M_TEMP, M_WAITOK \| M_ZERO);
6809	VERIFY(slg != NULL && slg->slg_slab != NULL);
6810
6811	lck_mtx_lock(mbuf_mlock);
6812	/*
6813	* No other thread could have gone into m_clalloc() after
6814	* we dropped the lock above, so verify that it's true.
6815	*/
6816	VERIFY(mb_clalloc_busy);
6817
6818	slabstbl[ix] = slg;
6819
6820	/ Chain each slab in the group to its forward neighbor /
6821	for (k = `1`; k < NSLABSPMB; k++)
6822	slg->slg_slab[k - `1`].sl_next = &slg->slg_slab[k];
6823	VERIFY(slg->slg_slab[NSLABSPMB - `1`].sl_next == NULL);
6824
6825	/ And chain the last slab in the previous group to this /
6826	if (ix > `0`) {
6827	VERIFY(slabstbl[ix - `1`]->
6828	slg_slab[NSLABSPMB - `1`].sl_next == NULL);
6829	slabstbl[ix - `1`]->slg_slab[NSLABSPMB - `1`].sl_next =
6830	&slg->slg_slab[`0`];
6831	}
6832	}
6833
6834	ix = MTOPG(buf) % NSLABSPMB;
6835	VERIFY(ix < NSLABSPMB);
6836
6837	return (&slg->slg_slab[ix]);
6838	}
6839
6840	static void
6841	slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6842	void base, void* head, unsigned* int len, int refcnt, int chunks)
6843	{
6844	sp->sl_class = class;
6845	sp->sl_flags = flags;
6846	sp->sl_base = base;
6847	sp->sl_head = head;
6848	sp->sl_len = len;
6849	sp->sl_refcnt = refcnt;
6850	sp->sl_chunks = chunks;
6851	slab_detach(sp);
6852	}
6853
6854	static void
6855	slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6856	{
6857	VERIFY(slab_is_detached(sp));
6858	m_slab_cnt(class)++;
6859	TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6860	sp->sl_flags &= ~SLF_DETACHED;
6861
6862	/*
6863	* If a buffer spans multiple contiguous pages then mark them as
6864	* detached too
6865	*/
6866	if (class == MC_16KCL) {
6867	int k;
6868	for (k = `1`; k < NSLABSP16KB; k++) {
6869	sp = sp->sl_next;
6870	/ Next slab must already be present /
6871	VERIFY(sp != NULL && slab_is_detached(sp));
6872	sp->sl_flags &= ~SLF_DETACHED;
6873	}
6874	}
6875	}
6876
6877	static void
6878	slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6879	{
6880	int k;
6881	VERIFY(!slab_is_detached(sp));
6882	VERIFY(m_slab_cnt(class) > `0`);
6883	m_slab_cnt(class)--;
6884	TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6885	slab_detach(sp);
6886	if (class == MC_16KCL) {
6887	for (k = `1`; k < NSLABSP16KB; k++) {
6888	sp = sp->sl_next;
6889	/ Next slab must already be present /
6890	VERIFY(sp != NULL);
6891	VERIFY(!slab_is_detached(sp));
6892	slab_detach(sp);
6893	}
6894	}
6895	}
6896
6897	static boolean_t
6898	slab_inrange(mcl_slab_t sp, void* *buf)
6899	{
6900	return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6901	(uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6902	}
6903
6904	#undef panic
6905
6906	static void
6907	slab_nextptr_panic(mcl_slab_t sp, void* *addr)
6908	{
6909	int i;
6910	unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6911	uintptr_t buf = (uintptr_t)sp->sl_base;
6912
6913	for (i = `0`; i < sp->sl_chunks; i++, buf += chunk_len) {
6914	void next = ((mcache_obj_t )buf)->obj_next;
6915	if (next != addr)
6916	continue;
6917	if (!mclverify) {
6918	if (next != NULL && !MBUF_IN_MAP(next)) {
6919	mcache_t *cp = m_cache(sp->sl_class);
6920	panic("%s: %s buffer %p in slab %p modified "
6921	"after free at offset 0: %p out of range "
6922	"[%p-%p)\n", __func__, cp->mc_name,
6923	(void *)buf, sp, next, mbutl, embutl);
6924	/ NOTREACHED /
6925	}
6926	} else {
6927	mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6928	(mcache_obj_t *)buf);
6929	mcl_audit_verify_nextptr(next, mca);
6930	}
6931	}
6932	}
6933
6934	static void
6935	slab_detach(mcl_slab_t *sp)
6936	{
6937	sp->sl_link.tqe_next = (mcl_slab_t *)-`1`;
6938	sp->sl_link.tqe_prev = (mcl_slab_t **)-`1`;
6939	sp->sl_flags \|= SLF_DETACHED;
6940	}
6941
6942	static boolean_t
6943	slab_is_detached(mcl_slab_t *sp)
6944	{
6945	return ((intptr_t)sp->sl_link.tqe_next == -`1` &&
6946	(intptr_t)sp->sl_link.tqe_prev == -`1` &&
6947	(sp->sl_flags & SLF_DETACHED));
6948	}
6949
6950	static void
6951	mcl_audit_init(void buf, mcache_audit_t *mca_list,
6952	mcache_obj_t *con_list, size_t con_size, unsigned* int num)
6953	{
6954	mcache_audit_t mca, mca_tail;
6955	mcache_obj_t *con = NULL;
6956	boolean_t save_contents = (con_list != NULL);
6957	unsigned int i, ix;
6958
6959	ASSERT(num <= NMBPG);
6960	ASSERT(con_list == NULL \|\| con_size != `0`);
6961
6962	ix = MTOPG(buf);
6963	VERIFY(ix < maxclaudit);
6964
6965	/ Make sure we haven't been here before /
6966	for (i = `0`; i < num; i++)
6967	VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6968
6969	mca = mca_tail = *mca_list;
6970	if (save_contents)
6971	con = *con_list;
6972
6973	for (i = `0`; i < num; i++) {
6974	mcache_audit_t *next;
6975
6976	next = mca->mca_next;
6977	bzero(mca, sizeof (*mca));
6978	mca->mca_next = next;
6979	mclaudit[ix].cl_audit[i] = mca;
6980
6981	/ Attach the contents buffer if requested /
6982	if (save_contents) {
6983	mcl_saved_contents_t *msc =
6984	(mcl_saved_contents_t )(void* *)con;
6985
6986	VERIFY(msc != NULL);
6987	VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6988	VERIFY(con_size == sizeof (*msc));
6989	mca->mca_contents_size = con_size;
6990	mca->mca_contents = msc;
6991	con = con->obj_next;
6992	bzero(mca->mca_contents, mca->mca_contents_size);
6993	}
6994
6995	mca_tail = mca;
6996	mca = mca->mca_next;
6997	}
6998
6999	if (save_contents)
7000	*con_list = con;
7001
7002	*mca_list = mca_tail->mca_next;
7003	mca_tail->mca_next = NULL;
7004	}
7005
7006	static void
7007	mcl_audit_free(void buf, unsigned* int num)
7008	{
7009	unsigned int i, ix;
7010	mcache_audit_t mca, mca_list;
7011
7012	ix = MTOPG(buf);
7013	VERIFY(ix < maxclaudit);
7014
7015	if (mclaudit[ix].cl_audit[`0`] != NULL) {
7016	mca_list = mclaudit[ix].cl_audit[`0`];
7017	for (i = `0`; i < num; i++) {
7018	mca = mclaudit[ix].cl_audit[i];
7019	mclaudit[ix].cl_audit[i] = NULL;
7020	if (mca->mca_contents)
7021	mcache_free(mcl_audit_con_cache,
7022	mca->mca_contents);
7023	}
7024	mcache_free_ext(mcache_audit_cache,
7025	(mcache_obj_t *)mca_list);
7026	}
7027	}
7028
7029	/*
7030	* Given an address of a buffer (mbuf/2KB/4KB/16KB), return
7031	* the corresponding audit structure for that buffer.
7032	*/
7033	static mcache_audit_t *
7034	mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
7035	{
7036	mcache_audit_t *mca = NULL;
7037	int ix = MTOPG(mobj), m_idx = `0`;
7038	unsigned char *page_addr;
7039
7040	VERIFY(ix < maxclaudit);
7041	VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
7042
7043	page_addr = PGTOM(ix);
7044
7045	switch (class) {
7046	case MC_MBUF:
7047	/*
7048	* For the mbuf case, find the index of the page
7049	* used by the mbuf and use that index to locate the
7050	* base address of the page. Then find out the
7051	* mbuf index relative to the page base and use
7052	* it to locate the audit structure.
7053	*/
7054	m_idx = MBPAGEIDX(page_addr, mobj);
7055	VERIFY(m_idx < (int)NMBPG);
7056	mca = mclaudit[ix].cl_audit[m_idx];
7057	break;
7058
7059	case MC_CL:
7060	/*
7061	* Same thing as above, but for 2KB clusters in a page.
7062	*/
7063	m_idx = CLPAGEIDX(page_addr, mobj);
7064	VERIFY(m_idx < (int)NCLPG);
7065	mca = mclaudit[ix].cl_audit[m_idx];
7066	break;
7067
7068	case MC_BIGCL:
7069	m_idx = BCLPAGEIDX(page_addr, mobj);
7070	VERIFY(m_idx < (int)NBCLPG);
7071	mca = mclaudit[ix].cl_audit[m_idx];
7072	break;
7073	case MC_16KCL:
7074	/*
7075	* Same as above, but only return the first element.
7076	*/
7077	mca = mclaudit[ix].cl_audit[`0`];
7078	break;
7079
7080	default:
7081	VERIFY(`0`);
7082	/ NOTREACHED /
7083	}
7084
7085	return (mca);
7086	}
7087
7088	static void
7089	mcl_audit_mbuf(mcache_audit_t mca, void* *addr, boolean_t composite,
7090	boolean_t alloc)
7091	{
7092	struct mbuf *m = addr;
7093	mcache_obj_t next = ((mcache_obj_t )m)->obj_next;
7094
7095	VERIFY(mca->mca_contents != NULL &&
7096	mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7097
7098	if (mclverify)
7099	mcl_audit_verify_nextptr(next, mca);
7100
7101	if (!alloc) {
7102	/ Save constructed mbuf fields /
7103	mcl_audit_save_mbuf(m, mca);
7104	if (mclverify) {
7105	mcache_set_pattern(MCACHE_FREE_PATTERN, m,
7106	m_maxsize(MC_MBUF));
7107	}
7108	((mcache_obj_t *)m)->obj_next = next;
7109	return;
7110	}
7111
7112	/ Check if the buffer has been corrupted while in freelist /
7113	if (mclverify) {
7114	mcache_audit_free_verify_set(mca, addr, `0`, m_maxsize(MC_MBUF));
7115	}
7116	/ Restore constructed mbuf fields /
7117	mcl_audit_restore_mbuf(m, mca, composite);
7118	}
7119
7120	static void
7121	mcl_audit_restore_mbuf(struct mbuf m, mcache_audit_t mca, boolean_t composite)
7122	{
7123	struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
7124
7125	if (composite) {
7126	struct mbuf *next = m->m_next;
7127	VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
7128	MBUF_IS_COMPOSITE(ms));
7129	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7130	/*
7131	* We could have hand-picked the mbuf fields and restore
7132	* them individually, but that will be a maintenance
7133	* headache. Instead, restore everything that was saved;
7134	* the mbuf layer will recheck and reinitialize anyway.
7135	*/
7136	bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
7137	m->m_next = next;
7138	} else {
7139	/*
7140	* For a regular mbuf (no cluster attached) there's nothing
7141	* to restore other than the type field, which is expected
7142	* to be MT_FREE.
7143	*/
7144	m->m_type = ms->m_type;
7145	}
7146	_MCHECK(m);
7147	}
7148
7149	static void
7150	mcl_audit_save_mbuf(struct mbuf m, mcache_audit_t mca)
7151	{
7152	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7153	_MCHECK(m);
7154	bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7155	}
7156
7157	static void
7158	mcl_audit_cluster(mcache_audit_t mca, void* *addr, size_t size, boolean_t alloc,
7159	boolean_t save_next)
7160	{
7161	mcache_obj_t next = ((mcache_obj_t )addr)->obj_next;
7162
7163	if (!alloc) {
7164	if (mclverify) {
7165	mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7166	}
7167	if (save_next) {
7168	mcl_audit_verify_nextptr(next, mca);
7169	((mcache_obj_t *)addr)->obj_next = next;
7170	}
7171	} else if (mclverify) {
7172	/ Check if the buffer has been corrupted while in freelist /
7173	mcl_audit_verify_nextptr(next, mca);
7174	mcache_audit_free_verify_set(mca, addr, `0`, size);
7175	}
7176	}
7177
7178	static void
7179	mcl_audit_scratch(mcache_audit_t *mca)
7180	{
7181	void *stack[MCACHE_STACK_DEPTH + `1`];
7182	mcl_scratch_audit_t *msa;
7183	struct timeval now;
7184
7185	VERIFY(mca->mca_contents != NULL);
7186	msa = MCA_SAVED_SCRATCH_PTR(mca);
7187
7188	msa->msa_pthread = msa->msa_thread;
7189	msa->msa_thread = current_thread();
7190	bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
7191	msa->msa_pdepth = msa->msa_depth;
7192	bzero(stack, sizeof (stack));
7193	msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + `1`) - `1`;
7194	bcopy(&stack[`1`], msa->msa_stack, sizeof (msa->msa_stack));
7195
7196	msa->msa_ptstamp = msa->msa_tstamp;
7197	microuptime(&now);
7198	/ tstamp is in ms relative to base_ts /
7199	msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / `1000`);
7200	if ((now.tv_sec - mb_start.tv_sec) > `0`)
7201	msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * `1000`);
7202	}
7203
7204	static void
7205	mcl_audit_mcheck_panic(struct mbuf *m)
7206	{
7207	mcache_audit_t *mca;
7208
7209	MRANGE(m);
7210	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7211
7212	panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
7213	m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
7214	/ NOTREACHED /
7215	}
7216
7217	static void
7218	mcl_audit_verify_nextptr(void next, mcache_audit_t mca)
7219	{
7220	if (next != NULL && !MBUF_IN_MAP(next) &&
7221	(next != (void *)MCACHE_FREE_PATTERN \|\| !mclverify)) {
7222	panic("mcl_audit: buffer %p modified after free at offset 0: "
7223	"%p out of range [%p-%p)\n%s\n",
7224	mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
7225	/ NOTREACHED /
7226	}
7227	}
7228
7229	/ This function turns on mbuf leak detection /
7230	static void
7231	mleak_activate(void)
7232	{
7233	mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7234	PE_parse_boot_argn("mleak_sample_factor",
7235	&mleak_table.mleak_sample_factor,
7236	sizeof (mleak_table.mleak_sample_factor));
7237
7238	if (mleak_table.mleak_sample_factor == `0`)
7239	mclfindleak = `0`;
7240
7241	if (mclfindleak == `0`)
7242	return;
7243
7244	vm_size_t alloc_size =
7245	mleak_alloc_buckets * sizeof (struct mallocation);
7246	vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
7247
7248	MALLOC(mleak_allocations, struct mallocation *, alloc_size,
7249	M_TEMP, M_WAITOK \| M_ZERO);
7250	VERIFY(mleak_allocations != NULL);
7251
7252	MALLOC(mleak_traces, struct mtrace *, trace_size,
7253	M_TEMP, M_WAITOK \| M_ZERO);
7254	VERIFY(mleak_traces != NULL);
7255
7256	MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7257	M_TEMP, M_WAITOK \| M_ZERO);
7258	VERIFY(mleak_stat != NULL);
7259	mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7260	#ifdef __LP64__
7261	mleak_stat->ml_isaddr64 = `1`;
7262	#endif /* __LP64__ */
7263	}
7264
7265	static void
7266	mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7267	{
7268	int temp;
7269
7270	if (mclfindleak == `0`)
7271	return;
7272
7273	if (!alloc)
7274	return (mleak_free(addr));
7275
7276	temp = atomic_add_32_ov(&mleak_table.mleak_capture, `1`);
7277
7278	if ((temp % mleak_table.mleak_sample_factor) == `0` && addr != NULL) {
7279	uintptr_t bt[MLEAK_STACK_DEPTH];
7280	int logged = backtrace(bt, MLEAK_STACK_DEPTH);
7281	mleak_log(bt, addr, logged, num);
7282	}
7283	}
7284
7285	/*
7286	* This function records the allocation in the mleak_allocations table
7287	* and the backtrace in the mleak_traces table; if allocation slot is in use,
7288	* replace old allocation with new one if the trace slot is in use, return
7289	* (or increment refcount if same trace).
7290	*/
7291	static boolean_t
7292	mleak_log(uintptr_t bt, mcache_obj_t addr, uint32_t depth, int num)
7293	{
7294	struct mallocation *allocation;
7295	struct mtrace *trace;
7296	uint32_t trace_index;
7297
7298	/ Quit if someone else modifying the tables /
7299	if (!lck_mtx_try_lock_spin(mleak_lock)) {
7300	mleak_table.total_conflicts++;
7301	return (FALSE);
7302	}
7303
7304	allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7305	mleak_alloc_buckets)];
7306	trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7307	trace = &mleak_traces[trace_index];
7308
7309	VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - `1`]);
7310	VERIFY(trace <= &mleak_traces[mleak_trace_buckets - `1`]);
7311
7312	allocation->hitcount++;
7313	trace->hitcount++;
7314
7315	/*
7316	* If the allocation bucket we want is occupied
7317	* and the occupier has the same trace, just bail.
7318	*/
7319	if (allocation->element != NULL &&
7320	trace_index == allocation->trace_index) {
7321	mleak_table.alloc_collisions++;
7322	lck_mtx_unlock(mleak_lock);
7323	return (TRUE);
7324	}
7325
7326	/*
7327	* Store the backtrace in the traces array;
7328	* Size of zero = trace bucket is free.
7329	*/
7330	if (trace->allocs > `0` &&
7331	bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != `0`) {
7332	/ Different, unique trace, but the same hash! Bail out. /
7333	trace->collisions++;
7334	mleak_table.trace_collisions++;
7335	lck_mtx_unlock(mleak_lock);
7336	return (TRUE);
7337	} else if (trace->allocs > `0`) {
7338	/ Same trace, already added, so increment refcount /
7339	trace->allocs++;
7340	} else {
7341	/ Found an unused trace bucket, so record the trace here /
7342	if (trace->depth != `0`) {
7343	/ this slot previously used but not currently in use /
7344	mleak_table.trace_overwrites++;
7345	}
7346	mleak_table.trace_recorded++;
7347	trace->allocs = `1`;
7348	memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
7349	trace->depth = depth;
7350	trace->collisions = `0`;
7351	}
7352
7353	/ Step 2: Store the allocation record in the allocations array /
7354	if (allocation->element != NULL) {
7355	/*
7356	* Replace an existing allocation. No need to preserve
7357	* because only a subset of the allocations are being
7358	* recorded anyway.
7359	*/
7360	mleak_table.alloc_collisions++;
7361	} else if (allocation->trace_index != `0`) {
7362	mleak_table.alloc_overwrites++;
7363	}
7364	allocation->element = addr;
7365	allocation->trace_index = trace_index;
7366	allocation->count = num;
7367	mleak_table.alloc_recorded++;
7368	mleak_table.outstanding_allocs++;
7369
7370	lck_mtx_unlock(mleak_lock);
7371	return (TRUE);
7372	}
7373
7374	static void
7375	mleak_free(mcache_obj_t *addr)
7376	{
7377	while (addr != NULL) {
7378	struct mallocation *allocation = &mleak_allocations
7379	[hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7380
7381	if (allocation->element == addr &&
7382	allocation->trace_index < mleak_trace_buckets) {
7383	lck_mtx_lock_spin(mleak_lock);
7384	if (allocation->element == addr &&
7385	allocation->trace_index < mleak_trace_buckets) {
7386	struct mtrace *trace;
7387	trace = &mleak_traces[allocation->trace_index];
7388	/ allocs = 0 means trace bucket is unused /
7389	if (trace->allocs > `0`)
7390	trace->allocs--;
7391	if (trace->allocs == `0`)
7392	trace->depth = `0`;
7393	/ NULL element means alloc bucket is unused /
7394	allocation->element = NULL;
7395	mleak_table.outstanding_allocs--;
7396	}
7397	lck_mtx_unlock(mleak_lock);
7398	}
7399	addr = addr->obj_next;
7400	}
7401	}
7402
7403	static void
7404	mleak_sort_traces()
7405	{
7406	int i, j, k;
7407	struct mtrace *swap;
7408
7409	for(i = `0`; i < MLEAK_NUM_TRACES; i++)
7410	mleak_top_trace[i] = NULL;
7411
7412	for(i = `0`, j = `0`; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
7413	{
7414	if (mleak_traces[i].allocs <= `0`)
7415	continue;
7416
7417	mleak_top_trace[j] = &mleak_traces[i];
7418	for (k = j; k > `0`; k--) {
7419	if (mleak_top_trace[k]->allocs <=
7420	mleak_top_trace[k-`1`]->allocs)
7421	break;
7422
7423	swap = mleak_top_trace[k-`1`];
7424	mleak_top_trace[k-`1`] = mleak_top_trace[k];
7425	mleak_top_trace[k] = swap;
7426	}
7427	j++;
7428	}
7429
7430	j--;
7431	for(; i < mleak_trace_buckets; i++) {
7432	if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
7433	continue;
7434
7435	mleak_top_trace[j] = &mleak_traces[i];
7436
7437	for (k = j; k > `0`; k--) {
7438	if (mleak_top_trace[k]->allocs <=
7439	mleak_top_trace[k-`1`]->allocs)
7440	break;
7441
7442	swap = mleak_top_trace[k-`1`];
7443	mleak_top_trace[k-`1`] = mleak_top_trace[k];
7444	mleak_top_trace[k] = swap;
7445	}
7446	}
7447	}
7448
7449	static void
7450	mleak_update_stats()
7451	{
7452	mleak_trace_stat_t *mltr;
7453	int i;
7454
7455	VERIFY(mleak_stat != NULL);
7456	#ifdef __LP64__
7457	VERIFY(mleak_stat->ml_isaddr64);
7458	#else
7459	VERIFY(!mleak_stat->ml_isaddr64);
7460	#endif /* !__LP64__ */
7461	VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7462
7463	mleak_sort_traces();
7464
7465	mltr = &mleak_stat->ml_trace[`0`];
7466	bzero(mltr, sizeof (mltr) MLEAK_NUM_TRACES);
7467	for (i = `0`; i < MLEAK_NUM_TRACES; i++) {
7468	int j;
7469
7470	if (mleak_top_trace[i] == NULL \|\|
7471	mleak_top_trace[i]->allocs == `0`)
7472	continue;
7473
7474	mltr->mltr_collisions = mleak_top_trace[i]->collisions;
7475	mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
7476	mltr->mltr_allocs = mleak_top_trace[i]->allocs;
7477	mltr->mltr_depth = mleak_top_trace[i]->depth;
7478
7479	VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7480	for (j = `0`; j < mltr->mltr_depth; j++)
7481	mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7482
7483	mltr++;
7484	}
7485	}
7486
7487	static struct mbtypes {
7488	int mt_type;
7489	const char *mt_name;
7490	} mbtypes[] = {
7491	{ MT_DATA, "data" },
7492	{ MT_OOBDATA, "oob data" },
7493	{ MT_CONTROL, "ancillary data" },
7494	{ MT_HEADER, "packet headers" },
7495	{ MT_SOCKET, "socket structures" },
7496	{ MT_PCB, "protocol control blocks" },
7497	{ MT_RTABLE, "routing table entries" },
7498	{ MT_HTABLE, "IMP host table entries" },
7499	{ MT_ATABLE, "address resolution tables" },
7500	{ MT_FTABLE, "fragment reassembly queue headers" },
7501	{ MT_SONAME, "socket names and addresses" },
7502	{ MT_SOOPTS, "socket options" },
7503	{ MT_RIGHTS, "access rights" },
7504	{ MT_IFADDR, "interface addresses" },
7505	{ MT_TAG, "packet tags" },
7506	{ `0`, NULL }
7507	};
7508
7509	#define MBUF_DUMP_BUF_CHK() { \
7510	clen -= k; \
7511	if (clen < 1) \
7512	goto done; \
7513	c += k; \
7514	}
7515
7516	static char *
7517	mbuf_dump(void)
7518	{
7519	unsigned long totmem = `0`, totfree = `0`, totmbufs, totused, totpct,
7520	totreturned = `0`;
7521	u_int32_t m_mbufs = `0`, m_clfree = `0`, m_bigclfree = `0`;
7522	u_int32_t m_mbufclfree = `0`, m_mbufbigclfree = `0`;
7523	u_int32_t m_16kclusters = `0`, m_16kclfree = `0`, m_mbuf16kclfree = `0`;
7524	int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
7525	uint8_t seen[`256`];
7526	struct mbtypes *mp;
7527	mb_class_stat_t *sp;
7528	mleak_trace_stat_t *mltr;
7529	char *c = mbuf_dump_buf;
7530	int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
7531	bool printed_banner = false;
7532
7533	mbuf_dump_buf[`0`] = `'\0'`;
7534
7535	/ synchronize all statistics in the mbuf table /
7536	mbuf_stat_sync();
7537	mbuf_mtypes_sync(TRUE);
7538
7539	sp = &mb_stat->mbs_class[`0`];
7540	for (i = `0`; i < mb_stat->mbs_cnt; i++, sp++) {
7541	u_int32_t mem;
7542
7543	if (m_class(i) == MC_MBUF) {
7544	m_mbufs = sp->mbcl_active;
7545	} else if (m_class(i) == MC_CL) {
7546	m_clfree = sp->mbcl_total - sp->mbcl_active;
7547	} else if (m_class(i) == MC_BIGCL) {
7548	m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7549	} else if (njcl > `0` && m_class(i) == MC_16KCL) {
7550	m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7551	m_16kclusters = sp->mbcl_total;
7552	} else if (m_class(i) == MC_MBUF_CL) {
7553	m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7554	} else if (m_class(i) == MC_MBUF_BIGCL) {
7555	m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7556	} else if (njcl > `0` && m_class(i) == MC_MBUF_16KCL) {
7557	m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7558	}
7559
7560	mem = sp->mbcl_ctotal * sp->mbcl_size;
7561	totmem += mem;
7562	totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7563	sp->mbcl_size;
7564	totreturned += sp->mbcl_release_cnt;
7565
7566	}
7567
7568	/ adjust free counts to include composite caches /
7569	m_clfree += m_mbufclfree;
7570	m_bigclfree += m_mbufbigclfree;
7571	m_16kclfree += m_mbuf16kclfree;
7572
7573	totmbufs = `0`;
7574	for (mp = mbtypes; mp->mt_name != NULL; mp++)
7575	totmbufs += mbstat.m_mtypes[mp->mt_type];
7576	if (totmbufs > m_mbufs)
7577	totmbufs = m_mbufs;
7578	k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7579	MBUF_DUMP_BUF_CHK();
7580
7581	bzero(&seen, sizeof (seen));
7582	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7583	if (mbstat.m_mtypes[mp->mt_type] != `0`) {
7584	seen[mp->mt_type] = `1`;
7585	k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7586	mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7587	MBUF_DUMP_BUF_CHK();
7588	}
7589	}
7590	seen[MT_FREE] = `1`;
7591	for (i = `0`; i < nmbtypes; i++)
7592	if (!seen[i] && mbstat.m_mtypes[i] != `0`) {
7593	k = snprintf(c, clen, "\t%u mbufs allocated to "
7594	"<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7595	MBUF_DUMP_BUF_CHK();
7596	}
7597	if ((m_mbufs - totmbufs) > `0`) {
7598	k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7599	m_mbufs - totmbufs);
7600	MBUF_DUMP_BUF_CHK();
7601	}
7602	k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7603	"%u/%u mbuf 4KB clusters in use\n",
7604	(unsigned int)(mbstat.m_clusters - m_clfree),
7605	(unsigned int)mbstat.m_clusters,
7606	(unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7607	(unsigned int)mbstat.m_bigclusters);
7608	MBUF_DUMP_BUF_CHK();
7609
7610	if (njcl > `0`) {
7611	k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7612	m_16kclusters - m_16kclfree, m_16kclusters,
7613	njclbytes / `1024`);
7614	MBUF_DUMP_BUF_CHK();
7615	}
7616	totused = totmem - totfree;
7617	if (totmem == `0`) {
7618	totpct = `0`;
7619	} else if (totused < (ULONG_MAX / `100`)) {
7620	totpct = (totused * `100`) / totmem;
7621	} else {
7622	u_long totmem1 = totmem / `100`;
7623	u_long totused1 = totused / `100`;
7624	totpct = (totused1 * `100`) / totmem1;
7625	}
7626	k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7627	"in use)\n", totmem / `1024`, totpct);
7628	MBUF_DUMP_BUF_CHK();
7629	k = snprintf(c, clen, "%lu KB returned to the system\n",
7630	totreturned / `1024`);
7631	MBUF_DUMP_BUF_CHK();
7632
7633	net_update_uptime();
7634	k = snprintf(c, clen,
7635	"VM allocation failures: contiguous %u, normal %u, one page %u\n",
7636	mb_kmem_contig_failed, mb_kmem_failed, mb_kmem_one_failed);
7637	MBUF_DUMP_BUF_CHK();
7638	if (mb_kmem_contig_failed_ts \|\| mb_kmem_failed_ts \|\|
7639	mb_kmem_one_failed_ts) {
7640	k = snprintf(c, clen,
7641	"VM allocation failure timestamps: contiguous %llu "
7642	"(size %llu), normal %llu (size %llu), one page %llu "
7643	"(now %llu)\n",
7644	mb_kmem_contig_failed_ts, mb_kmem_contig_failed_size,
7645	mb_kmem_failed_ts, mb_kmem_failed_size,
7646	mb_kmem_one_failed_ts, net_uptime());
7647	MBUF_DUMP_BUF_CHK();
7648	k = snprintf(c, clen,
7649	"VM return codes: ");
7650	MBUF_DUMP_BUF_CHK();
7651	for (i = `0`;
7652	i < sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[`0`]);
7653	i++) {
7654	k = snprintf(c, clen, "%s: %u ", mb_kmem_stats_labels[i],
7655	mb_kmem_stats[i]);
7656	MBUF_DUMP_BUF_CHK();
7657	}
7658	k = snprintf(c, clen, "\n");
7659	MBUF_DUMP_BUF_CHK();
7660	}
7661	k = snprintf(c, clen,
7662	"worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
7663	"bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
7664	mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
7665	mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
7666	mb_expand_16kcl_total);
7667	MBUF_DUMP_BUF_CHK();
7668	if (mbuf_worker_last_runtime != `0`) {
7669	k = snprintf(c, clen, "worker thread last run time: "
7670	"%llu (%llu seconds ago)\n",
7671	mbuf_worker_last_runtime,
7672	net_uptime() - mbuf_worker_last_runtime);
7673	MBUF_DUMP_BUF_CHK();
7674	}
7675	if (mbuf_drain_last_runtime != `0`) {
7676	k = snprintf(c, clen, "drain routine last run time: "
7677	"%llu (%llu seconds ago)\n",
7678	mbuf_drain_last_runtime,
7679	net_uptime() - mbuf_drain_last_runtime);
7680	MBUF_DUMP_BUF_CHK();
7681	}
7682
7683	#if DEBUG \|\| DEVELOPMENT
7684	k = snprintf(c, clen, "\nworker thread log:\n%s\n", mbwdog_logging);
7685	MBUF_DUMP_BUF_CHK();
7686	#endif
7687
7688	for (j = `0`; j < MTRACELARGE_NUM_TRACES; j++) {
7689	struct mtracelarge *trace = &mtracelarge_table[j];
7690	if (trace->size == `0` \|\| trace->depth == `0`)
7691	continue;
7692	if (printed_banner == false) {
7693	k = snprintf(c, clen,
7694	"\nlargest allocation failure backtraces:\n");
7695	MBUF_DUMP_BUF_CHK();
7696	printed_banner = true;
7697	}
7698	k = snprintf(c, clen, "size %llu: < ", trace->size);
7699	MBUF_DUMP_BUF_CHK();
7700	for (i = `0`; i < trace->depth; i++) {
7701	if (mleak_stat->ml_isaddr64) {
7702	k = snprintf(c, clen, "0x%0llx ",
7703	(uint64_t)VM_KERNEL_UNSLIDE(
7704	trace->addr[i]));
7705	} else {
7706	k = snprintf(c, clen,
7707	"0x%08x ",
7708	(uint32_t)VM_KERNEL_UNSLIDE(
7709	trace->addr[i]));
7710	}
7711	MBUF_DUMP_BUF_CHK();
7712	}
7713	k = snprintf(c, clen, ">\n");
7714	MBUF_DUMP_BUF_CHK();
7715	}
7716
7717	/ mbuf leak detection statistics /
7718	mleak_update_stats();
7719
7720	k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7721	MBUF_DUMP_BUF_CHK();
7722	k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7723	mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7724	mleak_table.mleak_sample_factor);
7725	MBUF_DUMP_BUF_CHK();
7726	k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7727	mleak_table.outstanding_allocs);
7728	MBUF_DUMP_BUF_CHK();
7729	k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7730	mleak_table.alloc_recorded, mleak_table.trace_recorded);
7731	MBUF_DUMP_BUF_CHK();
7732	k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7733	mleak_table.alloc_collisions, mleak_table.trace_collisions);
7734	MBUF_DUMP_BUF_CHK();
7735	k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7736	mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7737	MBUF_DUMP_BUF_CHK();
7738	k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7739	mleak_table.total_conflicts);
7740	MBUF_DUMP_BUF_CHK();
7741
7742	k = snprintf(c, clen, "top %d outstanding traces:\n",
7743	mleak_stat->ml_cnt);
7744	MBUF_DUMP_BUF_CHK();
7745	for (i = `0`; i < mleak_stat->ml_cnt; i++) {
7746	mltr = &mleak_stat->ml_trace[i];
7747	k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7748	"%llu hit(s), %llu collision(s)\n", (i + `1`),
7749	mltr->mltr_allocs, mltr->mltr_hitcount,
7750	mltr->mltr_collisions);
7751	MBUF_DUMP_BUF_CHK();
7752	}
7753
7754	if (mleak_stat->ml_isaddr64)
7755	k = snprintf(c, clen, MB_LEAK_HDR_64);
7756	else
7757	k = snprintf(c, clen, MB_LEAK_HDR_32);
7758	MBUF_DUMP_BUF_CHK();
7759
7760	for (i = `0`; i < MLEAK_STACK_DEPTH; i++) {
7761	k = snprintf(c, clen, "%2d: ", (i + `1`));
7762	MBUF_DUMP_BUF_CHK();
7763	for (j = `0`; j < mleak_stat->ml_cnt; j++) {
7764	mltr = &mleak_stat->ml_trace[j];
7765	if (i < mltr->mltr_depth) {
7766	if (mleak_stat->ml_isaddr64) {
7767	k = snprintf(c, clen, "0x%0llx ",
7768	(uint64_t)VM_KERNEL_UNSLIDE(
7769	mltr->mltr_addr[i]));
7770	} else {
7771	k = snprintf(c, clen,
7772	"0x%08x ",
7773	(uint32_t)VM_KERNEL_UNSLIDE(
7774	mltr->mltr_addr[i]));
7775	}
7776	} else {
7777	if (mleak_stat->ml_isaddr64)
7778	k = snprintf(c, clen,
7779	MB_LEAK_SPACING_64);
7780	else
7781	k = snprintf(c, clen,
7782	MB_LEAK_SPACING_32);
7783	}
7784	MBUF_DUMP_BUF_CHK();
7785	}
7786	k = snprintf(c, clen, "\n");
7787	MBUF_DUMP_BUF_CHK();
7788	}
7789	done:
7790	return (mbuf_dump_buf);
7791	}
7792
7793	#undef MBUF_DUMP_BUF_CHK
7794
7795	/*
7796	* Convert between a regular and a packet header mbuf. Caller is responsible
7797	* for setting or clearing M_PKTHDR; this routine does the rest of the work.
7798	*/
7799	int
7800	m_reinit(struct mbuf m, int* hdr)
7801	{
7802	int ret = `0`;
7803
7804	if (hdr) {
7805	VERIFY(!(m->m_flags & M_PKTHDR));
7806	if (!(m->m_flags & M_EXT) &&
7807	(m->m_data != m->m_dat \|\| m->m_len > `0`)) {
7808	/*
7809	* If there's no external cluster attached and the
7810	* mbuf appears to contain user data, we cannot
7811	* safely convert this to a packet header mbuf,
7812	* as the packet header structure might overlap
7813	* with the data.
7814	*/
7815	printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7816	"m_data %llx (expected %llx), "
7817	"m_len %d (expected 0)\n",
7818	__func__,
7819	(uint64_t)VM_KERNEL_ADDRPERM(m),
7820	(uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7821	(uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7822	ret = EBUSY;
7823	} else {
7824	VERIFY((m->m_flags & M_EXT) \|\| m->m_data == m->m_dat);
7825	m->m_flags \|= M_PKTHDR;
7826	MBUF_INIT_PKTHDR(m);
7827	}
7828	} else {
7829	/ Check for scratch area overflow /
7830	m_redzone_verify(m);
7831	/ Free the aux data and tags if there is any /
7832	m_tag_delete_chain(m, NULL);
7833	m->m_flags &= ~M_PKTHDR;
7834	}
7835
7836	return (ret);
7837	}
7838
7839	int
7840	m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
7841	{
7842	ASSERT(m->m_flags & M_EXT);
7843	return (atomic_test_set_32(&MEXT_PRIV(m), o, n));
7844	}
7845
7846	uint32_t
7847	m_ext_get_prop(struct mbuf *m)
7848	{
7849	ASSERT(m->m_flags & M_EXT);
7850	return (MEXT_PRIV(m));
7851	}
7852
7853	int
7854	m_ext_paired_is_active(struct mbuf *m)
7855	{
7856	return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : `1`);
7857	}
7858
7859	void
7860	m_ext_paired_activate(struct mbuf *m)
7861	{
7862	struct ext_ref *rfa;
7863	int hdr, type;
7864	caddr_t extbuf;
7865	m_ext_free_func_t extfree;
7866	u_int extsize;
7867
7868	VERIFY(MBUF_IS_PAIRED(m));
7869	VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
7870	VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
7871
7872	hdr = (m->m_flags & M_PKTHDR);
7873	type = m->m_type;
7874	extbuf = m->m_ext.ext_buf;
7875	extfree = m_get_ext_free(m);
7876	extsize = m->m_ext.ext_size;
7877	rfa = m_get_rfa(m);
7878
7879	VERIFY(extbuf != NULL && rfa != NULL);
7880
7881	/*
7882	* Safe to reinitialize packet header tags, since it's
7883	* already taken care of at m_free() time. Similar to
7884	* what's done in m_clattach() for the cluster. Bump
7885	* up MEXT_PREF to indicate activation.
7886	*/
7887	MBUF_INIT(m, hdr, type);
7888	MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
7889	`1`, `1`, `2`, EXTF_PAIRED, MEXT_PRIV(m), m);
7890	}
7891
7892	void
7893	m_scratch_init(struct mbuf *m)
7894	{
7895	struct pkthdr *pkt = &m->m_pkthdr;
7896
7897	VERIFY(m->m_flags & M_PKTHDR);
7898
7899	/ See comments in <rdar://problem/14040693> /
7900	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7901	panic_plain("Invalid attempt to modify guarded module-private "
7902	"area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7903	/ NOTREACHED /
7904	}
7905
7906	bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7907	}
7908
7909	/*
7910	* This routine is reserved for mbuf_get_driver_scratch(); clients inside
7911	* xnu that intend on utilizing the module-private area should directly
7912	* refer to the pkt_mpriv structure in the pkthdr. They are also expected
7913	* to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7914	* to handing it off to another module, respectively.
7915	*/
7916	u_int32_t
7917	m_scratch_get(struct mbuf m, u_int8_t *p)
7918	{
7919	struct pkthdr *pkt = &m->m_pkthdr;
7920
7921	VERIFY(m->m_flags & M_PKTHDR);
7922
7923	/ See comments in <rdar://problem/14040693> /
7924	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7925	panic_plain("Invalid attempt to access guarded module-private "
7926	"area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7927	/ NOTREACHED /
7928	}
7929
7930	if (mcltrace) {
7931	mcache_audit_t *mca;
7932
7933	lck_mtx_lock(mbuf_mlock);
7934	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7935	if (mca->mca_uflags & MB_SCVALID)
7936	mcl_audit_scratch(mca);
7937	lck_mtx_unlock(mbuf_mlock);
7938	}
7939
7940	p = (u_int8_t )&pkt->pkt_mpriv;
7941	return (sizeof (pkt->pkt_mpriv));
7942	}
7943
7944	static void
7945	m_redzone_init(struct mbuf *m)
7946	{
7947	VERIFY(m->m_flags & M_PKTHDR);
7948	/*
7949	* Each mbuf has a unique red zone pattern, which is a XOR
7950	* of the red zone cookie and the address of the mbuf.
7951	*/
7952	m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7953	}
7954
7955	static void
7956	m_redzone_verify(struct mbuf *m)
7957	{
7958	u_int32_t mb_redzone;
7959
7960	VERIFY(m->m_flags & M_PKTHDR);
7961
7962	mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7963	if (m->m_pkthdr.redzone != mb_redzone) {
7964	panic("mbuf %p redzone violation with value 0x%x "
7965	"(instead of 0x%x, using cookie 0x%x)\n",
7966	m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7967	/ NOTREACHED /
7968	}
7969	}
7970
7971	__private_extern__ inline void
7972	m_set_ext(struct mbuf m, struct* ext_ref *rfa, m_ext_free_func_t ext_free,
7973	caddr_t ext_arg)
7974	{
7975	VERIFY(m->m_flags & M_EXT);
7976	if (rfa != NULL) {
7977	m->m_ext.ext_refflags =
7978	(struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
7979	if (ext_free != NULL) {
7980	rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
7981	mb_obscure_extfree;
7982	m->m_ext.ext_free = (m_ext_free_func_t)
7983	(((uintptr_t)ext_free) ^ rfa->ext_token);
7984	if (ext_arg != NULL) {
7985	m->m_ext.ext_arg =
7986	(caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
7987	} else {
7988	m->m_ext.ext_arg = NULL;
7989	}
7990	} else {
7991	rfa->ext_token = `0`;
7992	m->m_ext.ext_free = NULL;
7993	m->m_ext.ext_arg = NULL;
7994	}
7995	} else {
7996	/*
7997	* If we are going to loose the cookie in ext_token by
7998	* resetting the rfa, we should use the global cookie
7999	* to obscure the ext_free and ext_arg pointers.
8000	*/
8001	if (ext_free != NULL) {
8002	m->m_ext.ext_free =
8003	(m_ext_free_func_t)((uintptr_t)ext_free ^
8004	mb_obscure_extfree);
8005	if (ext_arg != NULL) {
8006	m->m_ext.ext_arg =
8007	(caddr_t)((uintptr_t)ext_arg ^
8008	mb_obscure_extfree);
8009	} else {
8010	m->m_ext.ext_arg = NULL;
8011	}
8012	} else {
8013	m->m_ext.ext_free = NULL;
8014	m->m_ext.ext_arg = NULL;
8015	}
8016	m->m_ext.ext_refflags = NULL;
8017	}
8018	}
8019
8020	__private_extern__ inline struct ext_ref *
8021	m_get_rfa(struct mbuf *m)
8022	{
8023	if (m->m_ext.ext_refflags == NULL)
8024	return (NULL);
8025	else
8026	return ((struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref));
8027	}
8028
8029	__private_extern__ inline m_ext_free_func_t
8030	m_get_ext_free(struct mbuf *m)
8031	{
8032	struct ext_ref *rfa;
8033	if (m->m_ext.ext_free == NULL)
8034	return (NULL);
8035
8036	rfa = m_get_rfa(m);
8037	if (rfa == NULL)
8038	return ((m_ext_free_func_t)((uintptr_t)m->m_ext.ext_free ^ mb_obscure_extfree));
8039	else
8040	return ((m_ext_free_func_t)(((uintptr_t)m->m_ext.ext_free)
8041	^ rfa->ext_token));
8042	}
8043
8044	__private_extern__ inline caddr_t
8045	m_get_ext_arg(struct mbuf *m)
8046	{
8047	struct ext_ref *rfa;
8048	if (m->m_ext.ext_arg == NULL)
8049	return (NULL);
8050
8051	rfa = m_get_rfa(m);
8052	if (rfa == NULL) {
8053	return ((caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree));
8054	} else {
8055	return ((caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
8056	rfa->ext_token));
8057	}
8058	}
8059
8060	/*
8061	* Send a report of mbuf usage if the usage is at least 6% of max limit
8062	* or if there has been at least 3% increase since the last report.
8063	*
8064	* The values 6% and 3% are chosen so that we can do simple arithmetic
8065	* with shift operations.
8066	*/
8067	static boolean_t
8068	mbuf_report_usage(mbuf_class_t cl)
8069	{
8070	/ if a report is already in progress, nothing to do /
8071	if (mb_peak_newreport)
8072	return (TRUE);
8073
8074	if (m_total(cl) > m_peak(cl) &&
8075	m_total(cl) >= (m_maxlimit(cl) >> `4`) &&
8076	(m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> `5`))
8077	return (TRUE);
8078	return (FALSE);
8079	}
8080
8081	__private_extern__ void
8082	mbuf_report_peak_usage(void)
8083	{
8084	int i = `0`;
8085	u_int64_t uptime;
8086	struct nstat_sysinfo_data ns_data;
8087	uint32_t memreleased = `0`;
8088	static uint32_t prevmemreleased;
8089
8090	uptime = net_uptime();
8091	lck_mtx_lock(mbuf_mlock);
8092
8093	/ Generate an initial report after 1 week of uptime /
8094	if (!mb_peak_firstreport &&
8095	uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8096	mb_peak_newreport = TRUE;
8097	mb_peak_firstreport = TRUE;
8098	}
8099
8100	if (!mb_peak_newreport) {
8101	lck_mtx_unlock(mbuf_mlock);
8102	return;
8103	}
8104
8105	/*
8106	* Since a report is being generated before 1 week,
8107	* we do not need to force another one later
8108	*/
8109	if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
8110	mb_peak_firstreport = TRUE;
8111
8112	for (i = `0`; i < NELEM(mbuf_table); i++) {
8113	m_peak(m_class(i)) = m_total(m_class(i));
8114	memreleased += m_release_cnt(i);
8115	}
8116	memreleased = memreleased - prevmemreleased;
8117	prevmemreleased = memreleased;
8118	mb_peak_newreport = FALSE;
8119	lck_mtx_unlock(mbuf_mlock);
8120
8121	bzero(&ns_data, sizeof(ns_data));
8122	ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
8123	ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
8124	ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
8125	ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
8126	ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
8127	ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
8128	ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
8129	ns_data.u.mb_stats.draincnt = mbstat.m_drain;
8130	ns_data.u.mb_stats.memreleased = memreleased;
8131	ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
8132
8133	nstat_sysinfo_send_data(&ns_data);
8134
8135	/*
8136	* Reset the floor whenever we report a new
8137	* peak to track the trend (increase peek usage
8138	* is not a leak if mbufs get released
8139	* between reports and the floor stays low)
8140	*/
8141	total_sbmb_cnt_floor = total_sbmb_cnt_peak;
8142	}
8143
8144	/*
8145	* Simple routine to avoid taking the lock when we can't run the
8146	* mbuf drain.
8147	*/
8148	static int
8149	mbuf_drain_checks(boolean_t ignore_waiters)
8150	{
8151
8152	if (mb_drain_maxint == `0`)
8153	return `0`;
8154	if (!ignore_waiters && mb_waiters != `0`)
8155	return `0`;
8156
8157	return `1`;
8158	}
8159
8160	/*
8161	* Called by the VM when there's memory pressure or when we exhausted
8162	* the 4k/16k reserved space.
8163	*/
8164	static void
8165	mbuf_drain_locked(boolean_t ignore_waiters)
8166	{
8167	mbuf_class_t mc;
8168	mcl_slab_t sp, sp_tmp, *nsp;
8169	unsigned int num, k, interval, released = `0`;
8170	unsigned long total_mem = `0`, use_mem = `0`;
8171	boolean_t ret, purge_caches = FALSE;
8172	ppnum_t offset;
8173	mcache_obj_t *obj;
8174	unsigned long per;
8175	static unsigned char scratch[`32`];
8176	static ppnum_t scratch_pa = `0`;
8177
8178	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8179	if (!mbuf_drain_checks(ignore_waiters))
8180	return;
8181	if (scratch_pa == `0`) {
8182	bzero(scratch, sizeof(scratch));
8183	scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
8184	VERIFY(scratch_pa);
8185	} else if (mclverify) {
8186	/*
8187	* Panic if a driver wrote to our scratch memory.
8188	*/
8189	for (k = `0`; k < sizeof(scratch); k++)
8190	if (scratch[k])
8191	panic("suspect DMA to freed address");
8192	}
8193	/*
8194	* Don't free memory too often as that could cause excessive
8195	* waiting times for mbufs. Purge caches if we were asked to drain
8196	* in the last 5 minutes.
8197	*/
8198	if (mbuf_drain_last_runtime != `0`) {
8199	interval = net_uptime() - mbuf_drain_last_runtime;
8200	if (interval <= mb_drain_maxint) {
8201	return;
8202	}
8203	if (interval <= mb_drain_maxint * `5`)
8204	purge_caches = TRUE;
8205	}
8206	mbuf_drain_last_runtime = net_uptime();
8207	/*
8208	* Don't free any memory if we're using 60% or more.
8209	*/
8210	for (mc = `0`; mc < NELEM(mbuf_table); mc++) {
8211	total_mem += m_total(mc) * m_maxsize(mc);
8212	use_mem += m_active(mc) * m_maxsize(mc);
8213	}
8214	per = (use_mem * `100`) / total_mem;
8215	if (per >= `60`) {
8216	return;
8217	}
8218	/*
8219	* Purge all the caches. This effectively disables
8220	* caching for a few seconds, but the mbuf worker thread will
8221	* re-enable them again.
8222	*/
8223	if (purge_caches == TRUE)
8224	for (mc = `0`; mc < NELEM(mbuf_table); mc++) {
8225	if (m_total(mc) < m_avgtotal(mc))
8226	continue;
8227	lck_mtx_unlock(mbuf_mlock);
8228	ret = mcache_purge_cache(m_cache(mc), FALSE);
8229	lck_mtx_lock(mbuf_mlock);
8230	if (ret == TRUE)
8231	m_purge_cnt(mc)++;
8232	}
8233	/*
8234	* Move the objects from the composite class freelist to
8235	* the rudimentary slabs list, but keep at least 10% of the average
8236	* total in the freelist.
8237	*/
8238	for (mc = `0`; mc < NELEM(mbuf_table); mc++) {
8239	while (m_cobjlist(mc) &&
8240	m_total(mc) < m_avgtotal(mc) &&
8241	m_infree(mc) > `0.1` * m_avgtotal(mc) + m_minlimit(mc)) {
8242	obj = m_cobjlist(mc);
8243	m_cobjlist(mc) = obj->obj_next;
8244	obj->obj_next = NULL;
8245	num = cslab_free(mc, obj, `1`);
8246	VERIFY(num == `1`);
8247	m_free_cnt(mc)++;
8248	m_infree(mc)--;
8249	/ cslab_free() handles m_total /
8250	}
8251	}
8252	/*
8253	* Free the buffers present in the slab list up to 10% of the total
8254	* average per class.
8255	*
8256	* We walk the list backwards in an attempt to reduce fragmentation.
8257	*/
8258	for (mc = NELEM(mbuf_table) - `1`; (int)mc >= `0`; mc--) {
8259	TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8260	/*
8261	* Process only unused slabs occupying memory.
8262	*/
8263	if (sp->sl_refcnt != `0` \|\| sp->sl_len == `0` \|\|
8264	sp->sl_base == NULL)
8265	continue;
8266	if (m_total(mc) < m_avgtotal(mc) \|\|
8267	m_infree(mc) < `0.1` * m_avgtotal(mc) + m_minlimit(mc))
8268	break;
8269	slab_remove(sp, mc);
8270	switch (mc) {
8271	case MC_MBUF:
8272	m_infree(mc) -= NMBPG;
8273	m_total(mc) -= NMBPG;
8274	if (mclaudit != NULL)
8275	mcl_audit_free(sp->sl_base, NMBPG);
8276	break;
8277	case MC_CL:
8278	m_infree(mc) -= NCLPG;
8279	m_total(mc) -= NCLPG;
8280	if (mclaudit != NULL)
8281	mcl_audit_free(sp->sl_base, NMBPG);
8282	break;
8283	case MC_BIGCL:
8284	{
8285	m_infree(mc) -= NBCLPG;
8286	m_total(mc) -= NBCLPG;
8287	if (mclaudit != NULL)
8288	mcl_audit_free(sp->sl_base, NMBPG);
8289	break;
8290	}
8291	case MC_16KCL:
8292	m_infree(mc)--;
8293	m_total(mc)--;
8294	for (nsp = sp, k = `1`; k < NSLABSP16KB; k++) {
8295	nsp = nsp->sl_next;
8296	VERIFY(nsp->sl_refcnt == `0` &&
8297	nsp->sl_base != NULL &&
8298	nsp->sl_len == `0`);
8299	slab_init(nsp, `0`, `0`, NULL, NULL, `0`, `0`,
8300	`0`);
8301	nsp->sl_flags = `0`;
8302	}
8303	if (mclaudit != NULL) {
8304	if (sp->sl_len == PAGE_SIZE) {
8305	mcl_audit_free(sp->sl_base,
8306	NMBPG);
8307	} else {
8308	mcl_audit_free(sp->sl_base, `1`);
8309	}
8310	}
8311	break;
8312	default:
8313	/*
8314	* The composite classes have their own
8315	* freelist (m_cobjlist), so we only
8316	* process rudimentary classes here.
8317	*/
8318	VERIFY(`0`);
8319	}
8320	m_release_cnt(mc) += m_size(mc);
8321	released += m_size(mc);
8322	VERIFY(sp->sl_base != NULL &&
8323	sp->sl_len >= PAGE_SIZE);
8324	offset = MTOPG(sp->sl_base);
8325	/*
8326	* Make sure the IOMapper points to a valid, but
8327	* bogus, address. This should prevent further DMA
8328	* accesses to freed memory.
8329	*/
8330	IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8331	mcl_paddr[offset] = `0`;
8332	kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8333	sp->sl_len);
8334	slab_init(sp, `0`, `0`, NULL, NULL, `0`, `0`, `0`);
8335	sp->sl_flags = `0`;
8336	}
8337	}
8338	mbstat.m_drain++;
8339	mbstat.m_bigclusters = m_total(MC_BIGCL);
8340	mbstat.m_clusters = m_total(MC_CL);
8341	mbstat.m_mbufs = m_total(MC_MBUF);
8342	mbuf_stat_sync();
8343	mbuf_mtypes_sync(TRUE);
8344	}
8345
8346	__private_extern__ void
8347	mbuf_drain(boolean_t ignore_waiters)
8348	{
8349	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
8350	if (!mbuf_drain_checks(ignore_waiters))
8351	return;
8352	lck_mtx_lock(mbuf_mlock);
8353	mbuf_drain_locked(ignore_waiters);
8354	lck_mtx_unlock(mbuf_mlock);
8355	}
8356
8357
8358	static int
8359	m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8360	{
8361	#pragma unused(arg1, arg2)
8362	int val = `0`, err;
8363
8364	err = sysctl_handle_int(oidp, &val, `0`, req);
8365	if (err != `0` \|\| req->newptr == USER_ADDR_NULL)
8366	return (err);
8367	if (val) {
8368	mbuf_drain(TRUE);
8369	}
8370
8371	return (err);
8372	}
8373
8374	#if DEBUG \|\| DEVELOPMENT
8375	static void
8376	_mbwdog_logger(const char func, const* int line, const char *fmt, ...)
8377	{
8378	va_list ap;
8379	struct timeval now;
8380	char str[`384`], p[`256`];
8381	int len;
8382
8383	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8384	if (mbwdog_logging == NULL) {
8385	mbwdog_logging = _MALLOC(mbwdog_logging_size,
8386	M_TEMP, M_ZERO\|M_NOWAIT);
8387	if (mbwdog_logging == NULL)
8388	return;
8389	}
8390	va_start(ap, fmt);
8391	vsnprintf(p, sizeof(p), fmt, ap);
8392	va_end(ap);
8393	microuptime(&now);
8394	len = snprintf(str, sizeof(str),
8395	"\n%ld.%d (%d/%llx) %s:%d %s",
8396	now.tv_sec, now.tv_usec,
8397	current_proc()->p_pid,
8398	(uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
8399	func, line, p);
8400	if (len < `0`)
8401	return;
8402	if (mbwdog_logging_used + len > mbwdog_logging_size) {
8403	mbwdog_logging_used = mbwdog_logging_used / `2`;
8404	memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
8405	mbwdog_logging_size - mbwdog_logging_used);
8406	mbwdog_logging[mbwdog_logging_used] = `0`;
8407	}
8408	strlcat(mbwdog_logging, str, mbwdog_logging_size);
8409	mbwdog_logging_used += len;
8410	}
8411
8412	static int
8413	sysctl_mbwdog_log SYSCTL_HANDLER_ARGS
8414	{
8415	#pragma unused(oidp, arg1, arg2)
8416	return SYSCTL_OUT(req, mbwdog_logging, mbwdog_logging_used);
8417	}
8418	SYSCTL_DECL(_kern_ipc);
8419	SYSCTL_PROC(_kern_ipc, OID_AUTO, mbwdog_log,
8420	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_LOCKED,
8421	`0`, `0`, sysctl_mbwdog_log, "A", "");
8422
8423	static int mbtest_val;
8424	static int mbtest_running;
8425
8426	static void mbtest_thread(__unused void *arg)
8427	{
8428	int i;
8429	int scale_down = `1`;
8430	int iterations = `250`;
8431	int allocations = nmbclusters;
8432	iterations = iterations / scale_down;
8433	allocations = allocations / scale_down;
8434	printf("%s thread starting\n", __func__);
8435	for (i = `0`; i < iterations; i++) {
8436	unsigned int needed = allocations;
8437	struct mbuf m1, m2, *m3;
8438
8439	if (njcl > `0`) {
8440	needed = allocations;
8441	m3 = m_getpackets_internal(&needed, `0`, M_DONTWAIT, `0`, M16KCLBYTES);
8442	m_freem_list(m3);
8443	}
8444
8445	needed = allocations;
8446	m2 = m_getpackets_internal(&needed, `0`, M_DONTWAIT, `0`, MBIGCLBYTES);
8447	m_freem_list(m2);
8448
8449	m1 = m_getpackets_internal(&needed, `0`, M_DONTWAIT, `0`, MCLBYTES);
8450	m_freem_list(m1);
8451	}
8452
8453	printf("%s thread ending\n", __func__);
8454
8455	OSDecrementAtomic(&mbtest_running);
8456	wakeup_one((caddr_t)&mbtest_running);
8457	}
8458
8459	static void sysctl_mbtest(void)
8460	{
8461	/ We launch three threads - wait for all of them /
8462	OSIncrementAtomic(&mbtest_running);
8463	OSIncrementAtomic(&mbtest_running);
8464	OSIncrementAtomic(&mbtest_running);
8465
8466	thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, `10`);
8467	thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, `10`);
8468	thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, `10`);
8469
8470	while (mbtest_running) {
8471	msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
8472	}
8473	}
8474
8475	static int
8476	mbtest SYSCTL_HANDLER_ARGS
8477	{
8478	#pragma unused(arg1, arg2)
8479	int error = `0`, val, oldval = mbtest_val;
8480
8481	val = oldval;
8482	error = sysctl_handle_int(oidp, &val, `0`, req);
8483	if (error \|\| !req->newptr)
8484	return (error);
8485
8486	if (val != oldval)
8487	sysctl_mbtest();
8488
8489	mbtest_val = val;
8490
8491	return (error);
8492	}
8493	#endif // DEBUG \|\| DEVELOPMENT
8494
8495	static void
8496	mtracelarge_register(size_t size)
8497	{
8498	int i;
8499	struct mtracelarge *trace;
8500	uintptr_t bt[MLEAK_STACK_DEPTH];
8501	unsigned int depth;
8502
8503	depth = backtrace(bt, MLEAK_STACK_DEPTH);
8504	/ Check if this entry is already on the list. /
8505	for (i = `0`; i < MTRACELARGE_NUM_TRACES; i++) {
8506	trace = &mtracelarge_table[i];
8507	if (trace->size == size && trace->depth == depth &&
8508	memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == `0`) {
8509	return;
8510	}
8511
8512	}
8513	for (i = `0`; i < MTRACELARGE_NUM_TRACES; i++) {
8514	trace = &mtracelarge_table[i];
8515	if (size > trace->size) {
8516	trace->depth = depth;
8517	memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
8518	trace->size = size;
8519	break;
8520	}
8521	}
8522	}
8523
8524	SYSCTL_DECL(_kern_ipc);
8525	#if DEBUG \|\| DEVELOPMENT
8526	SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest,
8527	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &mbtest_val, `0`, &mbtest, "I",
8528	"Toggle to test mbufs");
8529	#endif
8530	SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
8531	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
8532	`0`, `0`, mbstat_sysctl, "S,mbstat", "");
8533	SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
8534	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
8535	`0`, `0`, mb_stat_sysctl, "S,mb_stat", "");
8536	SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
8537	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
8538	`0`, `0`, mleak_top_trace_sysctl, "S,mb_top_trace", "");
8539	SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
8540	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
8541	`0`, `0`, mleak_table_sysctl, "S,mleak_table", "");
8542	SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
8543	CTLFLAG_RW \| CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, `0`, "");
8544	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
8545	CTLFLAG_RD \| CTLFLAG_LOCKED, &mb_normalized, `0`, "");
8546	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
8547	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_watchdog, `0`, "");
8548	SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
8549	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, `0`,
8550	m_drain_force_sysctl, "I",
8551	"Forces the mbuf garbage collection to run");
8552	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
8553	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_drain_maxint, `0`,
8554	"Minimum time interval between garbage collection");
8555

Browse the source code of codebrowser/bsd/kern/uipc_mbuf.c