regcomp.c source code [glibc/posix/regcomp.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#ifdef _LIBC
21	# include <locale/weight.h>
22	#endif
23
24	static reg_errcode_t re_compile_internal (regex_t preg, const* char * pattern,
25	size_t length, reg_syntax_t syntax);
26	static void re_compile_fastmap_iter (regex_t *bufp,
27	const re_dfastate_t *init_state,
28	char *fastmap);
29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30	#ifdef RE_ENABLE_I18N
31	static void free_charset (re_charset_t *cset);
32	#endif /* RE_ENABLE_I18N */
33	static void free_workarea_compile (regex_t *preg);
34	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35	#ifdef RE_ENABLE_I18N
36	static void optimize_utf8 (re_dfa_t *dfa);
37	#endif
38	static reg_errcode_t analyze (regex_t *preg);
39	static reg_errcode_t preorder (bin_tree_t *root,
40	reg_errcode_t (fn (void , bin_tree_t )),
41	void *extra);
42	static reg_errcode_t postorder (bin_tree_t *root,
43	reg_errcode_t (fn (void , bin_tree_t )),
44	void *extra);
45	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
46	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
47	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
48	bin_tree_t *node);
49	static reg_errcode_t calc_first (void extra, bin_tree_t node);
50	static reg_errcode_t calc_next (void extra, bin_tree_t node);
51	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
52	static Idx duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint);
53	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54	unsigned int constraint);
55	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
57	Idx node, bool root);
58	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59	static Idx fetch_number (re_string_t input, re_token_t token,
60	reg_syntax_t syntax);
61	static int peek_token (re_token_t token, re_string_t input,
62	reg_syntax_t syntax);
63	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
64	reg_syntax_t syntax, reg_errcode_t *err);
65	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
66	re_token_t *token, reg_syntax_t syntax,
67	Idx nest, reg_errcode_t *err);
68	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
69	re_token_t *token, reg_syntax_t syntax,
70	Idx nest, reg_errcode_t *err);
71	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
72	re_token_t *token, reg_syntax_t syntax,
73	Idx nest, reg_errcode_t *err);
74	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
75	re_token_t *token, reg_syntax_t syntax,
76	Idx nest, reg_errcode_t *err);
77	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
78	re_dfa_t dfa, re_token_t token,
79	reg_syntax_t syntax, reg_errcode_t *err);
80	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
81	re_token_t *token, reg_syntax_t syntax,
82	reg_errcode_t *err);
83	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84	re_string_t *regexp,
85	re_token_t token, int* token_len,
86	re_dfa_t *dfa,
87	reg_syntax_t syntax,
88	bool accept_hyphen);
89	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90	re_string_t *regexp,
91	re_token_t *token);
92	#ifdef RE_ENABLE_I18N
93	static reg_errcode_t build_equiv_class (bitset_t sbcset,
94	re_charset_t *mbcset,
95	Idx *equiv_class_alloc,
96	const unsigned char *name);
97	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98	bitset_t sbcset,
99	re_charset_t *mbcset,
100	Idx *char_class_alloc,
101	const char *class_name,
102	reg_syntax_t syntax);
103	#else /* not RE_ENABLE_I18N */
104	static reg_errcode_t build_equiv_class (bitset_t sbcset,
105	const unsigned char *name);
106	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107	bitset_t sbcset,
108	const char *class_name,
109	reg_syntax_t syntax);
110	#endif /* not RE_ENABLE_I18N */
111	static bin_tree_t build_charclass_op (re_dfa_t dfa,
112	RE_TRANSLATE_TYPE trans,
113	const char *class_name,
114	const char *extra,
115	bool non_match, reg_errcode_t *err);
116	static bin_tree_t create_tree (re_dfa_t dfa,
117	bin_tree_t left, bin_tree_t right,
118	re_token_type_t type);
119	static bin_tree_t create_token_tree (re_dfa_t dfa,
120	bin_tree_t left, bin_tree_t right,
121	const re_token_t *token);
122	static bin_tree_t duplicate_tree (const* bin_tree_t src, re_dfa_t dfa);
123	static void free_token (re_token_t *node);
124	static reg_errcode_t free_tree (void extra, bin_tree_t node);
125	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
126
127	/ This table gives an error message for each of the error codes listed*
128	in regex.h. Obviously the order here has to be same as there.
129	POSIX doesn't require that we do anything for REG_NOERROR,
130	but why not be nice? /*
131
132	static const char __re_error_msgid[] =
133	{
134	#define REG_NOERROR_IDX 0
135	gettext_noop ("Success") / REG_NOERROR /
136	"\0"
137	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138	gettext_noop ("No match") / REG_NOMATCH /
139	"\0"
140	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141	gettext_noop ("Invalid regular expression") / REG_BADPAT /
142	"\0"
143	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144	gettext_noop ("Invalid collation character") / REG_ECOLLATE /
145	"\0"
146	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147	gettext_noop ("Invalid character class name") / REG_ECTYPE /
148	"\0"
149	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150	gettext_noop ("Trailing backslash") / REG_EESCAPE /
151	"\0"
152	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153	gettext_noop ("Invalid back reference") / REG_ESUBREG /
154	"\0"
155	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156	gettext_noop ("Unmatched [, [^, [:, [., or [=") / REG_EBRACK /
157	"\0"
158	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159	gettext_noop ("Unmatched ( or \\(") / REG_EPAREN /
160	"\0"
161	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162	gettext_noop ("Unmatched \\{") / REG_EBRACE /
163	"\0"
164	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165	gettext_noop ("Invalid content of \\{\\}") / REG_BADBR /
166	"\0"
167	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168	gettext_noop ("Invalid range end") / REG_ERANGE /
169	"\0"
170	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171	gettext_noop ("Memory exhausted") / REG_ESPACE /
172	"\0"
173	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174	gettext_noop ("Invalid preceding regular expression") / REG_BADRPT /
175	"\0"
176	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177	gettext_noop ("Premature end of regular expression") / REG_EEND /
178	"\0"
179	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180	gettext_noop ("Regular expression too big") / REG_ESIZE /
181	"\0"
182	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183	gettext_noop ("Unmatched ) or \\)") / REG_ERPAREN /
184	};
185
186	static const size_t __re_error_msgid_idx[] =
187	{
188	REG_NOERROR_IDX,
189	REG_NOMATCH_IDX,
190	REG_BADPAT_IDX,
191	REG_ECOLLATE_IDX,
192	REG_ECTYPE_IDX,
193	REG_EESCAPE_IDX,
194	REG_ESUBREG_IDX,
195	REG_EBRACK_IDX,
196	REG_EPAREN_IDX,
197	REG_EBRACE_IDX,
198	REG_BADBR_IDX,
199	REG_ERANGE_IDX,
200	REG_ESPACE_IDX,
201	REG_BADRPT_IDX,
202	REG_EEND_IDX,
203	REG_ESIZE_IDX,
204	REG_ERPAREN_IDX
205	};
206
207	/ Entry points for GNU code. /
208
209	/ re_compile_pattern is the GNU regular expression compiler: it*
210	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211	Returns 0 if the pattern was valid, otherwise an error string.
212
213	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214	are set in BUFP on entry. /*
215
216	const char *
217	re_compile_pattern (const char *pattern, size_t length,
218	struct re_pattern_buffer *bufp)
219	{
220	reg_errcode_t ret;
221
222	/ And GNU code determines whether or not to get register information*
223	by passing null for the REGS argument to re_match, etc., not by
224	setting no_sub, unless RE_NO_SUB is set. /*
225	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227	/ Match anchors at newline. /
228	bufp->newline_anchor = `1`;
229
230	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232	if (!ret)
233	return NULL;
234	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235	}
236	weak_alias (__re_compile_pattern, re_compile_pattern)
237
238	/ Set by 're_set_syntax' to the current regexp syntax to recognize. Can*
239	also be assigned to arbitrarily: each pattern buffer stores its own
240	syntax, so it can be changed between regex compilations. /*
241	/ This has no initializer because initialized variables in Emacs*
242	become read-only after dumping. /*
243	reg_syntax_t re_syntax_options;
244
245
246	/ Specify the precise syntax of regexps for compilation. This provides*
247	for compatibility for various utilities which historically have
248	different, incompatible syntaxes.
249
250	The argument SYNTAX is a bit mask comprised of the various bits
251	defined in regex.h. We return the old syntax. /*
252
253	reg_syntax_t
254	re_set_syntax (reg_syntax_t syntax)
255	{
256	reg_syntax_t ret = re_syntax_options;
257
258	re_syntax_options = syntax;
259	return ret;
260	}
261	weak_alias (__re_set_syntax, re_set_syntax)
262
263	int
264	re_compile_fastmap (struct re_pattern_buffer *bufp)
265	{
266	re_dfa_t *dfa = bufp->buffer;
267	char *fastmap = bufp->fastmap;
268
269	memset (fastmap, `'\0'`, sizeof (char) * SBC_MAX);
270	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271	if (dfa->init_state != dfa->init_state_word)
272	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273	if (dfa->init_state != dfa->init_state_nl)
274	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275	if (dfa->init_state != dfa->init_state_begbuf)
276	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277	bufp->fastmap_accurate = `1`;
278	return `0`;
279	}
280	weak_alias (__re_compile_fastmap, re_compile_fastmap)
281
282	static inline void
283	__attribute__ ((always_inline))
284	re_set_fastmap (char fastmap, bool icase, int* ch)
285	{
286	fastmap[ch] = `1`;
287	if (icase)
288	fastmap[tolower (ch)] = `1`;
289	}
290
291	/ Helper function for re_compile_fastmap.*
292	Compile fastmap for the initial_state INIT_STATE. /*
293
294	static void
295	re_compile_fastmap_iter (regex_t bufp, const* re_dfastate_t *init_state,
296	char *fastmap)
297	{
298	re_dfa_t *dfa = bufp->buffer;
299	Idx node_cnt;
300	bool icase = (dfa->mb_cur_max == `1` && (bufp->syntax & RE_ICASE));
301	for (node_cnt = `0`; node_cnt < init_state->nodes.nelem; ++node_cnt)
302	{
303	Idx node = init_state->nodes.elems[node_cnt];
304	re_token_type_t type = dfa->nodes[node].type;
305
306	if (type == CHARACTER)
307	{
308	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309	#ifdef RE_ENABLE_I18N
310	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
311	{
312	unsigned char buf[MB_LEN_MAX];
313	unsigned char *p;
314	wchar_t wc;
315	mbstate_t state;
316
317	p = buf;
318	*p++ = dfa->nodes[node].opr.c;
319	while (++node < dfa->nodes_len
320	&& dfa->nodes[node].type == CHARACTER
321	&& dfa->nodes[node].mb_partial)
322	*p++ = dfa->nodes[node].opr.c;
323	memset (&state, `'\0'`, sizeof (state));
324	if (__mbrtowc (&wc, (const char *) buf, p - buf,
325	&state) == p - buf
326	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
327	!= (size_t) -`1`))
328	re_set_fastmap (fastmap, false, buf[`0`]);
329	}
330	#endif
331	}
332	else if (type == SIMPLE_BRACKET)
333	{
334	int i, ch;
335	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
336	{
337	int j;
338	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
340	if (w & ((bitset_word_t) `1` << j))
341	re_set_fastmap (fastmap, icase, ch);
342	}
343	}
344	#ifdef RE_ENABLE_I18N
345	else if (type == COMPLEX_BRACKET)
346	{
347	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348	Idx i;
349
350	# ifdef _LIBC
351	/ See if we have to try all bytes which start multiple collation*
352	elements.
353	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354	collation element, and don't catch 'b' since 'b' is
355	the only collation element which starts from 'b' (and
356	it is caught by SIMPLE_BRACKET). /*
357	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != `0`
358	&& (cset->ncoll_syms \|\| cset->nranges))
359	{
360	const int32_t table = (const* int32_t *)
361	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362	for (i = `0`; i < SBC_MAX; ++i)
363	if (table[i] < `0`)
364	re_set_fastmap (fastmap, icase, i);
365	}
366	# endif /* _LIBC */
367
368	/ See if we have to start the match at all multibyte characters,*
369	i.e. where we would not find an invalid sequence. This only
370	applies to multibyte character sets; for single byte character
371	sets, the SIMPLE_BRACKET again suffices. /*
372	if (dfa->mb_cur_max > `1`
373	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
374	# ifdef _LIBC
375	\|\| cset->nequiv_classes
376	# endif /* _LIBC */
377	))
378	{
379	unsigned char c = `0`;
380	do
381	{
382	mbstate_t mbs;
383	memset (&mbs, `0`, sizeof (mbs));
384	if (__mbrtowc (NULL, (char *) &c, `1`, &mbs) == (size_t) -`2`)
385	re_set_fastmap (fastmap, false, (int) c);
386	}
387	while (++c != `0`);
388	}
389
390	else
391	{
392	/ ... Else catch all bytes which can start the mbchars. /
393	for (i = `0`; i < cset->nmbchars; ++i)
394	{
395	char buf[`256`];
396	mbstate_t state;
397	memset (&state, `'\0'`, sizeof (state));
398	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -`1`)
399	re_set_fastmap (fastmap, icase, (unsigned* char *) buf);
400	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
401	{
402	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403	!= (size_t) -`1`)
404	re_set_fastmap (fastmap, false, (unsigned* char *) buf);
405	}
406	}
407	}
408	}
409	#endif /* RE_ENABLE_I18N */
410	else if (type == OP_PERIOD
411	#ifdef RE_ENABLE_I18N
412	\|\| type == OP_UTF8_PERIOD
413	#endif /* RE_ENABLE_I18N */
414	\|\| type == END_OF_RE)
415	{
416	memset (fastmap, `'\1'`, sizeof (char) * SBC_MAX);
417	if (type == END_OF_RE)
418	bufp->can_be_null = `1`;
419	return;
420	}
421	}
422	}
423
424	/ Entry point for POSIX code. /
425	/ regcomp takes a regular expression as a string and compiles it.*
426
427	PREG is a regex_t . We do not expect any fields to be initialized,*
428	since POSIX says we shouldn't. Thus, we set
429
430	'buffer' to the compiled pattern;
431	'used' to the length of the compiled pattern;
432	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433	REG_EXTENDED bit in CFLAGS is set; otherwise, to
434	RE_SYNTAX_POSIX_BASIC;
435	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436	'fastmap' to an allocated space for the fastmap;
437	'fastmap_accurate' to zero;
438	're_nsub' to the number of subexpressions in PATTERN.
439
440	PATTERN is the address of the pattern string.
441
442	CFLAGS is a series of bits which affect compilation.
443
444	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445	use POSIX basic syntax.
446
447	If REG_NEWLINE is set, then . and [^...] don't match newline.
448	Also, regexec will try a match beginning after every newline.
449
450	If REG_ICASE is set, then we considers upper- and lowercase
451	versions of letters to be equivalent when matching.
452
453	If REG_NOSUB is set, then when PREG is passed to regexec, that
454	routine will report only success or failure, and nothing about the
455	registers.
456
457	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458	the return codes and their meanings.) /*
459
460	int
461	regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
462	{
463	reg_errcode_t ret;
464	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465	: RE_SYNTAX_POSIX_BASIC);
466
467	preg->buffer = NULL;
468	preg->allocated = `0`;
469	preg->used = `0`;
470
471	/ Try to allocate space for the fastmap. /
472	preg->fastmap = re_malloc (char, SBC_MAX);
473	if (__glibc_unlikely (preg->fastmap == NULL))
474	return REG_ESPACE;
475
476	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : `0`;
477
478	/ If REG_NEWLINE is set, newlines are treated differently. /
479	if (cflags & REG_NEWLINE)
480	{ / REG_NEWLINE implies neither . nor [^...] match newline. /
481	syntax &= ~RE_DOT_NEWLINE;
482	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
483	/ It also changes the matching behavior. /
484	preg->newline_anchor = `1`;
485	}
486	else
487	preg->newline_anchor = `0`;
488	preg->no_sub = !!(cflags & REG_NOSUB);
489	preg->translate = NULL;
490
491	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493	/ POSIX doesn't distinguish between an unmatched open-group and an*
494	unmatched close-group: both are REG_EPAREN. /*
495	if (ret == REG_ERPAREN)
496	ret = REG_EPAREN;
497
498	/ We have already checked preg->fastmap != NULL. /
499	if (__glibc_likely (ret == REG_NOERROR))
500	/ Compute the fastmap now, since regexec cannot modify the pattern*
501	buffer. This function never fails in this implementation. /*
502	(void) re_compile_fastmap (preg);
503	else
504	{
505	/ Some error occurred while compiling the expression. /
506	re_free (preg->fastmap);
507	preg->fastmap = NULL;
508	}
509
510	return (int) ret;
511	}
512	libc_hidden_def (__regcomp)
513	weak_alias (__regcomp, regcomp)
514
515	/ Returns a message corresponding to an error code, ERRCODE, returned*
516	from either regcomp or regexec. We don't use PREG here. /*
517
518	size_t
519	regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520	size_t errbuf_size)
521	{
522	const char *msg;
523	size_t msg_size;
524	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[`0`];
525
526	if (__glibc_unlikely (errcode < `0` \|\| errcode >= nerrcodes))
527	/ Only error codes returned by the rest of the code should be passed*
528	to this routine. If we are given anything else, or if other regex
529	code generates an invalid error code, then the program has a bug.
530	Dump core so we can fix it. /*
531	abort ();
532
533	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
534
535	msg_size = strlen (msg) + `1`; / Includes the null. /
536
537	if (__glibc_likely (errbuf_size != `0`))
538	{
539	size_t cpy_size = msg_size;
540	if (__glibc_unlikely (msg_size > errbuf_size))
541	{
542	cpy_size = errbuf_size - `1`;
543	errbuf[cpy_size] = `'\0'`;
544	}
545	memcpy (errbuf, msg, cpy_size);
546	}
547
548	return msg_size;
549	}
550	weak_alias (__regerror, regerror)
551
552
553	#ifdef RE_ENABLE_I18N
554	/ This static array is used for the map to single-byte characters when*
555	UTF-8 is used. Otherwise we would allocate memory just to initialize
556	it the same all the time. UTF-8 is the preferred encoding so this is
557	a worthwhile optimization. /*
558	static const bitset_t utf8_sb_map =
559	{
560	/ Set the first 128 bits. /
561	# if (defined __GNUC__ \|\| __clang_major__ >= 4) && !defined __STRICT_ANSI__
562	[`0` ... `0x80` / BITSET_WORD_BITS - `1`] = BITSET_WORD_MAX
563	# else
564	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
565	# error "bitset_word_t is narrower than 32 bits"
566	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569	BITSET_WORD_MAX, BITSET_WORD_MAX,
570	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571	BITSET_WORD_MAX,
572	# endif
573	(BITSET_WORD_MAX
574	>> (SBC_MAX % BITSET_WORD_BITS == `0`
575	? `0`
576	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577	# endif
578	};
579	#endif
580
581
582	static void
583	free_dfa_content (re_dfa_t *dfa)
584	{
585	Idx i, j;
586
587	if (dfa->nodes)
588	for (i = `0`; i < dfa->nodes_len; ++i)
589	free_token (dfa->nodes + i);
590	re_free (dfa->nexts);
591	for (i = `0`; i < dfa->nodes_len; ++i)
592	{
593	if (dfa->eclosures != NULL)
594	re_node_set_free (dfa->eclosures + i);
595	if (dfa->inveclosures != NULL)
596	re_node_set_free (dfa->inveclosures + i);
597	if (dfa->edests != NULL)
598	re_node_set_free (dfa->edests + i);
599	}
600	re_free (dfa->edests);
601	re_free (dfa->eclosures);
602	re_free (dfa->inveclosures);
603	re_free (dfa->nodes);
604
605	if (dfa->state_table)
606	for (i = `0`; i <= dfa->state_hash_mask; ++i)
607	{
608	struct re_state_table_entry *entry = dfa->state_table + i;
609	for (j = `0`; j < entry->num; ++j)
610	{
611	re_dfastate_t *state = entry->array[j];
612	free_state (state);
613	}
614	re_free (entry->array);
615	}
616	re_free (dfa->state_table);
617	#ifdef RE_ENABLE_I18N
618	if (dfa->sb_char != utf8_sb_map)
619	re_free (dfa->sb_char);
620	#endif
621	re_free (dfa->subexp_map);
622	#ifdef DEBUG
623	re_free (dfa->re_str);
624	#endif
625
626	re_free (dfa);
627	}
628
629
630	/ Free dynamically allocated space used by PREG. /
631
632	void
633	regfree (regex_t *preg)
634	{
635	re_dfa_t *dfa = preg->buffer;
636	if (__glibc_likely (dfa != NULL))
637	{
638	lock_fini (dfa->lock);
639	free_dfa_content (dfa);
640	}
641	preg->buffer = NULL;
642	preg->allocated = `0`;
643
644	re_free (preg->fastmap);
645	preg->fastmap = NULL;
646
647	re_free (preg->translate);
648	preg->translate = NULL;
649	}
650	libc_hidden_def (__regfree)
651	weak_alias (__regfree, regfree)
652
653	/ Entry points compatible with 4.2 BSD regex library. We don't define*
654	them unless specifically requested. /*
655
656	#if defined _REGEX_RE_COMP \|\| defined _LIBC
657
658	/ BSD has one and only one pattern buffer. /
659	static struct re_pattern_buffer re_comp_buf;
660
661	char *
662	# ifdef _LIBC
663	/ Make these definitions weak in libc, so POSIX programs can redefine*
664	these names if they don't use our functions, and still use
665	regcomp/regexec above without link errors. /*
666	weak_function
667	# endif
668	re_comp (const char *s)
669	{
670	reg_errcode_t ret;
671	char *fastmap;
672
673	if (!s)
674	{
675	if (!re_comp_buf.buffer)
676	return gettext ("No previous regular expression");
677	return `0`;
678	}
679
680	if (re_comp_buf.buffer)
681	{
682	fastmap = re_comp_buf.fastmap;
683	re_comp_buf.fastmap = NULL;
684	__regfree (&re_comp_buf);
685	memset (&re_comp_buf, `'\0'`, sizeof (re_comp_buf));
686	re_comp_buf.fastmap = fastmap;
687	}
688
689	if (re_comp_buf.fastmap == NULL)
690	{
691	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692	if (re_comp_buf.fastmap == NULL)
693	return (char *) gettext (__re_error_msgid
694	+ __re_error_msgid_idx[(int) REG_ESPACE]);
695	}
696
697	/ Since 're_exec' always passes NULL for the 'regs' argument, we*
698	don't need to initialize the pattern buffer fields which affect it. /*
699
700	/ Match anchors at newlines. /
701	re_comp_buf.newline_anchor = `1`;
702
703	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705	if (!ret)
706	return NULL;
707
708	/ Yes, we're discarding 'const' here if !HAVE_LIBINTL. /
709	return (char ) gettext (__re_error_msgid + __re_error_msgid_idx[(int*) ret]);
710	}
711
712	#ifdef _LIBC
713	libc_freeres_fn (free_mem)
714	{
715	__regfree (&re_comp_buf);
716	}
717	#endif
718
719	#endif /* _REGEX_RE_COMP */
720
721	/ Internal entry point.*
722	Compile the regular expression PATTERN, whose length is LENGTH.
723	SYNTAX indicate regular expression's syntax. /*
724
725	static reg_errcode_t
726	re_compile_internal (regex_t preg, const* char * pattern, size_t length,
727	reg_syntax_t syntax)
728	{
729	reg_errcode_t err = REG_NOERROR;
730	re_dfa_t *dfa;
731	re_string_t regexp;
732
733	/ Initialize the pattern buffer. /
734	preg->fastmap_accurate = `0`;
735	preg->syntax = syntax;
736	preg->not_bol = preg->not_eol = `0`;
737	preg->used = `0`;
738	preg->re_nsub = `0`;
739	preg->can_be_null = `0`;
740	preg->regs_allocated = REGS_UNALLOCATED;
741
742	/ Initialize the dfa. /
743	dfa = preg->buffer;
744	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
745	{
746	/ If zero allocated, but buffer is non-null, try to realloc*
747	enough space. This loses if buffer's address is bogus, but
748	that is the user's responsibility. If ->buffer is NULL this
749	is a simple allocation. /*
750	dfa = re_realloc (preg->buffer, re_dfa_t, `1`);
751	if (dfa == NULL)
752	return REG_ESPACE;
753	preg->allocated = sizeof (re_dfa_t);
754	preg->buffer = dfa;
755	}
756	preg->used = sizeof (re_dfa_t);
757
758	err = init_dfa (dfa, length);
759	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != `0`))
760	err = REG_ESPACE;
761	if (__glibc_unlikely (err != REG_NOERROR))
762	{
763	free_dfa_content (dfa);
764	preg->buffer = NULL;
765	preg->allocated = `0`;
766	return err;
767	}
768	#ifdef DEBUG
769	/ Note: length+1 will not overflow since it is checked in init_dfa. /
770	dfa->re_str = re_malloc (char, length + `1`);
771	strncpy (dfa->re_str, pattern, length + `1`);
772	#endif
773
774	err = re_string_construct (&regexp, pattern, length, preg->translate,
775	(syntax & RE_ICASE) != `0`, dfa);
776	if (__glibc_unlikely (err != REG_NOERROR))
777	{
778	re_compile_internal_free_return:
779	free_workarea_compile (preg);
780	re_string_destruct (&regexp);
781	lock_fini (dfa->lock);
782	free_dfa_content (dfa);
783	preg->buffer = NULL;
784	preg->allocated = `0`;
785	return err;
786	}
787
788	/ Parse the regular expression, and build a structure tree. /
789	preg->re_nsub = `0`;
790	dfa->str_tree = parse (&regexp, preg, syntax, &err);
791	if (__glibc_unlikely (dfa->str_tree == NULL))
792	goto re_compile_internal_free_return;
793
794	/ Analyze the tree and create the nfa. /
795	err = analyze (preg);
796	if (__glibc_unlikely (err != REG_NOERROR))
797	goto re_compile_internal_free_return;
798
799	#ifdef RE_ENABLE_I18N
800	/ If possible, do searching in single byte encoding to speed things up. /
801	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802	optimize_utf8 (dfa);
803	#endif
804
805	/ Then create the initial state of the dfa. /
806	err = create_initial_state (dfa);
807
808	/ Release work areas. /
809	free_workarea_compile (preg);
810	re_string_destruct (&regexp);
811
812	if (__glibc_unlikely (err != REG_NOERROR))
813	{
814	lock_fini (dfa->lock);
815	free_dfa_content (dfa);
816	preg->buffer = NULL;
817	preg->allocated = `0`;
818	}
819
820	return err;
821	}
822
823	/ Initialize DFA. We use the length of the regular expression PAT_LEN*
824	as the initial length of some arrays. /*
825
826	static reg_errcode_t
827	init_dfa (re_dfa_t *dfa, size_t pat_len)
828	{
829	__re_size_t table_size;
830	#ifndef _LIBC
831	const char *codeset_name;
832	#endif
833	#ifdef RE_ENABLE_I18N
834	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835	#else
836	size_t max_i18n_object_size = `0`;
837	#endif
838	size_t max_object_size =
839	MAX (sizeof (struct re_state_table_entry),
840	MAX (sizeof (re_token_t),
841	MAX (sizeof (re_node_set),
842	MAX (sizeof (regmatch_t),
843	max_i18n_object_size))));
844
845	memset (dfa, `'\0'`, sizeof (re_dfa_t));
846
847	/ Force allocation of str_tree_storage the first time. /
848	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
849
850	/ Avoid overflows. The extra "/ 2" is for the table_size doubling*
851	calculation below, and for similar doubling calculations
852	elsewhere. And it's <= rather than <, because some of the
853	doubling calculations add 1 afterwards. /*
854	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / `2`
855	<= pat_len))
856	return REG_ESPACE;
857
858	dfa->nodes_alloc = pat_len + `1`;
859	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860
861	/ table_size = 2 ^ ceil(log pat_len) /
862	for (table_size = `1`; ; table_size <<= `1`)
863	if (table_size > pat_len)
864	break;
865
866	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867	dfa->state_hash_mask = table_size - `1`;
868
869	dfa->mb_cur_max = MB_CUR_MAX;
870	#ifdef _LIBC
871	if (dfa->mb_cur_max == `6`
872	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == `0`)
873	dfa->is_utf8 = `1`;
874	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875	!= `0`);
876	#else
877	codeset_name = nl_langinfo (CODESET);
878	if ((codeset_name[`0`] == `'U'` \|\| codeset_name[`0`] == `'u'`)
879	&& (codeset_name[`1`] == `'T'` \|\| codeset_name[`1`] == `'t'`)
880	&& (codeset_name[`2`] == `'F'` \|\| codeset_name[`2`] == `'f'`)
881	&& strcmp (codeset_name + `3` + (codeset_name[`3`] == `'-'`), "8") == `0`)
882	dfa->is_utf8 = `1`;
883
884	/ We check exhaustively in the loop below if this charset is a*
885	superset of ASCII. /*
886	dfa->map_notascii = `0`;
887	#endif
888
889	#ifdef RE_ENABLE_I18N
890	if (dfa->mb_cur_max > `1`)
891	{
892	if (dfa->is_utf8)
893	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894	else
895	{
896	int i, j, ch;
897
898	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
899	if (__glibc_unlikely (dfa->sb_char == NULL))
900	return REG_ESPACE;
901
902	/ Set the bits corresponding to single byte chars. /
903	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
904	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
905	{
906	wint_t wch = __btowc (ch);
907	if (wch != WEOF)
908	dfa->sb_char[i] \|= (bitset_word_t) `1` << j;
909	# ifndef _LIBC
910	if (isascii (ch) && wch != ch)
911	dfa->map_notascii = `1`;
912	# endif
913	}
914	}
915	}
916	#endif
917
918	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
919	return REG_ESPACE;
920	return REG_NOERROR;
921	}
922
923	/ Initialize WORD_CHAR table, which indicate which character is*
924	"word". In this case "word" means that it is the word construction
925	character used by some operators like "\<", "\>", etc. /*
926
927	static void
928	init_word_char (re_dfa_t *dfa)
929	{
930	int i = `0`;
931	int j;
932	int ch = `0`;
933	dfa->word_ops_used = `1`;
934	if (__glibc_likely (dfa->map_notascii == `0`))
935	{
936	/ Avoid uint32_t and uint64_t as some non-GCC platforms lack*
937	them, an issue when this code is used in Gnulib. /*
938	bitset_word_t bits0 = `0x00000000`;
939	bitset_word_t bits1 = `0x03ff0000`;
940	bitset_word_t bits2 = `0x87fffffe`;
941	bitset_word_t bits3 = `0x07fffffe`;
942	if (BITSET_WORD_BITS == `64`)
943	{
944	/ Pacify gcc -Woverflow on 32-bit platformns. /
945	dfa->word_char[`0`] = bits1 << `31` << `1` \| bits0;
946	dfa->word_char[`1`] = bits3 << `31` << `1` \| bits2;
947	i = `2`;
948	}
949	else if (BITSET_WORD_BITS == `32`)
950	{
951	dfa->word_char[`0`] = bits0;
952	dfa->word_char[`1`] = bits1;
953	dfa->word_char[`2`] = bits2;
954	dfa->word_char[`3`] = bits3;
955	i = `4`;
956	}
957	else
958	goto general_case;
959	ch = `128`;
960
961	if (__glibc_likely (dfa->is_utf8))
962	{
963	memset (&dfa->word_char[i], `'\0'`, (SBC_MAX - ch) / `8`);
964	return;
965	}
966	}
967
968	general_case:
969	for (; i < BITSET_WORDS; ++i)
970	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
971	if (isalnum (ch) \|\| ch == `'_'`)
972	dfa->word_char[i] \|= (bitset_word_t) `1` << j;
973	}
974
975	/ Free the work area which are only used while compiling. /
976
977	static void
978	free_workarea_compile (regex_t *preg)
979	{
980	re_dfa_t *dfa = preg->buffer;
981	bin_tree_storage_t storage, next;
982	for (storage = dfa->str_tree_storage; storage; storage = next)
983	{
984	next = storage->next;
985	re_free (storage);
986	}
987	dfa->str_tree_storage = NULL;
988	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989	dfa->str_tree = NULL;
990	re_free (dfa->org_indices);
991	dfa->org_indices = NULL;
992	}
993
994	/ Create initial states for all contexts. /
995
996	static reg_errcode_t
997	create_initial_state (re_dfa_t *dfa)
998	{
999	Idx first, i;
1000	reg_errcode_t err;
1001	re_node_set init_nodes;
1002
1003	/ Initial states have the epsilon closure of the node which is*
1004	the first node of the regular expression. /*
1005	first = dfa->str_tree->first->node_idx;
1006	dfa->init_node = first;
1007	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008	if (__glibc_unlikely (err != REG_NOERROR))
1009	return err;
1010
1011	/ The back-references which are in initial states can epsilon transit,*
1012	since in this case all of the subexpressions can be null.
1013	Then we add epsilon closures of the nodes which are the next nodes of
1014	the back-references. /*
1015	if (dfa->nbackref > `0`)
1016	for (i = `0`; i < init_nodes.nelem; ++i)
1017	{
1018	Idx node_idx = init_nodes.elems[i];
1019	re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021	Idx clexp_idx;
1022	if (type != OP_BACK_REF)
1023	continue;
1024	for (clexp_idx = `0`; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025	{
1026	re_token_t *clexp_node;
1027	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028	if (clexp_node->type == OP_CLOSE_SUBEXP
1029	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030	break;
1031	}
1032	if (clexp_idx == init_nodes.nelem)
1033	continue;
1034
1035	if (type == OP_BACK_REF)
1036	{
1037	Idx dest_idx = dfa->edests[node_idx].elems[`0`];
1038	if (!re_node_set_contains (&init_nodes, dest_idx))
1039	{
1040	reg_errcode_t merge_err
1041	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042	if (merge_err != REG_NOERROR)
1043	return merge_err;
1044	i = `0`;
1045	}
1046	}
1047	}
1048
1049	/ It must be the first time to invoke acquire_state. /
1050	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, `0`);
1051	/ We don't check ERR here, since the initial state must not be NULL. /
1052	if (__glibc_unlikely (dfa->init_state == NULL))
1053	return err;
1054	if (dfa->init_state->has_constraint)
1055	{
1056	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057	CONTEXT_WORD);
1058	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059	CONTEXT_NEWLINE);
1060	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061	&init_nodes,
1062	CONTEXT_NEWLINE
1063	\| CONTEXT_BEGBUF);
1064	if (__glibc_unlikely (dfa->init_state_word == NULL
1065	\|\| dfa->init_state_nl == NULL
1066	\|\| dfa->init_state_begbuf == NULL))
1067	return err;
1068	}
1069	else
1070	dfa->init_state_word = dfa->init_state_nl
1071	= dfa->init_state_begbuf = dfa->init_state;
1072
1073	re_node_set_free (&init_nodes);
1074	return REG_NOERROR;
1075	}
1076
1077	#ifdef RE_ENABLE_I18N
1078	/ If it is possible to do searching in single byte encoding instead of UTF-8*
1079	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080	DFA nodes where needed. /*
1081
1082	static void
1083	optimize_utf8 (re_dfa_t *dfa)
1084	{
1085	Idx node;
1086	int i;
1087	bool mb_chars = false;
1088	bool has_period = false;
1089
1090	for (node = `0`; node < dfa->nodes_len; ++node)
1091	switch (dfa->nodes[node].type)
1092	{
1093	case CHARACTER:
1094	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095	mb_chars = true;
1096	break;
1097	case ANCHOR:
1098	switch (dfa->nodes[node].opr.ctx_type)
1099	{
1100	case LINE_FIRST:
1101	case LINE_LAST:
1102	case BUF_FIRST:
1103	case BUF_LAST:
1104	break;
1105	default:
1106	/ Word anchors etc. cannot be handled. It's okay to test*
1107	opr.ctx_type since constraints (for all DFA nodes) are
1108	created by ORing one or more opr.ctx_type values. /*
1109	return;
1110	}
1111	break;
1112	case OP_PERIOD:
1113	has_period = true;
1114	break;
1115	case OP_BACK_REF:
1116	case OP_ALT:
1117	case END_OF_RE:
1118	case OP_DUP_ASTERISK:
1119	case OP_OPEN_SUBEXP:
1120	case OP_CLOSE_SUBEXP:
1121	break;
1122	case COMPLEX_BRACKET:
1123	return;
1124	case SIMPLE_BRACKET:
1125	/ Just double check. /
1126	{
1127	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == `0`
1128	? `0`
1129	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131	{
1132	if (dfa->nodes[node].opr.sbcset[i] >> rshift != `0`)
1133	return;
1134	rshift = `0`;
1135	}
1136	}
1137	break;
1138	default:
1139	abort ();
1140	}
1141
1142	if (mb_chars \|\| has_period)
1143	for (node = `0`; node < dfa->nodes_len; ++node)
1144	{
1145	if (dfa->nodes[node].type == CHARACTER
1146	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1147	dfa->nodes[node].mb_partial = `0`;
1148	else if (dfa->nodes[node].type == OP_PERIOD)
1149	dfa->nodes[node].type = OP_UTF8_PERIOD;
1150	}
1151
1152	/ The search can be in single byte locale. /
1153	dfa->mb_cur_max = `1`;
1154	dfa->is_utf8 = `0`;
1155	dfa->has_mb_node = dfa->nbackref > `0` \|\| has_period;
1156	}
1157	#endif
1158
1159	/ Analyze the structure tree, and calculate "first", "next", "edest",*
1160	"eclosure", and "inveclosure". /*
1161
1162	static reg_errcode_t
1163	analyze (regex_t *preg)
1164	{
1165	re_dfa_t *dfa = preg->buffer;
1166	reg_errcode_t ret;
1167
1168	/ Allocate arrays. /
1169	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
1174	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
1175	return REG_ESPACE;
1176
1177	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178	if (dfa->subexp_map != NULL)
1179	{
1180	Idx i;
1181	for (i = `0`; i < preg->re_nsub; i++)
1182	dfa->subexp_map[i] = i;
1183	preorder (dfa->str_tree, optimize_subexps, dfa);
1184	for (i = `0`; i < preg->re_nsub; i++)
1185	if (dfa->subexp_map[i] != i)
1186	break;
1187	if (i == preg->re_nsub)
1188	{
1189	re_free (dfa->subexp_map);
1190	dfa->subexp_map = NULL;
1191	}
1192	}
1193
1194	ret = postorder (dfa->str_tree, lower_subexps, preg);
1195	if (__glibc_unlikely (ret != REG_NOERROR))
1196	return ret;
1197	ret = postorder (dfa->str_tree, calc_first, dfa);
1198	if (__glibc_unlikely (ret != REG_NOERROR))
1199	return ret;
1200	preorder (dfa->str_tree, calc_next, dfa);
1201	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202	if (__glibc_unlikely (ret != REG_NOERROR))
1203	return ret;
1204	ret = calc_eclosure (dfa);
1205	if (__glibc_unlikely (ret != REG_NOERROR))
1206	return ret;
1207
1208	/ We only need this during the prune_impossible_nodes pass in regexec.c;*
1209	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. /*
1210	if ((!preg->no_sub && preg->re_nsub > `0` && dfa->has_plural_match)
1211	\|\| dfa->nbackref)
1212	{
1213	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214	if (__glibc_unlikely (dfa->inveclosures == NULL))
1215	return REG_ESPACE;
1216	ret = calc_inveclosure (dfa);
1217	}
1218
1219	return ret;
1220	}
1221
1222	/ Our parse trees are very unbalanced, so we cannot use a stack to*
1223	implement parse tree visits. Instead, we use parent pointers and
1224	some hairy code in these two functions. /*
1225	static reg_errcode_t
1226	postorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1227	void *extra)
1228	{
1229	bin_tree_t node, prev;
1230
1231	for (node = root; ; )
1232	{
1233	/ Descend down the tree, preferably to the left (or to the right*
1234	if that's the only child). /*
1235	while (node->left \|\| node->right)
1236	if (node->left)
1237	node = node->left;
1238	else
1239	node = node->right;
1240
1241	do
1242	{
1243	reg_errcode_t err = fn (extra, node);
1244	if (__glibc_unlikely (err != REG_NOERROR))
1245	return err;
1246	if (node->parent == NULL)
1247	return REG_NOERROR;
1248	prev = node;
1249	node = node->parent;
1250	}
1251	/ Go up while we have a node that is reached from the right. /
1252	while (node->right == prev \|\| node->right == NULL);
1253	node = node->right;
1254	}
1255	}
1256
1257	static reg_errcode_t
1258	preorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1259	void *extra)
1260	{
1261	bin_tree_t *node;
1262
1263	for (node = root; ; )
1264	{
1265	reg_errcode_t err = fn (extra, node);
1266	if (__glibc_unlikely (err != REG_NOERROR))
1267	return err;
1268
1269	/ Go to the left node, or up and to the right. /
1270	if (node->left)
1271	node = node->left;
1272	else
1273	{
1274	bin_tree_t *prev = NULL;
1275	while (node->right == prev \|\| node->right == NULL)
1276	{
1277	prev = node;
1278	node = node->parent;
1279	if (!node)
1280	return REG_NOERROR;
1281	}
1282	node = node->right;
1283	}
1284	}
1285	}
1286
1287	/ Optimization pass: if a SUBEXP is entirely contained, strip it and tell*
1288	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1289	backreferences as well. Requires a preorder visit. /*
1290	static reg_errcode_t
1291	optimize_subexps (void extra, bin_tree_t node)
1292	{
1293	re_dfa_t dfa = (re_dfa_t ) extra;
1294
1295	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1296	{
1297	int idx = node->token.opr.idx;
1298	node->token.opr.idx = dfa->subexp_map[idx];
1299	dfa->used_bkref_map \|= `1` << node->token.opr.idx;
1300	}
1301
1302	else if (node->token.type == SUBEXP
1303	&& node->left && node->left->token.type == SUBEXP)
1304	{
1305	Idx other_idx = node->left->token.opr.idx;
1306
1307	node->left = node->left->left;
1308	if (node->left)
1309	node->left->parent = node;
1310
1311	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312	if (other_idx < BITSET_WORD_BITS)
1313	dfa->used_bkref_map &= ~((bitset_word_t) `1` << other_idx);
1314	}
1315
1316	return REG_NOERROR;
1317	}
1318
1319	/ Lowering pass: Turn each SUBEXP node into the appropriate concatenation*
1320	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. /*
1321	static reg_errcode_t
1322	lower_subexps (void extra, bin_tree_t node)
1323	{
1324	regex_t preg = (regex_t ) extra;
1325	reg_errcode_t err = REG_NOERROR;
1326
1327	if (node->left && node->left->token.type == SUBEXP)
1328	{
1329	node->left = lower_subexp (&err, preg, node->left);
1330	if (node->left)
1331	node->left->parent = node;
1332	}
1333	if (node->right && node->right->token.type == SUBEXP)
1334	{
1335	node->right = lower_subexp (&err, preg, node->right);
1336	if (node->right)
1337	node->right->parent = node;
1338	}
1339
1340	return err;
1341	}
1342
1343	static bin_tree_t *
1344	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1345	{
1346	re_dfa_t *dfa = preg->buffer;
1347	bin_tree_t *body = node->left;
1348	bin_tree_t op, cls, tree1, tree;
1349
1350	if (preg->no_sub
1351	/ We do not optimize empty subexpressions, because otherwise we may*
1352	have bad CONCAT nodes with NULL children. This is obviously not
1353	very common, so we do not lose much. An example that triggers
1354	this case is the sed "script" //x. /*
1355	&& node->left != NULL
1356	&& (node->token.opr.idx >= BITSET_WORD_BITS
1357	\|\| !(dfa->used_bkref_map
1358	& ((bitset_word_t) `1` << node->token.opr.idx))))
1359	return node->left;
1360
1361	/ Convert the SUBEXP node to the concatenation of an*
1362	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. /*
1363	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366	tree = create_tree (dfa, op, tree1, CONCAT);
1367	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
1368	\|\| op == NULL \|\| cls == NULL))
1369	{
1370	*err = REG_ESPACE;
1371	return NULL;
1372	}
1373
1374	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376	return tree;
1377	}
1378
1379	/ Pass 1 in building the NFA: compute FIRST and create unlinked automaton*
1380	nodes. Requires a postorder visit. /*
1381	static reg_errcode_t
1382	calc_first (void extra, bin_tree_t node)
1383	{
1384	re_dfa_t dfa = (re_dfa_t ) extra;
1385	if (node->token.type == CONCAT)
1386	{
1387	node->first = node->left->first;
1388	node->node_idx = node->left->node_idx;
1389	}
1390	else
1391	{
1392	node->first = node;
1393	node->node_idx = re_dfa_add_node (dfa, node->token);
1394	if (__glibc_unlikely (node->node_idx == -`1`))
1395	return REG_ESPACE;
1396	if (node->token.type == ANCHOR)
1397	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1398	}
1399	return REG_NOERROR;
1400	}
1401
1402	/ Pass 2: compute NEXT on the tree. Preorder visit. /
1403	static reg_errcode_t
1404	calc_next (void extra, bin_tree_t node)
1405	{
1406	switch (node->token.type)
1407	{
1408	case OP_DUP_ASTERISK:
1409	node->left->next = node;
1410	break;
1411	case CONCAT:
1412	node->left->next = node->right->first;
1413	node->right->next = node->next;
1414	break;
1415	default:
1416	if (node->left)
1417	node->left->next = node->next;
1418	if (node->right)
1419	node->right->next = node->next;
1420	break;
1421	}
1422	return REG_NOERROR;
1423	}
1424
1425	/ Pass 3: link all DFA nodes to their NEXT node (any order will do). /
1426	static reg_errcode_t
1427	link_nfa_nodes (void extra, bin_tree_t node)
1428	{
1429	re_dfa_t dfa = (re_dfa_t ) extra;
1430	Idx idx = node->node_idx;
1431	reg_errcode_t err = REG_NOERROR;
1432
1433	switch (node->token.type)
1434	{
1435	case CONCAT:
1436	break;
1437
1438	case END_OF_RE:
1439	DEBUG_ASSERT (node->next == NULL);
1440	break;
1441
1442	case OP_DUP_ASTERISK:
1443	case OP_ALT:
1444	{
1445	Idx left, right;
1446	dfa->has_plural_match = `1`;
1447	if (node->left != NULL)
1448	left = node->left->first->node_idx;
1449	else
1450	left = node->next->node_idx;
1451	if (node->right != NULL)
1452	right = node->right->first->node_idx;
1453	else
1454	right = node->next->node_idx;
1455	DEBUG_ASSERT (left > -`1`);
1456	DEBUG_ASSERT (right > -`1`);
1457	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458	}
1459	break;
1460
1461	case ANCHOR:
1462	case OP_OPEN_SUBEXP:
1463	case OP_CLOSE_SUBEXP:
1464	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465	break;
1466
1467	case OP_BACK_REF:
1468	dfa->nexts[idx] = node->next->node_idx;
1469	if (node->token.type == OP_BACK_REF)
1470	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471	break;
1472
1473	default:
1474	DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1475	dfa->nexts[idx] = node->next->node_idx;
1476	break;
1477	}
1478
1479	return err;
1480	}
1481
1482	/ Duplicate the epsilon closure of the node ROOT_NODE.*
1483	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484	to their own constraint. /*
1485
1486	static reg_errcode_t
1487	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488	Idx root_node, unsigned int init_constraint)
1489	{
1490	Idx org_node, clone_node;
1491	bool ok;
1492	unsigned int constraint = init_constraint;
1493	for (org_node = top_org_node, clone_node = top_clone_node;;)
1494	{
1495	Idx org_dest, clone_dest;
1496	if (dfa->nodes[org_node].type == OP_BACK_REF)
1497	{
1498	/ If the back reference epsilon-transit, its destination must*
1499	also have the constraint. Then duplicate the epsilon closure
1500	of the destination of the back reference, and store it in
1501	edests of the back reference. /*
1502	org_dest = dfa->nexts[org_node];
1503	re_node_set_empty (dfa->edests + clone_node);
1504	clone_dest = duplicate_node (dfa, org_dest, constraint);
1505	if (__glibc_unlikely (clone_dest == -`1`))
1506	return REG_ESPACE;
1507	dfa->nexts[clone_node] = dfa->nexts[org_node];
1508	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509	if (__glibc_unlikely (! ok))
1510	return REG_ESPACE;
1511	}
1512	else if (dfa->edests[org_node].nelem == `0`)
1513	{
1514	/ In case of the node can't epsilon-transit, don't duplicate the*
1515	destination and store the original destination as the
1516	destination of the node. /*
1517	dfa->nexts[clone_node] = dfa->nexts[org_node];
1518	break;
1519	}
1520	else if (dfa->edests[org_node].nelem == `1`)
1521	{
1522	/ In case of the node can epsilon-transit, and it has only one*
1523	destination. /*
1524	org_dest = dfa->edests[org_node].elems[`0`];
1525	re_node_set_empty (dfa->edests + clone_node);
1526	/ If the node is root_node itself, it means the epsilon closure*
1527	has a loop. Then tie it to the destination of the root_node. /*
1528	if (org_node == root_node && clone_node != org_node)
1529	{
1530	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531	if (__glibc_unlikely (! ok))
1532	return REG_ESPACE;
1533	break;
1534	}
1535	/ In case the node has another constraint, append it. /
1536	constraint \|= dfa->nodes[org_node].constraint;
1537	clone_dest = duplicate_node (dfa, org_dest, constraint);
1538	if (__glibc_unlikely (clone_dest == -`1`))
1539	return REG_ESPACE;
1540	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541	if (__glibc_unlikely (! ok))
1542	return REG_ESPACE;
1543	}
1544	else / dfa->edests[org_node].nelem == 2 /
1545	{
1546	/ In case of the node can epsilon-transit, and it has two*
1547	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1548	org_dest = dfa->edests[org_node].elems[`0`];
1549	re_node_set_empty (dfa->edests + clone_node);
1550	/ Search for a duplicated node which satisfies the constraint. /
1551	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552	if (clone_dest == -`1`)
1553	{
1554	/ There is no such duplicated node, create a new one. /
1555	reg_errcode_t err;
1556	clone_dest = duplicate_node (dfa, org_dest, constraint);
1557	if (__glibc_unlikely (clone_dest == -`1`))
1558	return REG_ESPACE;
1559	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560	if (__glibc_unlikely (! ok))
1561	return REG_ESPACE;
1562	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563	root_node, constraint);
1564	if (__glibc_unlikely (err != REG_NOERROR))
1565	return err;
1566	}
1567	else
1568	{
1569	/ There is a duplicated node which satisfies the constraint,*
1570	use it to avoid infinite loop. /*
1571	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572	if (__glibc_unlikely (! ok))
1573	return REG_ESPACE;
1574	}
1575
1576	org_dest = dfa->edests[org_node].elems[`1`];
1577	clone_dest = duplicate_node (dfa, org_dest, constraint);
1578	if (__glibc_unlikely (clone_dest == -`1`))
1579	return REG_ESPACE;
1580	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581	if (__glibc_unlikely (! ok))
1582	return REG_ESPACE;
1583	}
1584	org_node = org_dest;
1585	clone_node = clone_dest;
1586	}
1587	return REG_NOERROR;
1588	}
1589
1590	/ Search for a node which is duplicated from the node ORG_NODE, and*
1591	satisfies the constraint CONSTRAINT. /*
1592
1593	static Idx
1594	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595	unsigned int constraint)
1596	{
1597	Idx idx;
1598	for (idx = dfa->nodes_len - `1`; dfa->nodes[idx].duplicated && idx > `0`; --idx)
1599	{
1600	if (org_node == dfa->org_indices[idx]
1601	&& constraint == dfa->nodes[idx].constraint)
1602	return idx; / Found. /
1603	}
1604	return -`1`; / Not found. /
1605	}
1606
1607	/ Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.*
1608	Return the index of the new node, or -1 if insufficient storage is
1609	available. /*
1610
1611	static Idx
1612	duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint)
1613	{
1614	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615	if (__glibc_likely (dup_idx != -`1`))
1616	{
1617	dfa->nodes[dup_idx].constraint = constraint;
1618	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
1619	dfa->nodes[dup_idx].duplicated = `1`;
1620
1621	/ Store the index of the original node. /
1622	dfa->org_indices[dup_idx] = org_idx;
1623	}
1624	return dup_idx;
1625	}
1626
1627	static reg_errcode_t
1628	calc_inveclosure (re_dfa_t *dfa)
1629	{
1630	Idx src, idx;
1631	bool ok;
1632	for (idx = `0`; idx < dfa->nodes_len; ++idx)
1633	re_node_set_init_empty (dfa->inveclosures + idx);
1634
1635	for (src = `0`; src < dfa->nodes_len; ++src)
1636	{
1637	Idx *elems = dfa->eclosures[src].elems;
1638	for (idx = `0`; idx < dfa->eclosures[src].nelem; ++idx)
1639	{
1640	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641	if (__glibc_unlikely (! ok))
1642	return REG_ESPACE;
1643	}
1644	}
1645
1646	return REG_NOERROR;
1647	}
1648
1649	/ Calculate "eclosure" for all the node in DFA. /
1650
1651	static reg_errcode_t
1652	calc_eclosure (re_dfa_t *dfa)
1653	{
1654	Idx node_idx;
1655	bool incomplete;
1656	DEBUG_ASSERT (dfa->nodes_len > `0`);
1657	incomplete = false;
1658	/ For each nodes, calculate epsilon closure. /
1659	for (node_idx = `0`; ; ++node_idx)
1660	{
1661	reg_errcode_t err;
1662	re_node_set eclosure_elem;
1663	if (node_idx == dfa->nodes_len)
1664	{
1665	if (!incomplete)
1666	break;
1667	incomplete = false;
1668	node_idx = `0`;
1669	}
1670
1671	DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -`1`);
1672
1673	/ If we have already calculated, skip it. /
1674	if (dfa->eclosures[node_idx].nelem != `0`)
1675	continue;
1676	/ Calculate epsilon closure of 'node_idx'. /
1677	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678	if (__glibc_unlikely (err != REG_NOERROR))
1679	return err;
1680
1681	if (dfa->eclosures[node_idx].nelem == `0`)
1682	{
1683	incomplete = true;
1684	re_node_set_free (&eclosure_elem);
1685	}
1686	}
1687	return REG_NOERROR;
1688	}
1689
1690	/ Calculate epsilon closure of NODE. /
1691
1692	static reg_errcode_t
1693	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1694	{
1695	reg_errcode_t err;
1696	Idx i;
1697	re_node_set eclosure;
1698	bool incomplete = false;
1699	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + `1`);
1700	if (__glibc_unlikely (err != REG_NOERROR))
1701	return err;
1702
1703	/ An epsilon closure includes itself. /
1704	eclosure.elems[eclosure.nelem++] = node;
1705
1706	/ This indicates that we are calculating this node now.*
1707	We reference this value to avoid infinite loop. /*
1708	dfa->eclosures[node].nelem = -`1`;
1709
1710	/ If the current node has constraints, duplicate all nodes*
1711	since they must inherit the constraints. /*
1712	if (dfa->nodes[node].constraint
1713	&& dfa->edests[node].nelem
1714	&& !dfa->nodes[dfa->edests[node].elems[`0`]].duplicated)
1715	{
1716	err = duplicate_node_closure (dfa, node, node, node,
1717	dfa->nodes[node].constraint);
1718	if (__glibc_unlikely (err != REG_NOERROR))
1719	return err;
1720	}
1721
1722	/ Expand each epsilon destination nodes. /
1723	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1724	for (i = `0`; i < dfa->edests[node].nelem; ++i)
1725	{
1726	re_node_set eclosure_elem;
1727	Idx edest = dfa->edests[node].elems[i];
1728	/ If calculating the epsilon closure of 'edest' is in progress,*
1729	return intermediate result. /*
1730	if (dfa->eclosures[edest].nelem == -`1`)
1731	{
1732	incomplete = true;
1733	continue;
1734	}
1735	/ If we haven't calculated the epsilon closure of 'edest' yet,*
1736	calculate now. Otherwise use calculated epsilon closure. /*
1737	if (dfa->eclosures[edest].nelem == `0`)
1738	{
1739	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1740	if (__glibc_unlikely (err != REG_NOERROR))
1741	return err;
1742	}
1743	else
1744	eclosure_elem = dfa->eclosures[edest];
1745	/ Merge the epsilon closure of 'edest'. /
1746	err = re_node_set_merge (&eclosure, &eclosure_elem);
1747	if (__glibc_unlikely (err != REG_NOERROR))
1748	return err;
1749	/ If the epsilon closure of 'edest' is incomplete,*
1750	the epsilon closure of this node is also incomplete. /*
1751	if (dfa->eclosures[edest].nelem == `0`)
1752	{
1753	incomplete = true;
1754	re_node_set_free (&eclosure_elem);
1755	}
1756	}
1757
1758	if (incomplete && !root)
1759	dfa->eclosures[node].nelem = `0`;
1760	else
1761	dfa->eclosures[node] = eclosure;
1762	*new_set = eclosure;
1763	return REG_NOERROR;
1764	}
1765
1766	/ Functions for token which are used in the parser. /
1767
1768	/ Fetch a token from INPUT.*
1769	We must not use this function inside bracket expressions. /*
1770
1771	static void
1772	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1773	{
1774	re_string_skip_bytes (input, peek_token (result, input, syntax));
1775	}
1776
1777	/ Peek a token from INPUT, and return the length of the token.*
1778	We must not use this function inside bracket expressions. /*
1779
1780	static int
1781	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1782	{
1783	unsigned char c;
1784
1785	if (re_string_eoi (input))
1786	{
1787	token->type = END_OF_RE;
1788	return `0`;
1789	}
1790
1791	c = re_string_peek_byte (input, `0`);
1792	token->opr.c = c;
1793
1794	token->word_char = `0`;
1795	#ifdef RE_ENABLE_I18N
1796	token->mb_partial = `0`;
1797	if (input->mb_cur_max > `1`
1798	&& !re_string_first_byte (input, re_string_cur_idx (input)))
1799	{
1800	token->type = CHARACTER;
1801	token->mb_partial = `1`;
1802	return `1`;
1803	}
1804	#endif
1805	if (c == `'\\'`)
1806	{
1807	unsigned char c2;
1808	if (re_string_cur_idx (input) + `1` >= re_string_length (input))
1809	{
1810	token->type = BACK_SLASH;
1811	return `1`;
1812	}
1813
1814	c2 = re_string_peek_byte_case (input, `1`);
1815	token->opr.c = c2;
1816	token->type = CHARACTER;
1817	#ifdef RE_ENABLE_I18N
1818	if (input->mb_cur_max > `1`)
1819	{
1820	wint_t wc = re_string_wchar_at (input,
1821	re_string_cur_idx (input) + `1`);
1822	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1823	}
1824	else
1825	#endif
1826	token->word_char = IS_WORD_CHAR (c2) != `0`;
1827
1828	switch (c2)
1829	{
1830	case `'\|'`:
1831	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832	token->type = OP_ALT;
1833	break;
1834	case `'1'`: case `'2'`: case `'3'`: case `'4'`: case `'5'`:
1835	case `'6'`: case `'7'`: case `'8'`: case `'9'`:
1836	if (!(syntax & RE_NO_BK_REFS))
1837	{
1838	token->type = OP_BACK_REF;
1839	token->opr.idx = c2 - `'1'`;
1840	}
1841	break;
1842	case `'<'`:
1843	if (!(syntax & RE_NO_GNU_OPS))
1844	{
1845	token->type = ANCHOR;
1846	token->opr.ctx_type = WORD_FIRST;
1847	}
1848	break;
1849	case `'>'`:
1850	if (!(syntax & RE_NO_GNU_OPS))
1851	{
1852	token->type = ANCHOR;
1853	token->opr.ctx_type = WORD_LAST;
1854	}
1855	break;
1856	case `'b'`:
1857	if (!(syntax & RE_NO_GNU_OPS))
1858	{
1859	token->type = ANCHOR;
1860	token->opr.ctx_type = WORD_DELIM;
1861	}
1862	break;
1863	case `'B'`:
1864	if (!(syntax & RE_NO_GNU_OPS))
1865	{
1866	token->type = ANCHOR;
1867	token->opr.ctx_type = NOT_WORD_DELIM;
1868	}
1869	break;
1870	case `'w'`:
1871	if (!(syntax & RE_NO_GNU_OPS))
1872	token->type = OP_WORD;
1873	break;
1874	case `'W'`:
1875	if (!(syntax & RE_NO_GNU_OPS))
1876	token->type = OP_NOTWORD;
1877	break;
1878	case `'s'`:
1879	if (!(syntax & RE_NO_GNU_OPS))
1880	token->type = OP_SPACE;
1881	break;
1882	case `'S'`:
1883	if (!(syntax & RE_NO_GNU_OPS))
1884	token->type = OP_NOTSPACE;
1885	break;
1886	case '`':
1887	if (!(syntax & RE_NO_GNU_OPS))
1888	{
1889	token->type = ANCHOR;
1890	token->opr.ctx_type = BUF_FIRST;
1891	}
1892	break;
1893	case `'\''`:
1894	if (!(syntax & RE_NO_GNU_OPS))
1895	{
1896	token->type = ANCHOR;
1897	token->opr.ctx_type = BUF_LAST;
1898	}
1899	break;
1900	case `'('`:
1901	if (!(syntax & RE_NO_BK_PARENS))
1902	token->type = OP_OPEN_SUBEXP;
1903	break;
1904	case `')'`:
1905	if (!(syntax & RE_NO_BK_PARENS))
1906	token->type = OP_CLOSE_SUBEXP;
1907	break;
1908	case `'+'`:
1909	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910	token->type = OP_DUP_PLUS;
1911	break;
1912	case `'?'`:
1913	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914	token->type = OP_DUP_QUESTION;
1915	break;
1916	case `'{'`:
1917	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918	token->type = OP_OPEN_DUP_NUM;
1919	break;
1920	case `'}'`:
1921	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922	token->type = OP_CLOSE_DUP_NUM;
1923	break;
1924	default:
1925	break;
1926	}
1927	return `2`;
1928	}
1929
1930	token->type = CHARACTER;
1931	#ifdef RE_ENABLE_I18N
1932	if (input->mb_cur_max > `1`)
1933	{
1934	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1936	}
1937	else
1938	#endif
1939	token->word_char = IS_WORD_CHAR (token->opr.c);
1940
1941	switch (c)
1942	{
1943	case `'\n'`:
1944	if (syntax & RE_NEWLINE_ALT)
1945	token->type = OP_ALT;
1946	break;
1947	case `'\|'`:
1948	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949	token->type = OP_ALT;
1950	break;
1951	case `'*'`:
1952	token->type = OP_DUP_ASTERISK;
1953	break;
1954	case `'+'`:
1955	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956	token->type = OP_DUP_PLUS;
1957	break;
1958	case `'?'`:
1959	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960	token->type = OP_DUP_QUESTION;
1961	break;
1962	case `'{'`:
1963	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964	token->type = OP_OPEN_DUP_NUM;
1965	break;
1966	case `'}'`:
1967	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968	token->type = OP_CLOSE_DUP_NUM;
1969	break;
1970	case `'('`:
1971	if (syntax & RE_NO_BK_PARENS)
1972	token->type = OP_OPEN_SUBEXP;
1973	break;
1974	case `')'`:
1975	if (syntax & RE_NO_BK_PARENS)
1976	token->type = OP_CLOSE_SUBEXP;
1977	break;
1978	case `'['`:
1979	token->type = OP_OPEN_BRACKET;
1980	break;
1981	case `'.'`:
1982	token->type = OP_PERIOD;
1983	break;
1984	case `'^'`:
1985	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
1986	&& re_string_cur_idx (input) != `0`)
1987	{
1988	char prev = re_string_peek_byte (input, -`1`);
1989	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != `'\n'`)
1990	break;
1991	}
1992	token->type = ANCHOR;
1993	token->opr.ctx_type = LINE_FIRST;
1994	break;
1995	case `'$'`:
1996	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1997	&& re_string_cur_idx (input) + `1` != re_string_length (input))
1998	{
1999	re_token_t next;
2000	re_string_skip_bytes (input, `1`);
2001	peek_token (&next, input, syntax);
2002	re_string_skip_bytes (input, -`1`);
2003	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004	break;
2005	}
2006	token->type = ANCHOR;
2007	token->opr.ctx_type = LINE_LAST;
2008	break;
2009	default:
2010	break;
2011	}
2012	return `1`;
2013	}
2014
2015	/ Peek a token from INPUT, and return the length of the token.*
2016	We must not use this function out of bracket expressions. /*
2017
2018	static int
2019	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2020	{
2021	unsigned char c;
2022	if (re_string_eoi (input))
2023	{
2024	token->type = END_OF_RE;
2025	return `0`;
2026	}
2027	c = re_string_peek_byte (input, `0`);
2028	token->opr.c = c;
2029
2030	#ifdef RE_ENABLE_I18N
2031	if (input->mb_cur_max > `1`
2032	&& !re_string_first_byte (input, re_string_cur_idx (input)))
2033	{
2034	token->type = CHARACTER;
2035	return `1`;
2036	}
2037	#endif /* RE_ENABLE_I18N */
2038
2039	if (c == `'\\'` && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040	&& re_string_cur_idx (input) + `1` < re_string_length (input))
2041	{
2042	/ In this case, '\' escape a character. /
2043	unsigned char c2;
2044	re_string_skip_bytes (input, `1`);
2045	c2 = re_string_peek_byte (input, `0`);
2046	token->opr.c = c2;
2047	token->type = CHARACTER;
2048	return `1`;
2049	}
2050	if (c == `'['`) / '[' is a special char in a bracket exps. /
2051	{
2052	unsigned char c2;
2053	int token_len;
2054	if (re_string_cur_idx (input) + `1` < re_string_length (input))
2055	c2 = re_string_peek_byte (input, `1`);
2056	else
2057	c2 = `0`;
2058	token->opr.c = c2;
2059	token_len = `2`;
2060	switch (c2)
2061	{
2062	case `'.'`:
2063	token->type = OP_OPEN_COLL_ELEM;
2064	break;
2065
2066	case `'='`:
2067	token->type = OP_OPEN_EQUIV_CLASS;
2068	break;
2069
2070	case `':'`:
2071	if (syntax & RE_CHAR_CLASSES)
2072	{
2073	token->type = OP_OPEN_CHAR_CLASS;
2074	break;
2075	}
2076	FALLTHROUGH;
2077	default:
2078	token->type = CHARACTER;
2079	token->opr.c = c;
2080	token_len = `1`;
2081	break;
2082	}
2083	return token_len;
2084	}
2085	switch (c)
2086	{
2087	case `'-'`:
2088	token->type = OP_CHARSET_RANGE;
2089	break;
2090	case `']'`:
2091	token->type = OP_CLOSE_BRACKET;
2092	break;
2093	case `'^'`:
2094	token->type = OP_NON_MATCH_LIST;
2095	break;
2096	default:
2097	token->type = CHARACTER;
2098	}
2099	return `1`;
2100	}
2101
2102	/ Functions for parser. /
2103
2104	/ Entry point of the parser.*
2105	Parse the regular expression REGEXP and return the structure tree.
2106	If an error occurs, ERR is set by error code, and return NULL.
2107	This function build the following tree, from regular expression <reg_exp>:
2108	CAT
2109	/ \
2110	/ \
2111	<reg_exp> EOR
2112
2113	CAT means concatenation.
2114	EOR means end of regular expression. /*
2115
2116	static bin_tree_t *
2117	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2118	reg_errcode_t *err)
2119	{
2120	re_dfa_t *dfa = preg->buffer;
2121	bin_tree_t tree, eor, *root;
2122	re_token_t current_token;
2123	dfa->syntax = syntax;
2124	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2125	tree = parse_reg_exp (regexp, preg, &current_token, syntax, `0`, err);
2126	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2127	return NULL;
2128	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129	if (tree != NULL)
2130	root = create_tree (dfa, tree, eor, CONCAT);
2131	else
2132	root = eor;
2133	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
2134	{
2135	*err = REG_ESPACE;
2136	return NULL;
2137	}
2138	return root;
2139	}
2140
2141	/ This function build the following tree, from regular expression*
2142	<branch1>\|<branch2>:
2143	ALT
2144	/ \
2145	/ \
2146	<branch1> <branch2>
2147
2148	ALT means alternative, which represents the operator '\|'. /*
2149
2150	static bin_tree_t *
2151	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2152	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2153	{
2154	re_dfa_t *dfa = preg->buffer;
2155	bin_tree_t tree, branch = NULL;
2156	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2157	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2158	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2159	return NULL;
2160
2161	while (token->type == OP_ALT)
2162	{
2163	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2164	if (token->type != OP_ALT && token->type != END_OF_RE
2165	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2166	{
2167	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2168	dfa->completed_bkref_map = initial_bkref_map;
2169	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2170	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2171	{
2172	if (tree != NULL)
2173	postorder (tree, free_tree, NULL);
2174	return NULL;
2175	}
2176	dfa->completed_bkref_map \|= accumulated_bkref_map;
2177	}
2178	else
2179	branch = NULL;
2180	tree = create_tree (dfa, tree, branch, OP_ALT);
2181	if (__glibc_unlikely (tree == NULL))
2182	{
2183	*err = REG_ESPACE;
2184	return NULL;
2185	}
2186	}
2187	return tree;
2188	}
2189
2190	/ This function build the following tree, from regular expression*
2191	<exp1><exp2>:
2192	CAT
2193	/ \
2194	/ \
2195	<exp1> <exp2>
2196
2197	CAT means concatenation. /*
2198
2199	static bin_tree_t *
2200	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2201	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2202	{
2203	bin_tree_t tree, expr;
2204	re_dfa_t *dfa = preg->buffer;
2205	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2206	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2207	return NULL;
2208
2209	while (token->type != OP_ALT && token->type != END_OF_RE
2210	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2211	{
2212	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2213	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2214	{
2215	if (tree != NULL)
2216	postorder (tree, free_tree, NULL);
2217	return NULL;
2218	}
2219	if (tree != NULL && expr != NULL)
2220	{
2221	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2222	if (newtree == NULL)
2223	{
2224	postorder (expr, free_tree, NULL);
2225	postorder (tree, free_tree, NULL);
2226	*err = REG_ESPACE;
2227	return NULL;
2228	}
2229	tree = newtree;
2230	}
2231	else if (tree == NULL)
2232	tree = expr;
2233	/ Otherwise expr == NULL, we don't need to create new tree. /
2234	}
2235	return tree;
2236	}
2237
2238	/ This function build the following tree, from regular expression a:
2239	*
2240	\|
2241	a
2242	*/
2243
2244	static bin_tree_t *
2245	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2246	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2247	{
2248	re_dfa_t *dfa = preg->buffer;
2249	bin_tree_t *tree;
2250	switch (token->type)
2251	{
2252	case CHARACTER:
2253	tree = create_token_tree (dfa, NULL, NULL, token);
2254	if (__glibc_unlikely (tree == NULL))
2255	{
2256	*err = REG_ESPACE;
2257	return NULL;
2258	}
2259	#ifdef RE_ENABLE_I18N
2260	if (dfa->mb_cur_max > `1`)
2261	{
2262	while (!re_string_eoi (regexp)
2263	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2264	{
2265	bin_tree_t *mbc_remain;
2266	fetch_token (token, regexp, syntax);
2267	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2268	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2269	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
2270	{
2271	*err = REG_ESPACE;
2272	return NULL;
2273	}
2274	}
2275	}
2276	#endif
2277	break;
2278
2279	case OP_OPEN_SUBEXP:
2280	tree = parse_sub_exp (regexp, preg, token, syntax, nest + `1`, err);
2281	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2282	return NULL;
2283	break;
2284
2285	case OP_OPEN_BRACKET:
2286	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2287	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2288	return NULL;
2289	break;
2290
2291	case OP_BACK_REF:
2292	if (!__glibc_likely (dfa->completed_bkref_map & (`1` << token->opr.idx)))
2293	{
2294	*err = REG_ESUBREG;
2295	return NULL;
2296	}
2297	dfa->used_bkref_map \|= `1` << token->opr.idx;
2298	tree = create_token_tree (dfa, NULL, NULL, token);
2299	if (__glibc_unlikely (tree == NULL))
2300	{
2301	*err = REG_ESPACE;
2302	return NULL;
2303	}
2304	++dfa->nbackref;
2305	dfa->has_mb_node = `1`;
2306	break;
2307
2308	case OP_OPEN_DUP_NUM:
2309	if (syntax & RE_CONTEXT_INVALID_DUP)
2310	{
2311	*err = REG_BADRPT;
2312	return NULL;
2313	}
2314	FALLTHROUGH;
2315	case OP_DUP_ASTERISK:
2316	case OP_DUP_PLUS:
2317	case OP_DUP_QUESTION:
2318	if (syntax & RE_CONTEXT_INVALID_OPS)
2319	{
2320	*err = REG_BADRPT;
2321	return NULL;
2322	}
2323	else if (syntax & RE_CONTEXT_INDEP_OPS)
2324	{
2325	fetch_token (token, regexp, syntax);
2326	return parse_expression (regexp, preg, token, syntax, nest, err);
2327	}
2328	FALLTHROUGH;
2329	case OP_CLOSE_SUBEXP:
2330	if ((token->type == OP_CLOSE_SUBEXP)
2331	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2332	{
2333	*err = REG_ERPAREN;
2334	return NULL;
2335	}
2336	FALLTHROUGH;
2337	case OP_CLOSE_DUP_NUM:
2338	/ We treat it as a normal character. /
2339
2340	/ Then we can these characters as normal characters. /
2341	token->type = CHARACTER;
2342	/ mb_partial and word_char bits should be initialized already*
2343	by peek_token. /*
2344	tree = create_token_tree (dfa, NULL, NULL, token);
2345	if (__glibc_unlikely (tree == NULL))
2346	{
2347	*err = REG_ESPACE;
2348	return NULL;
2349	}
2350	break;
2351
2352	case ANCHOR:
2353	if ((token->opr.ctx_type
2354	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2355	&& dfa->word_ops_used == `0`)
2356	init_word_char (dfa);
2357	if (token->opr.ctx_type == WORD_DELIM
2358	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2359	{
2360	bin_tree_t tree_first, tree_last;
2361	if (token->opr.ctx_type == WORD_DELIM)
2362	{
2363	token->opr.ctx_type = WORD_FIRST;
2364	tree_first = create_token_tree (dfa, NULL, NULL, token);
2365	token->opr.ctx_type = WORD_LAST;
2366	}
2367	else
2368	{
2369	token->opr.ctx_type = INSIDE_WORD;
2370	tree_first = create_token_tree (dfa, NULL, NULL, token);
2371	token->opr.ctx_type = INSIDE_NOTWORD;
2372	}
2373	tree_last = create_token_tree (dfa, NULL, NULL, token);
2374	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2375	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
2376	\|\| tree == NULL))
2377	{
2378	*err = REG_ESPACE;
2379	return NULL;
2380	}
2381	}
2382	else
2383	{
2384	tree = create_token_tree (dfa, NULL, NULL, token);
2385	if (__glibc_unlikely (tree == NULL))
2386	{
2387	*err = REG_ESPACE;
2388	return NULL;
2389	}
2390	}
2391	/ We must return here, since ANCHORs can't be followed*
2392	by repetition operators.
2393	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2394	it must not be "<ANCHOR(^)><REPEAT()>". /
2395	fetch_token (token, regexp, syntax);
2396	return tree;
2397
2398	case OP_PERIOD:
2399	tree = create_token_tree (dfa, NULL, NULL, token);
2400	if (__glibc_unlikely (tree == NULL))
2401	{
2402	*err = REG_ESPACE;
2403	return NULL;
2404	}
2405	if (dfa->mb_cur_max > `1`)
2406	dfa->has_mb_node = `1`;
2407	break;
2408
2409	case OP_WORD:
2410	case OP_NOTWORD:
2411	tree = build_charclass_op (dfa, regexp->trans,
2412	"alnum",
2413	"_",
2414	token->type == OP_NOTWORD, err);
2415	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2416	return NULL;
2417	break;
2418
2419	case OP_SPACE:
2420	case OP_NOTSPACE:
2421	tree = build_charclass_op (dfa, regexp->trans,
2422	"space",
2423	"",
2424	token->type == OP_NOTSPACE, err);
2425	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2426	return NULL;
2427	break;
2428
2429	case OP_ALT:
2430	case END_OF_RE:
2431	return NULL;
2432
2433	case BACK_SLASH:
2434	*err = REG_EESCAPE;
2435	return NULL;
2436
2437	default:
2438	/ Must not happen? /
2439	DEBUG_ASSERT (false);
2440	return NULL;
2441	}
2442	fetch_token (token, regexp, syntax);
2443
2444	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2445	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2446	{
2447	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2448	syntax, err);
2449	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2450	{
2451	if (tree != NULL)
2452	postorder (tree, free_tree, NULL);
2453	return NULL;
2454	}
2455	tree = dup_tree;
2456	/ In BRE consecutive duplications are not allowed. /
2457	if ((syntax & RE_CONTEXT_INVALID_DUP)
2458	&& (token->type == OP_DUP_ASTERISK
2459	\|\| token->type == OP_OPEN_DUP_NUM))
2460	{
2461	if (tree != NULL)
2462	postorder (tree, free_tree, NULL);
2463	*err = REG_BADRPT;
2464	return NULL;
2465	}
2466	}
2467
2468	return tree;
2469	}
2470
2471	/ This function build the following tree, from regular expression*
2472	(<reg_exp>):
2473	SUBEXP
2474	\|
2475	<reg_exp>
2476	*/
2477
2478	static bin_tree_t *
2479	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2480	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2481	{
2482	re_dfa_t *dfa = preg->buffer;
2483	bin_tree_t *tree;
2484	size_t cur_nsub;
2485	cur_nsub = preg->re_nsub++;
2486
2487	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2488
2489	/ The subexpression may be a null string. /
2490	if (token->type == OP_CLOSE_SUBEXP)
2491	tree = NULL;
2492	else
2493	{
2494	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495	if (__glibc_unlikely (*err == REG_NOERROR
2496	&& token->type != OP_CLOSE_SUBEXP))
2497	{
2498	if (tree != NULL)
2499	postorder (tree, free_tree, NULL);
2500	*err = REG_EPAREN;
2501	}
2502	if (__glibc_unlikely (*err != REG_NOERROR))
2503	return NULL;
2504	}
2505
2506	if (cur_nsub <= `'9'` - `'1'`)
2507	dfa->completed_bkref_map \|= `1` << cur_nsub;
2508
2509	tree = create_tree (dfa, tree, NULL, SUBEXP);
2510	if (__glibc_unlikely (tree == NULL))
2511	{
2512	*err = REG_ESPACE;
2513	return NULL;
2514	}
2515	tree->token.opr.idx = cur_nsub;
2516	return tree;
2517	}
2518
2519	/ This function parse repetition operators like "", "+", "{1,3}" etc. /*
2520
2521	static bin_tree_t *
2522	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2523	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2524	{
2525	bin_tree_t tree = NULL, old_tree = NULL;
2526	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2527	re_token_t start_token = *token;
2528
2529	if (token->type == OP_OPEN_DUP_NUM)
2530	{
2531	end = `0`;
2532	start = fetch_number (regexp, token, syntax);
2533	if (start == -`1`)
2534	{
2535	if (token->type == CHARACTER && token->opr.c == `','`)
2536	start = `0`; / We treat "{,m}" as "{0,m}". /
2537	else
2538	{
2539	err = REG_BADBR; /* <re>{} is invalid. /
2540	return NULL;
2541	}
2542	}
2543	if (__glibc_likely (start != -`2`))
2544	{
2545	/ We treat "{n}" as "{n,n}". /
2546	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2547	: ((token->type == CHARACTER && token->opr.c == `','`)
2548	? fetch_number (regexp, token, syntax) : -`2`));
2549	}
2550	if (__glibc_unlikely (start == -`2` \|\| end == -`2`))
2551	{
2552	/ Invalid sequence. /
2553	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2554	{
2555	if (token->type == END_OF_RE)
2556	*err = REG_EBRACE;
2557	else
2558	*err = REG_BADBR;
2559
2560	return NULL;
2561	}
2562
2563	/ If the syntax bit is set, rollback. /
2564	re_string_set_index (regexp, start_idx);
2565	*token = start_token;
2566	token->type = CHARACTER;
2567	/ mb_partial and word_char bits should be already initialized by*
2568	peek_token. /*
2569	return elem;
2570	}
2571
2572	if (__glibc_unlikely ((end != -`1` && start > end)
2573	\|\| token->type != OP_CLOSE_DUP_NUM))
2574	{
2575	/ First number greater than second. /
2576	*err = REG_BADBR;
2577	return NULL;
2578	}
2579
2580	if (__glibc_unlikely (RE_DUP_MAX < (end == -`1` ? start : end)))
2581	{
2582	*err = REG_ESIZE;
2583	return NULL;
2584	}
2585	}
2586	else
2587	{
2588	start = (token->type == OP_DUP_PLUS) ? `1` : `0`;
2589	end = (token->type == OP_DUP_QUESTION) ? `1` : -`1`;
2590	}
2591
2592	fetch_token (token, regexp, syntax);
2593
2594	if (__glibc_unlikely (elem == NULL))
2595	return NULL;
2596	if (__glibc_unlikely (start == `0` && end == `0`))
2597	{
2598	postorder (elem, free_tree, NULL);
2599	return NULL;
2600	}
2601
2602	/ Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". /
2603	if (__glibc_unlikely (start > `0`))
2604	{
2605	tree = elem;
2606	for (i = `2`; i <= start; ++i)
2607	{
2608	elem = duplicate_tree (elem, dfa);
2609	tree = create_tree (dfa, tree, elem, CONCAT);
2610	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2611	goto parse_dup_op_espace;
2612	}
2613
2614	if (start == end)
2615	return tree;
2616
2617	/ Duplicate ELEM before it is marked optional. /
2618	elem = duplicate_tree (elem, dfa);
2619	if (__glibc_unlikely (elem == NULL))
2620	goto parse_dup_op_espace;
2621	old_tree = tree;
2622	}
2623	else
2624	old_tree = NULL;
2625
2626	if (elem->token.type == SUBEXP)
2627	{
2628	uintptr_t subidx = elem->token.opr.idx;
2629	postorder (elem, mark_opt_subexp, (void *) subidx);
2630	}
2631
2632	tree = create_tree (dfa, elem, NULL,
2633	(end == -`1` ? OP_DUP_ASTERISK : OP_ALT));
2634	if (__glibc_unlikely (tree == NULL))
2635	goto parse_dup_op_espace;
2636
2637	/ This loop is actually executed only when end != -1,*
2638	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2639	already created the start+1-th copy. /*
2640	if (TYPE_SIGNED (Idx) \|\| end != -`1`)
2641	for (i = start + `2`; i <= end; ++i)
2642	{
2643	elem = duplicate_tree (elem, dfa);
2644	tree = create_tree (dfa, tree, elem, CONCAT);
2645	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2646	goto parse_dup_op_espace;
2647
2648	tree = create_tree (dfa, tree, NULL, OP_ALT);
2649	if (__glibc_unlikely (tree == NULL))
2650	goto parse_dup_op_espace;
2651	}
2652
2653	if (old_tree)
2654	tree = create_tree (dfa, old_tree, tree, CONCAT);
2655
2656	return tree;
2657
2658	parse_dup_op_espace:
2659	*err = REG_ESPACE;
2660	return NULL;
2661	}
2662
2663	/ Size of the names for collating symbol/equivalence_class/character_class.*
2664	I'm not sure, but maybe enough. /*
2665	#define BRACKET_NAME_BUF_SIZE 32
2666
2667	#ifndef _LIBC
2668
2669	# ifdef RE_ENABLE_I18N
2670	/ Convert the byte B to the corresponding wide character. In a*
2671	unibyte locale, treat B as itself. In a multibyte locale, return
2672	WEOF if B is an encoding error. /*
2673	static wint_t
2674	parse_byte (unsigned char b, re_charset_t *mbcset)
2675	{
2676	return mbcset == NULL ? b : __btowc (b);
2677	}
2678	# endif
2679
2680	/ Local function for parse_bracket_exp only used in case of NOT _LIBC.*
2681	Build the range expression which starts from START_ELEM, and ends
2682	at END_ELEM. The result are written to MBCSET and SBCSET.
2683	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2684	mbcset->range_ends, is a pointer argument since we may
2685	update it. /*
2686
2687	static reg_errcode_t
2688	# ifdef RE_ENABLE_I18N
2689	build_range_exp (const reg_syntax_t syntax,
2690	bitset_t sbcset,
2691	re_charset_t *mbcset,
2692	Idx *range_alloc,
2693	const bracket_elem_t *start_elem,
2694	const bracket_elem_t *end_elem)
2695	# else /* not RE_ENABLE_I18N */
2696	build_range_exp (const reg_syntax_t syntax,
2697	bitset_t sbcset,
2698	const bracket_elem_t *start_elem,
2699	const bracket_elem_t *end_elem)
2700	# endif /* not RE_ENABLE_I18N */
2701	{
2702	unsigned int start_ch, end_ch;
2703	/ Equivalence Classes and Character Classes can't be a range start/end. /
2704	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2705	\|\| start_elem->type == CHAR_CLASS
2706	\|\| end_elem->type == EQUIV_CLASS
2707	\|\| end_elem->type == CHAR_CLASS))
2708	return REG_ERANGE;
2709
2710	/ We can handle no multi character collating elements without libc*
2711	support. /*
2712	if (__glibc_unlikely ((start_elem->type == COLL_SYM
2713	&& strlen ((char *) start_elem->opr.name) > `1`)
2714	\|\| (end_elem->type == COLL_SYM
2715	&& strlen ((char *) end_elem->opr.name) > `1`)))
2716	return REG_ECOLLATE;
2717
2718	# ifdef RE_ENABLE_I18N
2719	{
2720	wchar_t wc;
2721	wint_t start_wc;
2722	wint_t end_wc;
2723
2724	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2725	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2726	: `0`));
2727	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2728	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2729	: `0`));
2730	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2731	? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2732	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2733	? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2734	if (start_wc == WEOF \|\| end_wc == WEOF)
2735	return REG_ECOLLATE;
2736	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2737	&& start_wc > end_wc))
2738	return REG_ERANGE;
2739
2740	/ Got valid collation sequence values, add them as a new entry.*
2741	However, for !_LIBC we have no collation elements: if the
2742	character set is single byte, the single byte character set
2743	that we build below suffices. parse_bracket_exp passes
2744	no MBCSET if dfa->mb_cur_max == 1. /*
2745	if (mbcset)
2746	{
2747	/ Check the space of the arrays. /
2748	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2749	{
2750	/ There is not enough space, need realloc. /
2751	wchar_t new_array_start, new_array_end;
2752	Idx new_nranges;
2753
2754	/ +1 in case of mbcset->nranges is 0. /
2755	new_nranges = `2` * mbcset->nranges + `1`;
2756	/ Use realloc since mbcset->range_starts and mbcset->range_ends*
2757	are NULL if range_alloc == 0. /
2758	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2759	new_nranges);
2760	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2761	new_nranges);
2762
2763	if (__glibc_unlikely (new_array_start == NULL
2764	\|\| new_array_end == NULL))
2765	{
2766	re_free (new_array_start);
2767	re_free (new_array_end);
2768	return REG_ESPACE;
2769	}
2770
2771	mbcset->range_starts = new_array_start;
2772	mbcset->range_ends = new_array_end;
2773	*range_alloc = new_nranges;
2774	}
2775
2776	mbcset->range_starts[mbcset->nranges] = start_wc;
2777	mbcset->range_ends[mbcset->nranges++] = end_wc;
2778	}
2779
2780	/ Build the table for single byte characters. /
2781	for (wc = `0`; wc < SBC_MAX; ++wc)
2782	{
2783	if (start_wc <= wc && wc <= end_wc)
2784	bitset_set (sbcset, wc);
2785	}
2786	}
2787	# else /* not RE_ENABLE_I18N */
2788	{
2789	unsigned int ch;
2790	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2791	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2792	: `0`));
2793	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2794	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2795	: `0`));
2796	if (start_ch > end_ch)
2797	return REG_ERANGE;
2798	/ Build the table for single byte characters. /
2799	for (ch = `0`; ch < SBC_MAX; ++ch)
2800	if (start_ch <= ch && ch <= end_ch)
2801	bitset_set (sbcset, ch);
2802	}
2803	# endif /* not RE_ENABLE_I18N */
2804	return REG_NOERROR;
2805	}
2806	#endif /* not _LIBC */
2807
2808	#ifndef _LIBC
2809	/ Helper function for parse_bracket_exp only used in case of NOT _LIBC..*
2810	Build the collating element which is represented by NAME.
2811	The result are written to MBCSET and SBCSET.
2812	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2813	pointer argument since we may update it. /*
2814
2815	static reg_errcode_t
2816	# ifdef RE_ENABLE_I18N
2817	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2818	Idx coll_sym_alloc, const* unsigned char *name)
2819	# else /* not RE_ENABLE_I18N */
2820	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2821	# endif /* not RE_ENABLE_I18N */
2822	{
2823	size_t name_len = strlen ((const char *) name);
2824	if (__glibc_unlikely (name_len != `1`))
2825	return REG_ECOLLATE;
2826	else
2827	{
2828	bitset_set (sbcset, name[`0`]);
2829	return REG_NOERROR;
2830	}
2831	}
2832	#endif /* not _LIBC */
2833
2834	#ifdef _LIBC
2835	/ Local function for parse_bracket_exp used in _LIBC environment.*
2836	Seek the collating symbol entry corresponding to NAME.
2837	Return the index of the symbol in the SYMB_TABLE,
2838	or -1 if not found. /*
2839
2840	static inline int32_t
2841	__attribute__ ((always_inline))
2842	seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2843	const int32_t *symb_table, int32_t table_size,
2844	const unsigned char *extra)
2845	{
2846	int32_t elem;
2847
2848	for (elem = `0`; elem < table_size; elem++)
2849	if (symb_table[`2` * elem] != `0`)
2850	{
2851	int32_t idx = symb_table[`2` * elem + `1`];
2852	/ Skip the name of collating element name. /
2853	idx += `1` + extra[idx];
2854	if (/ Compare the length of the name. /
2855	name_len == extra[idx]
2856	/ Compare the name. /
2857	&& memcmp (name, &extra[idx + `1`], name_len) == `0`)
2858	/ Yep, this is the entry. /
2859	return elem;
2860	}
2861	return -`1`;
2862	}
2863
2864	/ Local function for parse_bracket_exp used in _LIBC environment.*
2865	Look up the collation sequence value of BR_ELEM.
2866	Return the value if succeeded, UINT_MAX otherwise. /*
2867
2868	static inline unsigned int
2869	__attribute__ ((always_inline))
2870	lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2871	const unsigned char *collseqmb,
2872	const char *collseqwc, int32_t table_size,
2873	const int32_t *symb_table,
2874	const unsigned char *extra)
2875	{
2876	if (br_elem->type == SB_CHAR)
2877	{
2878	/ if (MB_CUR_MAX == 1) /
2879	if (nrules == `0`)
2880	return collseqmb[br_elem->opr.ch];
2881	else
2882	{
2883	wint_t wc = __btowc (br_elem->opr.ch);
2884	return __collseq_table_lookup (collseqwc, wc);
2885	}
2886	}
2887	else if (br_elem->type == MB_CHAR)
2888	{
2889	if (nrules != `0`)
2890	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2891	}
2892	else if (br_elem->type == COLL_SYM)
2893	{
2894	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2895	if (nrules != `0`)
2896	{
2897	int32_t elem, idx;
2898	elem = seek_collating_symbol_entry (br_elem->opr.name,
2899	sym_name_len,
2900	symb_table, table_size,
2901	extra);
2902	if (elem != -`1`)
2903	{
2904	/ We found the entry. /
2905	idx = symb_table[`2` * elem + `1`];
2906	/ Skip the name of collating element name. /
2907	idx += `1` + extra[idx];
2908	/ Skip the byte sequence of the collating element. /
2909	idx += `1` + extra[idx];
2910	/ Adjust for the alignment. /
2911	idx = (idx + `3`) & ~`3`;
2912	/ Skip the multibyte collation sequence value. /
2913	idx += sizeof (unsigned int);
2914	/ Skip the wide char sequence of the collating element. /
2915	idx += sizeof (unsigned int) *
2916	(`1` + (unsigned* int *) (extra + idx));
2917	/ Return the collation sequence value. /
2918	return (unsigned* int *) (extra + idx);
2919	}
2920	else if (sym_name_len == `1`)
2921	{
2922	/ No valid character. Match it as a single byte*
2923	character. /*
2924	return collseqmb[br_elem->opr.name[`0`]];
2925	}
2926	}
2927	else if (sym_name_len == `1`)
2928	return collseqmb[br_elem->opr.name[`0`]];
2929	}
2930	return UINT_MAX;
2931	}
2932
2933	/ Local function for parse_bracket_exp used in _LIBC environment.*
2934	Build the range expression which starts from START_ELEM, and ends
2935	at END_ELEM. The result are written to MBCSET and SBCSET.
2936	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937	mbcset->range_ends, is a pointer argument since we may
2938	update it. /*
2939
2940	static inline reg_errcode_t
2941	__attribute__ ((always_inline))
2942	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int* *range_alloc,
2943	bracket_elem_t start_elem, bracket_elem_t end_elem,
2944	re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2945	const unsigned char collseqmb, const* char *collseqwc,
2946	int32_t table_size, const int32_t *symb_table,
2947	const unsigned char *extra)
2948	{
2949	unsigned int ch;
2950	uint32_t start_collseq;
2951	uint32_t end_collseq;
2952
2953	/ Equivalence Classes and Character Classes can't be a range*
2954	start/end. /*
2955	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2956	\|\| start_elem->type == CHAR_CLASS
2957	\|\| end_elem->type == EQUIV_CLASS
2958	\|\| end_elem->type == CHAR_CLASS))
2959	return REG_ERANGE;
2960
2961	/ FIXME: Implement rational ranges here, too. /
2962	start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2963	table_size, symb_table, extra);
2964	end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2965	table_size, symb_table, extra);
2966	/ Check start/end collation sequence values. /
2967	if (__glibc_unlikely (start_collseq == UINT_MAX
2968	\|\| end_collseq == UINT_MAX))
2969	return REG_ECOLLATE;
2970	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2971	&& start_collseq > end_collseq))
2972	return REG_ERANGE;
2973
2974	/ Got valid collation sequence values, add them as a new entry.*
2975	However, if we have no collation elements, and the character set
2976	is single byte, the single byte character set that we
2977	build below suffices. /*
2978	if (nrules > `0` \|\| dfa->mb_cur_max > `1`)
2979	{
2980	/ Check the space of the arrays. /
2981	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2982	{
2983	/ There is not enough space, need realloc. /
2984	uint32_t *new_array_start;
2985	uint32_t *new_array_end;
2986	int new_nranges;
2987
2988	/ +1 in case of mbcset->nranges is 0. /
2989	new_nranges = `2` * mbcset->nranges + `1`;
2990	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2991	new_nranges);
2992	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2993	new_nranges);
2994
2995	if (__glibc_unlikely (new_array_start == NULL
2996	\|\| new_array_end == NULL))
2997	return REG_ESPACE;
2998
2999	mbcset->range_starts = new_array_start;
3000	mbcset->range_ends = new_array_end;
3001	*range_alloc = new_nranges;
3002	}
3003
3004	mbcset->range_starts[mbcset->nranges] = start_collseq;
3005	mbcset->range_ends[mbcset->nranges++] = end_collseq;
3006	}
3007
3008	/ Build the table for single byte characters. /
3009	for (ch = `0`; ch < SBC_MAX; ch++)
3010	{
3011	uint32_t ch_collseq;
3012	/ if (MB_CUR_MAX == 1) /
3013	if (nrules == `0`)
3014	ch_collseq = collseqmb[ch];
3015	else
3016	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3017	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3018	bitset_set (sbcset, ch);
3019	}
3020	return REG_NOERROR;
3021	}
3022
3023	/ Local function for parse_bracket_exp used in _LIBC environment.*
3024	Build the collating element which is represented by NAME.
3025	The result are written to MBCSET and SBCSET.
3026	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3027	pointer argument since we may update it. /*
3028
3029	static inline reg_errcode_t
3030	__attribute__ ((always_inline))
3031	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3032	int coll_sym_alloc, const* unsigned char *name,
3033	uint32_t nrules, int32_t table_size,
3034	const int32_t symb_table, const* unsigned char *extra)
3035	{
3036	int32_t elem, idx;
3037	size_t name_len = strlen ((const char *) name);
3038	if (nrules != `0`)
3039	{
3040	elem = seek_collating_symbol_entry (name, name_len, symb_table,
3041	table_size, extra);
3042	if (elem != -`1`)
3043	{
3044	/ We found the entry. /
3045	idx = symb_table[`2` * elem + `1`];
3046	/ Skip the name of collating element name. /
3047	idx += `1` + extra[idx];
3048	}
3049	else if (name_len == `1`)
3050	{
3051	/ No valid character, treat it as a normal*
3052	character. /*
3053	bitset_set (sbcset, name[`0`]);
3054	return REG_NOERROR;
3055	}
3056	else
3057	return REG_ECOLLATE;
3058
3059	/ Got valid collation sequence, add it as a new entry. /
3060	/ Check the space of the arrays. /
3061	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3062	{
3063	/ Not enough, realloc it. /
3064	/ +1 in case of mbcset->ncoll_syms is 0. /
3065	int new_coll_sym_alloc = `2` * mbcset->ncoll_syms + `1`;
3066	/ Use realloc since mbcset->coll_syms is NULL*
3067	if alloc == 0. /
3068	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3069	new_coll_sym_alloc);
3070	if (__glibc_unlikely (new_coll_syms == NULL))
3071	return REG_ESPACE;
3072	mbcset->coll_syms = new_coll_syms;
3073	*coll_sym_alloc = new_coll_sym_alloc;
3074	}
3075	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3076	return REG_NOERROR;
3077	}
3078	else
3079	{
3080	if (__glibc_unlikely (name_len != `1`))
3081	return REG_ECOLLATE;
3082	else
3083	{
3084	bitset_set (sbcset, name[`0`]);
3085	return REG_NOERROR;
3086	}
3087	}
3088	}
3089	#endif /* _LIBC */
3090
3091	/ This function parse bracket expression like "[abc]", "[a-c]",*
3092	"[[.a-a.]]" etc. /*
3093
3094	static bin_tree_t *
3095	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
3096	reg_syntax_t syntax, reg_errcode_t *err)
3097	{
3098	#ifdef _LIBC
3099	const unsigned char *collseqmb;
3100	const char *collseqwc = NULL;
3101	uint32_t nrules;
3102	int32_t table_size = `0`;
3103	const int32_t *symb_table = NULL;
3104	const unsigned char *extra = NULL;
3105	#endif
3106
3107	re_token_t br_token;
3108	re_bitset_ptr_t sbcset;
3109	#ifdef RE_ENABLE_I18N
3110	re_charset_t *mbcset;
3111	Idx coll_sym_alloc = `0`, range_alloc = `0`, mbchar_alloc = `0`;
3112	Idx equiv_class_alloc = `0`, char_class_alloc = `0`;
3113	#endif /* not RE_ENABLE_I18N */
3114	bool non_match = false;
3115	bin_tree_t *work_tree;
3116	int token_len;
3117	bool first_round = true;
3118	#ifdef _LIBC
3119	collseqmb = (const unsigned char *)
3120	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3121	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3122	if (nrules)
3123	{
3124	/*
3125	if (MB_CUR_MAX > 1)
3126	*/
3127	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3128	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3129	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3130	_NL_COLLATE_SYMB_TABLEMB);
3131	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3132	_NL_COLLATE_SYMB_EXTRAMB);
3133	}
3134	#endif
3135	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3136	#ifdef RE_ENABLE_I18N
3137	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3138	#endif /* RE_ENABLE_I18N */
3139	#ifdef RE_ENABLE_I18N
3140	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
3141	#else
3142	if (__glibc_unlikely (sbcset == NULL))
3143	#endif /* RE_ENABLE_I18N */
3144	{
3145	re_free (sbcset);
3146	#ifdef RE_ENABLE_I18N
3147	re_free (mbcset);
3148	#endif
3149	*err = REG_ESPACE;
3150	return NULL;
3151	}
3152
3153	token_len = peek_token_bracket (token, regexp, syntax);
3154	if (__glibc_unlikely (token->type == END_OF_RE))
3155	{
3156	*err = REG_BADPAT;
3157	goto parse_bracket_exp_free_return;
3158	}
3159	if (token->type == OP_NON_MATCH_LIST)
3160	{
3161	#ifdef RE_ENABLE_I18N
3162	mbcset->non_match = `1`;
3163	#endif /* not RE_ENABLE_I18N */
3164	non_match = true;
3165	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3166	bitset_set (sbcset, `'\n'`);
3167	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3168	token_len = peek_token_bracket (token, regexp, syntax);
3169	if (__glibc_unlikely (token->type == END_OF_RE))
3170	{
3171	*err = REG_BADPAT;
3172	goto parse_bracket_exp_free_return;
3173	}
3174	}
3175
3176	/ We treat the first ']' as a normal character. /
3177	if (token->type == OP_CLOSE_BRACKET)
3178	token->type = CHARACTER;
3179
3180	while (`1`)
3181	{
3182	bracket_elem_t start_elem, end_elem;
3183	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3184	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3185	reg_errcode_t ret;
3186	int token_len2 = `0`;
3187	bool is_range_exp = false;
3188	re_token_t token2;
3189
3190	start_elem.opr.name = start_name_buf;
3191	start_elem.type = COLL_SYM;
3192	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3193	syntax, first_round);
3194	if (__glibc_unlikely (ret != REG_NOERROR))
3195	{
3196	*err = ret;
3197	goto parse_bracket_exp_free_return;
3198	}
3199	first_round = false;
3200
3201	/ Get information about the next token. We need it in any case. /
3202	token_len = peek_token_bracket (token, regexp, syntax);
3203
3204	/ Do not check for ranges if we know they are not allowed. /
3205	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3206	{
3207	if (__glibc_unlikely (token->type == END_OF_RE))
3208	{
3209	*err = REG_EBRACK;
3210	goto parse_bracket_exp_free_return;
3211	}
3212	if (token->type == OP_CHARSET_RANGE)
3213	{
3214	re_string_skip_bytes (regexp, token_len); / Skip '-'. /
3215	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3216	if (__glibc_unlikely (token2.type == END_OF_RE))
3217	{
3218	*err = REG_EBRACK;
3219	goto parse_bracket_exp_free_return;
3220	}
3221	if (token2.type == OP_CLOSE_BRACKET)
3222	{
3223	/ We treat the last '-' as a normal character. /
3224	re_string_skip_bytes (regexp, -token_len);
3225	token->type = CHARACTER;
3226	}
3227	else
3228	is_range_exp = true;
3229	}
3230	}
3231
3232	if (is_range_exp == true)
3233	{
3234	end_elem.opr.name = end_name_buf;
3235	end_elem.type = COLL_SYM;
3236	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3237	dfa, syntax, true);
3238	if (__glibc_unlikely (ret != REG_NOERROR))
3239	{
3240	*err = ret;
3241	goto parse_bracket_exp_free_return;
3242	}
3243
3244	token_len = peek_token_bracket (token, regexp, syntax);
3245
3246	#ifdef _LIBC
3247	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3248	&start_elem, &end_elem,
3249	dfa, syntax, nrules, collseqmb, collseqwc,
3250	table_size, symb_table, extra);
3251	#else
3252	# ifdef RE_ENABLE_I18N
3253	*err = build_range_exp (syntax, sbcset,
3254	dfa->mb_cur_max > `1` ? mbcset : NULL,
3255	&range_alloc, &start_elem, &end_elem);
3256	# else
3257	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3258	# endif
3259	#endif /* RE_ENABLE_I18N */
3260	if (__glibc_unlikely (*err != REG_NOERROR))
3261	goto parse_bracket_exp_free_return;
3262	}
3263	else
3264	{
3265	switch (start_elem.type)
3266	{
3267	case SB_CHAR:
3268	bitset_set (sbcset, start_elem.opr.ch);
3269	break;
3270	#ifdef RE_ENABLE_I18N
3271	case MB_CHAR:
3272	/ Check whether the array has enough space. /
3273	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3274	{
3275	wchar_t *new_mbchars;
3276	/ Not enough, realloc it. /
3277	/ +1 in case of mbcset->nmbchars is 0. /
3278	mbchar_alloc = `2` * mbcset->nmbchars + `1`;
3279	/ Use realloc since array is NULL if alloc == 0. /*
3280	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3281	mbchar_alloc);
3282	if (__glibc_unlikely (new_mbchars == NULL))
3283	goto parse_bracket_exp_espace;
3284	mbcset->mbchars = new_mbchars;
3285	}
3286	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3287	break;
3288	#endif /* RE_ENABLE_I18N */
3289	case EQUIV_CLASS:
3290	*err = build_equiv_class (sbcset,
3291	#ifdef RE_ENABLE_I18N
3292	mbcset, &equiv_class_alloc,
3293	#endif /* RE_ENABLE_I18N */
3294	start_elem.opr.name);
3295	if (__glibc_unlikely (*err != REG_NOERROR))
3296	goto parse_bracket_exp_free_return;
3297	break;
3298	case COLL_SYM:
3299	*err = build_collating_symbol (sbcset,
3300	#ifdef RE_ENABLE_I18N
3301	mbcset, &coll_sym_alloc,
3302	#endif /* RE_ENABLE_I18N */
3303	start_elem.opr.name,
3304	nrules, table_size, symb_table, extra);
3305	if (__glibc_unlikely (*err != REG_NOERROR))
3306	goto parse_bracket_exp_free_return;
3307	break;
3308	case CHAR_CLASS:
3309	*err = build_charclass (regexp->trans, sbcset,
3310	#ifdef RE_ENABLE_I18N
3311	mbcset, &char_class_alloc,
3312	#endif /* RE_ENABLE_I18N */
3313	(const char *) start_elem.opr.name,
3314	syntax);
3315	if (__glibc_unlikely (*err != REG_NOERROR))
3316	goto parse_bracket_exp_free_return;
3317	break;
3318	default:
3319	DEBUG_ASSERT (false);
3320	break;
3321	}
3322	}
3323	if (__glibc_unlikely (token->type == END_OF_RE))
3324	{
3325	*err = REG_EBRACK;
3326	goto parse_bracket_exp_free_return;
3327	}
3328	if (token->type == OP_CLOSE_BRACKET)
3329	break;
3330	}
3331
3332	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3333
3334	/ If it is non-matching list. /
3335	if (non_match)
3336	bitset_not (sbcset);
3337
3338	#ifdef RE_ENABLE_I18N
3339	/ Ensure only single byte characters are set. /
3340	if (dfa->mb_cur_max > `1`)
3341	bitset_mask (sbcset, dfa->sb_char);
3342
3343	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3344	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > `1` && (mbcset->nchar_classes
3345	\|\| mbcset->non_match)))
3346	{
3347	bin_tree_t *mbc_tree;
3348	int sbc_idx;
3349	/ Build a tree for complex bracket. /
3350	dfa->has_mb_node = `1`;
3351	br_token.type = COMPLEX_BRACKET;
3352	br_token.opr.mbcset = mbcset;
3353	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3354	if (__glibc_unlikely (mbc_tree == NULL))
3355	goto parse_bracket_exp_espace;
3356	for (sbc_idx = `0`; sbc_idx < BITSET_WORDS; ++sbc_idx)
3357	if (sbcset[sbc_idx])
3358	break;
3359	/ If there are no bits set in sbcset, there is no point*
3360	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. /*
3361	if (sbc_idx < BITSET_WORDS)
3362	{
3363	/ Build a tree for simple bracket. /
3364	br_token.type = SIMPLE_BRACKET;
3365	br_token.opr.sbcset = sbcset;
3366	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3367	if (__glibc_unlikely (work_tree == NULL))
3368	goto parse_bracket_exp_espace;
3369
3370	/ Then join them by ALT node. /
3371	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3372	if (__glibc_unlikely (work_tree == NULL))
3373	goto parse_bracket_exp_espace;
3374	}
3375	else
3376	{
3377	re_free (sbcset);
3378	work_tree = mbc_tree;
3379	}
3380	}
3381	else
3382	#endif /* not RE_ENABLE_I18N */
3383	{
3384	#ifdef RE_ENABLE_I18N
3385	free_charset (mbcset);
3386	#endif
3387	/ Build a tree for simple bracket. /
3388	br_token.type = SIMPLE_BRACKET;
3389	br_token.opr.sbcset = sbcset;
3390	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3391	if (__glibc_unlikely (work_tree == NULL))
3392	goto parse_bracket_exp_espace;
3393	}
3394	return work_tree;
3395
3396	parse_bracket_exp_espace:
3397	*err = REG_ESPACE;
3398	parse_bracket_exp_free_return:
3399	re_free (sbcset);
3400	#ifdef RE_ENABLE_I18N
3401	free_charset (mbcset);
3402	#endif /* RE_ENABLE_I18N */
3403	return NULL;
3404	}
3405
3406	/ Parse an element in the bracket expression. /
3407
3408	static reg_errcode_t
3409	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3410	re_token_t token, int* token_len, re_dfa_t *dfa,
3411	reg_syntax_t syntax, bool accept_hyphen)
3412	{
3413	#ifdef RE_ENABLE_I18N
3414	int cur_char_size;
3415	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3416	if (cur_char_size > `1`)
3417	{
3418	elem->type = MB_CHAR;
3419	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3420	re_string_skip_bytes (regexp, cur_char_size);
3421	return REG_NOERROR;
3422	}
3423	#endif /* RE_ENABLE_I18N */
3424	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3425	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3426	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3427	return parse_bracket_symbol (elem, regexp, token);
3428	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3429	{
3430	/ A '-' must only appear as anything but a range indicator before*
3431	the closing bracket. Everything else is an error. /*
3432	re_token_t token2;
3433	(void) peek_token_bracket (&token2, regexp, syntax);
3434	if (token2.type != OP_CLOSE_BRACKET)
3435	/ The actual error value is not standardized since this whole*
3436	case is undefined. But ERANGE makes good sense. /*
3437	return REG_ERANGE;
3438	}
3439	elem->type = SB_CHAR;
3440	elem->opr.ch = token->opr.c;
3441	return REG_NOERROR;
3442	}
3443
3444	/ Parse a bracket symbol in the bracket expression. Bracket symbols are*
3445	such as [:<character_class>:], [.<collating_element>.], and
3446	[=<equivalent_class>=]. /*
3447
3448	static reg_errcode_t
3449	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3450	re_token_t *token)
3451	{
3452	unsigned char ch, delim = token->opr.c;
3453	int i = `0`;
3454	if (re_string_eoi(regexp))
3455	return REG_EBRACK;
3456	for (;; ++i)
3457	{
3458	if (i >= BRACKET_NAME_BUF_SIZE)
3459	return REG_EBRACK;
3460	if (token->type == OP_OPEN_CHAR_CLASS)
3461	ch = re_string_fetch_byte_case (regexp);
3462	else
3463	ch = re_string_fetch_byte (regexp);
3464	if (re_string_eoi(regexp))
3465	return REG_EBRACK;
3466	if (ch == delim && re_string_peek_byte (regexp, `0`) == `']'`)
3467	break;
3468	elem->opr.name[i] = ch;
3469	}
3470	re_string_skip_bytes (regexp, `1`);
3471	elem->opr.name[i] = `'\0'`;
3472	switch (token->type)
3473	{
3474	case OP_OPEN_COLL_ELEM:
3475	elem->type = COLL_SYM;
3476	break;
3477	case OP_OPEN_EQUIV_CLASS:
3478	elem->type = EQUIV_CLASS;
3479	break;
3480	case OP_OPEN_CHAR_CLASS:
3481	elem->type = CHAR_CLASS;
3482	break;
3483	default:
3484	break;
3485	}
3486	return REG_NOERROR;
3487	}
3488
3489	/ Helper function for parse_bracket_exp.*
3490	Build the equivalence class which is represented by NAME.
3491	The result are written to MBCSET and SBCSET.
3492	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3493	is a pointer argument since we may update it. /*
3494
3495	static reg_errcode_t
3496	#ifdef RE_ENABLE_I18N
3497	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3498	Idx equiv_class_alloc, const* unsigned char *name)
3499	#else /* not RE_ENABLE_I18N */
3500	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3501	#endif /* not RE_ENABLE_I18N */
3502	{
3503	#ifdef _LIBC
3504	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3505	if (nrules != `0`)
3506	{
3507	const int32_t table, indirect;
3508	const unsigned char weights, extra, *cp;
3509	unsigned char char_buf[`2`];
3510	int32_t idx1, idx2;
3511	unsigned int ch;
3512	size_t len;
3513	/ Calculate the index for equivalence class. /
3514	cp = name;
3515	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3516	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3517	_NL_COLLATE_WEIGHTMB);
3518	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3519	_NL_COLLATE_EXTRAMB);
3520	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3521	_NL_COLLATE_INDIRECTMB);
3522	idx1 = findidx (table, indirect, extra, &cp, -`1`);
3523	if (__glibc_unlikely (idx1 == `0` \|\| *cp != `'\0'`))
3524	/ This isn't a valid character. /
3525	return REG_ECOLLATE;
3526
3527	/ Build single byte matching table for this equivalence class. /
3528	len = weights[idx1 & `0xffffff`];
3529	for (ch = `0`; ch < SBC_MAX; ++ch)
3530	{
3531	char_buf[`0`] = ch;
3532	cp = char_buf;
3533	idx2 = findidx (table, indirect, extra, &cp, `1`);
3534	/*
3535	idx2 = table[ch];
3536	*/
3537	if (idx2 == `0`)
3538	/ This isn't a valid character. /
3539	continue;
3540	/ Compare only if the length matches and the collation rule*
3541	index is the same. /*
3542	if (len == weights[idx2 & `0xffffff`] && (idx1 >> `24`) == (idx2 >> `24`)
3543	&& memcmp (weights + (idx1 & `0xffffff`) + `1`,
3544	weights + (idx2 & `0xffffff`) + `1`, len) == `0`)
3545	bitset_set (sbcset, ch);
3546	}
3547	/ Check whether the array has enough space. /
3548	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3549	{
3550	/ Not enough, realloc it. /
3551	/ +1 in case of mbcset->nequiv_classes is 0. /
3552	Idx new_equiv_class_alloc = `2` * mbcset->nequiv_classes + `1`;
3553	/ Use realloc since the array is NULL if alloc == 0. /*
3554	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555	int32_t,
3556	new_equiv_class_alloc);
3557	if (__glibc_unlikely (new_equiv_classes == NULL))
3558	return REG_ESPACE;
3559	mbcset->equiv_classes = new_equiv_classes;
3560	*equiv_class_alloc = new_equiv_class_alloc;
3561	}
3562	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563	}
3564	else
3565	#endif /* _LIBC */
3566	{
3567	if (__glibc_unlikely (strlen ((const char *) name) != `1`))
3568	return REG_ECOLLATE;
3569	bitset_set (sbcset, *name);
3570	}
3571	return REG_NOERROR;
3572	}
3573
3574	/ Helper function for parse_bracket_exp.*
3575	Build the character class which is represented by NAME.
3576	The result are written to MBCSET and SBCSET.
3577	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578	is a pointer argument since we may update it. /*
3579
3580	static reg_errcode_t
3581	#ifdef RE_ENABLE_I18N
3582	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583	re_charset_t mbcset, Idx char_class_alloc,
3584	const char *class_name, reg_syntax_t syntax)
3585	#else /* not RE_ENABLE_I18N */
3586	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587	const char *class_name, reg_syntax_t syntax)
3588	#endif /* not RE_ENABLE_I18N */
3589	{
3590	int i;
3591	const char *name = class_name;
3592
3593	/ In case of REG_ICASE "upper" and "lower" match the both of*
3594	upper and lower cases. /*
3595	if ((syntax & RE_ICASE)
3596	&& (strcmp (name, "upper") == `0` \|\| strcmp (name, "lower") == `0`))
3597	name = "alpha";
3598
3599	#ifdef RE_ENABLE_I18N
3600	/ Check the space of the arrays. /
3601	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3602	{
3603	/ Not enough, realloc it. /
3604	/ +1 in case of mbcset->nchar_classes is 0. /
3605	Idx new_char_class_alloc = `2` * mbcset->nchar_classes + `1`;
3606	/ Use realloc since array is NULL if alloc == 0. /*
3607	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608	new_char_class_alloc);
3609	if (__glibc_unlikely (new_char_classes == NULL))
3610	return REG_ESPACE;
3611	mbcset->char_classes = new_char_classes;
3612	*char_class_alloc = new_char_class_alloc;
3613	}
3614	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615	#endif /* RE_ENABLE_I18N */
3616
3617	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3618	do { \
3619	if (__glibc_unlikely (trans != NULL)) \
3620	{ \
3621	for (i = 0; i < SBC_MAX; ++i) \
3622	if (ctype_func (i)) \
3623	bitset_set (sbcset, trans[i]); \
3624	} \
3625	else \
3626	{ \
3627	for (i = 0; i < SBC_MAX; ++i) \
3628	if (ctype_func (i)) \
3629	bitset_set (sbcset, i); \
3630	} \
3631	} while (0)
3632
3633	if (strcmp (name, "alnum") == `0`)
3634	BUILD_CHARCLASS_LOOP (isalnum);
3635	else if (strcmp (name, "cntrl") == `0`)
3636	BUILD_CHARCLASS_LOOP (iscntrl);
3637	else if (strcmp (name, "lower") == `0`)
3638	BUILD_CHARCLASS_LOOP (islower);
3639	else if (strcmp (name, "space") == `0`)
3640	BUILD_CHARCLASS_LOOP (isspace);
3641	else if (strcmp (name, "alpha") == `0`)
3642	BUILD_CHARCLASS_LOOP (isalpha);
3643	else if (strcmp (name, "digit") == `0`)
3644	BUILD_CHARCLASS_LOOP (isdigit);
3645	else if (strcmp (name, "print") == `0`)
3646	BUILD_CHARCLASS_LOOP (isprint);
3647	else if (strcmp (name, "upper") == `0`)
3648	BUILD_CHARCLASS_LOOP (isupper);
3649	else if (strcmp (name, "blank") == `0`)
3650	BUILD_CHARCLASS_LOOP (isblank);
3651	else if (strcmp (name, "graph") == `0`)
3652	BUILD_CHARCLASS_LOOP (isgraph);
3653	else if (strcmp (name, "punct") == `0`)
3654	BUILD_CHARCLASS_LOOP (ispunct);
3655	else if (strcmp (name, "xdigit") == `0`)
3656	BUILD_CHARCLASS_LOOP (isxdigit);
3657	else
3658	return REG_ECTYPE;
3659
3660	return REG_NOERROR;
3661	}
3662
3663	static bin_tree_t *
3664	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665	const char *class_name,
3666	const char *extra, bool non_match,
3667	reg_errcode_t *err)
3668	{
3669	re_bitset_ptr_t sbcset;
3670	#ifdef RE_ENABLE_I18N
3671	re_charset_t *mbcset;
3672	Idx alloc = `0`;
3673	#endif /* not RE_ENABLE_I18N */
3674	reg_errcode_t ret;
3675	bin_tree_t *tree;
3676
3677	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3678	if (__glibc_unlikely (sbcset == NULL))
3679	{
3680	*err = REG_ESPACE;
3681	return NULL;
3682	}
3683	#ifdef RE_ENABLE_I18N
3684	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3685	if (__glibc_unlikely (mbcset == NULL))
3686	{
3687	re_free (sbcset);
3688	*err = REG_ESPACE;
3689	return NULL;
3690	}
3691	mbcset->non_match = non_match;
3692	#endif /* RE_ENABLE_I18N */
3693
3694	/ We don't care the syntax in this case. /
3695	ret = build_charclass (trans, sbcset,
3696	#ifdef RE_ENABLE_I18N
3697	mbcset, &alloc,
3698	#endif /* RE_ENABLE_I18N */
3699	class_name, `0`);
3700
3701	if (__glibc_unlikely (ret != REG_NOERROR))
3702	{
3703	re_free (sbcset);
3704	#ifdef RE_ENABLE_I18N
3705	free_charset (mbcset);
3706	#endif /* RE_ENABLE_I18N */
3707	*err = ret;
3708	return NULL;
3709	}
3710	/ \w match '_' also. /
3711	for (; *extra; extra++)
3712	bitset_set (sbcset, *extra);
3713
3714	/ If it is non-matching list. /
3715	if (non_match)
3716	bitset_not (sbcset);
3717
3718	#ifdef RE_ENABLE_I18N
3719	/ Ensure only single byte characters are set. /
3720	if (dfa->mb_cur_max > `1`)
3721	bitset_mask (sbcset, dfa->sb_char);
3722	#endif
3723
3724	/ Build a tree for simple bracket. /
3725	re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3726	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3727	if (__glibc_unlikely (tree == NULL))
3728	goto build_word_op_espace;
3729
3730	#ifdef RE_ENABLE_I18N
3731	if (dfa->mb_cur_max > `1`)
3732	{
3733	bin_tree_t *mbc_tree;
3734	/ Build a tree for complex bracket. /
3735	br_token.type = COMPLEX_BRACKET;
3736	br_token.opr.mbcset = mbcset;
3737	dfa->has_mb_node = `1`;
3738	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3739	if (__glibc_unlikely (mbc_tree == NULL))
3740	goto build_word_op_espace;
3741	/ Then join them by ALT node. /
3742	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3743	if (__glibc_likely (mbc_tree != NULL))
3744	return tree;
3745	}
3746	else
3747	{
3748	free_charset (mbcset);
3749	return tree;
3750	}
3751	#else /* not RE_ENABLE_I18N */
3752	return tree;
3753	#endif /* not RE_ENABLE_I18N */
3754
3755	build_word_op_espace:
3756	re_free (sbcset);
3757	#ifdef RE_ENABLE_I18N
3758	free_charset (mbcset);
3759	#endif /* RE_ENABLE_I18N */
3760	*err = REG_ESPACE;
3761	return NULL;
3762	}
3763
3764	/ This is intended for the expressions like "a{1,3}".*
3765	Fetch a number from 'input', and return the number.
3766	Return -1 if the number field is empty like "{,1}".
3767	Return RE_DUP_MAX + 1 if the number field is too large.
3768	Return -2 if an error occurred. /*
3769
3770	static Idx
3771	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3772	{
3773	Idx num = -`1`;
3774	unsigned char c;
3775	while (`1`)
3776	{
3777	fetch_token (token, input, syntax);
3778	c = token->opr.c;
3779	if (__glibc_unlikely (token->type == END_OF_RE))
3780	return -`2`;
3781	if (token->type == OP_CLOSE_DUP_NUM \|\| c == `','`)
3782	break;
3783	num = ((token->type != CHARACTER \|\| c < `'0'` \|\| `'9'` < c \|\| num == -`2`)
3784	? -`2`
3785	: num == -`1`
3786	? c - `'0'`
3787	: MIN (RE_DUP_MAX + `1`, num * `10` + c - `'0'`));
3788	}
3789	return num;
3790	}
3791
3792	#ifdef RE_ENABLE_I18N
3793	static void
3794	free_charset (re_charset_t *cset)
3795	{
3796	re_free (cset->mbchars);
3797	# ifdef _LIBC
3798	re_free (cset->coll_syms);
3799	re_free (cset->equiv_classes);
3800	# endif
3801	re_free (cset->range_starts);
3802	re_free (cset->range_ends);
3803	re_free (cset->char_classes);
3804	re_free (cset);
3805	}
3806	#endif /* RE_ENABLE_I18N */
3807
3808	/ Functions for binary tree operation. /
3809
3810	/ Create a tree node. /
3811
3812	static bin_tree_t *
3813	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3814	re_token_type_t type)
3815	{
3816	re_token_t t = { .type = type };
3817	return create_token_tree (dfa, left, right, &t);
3818	}
3819
3820	static bin_tree_t *
3821	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3822	const re_token_t *token)
3823	{
3824	bin_tree_t *tree;
3825	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3826	{
3827	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, `1`);
3828
3829	if (storage == NULL)
3830	return NULL;
3831	storage->next = dfa->str_tree_storage;
3832	dfa->str_tree_storage = storage;
3833	dfa->str_tree_storage_idx = `0`;
3834	}
3835	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3836
3837	tree->parent = NULL;
3838	tree->left = left;
3839	tree->right = right;
3840	tree->token = *token;
3841	tree->token.duplicated = `0`;
3842	tree->token.opt_subexp = `0`;
3843	tree->first = NULL;
3844	tree->next = NULL;
3845	tree->node_idx = -`1`;
3846
3847	if (left != NULL)
3848	left->parent = tree;
3849	if (right != NULL)
3850	right->parent = tree;
3851	return tree;
3852	}
3853
3854	/ Mark the tree SRC as an optional subexpression.*
3855	To be called from preorder or postorder. /*
3856
3857	static reg_errcode_t
3858	mark_opt_subexp (void extra, bin_tree_t node)
3859	{
3860	Idx idx = (uintptr_t) extra;
3861	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862	node->token.opt_subexp = `1`;
3863
3864	return REG_NOERROR;
3865	}
3866
3867	/ Free the allocated memory inside NODE. /
3868
3869	static void
3870	free_token (re_token_t *node)
3871	{
3872	#ifdef RE_ENABLE_I18N
3873	if (node->type == COMPLEX_BRACKET && node->duplicated == `0`)
3874	free_charset (node->opr.mbcset);
3875	else
3876	#endif /* RE_ENABLE_I18N */
3877	if (node->type == SIMPLE_BRACKET && node->duplicated == `0`)
3878	re_free (node->opr.sbcset);
3879	}
3880
3881	/ Worker function for tree walking. Free the allocated memory inside NODE*
3882	and its children. /*
3883
3884	static reg_errcode_t
3885	free_tree (void extra, bin_tree_t node)
3886	{
3887	free_token (&node->token);
3888	return REG_NOERROR;
3889	}
3890
3891
3892	/ Duplicate the node SRC, and return new node. This is a preorder*
3893	visit similar to the one implemented by the generic visitor, but
3894	we need more infrastructure to maintain two parallel trees --- so,
3895	it's easier to duplicate. /*
3896
3897	static bin_tree_t *
3898	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3899	{
3900	const bin_tree_t *node;
3901	bin_tree_t *dup_root;
3902	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3903
3904	for (node = root; ; )
3905	{
3906	/ Create a new tree and link it back to the current parent. /
3907	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3908	if (*p_new == NULL)
3909	return NULL;
3910	(*p_new)->parent = dup_node;
3911	(*p_new)->token.duplicated = `1`;
3912	dup_node = *p_new;
3913
3914	/ Go to the left node, or up and to the right. /
3915	if (node->left)
3916	{
3917	node = node->left;
3918	p_new = &dup_node->left;
3919	}
3920	else
3921	{
3922	const bin_tree_t *prev = NULL;
3923	while (node->right == prev \|\| node->right == NULL)
3924	{
3925	prev = node;
3926	node = node->parent;
3927	dup_node = dup_node->parent;
3928	if (!node)
3929	return dup_root;
3930	}
3931	node = node->right;
3932	p_new = &dup_node->right;
3933	}
3934	}
3935	}
3936

Browse the source code of glibc/posix/regcomp.c