regcomp.c source code [glibc/posix/regcomp.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#ifdef _LIBC
21	# include <locale/weight.h>
22	#endif
23
24	static reg_errcode_t re_compile_internal (regex_t preg, const* char * pattern,
25	size_t length, reg_syntax_t syntax);
26	static void re_compile_fastmap_iter (regex_t *bufp,
27	const re_dfastate_t *init_state,
28	char *fastmap);
29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30	#ifdef RE_ENABLE_I18N
31	static void free_charset (re_charset_t *cset);
32	#endif /* RE_ENABLE_I18N */
33	static void free_workarea_compile (regex_t *preg);
34	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35	#ifdef RE_ENABLE_I18N
36	static void optimize_utf8 (re_dfa_t *dfa);
37	#endif
38	static reg_errcode_t analyze (regex_t *preg);
39	static reg_errcode_t preorder (bin_tree_t *root,
40	reg_errcode_t (fn (void , bin_tree_t )),
41	void *extra);
42	static reg_errcode_t postorder (bin_tree_t *root,
43	reg_errcode_t (fn (void , bin_tree_t )),
44	void *extra);
45	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
46	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
47	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
48	bin_tree_t *node);
49	static reg_errcode_t calc_first (void extra, bin_tree_t node);
50	static reg_errcode_t calc_next (void extra, bin_tree_t node);
51	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
52	static Idx duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint);
53	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54	unsigned int constraint);
55	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
57	Idx node, bool root);
58	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59	static Idx fetch_number (re_string_t input, re_token_t token,
60	reg_syntax_t syntax);
61	static int peek_token (re_token_t token, re_string_t input,
62	reg_syntax_t syntax);
63	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
64	reg_syntax_t syntax, reg_errcode_t *err);
65	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
66	re_token_t *token, reg_syntax_t syntax,
67	Idx nest, reg_errcode_t *err);
68	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
69	re_token_t *token, reg_syntax_t syntax,
70	Idx nest, reg_errcode_t *err);
71	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
72	re_token_t *token, reg_syntax_t syntax,
73	Idx nest, reg_errcode_t *err);
74	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
75	re_token_t *token, reg_syntax_t syntax,
76	Idx nest, reg_errcode_t *err);
77	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
78	re_dfa_t dfa, re_token_t token,
79	reg_syntax_t syntax, reg_errcode_t *err);
80	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
81	re_token_t *token, reg_syntax_t syntax,
82	reg_errcode_t *err);
83	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84	re_string_t *regexp,
85	re_token_t token, int* token_len,
86	re_dfa_t *dfa,
87	reg_syntax_t syntax,
88	bool accept_hyphen);
89	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90	re_string_t *regexp,
91	re_token_t *token);
92	#ifdef RE_ENABLE_I18N
93	static reg_errcode_t build_equiv_class (bitset_t sbcset,
94	re_charset_t *mbcset,
95	Idx *equiv_class_alloc,
96	const unsigned char *name);
97	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98	bitset_t sbcset,
99	re_charset_t *mbcset,
100	Idx *char_class_alloc,
101	const char *class_name,
102	reg_syntax_t syntax);
103	#else /* not RE_ENABLE_I18N */
104	static reg_errcode_t build_equiv_class (bitset_t sbcset,
105	const unsigned char *name);
106	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107	bitset_t sbcset,
108	const char *class_name,
109	reg_syntax_t syntax);
110	#endif /* not RE_ENABLE_I18N */
111	static bin_tree_t build_charclass_op (re_dfa_t dfa,
112	RE_TRANSLATE_TYPE trans,
113	const char *class_name,
114	const char *extra,
115	bool non_match, reg_errcode_t *err);
116	static bin_tree_t create_tree (re_dfa_t dfa,
117	bin_tree_t left, bin_tree_t right,
118	re_token_type_t type);
119	static bin_tree_t create_token_tree (re_dfa_t dfa,
120	bin_tree_t left, bin_tree_t right,
121	const re_token_t *token);
122	static bin_tree_t duplicate_tree (const* bin_tree_t src, re_dfa_t dfa);
123	static void free_token (re_token_t *node);
124	static reg_errcode_t free_tree (void extra, bin_tree_t node);
125	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
126
127	/ This table gives an error message for each of the error codes listed*
128	in regex.h. Obviously the order here has to be same as there.
129	POSIX doesn't require that we do anything for REG_NOERROR,
130	but why not be nice? /*
131
132	static const char __re_error_msgid[] =
133	{
134	#define REG_NOERROR_IDX 0
135	gettext_noop ("Success") / REG_NOERROR /
136	"\0"
137	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138	gettext_noop ("No match") / REG_NOMATCH /
139	"\0"
140	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141	gettext_noop ("Invalid regular expression") / REG_BADPAT /
142	"\0"
143	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144	gettext_noop ("Invalid collation character") / REG_ECOLLATE /
145	"\0"
146	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147	gettext_noop ("Invalid character class name") / REG_ECTYPE /
148	"\0"
149	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150	gettext_noop ("Trailing backslash") / REG_EESCAPE /
151	"\0"
152	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153	gettext_noop ("Invalid back reference") / REG_ESUBREG /
154	"\0"
155	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156	gettext_noop ("Unmatched [, [^, [:, [., or [=") / REG_EBRACK /
157	"\0"
158	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159	gettext_noop ("Unmatched ( or \\(") / REG_EPAREN /
160	"\0"
161	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162	gettext_noop ("Unmatched \\{") / REG_EBRACE /
163	"\0"
164	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165	gettext_noop ("Invalid content of \\{\\}") / REG_BADBR /
166	"\0"
167	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168	gettext_noop ("Invalid range end") / REG_ERANGE /
169	"\0"
170	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171	gettext_noop ("Memory exhausted") / REG_ESPACE /
172	"\0"
173	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174	gettext_noop ("Invalid preceding regular expression") / REG_BADRPT /
175	"\0"
176	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177	gettext_noop ("Premature end of regular expression") / REG_EEND /
178	"\0"
179	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180	gettext_noop ("Regular expression too big") / REG_ESIZE /
181	"\0"
182	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183	gettext_noop ("Unmatched ) or \\)") / REG_ERPAREN /
184	};
185
186	static const size_t __re_error_msgid_idx[] =
187	{
188	REG_NOERROR_IDX,
189	REG_NOMATCH_IDX,
190	REG_BADPAT_IDX,
191	REG_ECOLLATE_IDX,
192	REG_ECTYPE_IDX,
193	REG_EESCAPE_IDX,
194	REG_ESUBREG_IDX,
195	REG_EBRACK_IDX,
196	REG_EPAREN_IDX,
197	REG_EBRACE_IDX,
198	REG_BADBR_IDX,
199	REG_ERANGE_IDX,
200	REG_ESPACE_IDX,
201	REG_BADRPT_IDX,
202	REG_EEND_IDX,
203	REG_ESIZE_IDX,
204	REG_ERPAREN_IDX
205	};
206
207	/ Entry points for GNU code. /
208
209	/ re_compile_pattern is the GNU regular expression compiler: it*
210	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211	Returns 0 if the pattern was valid, otherwise an error string.
212
213	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214	are set in BUFP on entry. /*
215
216	const char *
217	re_compile_pattern (const char *pattern, size_t length,
218	struct re_pattern_buffer *bufp)
219	{
220	reg_errcode_t ret;
221
222	/ And GNU code determines whether or not to get register information*
223	by passing null for the REGS argument to re_match, etc., not by
224	setting no_sub, unless RE_NO_SUB is set. /*
225	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226
227	/ Match anchors at newline. /
228	bufp->newline_anchor = `1`;
229
230	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231
232	if (!ret)
233	return NULL;
234	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235	}
236	weak_alias (__re_compile_pattern, re_compile_pattern)
237
238	/ Set by 're_set_syntax' to the current regexp syntax to recognize. Can*
239	also be assigned to arbitrarily: each pattern buffer stores its own
240	syntax, so it can be changed between regex compilations. /*
241	/ This has no initializer because initialized variables in Emacs*
242	become read-only after dumping. /*
243	reg_syntax_t re_syntax_options;
244
245
246	/ Specify the precise syntax of regexps for compilation. This provides*
247	for compatibility for various utilities which historically have
248	different, incompatible syntaxes.
249
250	The argument SYNTAX is a bit mask comprised of the various bits
251	defined in regex.h. We return the old syntax. /*
252
253	reg_syntax_t
254	re_set_syntax (reg_syntax_t syntax)
255	{
256	reg_syntax_t ret = re_syntax_options;
257
258	re_syntax_options = syntax;
259	return ret;
260	}
261	weak_alias (__re_set_syntax, re_set_syntax)
262
263	int
264	re_compile_fastmap (struct re_pattern_buffer *bufp)
265	{
266	re_dfa_t *dfa = bufp->buffer;
267	char *fastmap = bufp->fastmap;
268
269	memset (fastmap, `'\0'`, sizeof (char) * SBC_MAX);
270	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271	if (dfa->init_state != dfa->init_state_word)
272	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273	if (dfa->init_state != dfa->init_state_nl)
274	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275	if (dfa->init_state != dfa->init_state_begbuf)
276	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277	bufp->fastmap_accurate = `1`;
278	return `0`;
279	}
280	weak_alias (__re_compile_fastmap, re_compile_fastmap)
281
282	static inline void
283	__attribute__ ((always_inline))
284	re_set_fastmap (char fastmap, bool icase, int* ch)
285	{
286	fastmap[ch] = `1`;
287	if (icase)
288	fastmap[tolower (ch)] = `1`;
289	}
290
291	/ Helper function for re_compile_fastmap.*
292	Compile fastmap for the initial_state INIT_STATE. /*
293
294	static void
295	re_compile_fastmap_iter (regex_t bufp, const* re_dfastate_t *init_state,
296	char *fastmap)
297	{
298	re_dfa_t *dfa = bufp->buffer;
299	Idx node_cnt;
300	bool icase = (dfa->mb_cur_max == `1` && (bufp->syntax & RE_ICASE));
301	for (node_cnt = `0`; node_cnt < init_state->nodes.nelem; ++node_cnt)
302	{
303	Idx node = init_state->nodes.elems[node_cnt];
304	re_token_type_t type = dfa->nodes[node].type;
305
306	if (type == CHARACTER)
307	{
308	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309	#ifdef RE_ENABLE_I18N
310	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
311	{
312	unsigned char buf[MB_LEN_MAX];
313	unsigned char *p;
314	wchar_t wc;
315	mbstate_t state;
316
317	p = buf;
318	*p++ = dfa->nodes[node].opr.c;
319	while (++node < dfa->nodes_len
320	&& dfa->nodes[node].type == CHARACTER
321	&& dfa->nodes[node].mb_partial)
322	*p++ = dfa->nodes[node].opr.c;
323	memset (&state, `'\0'`, sizeof (state));
324	if (__mbrtowc (&wc, (const char *) buf, p - buf,
325	&state) == p - buf
326	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
327	!= (size_t) -`1`))
328	re_set_fastmap (fastmap, false, buf[`0`]);
329	}
330	#endif
331	}
332	else if (type == SIMPLE_BRACKET)
333	{
334	int i, ch;
335	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
336	{
337	int j;
338	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
340	if (w & ((bitset_word_t) `1` << j))
341	re_set_fastmap (fastmap, icase, ch);
342	}
343	}
344	#ifdef RE_ENABLE_I18N
345	else if (type == COMPLEX_BRACKET)
346	{
347	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348	Idx i;
349
350	# ifdef _LIBC
351	/ See if we have to try all bytes which start multiple collation*
352	elements.
353	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354	collation element, and don't catch 'b' since 'b' is
355	the only collation element which starts from 'b' (and
356	it is caught by SIMPLE_BRACKET). /*
357	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != `0`
358	&& (cset->ncoll_syms \|\| cset->nranges))
359	{
360	const int32_t table = (const* int32_t *)
361	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362	for (i = `0`; i < SBC_MAX; ++i)
363	if (table[i] < `0`)
364	re_set_fastmap (fastmap, icase, i);
365	}
366	# endif /* _LIBC */
367
368	/ See if we have to start the match at all multibyte characters,*
369	i.e. where we would not find an invalid sequence. This only
370	applies to multibyte character sets; for single byte character
371	sets, the SIMPLE_BRACKET again suffices. /*
372	if (dfa->mb_cur_max > `1`
373	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
374	# ifdef _LIBC
375	\|\| cset->nequiv_classes
376	# endif /* _LIBC */
377	))
378	{
379	unsigned char c = `0`;
380	do
381	{
382	mbstate_t mbs;
383	memset (&mbs, `0`, sizeof (mbs));
384	if (__mbrtowc (NULL, (char *) &c, `1`, &mbs) == (size_t) -`2`)
385	re_set_fastmap (fastmap, false, (int) c);
386	}
387	while (++c != `0`);
388	}
389
390	else
391	{
392	/ ... Else catch all bytes which can start the mbchars. /
393	for (i = `0`; i < cset->nmbchars; ++i)
394	{
395	char buf[`256`];
396	mbstate_t state;
397	memset (&state, `'\0'`, sizeof (state));
398	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -`1`)
399	re_set_fastmap (fastmap, icase, (unsigned* char *) buf);
400	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > `1`)
401	{
402	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403	!= (size_t) -`1`)
404	re_set_fastmap (fastmap, false, (unsigned* char *) buf);
405	}
406	}
407	}
408	}
409	#endif /* RE_ENABLE_I18N */
410	else if (type == OP_PERIOD
411	#ifdef RE_ENABLE_I18N
412	\|\| type == OP_UTF8_PERIOD
413	#endif /* RE_ENABLE_I18N */
414	\|\| type == END_OF_RE)
415	{
416	memset (fastmap, `'\1'`, sizeof (char) * SBC_MAX);
417	if (type == END_OF_RE)
418	bufp->can_be_null = `1`;
419	return;
420	}
421	}
422	}
423
424	/ Entry point for POSIX code. /
425	/ regcomp takes a regular expression as a string and compiles it.*
426
427	PREG is a regex_t . We do not expect any fields to be initialized,*
428	since POSIX says we shouldn't. Thus, we set
429
430	'buffer' to the compiled pattern;
431	'used' to the length of the compiled pattern;
432	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433	REG_EXTENDED bit in CFLAGS is set; otherwise, to
434	RE_SYNTAX_POSIX_BASIC;
435	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436	'fastmap' to an allocated space for the fastmap;
437	'fastmap_accurate' to zero;
438	're_nsub' to the number of subexpressions in PATTERN.
439
440	PATTERN is the address of the pattern string.
441
442	CFLAGS is a series of bits which affect compilation.
443
444	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445	use POSIX basic syntax.
446
447	If REG_NEWLINE is set, then . and [^...] don't match newline.
448	Also, regexec will try a match beginning after every newline.
449
450	If REG_ICASE is set, then we considers upper- and lowercase
451	versions of letters to be equivalent when matching.
452
453	If REG_NOSUB is set, then when PREG is passed to regexec, that
454	routine will report only success or failure, and nothing about the
455	registers.
456
457	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458	the return codes and their meanings.) /*
459
460	int
461	regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
462	{
463	reg_errcode_t ret;
464	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465	: RE_SYNTAX_POSIX_BASIC);
466
467	preg->buffer = NULL;
468	preg->allocated = `0`;
469	preg->used = `0`;
470
471	/ Try to allocate space for the fastmap. /
472	preg->fastmap = re_malloc (char, SBC_MAX);
473	if (__glibc_unlikely (preg->fastmap == NULL))
474	return REG_ESPACE;
475
476	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : `0`;
477
478	/ If REG_NEWLINE is set, newlines are treated differently. /
479	if (cflags & REG_NEWLINE)
480	{ / REG_NEWLINE implies neither . nor [^...] match newline. /
481	syntax &= ~RE_DOT_NEWLINE;
482	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
483	/ It also changes the matching behavior. /
484	preg->newline_anchor = `1`;
485	}
486	else
487	preg->newline_anchor = `0`;
488	preg->no_sub = !!(cflags & REG_NOSUB);
489	preg->translate = NULL;
490
491	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
492
493	/ POSIX doesn't distinguish between an unmatched open-group and an*
494	unmatched close-group: both are REG_EPAREN. /*
495	if (ret == REG_ERPAREN)
496	ret = REG_EPAREN;
497
498	/ We have already checked preg->fastmap != NULL. /
499	if (__glibc_likely (ret == REG_NOERROR))
500	/ Compute the fastmap now, since regexec cannot modify the pattern*
501	buffer. This function never fails in this implementation. /*
502	(void) re_compile_fastmap (preg);
503	else
504	{
505	/ Some error occurred while compiling the expression. /
506	re_free (preg->fastmap);
507	preg->fastmap = NULL;
508	}
509
510	return (int) ret;
511	}
512	libc_hidden_def (__regcomp)
513	weak_alias (__regcomp, regcomp)
514
515	/ Returns a message corresponding to an error code, ERRCODE, returned*
516	from either regcomp or regexec. We don't use PREG here. /*
517
518	size_t
519	regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520	size_t errbuf_size)
521	{
522	const char *msg;
523	size_t msg_size;
524	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[`0`];
525
526	if (__glibc_unlikely (errcode < `0` \|\| errcode >= nerrcodes))
527	/ Only error codes returned by the rest of the code should be passed*
528	to this routine. If we are given anything else, or if other regex
529	code generates an invalid error code, then the program has a bug.
530	Dump core so we can fix it. /*
531	abort ();
532
533	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
534
535	msg_size = strlen (msg) + `1`; / Includes the null. /
536
537	if (__glibc_likely (errbuf_size != `0`))
538	{
539	size_t cpy_size = msg_size;
540	if (__glibc_unlikely (msg_size > errbuf_size))
541	{
542	cpy_size = errbuf_size - `1`;
543	errbuf[cpy_size] = `'\0'`;
544	}
545	memcpy (errbuf, msg, cpy_size);
546	}
547
548	return msg_size;
549	}
550	weak_alias (__regerror, regerror)
551
552
553	#ifdef RE_ENABLE_I18N
554	/ This static array is used for the map to single-byte characters when*
555	UTF-8 is used. Otherwise we would allocate memory just to initialize
556	it the same all the time. UTF-8 is the preferred encoding so this is
557	a worthwhile optimization. /*
558	static const bitset_t utf8_sb_map =
559	{
560	/ Set the first 128 bits. /
561	# if (defined __GNUC__ \|\| __clang_major__ >= 4) && !defined __STRICT_ANSI__
562	[`0` ... `0x80` / BITSET_WORD_BITS - `1`] = BITSET_WORD_MAX
563	# else
564	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
565	# error "bitset_word_t is narrower than 32 bits"
566	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569	BITSET_WORD_MAX, BITSET_WORD_MAX,
570	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571	BITSET_WORD_MAX,
572	# endif
573	(BITSET_WORD_MAX
574	>> (SBC_MAX % BITSET_WORD_BITS == `0`
575	? `0`
576	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577	# endif
578	};
579	#endif
580
581
582	static void
583	free_dfa_content (re_dfa_t *dfa)
584	{
585	Idx i, j;
586
587	if (dfa->nodes)
588	for (i = `0`; i < dfa->nodes_len; ++i)
589	free_token (dfa->nodes + i);
590	re_free (dfa->nexts);
591	for (i = `0`; i < dfa->nodes_len; ++i)
592	{
593	if (dfa->eclosures != NULL)
594	re_node_set_free (dfa->eclosures + i);
595	if (dfa->inveclosures != NULL)
596	re_node_set_free (dfa->inveclosures + i);
597	if (dfa->edests != NULL)
598	re_node_set_free (dfa->edests + i);
599	}
600	re_free (dfa->edests);
601	re_free (dfa->eclosures);
602	re_free (dfa->inveclosures);
603	re_free (dfa->nodes);
604
605	if (dfa->state_table)
606	for (i = `0`; i <= dfa->state_hash_mask; ++i)
607	{
608	struct re_state_table_entry *entry = dfa->state_table + i;
609	for (j = `0`; j < entry->num; ++j)
610	{
611	re_dfastate_t *state = entry->array[j];
612	free_state (state);
613	}
614	re_free (entry->array);
615	}
616	re_free (dfa->state_table);
617	#ifdef RE_ENABLE_I18N
618	if (dfa->sb_char != utf8_sb_map)
619	re_free (dfa->sb_char);
620	#endif
621	re_free (dfa->subexp_map);
622	#ifdef DEBUG
623	re_free (dfa->re_str);
624	#endif
625
626	re_free (dfa);
627	}
628
629
630	/ Free dynamically allocated space used by PREG. /
631
632	void
633	regfree (regex_t *preg)
634	{
635	re_dfa_t *dfa = preg->buffer;
636	if (__glibc_likely (dfa != NULL))
637	{
638	lock_fini (dfa->lock);
639	free_dfa_content (dfa);
640	}
641	preg->buffer = NULL;
642	preg->allocated = `0`;
643
644	re_free (preg->fastmap);
645	preg->fastmap = NULL;
646
647	re_free (preg->translate);
648	preg->translate = NULL;
649	}
650	libc_hidden_def (__regfree)
651	weak_alias (__regfree, regfree)
652
653	/ Entry points compatible with 4.2 BSD regex library. We don't define*
654	them unless specifically requested. /*
655
656	#if defined _REGEX_RE_COMP \|\| defined _LIBC
657
658	/ BSD has one and only one pattern buffer. /
659	static struct re_pattern_buffer re_comp_buf;
660
661	char *
662	# ifdef _LIBC
663	/ Make these definitions weak in libc, so POSIX programs can redefine*
664	these names if they don't use our functions, and still use
665	regcomp/regexec above without link errors. /*
666	weak_function
667	# endif
668	re_comp (const char *s)
669	{
670	reg_errcode_t ret;
671	char *fastmap;
672
673	if (!s)
674	{
675	if (!re_comp_buf.buffer)
676	return gettext ("No previous regular expression");
677	return `0`;
678	}
679
680	if (re_comp_buf.buffer)
681	{
682	fastmap = re_comp_buf.fastmap;
683	re_comp_buf.fastmap = NULL;
684	__regfree (&re_comp_buf);
685	memset (&re_comp_buf, `'\0'`, sizeof (re_comp_buf));
686	re_comp_buf.fastmap = fastmap;
687	}
688
689	if (re_comp_buf.fastmap == NULL)
690	{
691	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692	if (re_comp_buf.fastmap == NULL)
693	return (char *) gettext (__re_error_msgid
694	+ __re_error_msgid_idx[(int) REG_ESPACE]);
695	}
696
697	/ Since 're_exec' always passes NULL for the 'regs' argument, we*
698	don't need to initialize the pattern buffer fields which affect it. /*
699
700	/ Match anchors at newlines. /
701	re_comp_buf.newline_anchor = `1`;
702
703	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704
705	if (!ret)
706	return NULL;
707
708	/ Yes, we're discarding 'const' here if !HAVE_LIBINTL. /
709	return (char ) gettext (__re_error_msgid + __re_error_msgid_idx[(int*) ret]);
710	}
711
712	#ifdef _LIBC
713	void
714	__libc_regcomp_freemem (void)
715	{
716	__regfree (&re_comp_buf);
717	}
718	#endif
719
720	#endif /* _REGEX_RE_COMP */
721
722	/ Internal entry point.*
723	Compile the regular expression PATTERN, whose length is LENGTH.
724	SYNTAX indicate regular expression's syntax. /*
725
726	static reg_errcode_t
727	re_compile_internal (regex_t preg, const* char * pattern, size_t length,
728	reg_syntax_t syntax)
729	{
730	reg_errcode_t err = REG_NOERROR;
731	re_dfa_t *dfa;
732	re_string_t regexp;
733
734	/ Initialize the pattern buffer. /
735	preg->fastmap_accurate = `0`;
736	preg->syntax = syntax;
737	preg->not_bol = preg->not_eol = `0`;
738	preg->used = `0`;
739	preg->re_nsub = `0`;
740	preg->can_be_null = `0`;
741	preg->regs_allocated = REGS_UNALLOCATED;
742
743	/ Initialize the dfa. /
744	dfa = preg->buffer;
745	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
746	{
747	/ If zero allocated, but buffer is non-null, try to realloc*
748	enough space. This loses if buffer's address is bogus, but
749	that is the user's responsibility. If ->buffer is NULL this
750	is a simple allocation. /*
751	dfa = re_realloc (preg->buffer, re_dfa_t, `1`);
752	if (dfa == NULL)
753	return REG_ESPACE;
754	preg->allocated = sizeof (re_dfa_t);
755	preg->buffer = dfa;
756	}
757	preg->used = sizeof (re_dfa_t);
758
759	err = init_dfa (dfa, length);
760	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != `0`))
761	err = REG_ESPACE;
762	if (__glibc_unlikely (err != REG_NOERROR))
763	{
764	free_dfa_content (dfa);
765	preg->buffer = NULL;
766	preg->allocated = `0`;
767	return err;
768	}
769	#ifdef DEBUG
770	/ Note: length+1 will not overflow since it is checked in init_dfa. /
771	dfa->re_str = re_malloc (char, length + `1`);
772	strncpy (dfa->re_str, pattern, length + `1`);
773	#endif
774
775	err = re_string_construct (&regexp, pattern, length, preg->translate,
776	(syntax & RE_ICASE) != `0`, dfa);
777	if (__glibc_unlikely (err != REG_NOERROR))
778	{
779	re_compile_internal_free_return:
780	free_workarea_compile (preg);
781	re_string_destruct (&regexp);
782	lock_fini (dfa->lock);
783	free_dfa_content (dfa);
784	preg->buffer = NULL;
785	preg->allocated = `0`;
786	return err;
787	}
788
789	/ Parse the regular expression, and build a structure tree. /
790	preg->re_nsub = `0`;
791	dfa->str_tree = parse (&regexp, preg, syntax, &err);
792	if (__glibc_unlikely (dfa->str_tree == NULL))
793	goto re_compile_internal_free_return;
794
795	/ Analyze the tree and create the nfa. /
796	err = analyze (preg);
797	if (__glibc_unlikely (err != REG_NOERROR))
798	goto re_compile_internal_free_return;
799
800	#ifdef RE_ENABLE_I18N
801	/ If possible, do searching in single byte encoding to speed things up. /
802	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
803	optimize_utf8 (dfa);
804	#endif
805
806	/ Then create the initial state of the dfa. /
807	err = create_initial_state (dfa);
808
809	/ Release work areas. /
810	free_workarea_compile (preg);
811	re_string_destruct (&regexp);
812
813	if (__glibc_unlikely (err != REG_NOERROR))
814	{
815	lock_fini (dfa->lock);
816	free_dfa_content (dfa);
817	preg->buffer = NULL;
818	preg->allocated = `0`;
819	}
820
821	return err;
822	}
823
824	/ Initialize DFA. We use the length of the regular expression PAT_LEN*
825	as the initial length of some arrays. /*
826
827	static reg_errcode_t
828	init_dfa (re_dfa_t *dfa, size_t pat_len)
829	{
830	__re_size_t table_size;
831	#ifndef _LIBC
832	const char *codeset_name;
833	#endif
834	#ifdef RE_ENABLE_I18N
835	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
836	#else
837	size_t max_i18n_object_size = `0`;
838	#endif
839	size_t max_object_size =
840	MAX (sizeof (struct re_state_table_entry),
841	MAX (sizeof (re_token_t),
842	MAX (sizeof (re_node_set),
843	MAX (sizeof (regmatch_t),
844	max_i18n_object_size))));
845
846	memset (dfa, `'\0'`, sizeof (re_dfa_t));
847
848	/ Force allocation of str_tree_storage the first time. /
849	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
850
851	/ Avoid overflows. The extra "/ 2" is for the table_size doubling*
852	calculation below, and for similar doubling calculations
853	elsewhere. And it's <= rather than <, because some of the
854	doubling calculations add 1 afterwards. /*
855	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / `2`
856	<= pat_len))
857	return REG_ESPACE;
858
859	dfa->nodes_alloc = pat_len + `1`;
860	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861
862	/ table_size = 2 ^ ceil(log pat_len) /
863	for (table_size = `1`; ; table_size <<= `1`)
864	if (table_size > pat_len)
865	break;
866
867	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
868	dfa->state_hash_mask = table_size - `1`;
869
870	dfa->mb_cur_max = MB_CUR_MAX;
871	#ifdef _LIBC
872	if (dfa->mb_cur_max == `6`
873	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == `0`)
874	dfa->is_utf8 = `1`;
875	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
876	!= `0`);
877	#else
878	codeset_name = nl_langinfo (CODESET);
879	if ((codeset_name[`0`] == `'U'` \|\| codeset_name[`0`] == `'u'`)
880	&& (codeset_name[`1`] == `'T'` \|\| codeset_name[`1`] == `'t'`)
881	&& (codeset_name[`2`] == `'F'` \|\| codeset_name[`2`] == `'f'`)
882	&& strcmp (codeset_name + `3` + (codeset_name[`3`] == `'-'`), "8") == `0`)
883	dfa->is_utf8 = `1`;
884
885	/ We check exhaustively in the loop below if this charset is a*
886	superset of ASCII. /*
887	dfa->map_notascii = `0`;
888	#endif
889
890	#ifdef RE_ENABLE_I18N
891	if (dfa->mb_cur_max > `1`)
892	{
893	if (dfa->is_utf8)
894	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
895	else
896	{
897	int i, j, ch;
898
899	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
900	if (__glibc_unlikely (dfa->sb_char == NULL))
901	return REG_ESPACE;
902
903	/ Set the bits corresponding to single byte chars. /
904	for (i = `0`, ch = `0`; i < BITSET_WORDS; ++i)
905	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
906	{
907	wint_t wch = __btowc (ch);
908	if (wch != WEOF)
909	dfa->sb_char[i] \|= (bitset_word_t) `1` << j;
910	# ifndef _LIBC
911	if (isascii (ch) && wch != ch)
912	dfa->map_notascii = `1`;
913	# endif
914	}
915	}
916	}
917	#endif
918
919	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
920	return REG_ESPACE;
921	return REG_NOERROR;
922	}
923
924	/ Initialize WORD_CHAR table, which indicate which character is*
925	"word". In this case "word" means that it is the word construction
926	character used by some operators like "\<", "\>", etc. /*
927
928	static void
929	init_word_char (re_dfa_t *dfa)
930	{
931	int i = `0`;
932	int j;
933	int ch = `0`;
934	dfa->word_ops_used = `1`;
935	if (__glibc_likely (dfa->map_notascii == `0`))
936	{
937	/ Avoid uint32_t and uint64_t as some non-GCC platforms lack*
938	them, an issue when this code is used in Gnulib. /*
939	bitset_word_t bits0 = `0x00000000`;
940	bitset_word_t bits1 = `0x03ff0000`;
941	bitset_word_t bits2 = `0x87fffffe`;
942	bitset_word_t bits3 = `0x07fffffe`;
943	if (BITSET_WORD_BITS == `64`)
944	{
945	/ Pacify gcc -Woverflow on 32-bit platformns. /
946	dfa->word_char[`0`] = bits1 << `31` << `1` \| bits0;
947	dfa->word_char[`1`] = bits3 << `31` << `1` \| bits2;
948	i = `2`;
949	}
950	else if (BITSET_WORD_BITS == `32`)
951	{
952	dfa->word_char[`0`] = bits0;
953	dfa->word_char[`1`] = bits1;
954	dfa->word_char[`2`] = bits2;
955	dfa->word_char[`3`] = bits3;
956	i = `4`;
957	}
958	else
959	goto general_case;
960	ch = `128`;
961
962	if (__glibc_likely (dfa->is_utf8))
963	{
964	memset (&dfa->word_char[i], `'\0'`, (SBC_MAX - ch) / `8`);
965	return;
966	}
967	}
968
969	general_case:
970	for (; i < BITSET_WORDS; ++i)
971	for (j = `0`; j < BITSET_WORD_BITS; ++j, ++ch)
972	if (isalnum (ch) \|\| ch == `'_'`)
973	dfa->word_char[i] \|= (bitset_word_t) `1` << j;
974	}
975
976	/ Free the work area which are only used while compiling. /
977
978	static void
979	free_workarea_compile (regex_t *preg)
980	{
981	re_dfa_t *dfa = preg->buffer;
982	bin_tree_storage_t storage, next;
983	for (storage = dfa->str_tree_storage; storage; storage = next)
984	{
985	next = storage->next;
986	re_free (storage);
987	}
988	dfa->str_tree_storage = NULL;
989	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
990	dfa->str_tree = NULL;
991	re_free (dfa->org_indices);
992	dfa->org_indices = NULL;
993	}
994
995	/ Create initial states for all contexts. /
996
997	static reg_errcode_t
998	create_initial_state (re_dfa_t *dfa)
999	{
1000	Idx first, i;
1001	reg_errcode_t err;
1002	re_node_set init_nodes;
1003
1004	/ Initial states have the epsilon closure of the node which is*
1005	the first node of the regular expression. /*
1006	first = dfa->str_tree->first->node_idx;
1007	dfa->init_node = first;
1008	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1009	if (__glibc_unlikely (err != REG_NOERROR))
1010	return err;
1011
1012	/ The back-references which are in initial states can epsilon transit,*
1013	since in this case all of the subexpressions can be null.
1014	Then we add epsilon closures of the nodes which are the next nodes of
1015	the back-references. /*
1016	if (dfa->nbackref > `0`)
1017	for (i = `0`; i < init_nodes.nelem; ++i)
1018	{
1019	Idx node_idx = init_nodes.elems[i];
1020	re_token_type_t type = dfa->nodes[node_idx].type;
1021
1022	Idx clexp_idx;
1023	if (type != OP_BACK_REF)
1024	continue;
1025	for (clexp_idx = `0`; clexp_idx < init_nodes.nelem; ++clexp_idx)
1026	{
1027	re_token_t *clexp_node;
1028	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1029	if (clexp_node->type == OP_CLOSE_SUBEXP
1030	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1031	break;
1032	}
1033	if (clexp_idx == init_nodes.nelem)
1034	continue;
1035
1036	if (type == OP_BACK_REF)
1037	{
1038	Idx dest_idx = dfa->edests[node_idx].elems[`0`];
1039	if (!re_node_set_contains (&init_nodes, dest_idx))
1040	{
1041	reg_errcode_t merge_err
1042	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1043	if (merge_err != REG_NOERROR)
1044	return merge_err;
1045	i = `0`;
1046	}
1047	}
1048	}
1049
1050	/ It must be the first time to invoke acquire_state. /
1051	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, `0`);
1052	/ We don't check ERR here, since the initial state must not be NULL. /
1053	if (__glibc_unlikely (dfa->init_state == NULL))
1054	return err;
1055	if (dfa->init_state->has_constraint)
1056	{
1057	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1058	CONTEXT_WORD);
1059	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1060	CONTEXT_NEWLINE);
1061	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1062	&init_nodes,
1063	CONTEXT_NEWLINE
1064	\| CONTEXT_BEGBUF);
1065	if (__glibc_unlikely (dfa->init_state_word == NULL
1066	\|\| dfa->init_state_nl == NULL
1067	\|\| dfa->init_state_begbuf == NULL))
1068	return err;
1069	}
1070	else
1071	dfa->init_state_word = dfa->init_state_nl
1072	= dfa->init_state_begbuf = dfa->init_state;
1073
1074	re_node_set_free (&init_nodes);
1075	return REG_NOERROR;
1076	}
1077
1078	#ifdef RE_ENABLE_I18N
1079	/ If it is possible to do searching in single byte encoding instead of UTF-8*
1080	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1081	DFA nodes where needed. /*
1082
1083	static void
1084	optimize_utf8 (re_dfa_t *dfa)
1085	{
1086	Idx node;
1087	int i;
1088	bool mb_chars = false;
1089	bool has_period = false;
1090
1091	for (node = `0`; node < dfa->nodes_len; ++node)
1092	switch (dfa->nodes[node].type)
1093	{
1094	case CHARACTER:
1095	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1096	mb_chars = true;
1097	break;
1098	case ANCHOR:
1099	switch (dfa->nodes[node].opr.ctx_type)
1100	{
1101	case LINE_FIRST:
1102	case LINE_LAST:
1103	case BUF_FIRST:
1104	case BUF_LAST:
1105	break;
1106	default:
1107	/ Word anchors etc. cannot be handled. It's okay to test*
1108	opr.ctx_type since constraints (for all DFA nodes) are
1109	created by ORing one or more opr.ctx_type values. /*
1110	return;
1111	}
1112	break;
1113	case OP_PERIOD:
1114	has_period = true;
1115	break;
1116	case OP_BACK_REF:
1117	case OP_ALT:
1118	case END_OF_RE:
1119	case OP_DUP_ASTERISK:
1120	case OP_OPEN_SUBEXP:
1121	case OP_CLOSE_SUBEXP:
1122	break;
1123	case COMPLEX_BRACKET:
1124	return;
1125	case SIMPLE_BRACKET:
1126	/ Just double check. /
1127	{
1128	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == `0`
1129	? `0`
1130	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1131	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1132	{
1133	if (dfa->nodes[node].opr.sbcset[i] >> rshift != `0`)
1134	return;
1135	rshift = `0`;
1136	}
1137	}
1138	break;
1139	default:
1140	abort ();
1141	}
1142
1143	if (mb_chars \|\| has_period)
1144	for (node = `0`; node < dfa->nodes_len; ++node)
1145	{
1146	if (dfa->nodes[node].type == CHARACTER
1147	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1148	dfa->nodes[node].mb_partial = `0`;
1149	else if (dfa->nodes[node].type == OP_PERIOD)
1150	dfa->nodes[node].type = OP_UTF8_PERIOD;
1151	}
1152
1153	/ The search can be in single byte locale. /
1154	dfa->mb_cur_max = `1`;
1155	dfa->is_utf8 = `0`;
1156	dfa->has_mb_node = dfa->nbackref > `0` \|\| has_period;
1157	}
1158	#endif
1159
1160	/ Analyze the structure tree, and calculate "first", "next", "edest",*
1161	"eclosure", and "inveclosure". /*
1162
1163	static reg_errcode_t
1164	analyze (regex_t *preg)
1165	{
1166	re_dfa_t *dfa = preg->buffer;
1167	reg_errcode_t ret;
1168
1169	/ Allocate arrays. /
1170	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1171	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1172	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1173	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1174	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
1175	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
1176	return REG_ESPACE;
1177
1178	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1179	if (dfa->subexp_map != NULL)
1180	{
1181	Idx i;
1182	for (i = `0`; i < preg->re_nsub; i++)
1183	dfa->subexp_map[i] = i;
1184	preorder (dfa->str_tree, optimize_subexps, dfa);
1185	for (i = `0`; i < preg->re_nsub; i++)
1186	if (dfa->subexp_map[i] != i)
1187	break;
1188	if (i == preg->re_nsub)
1189	{
1190	re_free (dfa->subexp_map);
1191	dfa->subexp_map = NULL;
1192	}
1193	}
1194
1195	ret = postorder (dfa->str_tree, lower_subexps, preg);
1196	if (__glibc_unlikely (ret != REG_NOERROR))
1197	return ret;
1198	ret = postorder (dfa->str_tree, calc_first, dfa);
1199	if (__glibc_unlikely (ret != REG_NOERROR))
1200	return ret;
1201	preorder (dfa->str_tree, calc_next, dfa);
1202	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1203	if (__glibc_unlikely (ret != REG_NOERROR))
1204	return ret;
1205	ret = calc_eclosure (dfa);
1206	if (__glibc_unlikely (ret != REG_NOERROR))
1207	return ret;
1208
1209	/ We only need this during the prune_impossible_nodes pass in regexec.c;*
1210	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. /*
1211	if ((!preg->no_sub && preg->re_nsub > `0` && dfa->has_plural_match)
1212	\|\| dfa->nbackref)
1213	{
1214	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1215	if (__glibc_unlikely (dfa->inveclosures == NULL))
1216	return REG_ESPACE;
1217	ret = calc_inveclosure (dfa);
1218	}
1219
1220	return ret;
1221	}
1222
1223	/ Our parse trees are very unbalanced, so we cannot use a stack to*
1224	implement parse tree visits. Instead, we use parent pointers and
1225	some hairy code in these two functions. /*
1226	static reg_errcode_t
1227	postorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1228	void *extra)
1229	{
1230	bin_tree_t node, prev;
1231
1232	for (node = root; ; )
1233	{
1234	/ Descend down the tree, preferably to the left (or to the right*
1235	if that's the only child). /*
1236	while (node->left \|\| node->right)
1237	if (node->left)
1238	node = node->left;
1239	else
1240	node = node->right;
1241
1242	do
1243	{
1244	reg_errcode_t err = fn (extra, node);
1245	if (__glibc_unlikely (err != REG_NOERROR))
1246	return err;
1247	if (node->parent == NULL)
1248	return REG_NOERROR;
1249	prev = node;
1250	node = node->parent;
1251	}
1252	/ Go up while we have a node that is reached from the right. /
1253	while (node->right == prev \|\| node->right == NULL);
1254	node = node->right;
1255	}
1256	}
1257
1258	static reg_errcode_t
1259	preorder (bin_tree_t root, reg_errcode_t (fn (void* , bin_tree_t )),
1260	void *extra)
1261	{
1262	bin_tree_t *node;
1263
1264	for (node = root; ; )
1265	{
1266	reg_errcode_t err = fn (extra, node);
1267	if (__glibc_unlikely (err != REG_NOERROR))
1268	return err;
1269
1270	/ Go to the left node, or up and to the right. /
1271	if (node->left)
1272	node = node->left;
1273	else
1274	{
1275	bin_tree_t *prev = NULL;
1276	while (node->right == prev \|\| node->right == NULL)
1277	{
1278	prev = node;
1279	node = node->parent;
1280	if (!node)
1281	return REG_NOERROR;
1282	}
1283	node = node->right;
1284	}
1285	}
1286	}
1287
1288	/ Optimization pass: if a SUBEXP is entirely contained, strip it and tell*
1289	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1290	backreferences as well. Requires a preorder visit. /*
1291	static reg_errcode_t
1292	optimize_subexps (void extra, bin_tree_t node)
1293	{
1294	re_dfa_t dfa = (re_dfa_t ) extra;
1295
1296	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1297	{
1298	int idx = node->token.opr.idx;
1299	node->token.opr.idx = dfa->subexp_map[idx];
1300	dfa->used_bkref_map \|= `1` << node->token.opr.idx;
1301	}
1302
1303	else if (node->token.type == SUBEXP
1304	&& node->left && node->left->token.type == SUBEXP)
1305	{
1306	Idx other_idx = node->left->token.opr.idx;
1307
1308	node->left = node->left->left;
1309	if (node->left)
1310	node->left->parent = node;
1311
1312	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1313	if (other_idx < BITSET_WORD_BITS)
1314	dfa->used_bkref_map &= ~((bitset_word_t) `1` << other_idx);
1315	}
1316
1317	return REG_NOERROR;
1318	}
1319
1320	/ Lowering pass: Turn each SUBEXP node into the appropriate concatenation*
1321	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. /*
1322	static reg_errcode_t
1323	lower_subexps (void extra, bin_tree_t node)
1324	{
1325	regex_t preg = (regex_t ) extra;
1326	reg_errcode_t err = REG_NOERROR;
1327
1328	if (node->left && node->left->token.type == SUBEXP)
1329	{
1330	node->left = lower_subexp (&err, preg, node->left);
1331	if (node->left)
1332	node->left->parent = node;
1333	}
1334	if (node->right && node->right->token.type == SUBEXP)
1335	{
1336	node->right = lower_subexp (&err, preg, node->right);
1337	if (node->right)
1338	node->right->parent = node;
1339	}
1340
1341	return err;
1342	}
1343
1344	static bin_tree_t *
1345	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1346	{
1347	re_dfa_t *dfa = preg->buffer;
1348	bin_tree_t *body = node->left;
1349	bin_tree_t op, cls, tree1, tree;
1350
1351	if (preg->no_sub
1352	/ We do not optimize empty subexpressions, because otherwise we may*
1353	have bad CONCAT nodes with NULL children. This is obviously not
1354	very common, so we do not lose much. An example that triggers
1355	this case is the sed "script" //x. /*
1356	&& node->left != NULL
1357	&& (node->token.opr.idx >= BITSET_WORD_BITS
1358	\|\| !(dfa->used_bkref_map
1359	& ((bitset_word_t) `1` << node->token.opr.idx))))
1360	return node->left;
1361
1362	/ Convert the SUBEXP node to the concatenation of an*
1363	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. /*
1364	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1365	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1366	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1367	tree = create_tree (dfa, op, tree1, CONCAT);
1368	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
1369	\|\| op == NULL \|\| cls == NULL))
1370	{
1371	*err = REG_ESPACE;
1372	return NULL;
1373	}
1374
1375	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1376	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1377	return tree;
1378	}
1379
1380	/ Pass 1 in building the NFA: compute FIRST and create unlinked automaton*
1381	nodes. Requires a postorder visit. /*
1382	static reg_errcode_t
1383	calc_first (void extra, bin_tree_t node)
1384	{
1385	re_dfa_t dfa = (re_dfa_t ) extra;
1386	if (node->token.type == CONCAT)
1387	{
1388	node->first = node->left->first;
1389	node->node_idx = node->left->node_idx;
1390	}
1391	else
1392	{
1393	node->first = node;
1394	node->node_idx = re_dfa_add_node (dfa, node->token);
1395	if (__glibc_unlikely (node->node_idx == -`1`))
1396	return REG_ESPACE;
1397	if (node->token.type == ANCHOR)
1398	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1399	}
1400	return REG_NOERROR;
1401	}
1402
1403	/ Pass 2: compute NEXT on the tree. Preorder visit. /
1404	static reg_errcode_t
1405	calc_next (void extra, bin_tree_t node)
1406	{
1407	switch (node->token.type)
1408	{
1409	case OP_DUP_ASTERISK:
1410	node->left->next = node;
1411	break;
1412	case CONCAT:
1413	node->left->next = node->right->first;
1414	node->right->next = node->next;
1415	break;
1416	default:
1417	if (node->left)
1418	node->left->next = node->next;
1419	if (node->right)
1420	node->right->next = node->next;
1421	break;
1422	}
1423	return REG_NOERROR;
1424	}
1425
1426	/ Pass 3: link all DFA nodes to their NEXT node (any order will do). /
1427	static reg_errcode_t
1428	link_nfa_nodes (void extra, bin_tree_t node)
1429	{
1430	re_dfa_t dfa = (re_dfa_t ) extra;
1431	Idx idx = node->node_idx;
1432	reg_errcode_t err = REG_NOERROR;
1433
1434	switch (node->token.type)
1435	{
1436	case CONCAT:
1437	break;
1438
1439	case END_OF_RE:
1440	DEBUG_ASSERT (node->next == NULL);
1441	break;
1442
1443	case OP_DUP_ASTERISK:
1444	case OP_ALT:
1445	{
1446	Idx left, right;
1447	dfa->has_plural_match = `1`;
1448	if (node->left != NULL)
1449	left = node->left->first->node_idx;
1450	else
1451	left = node->next->node_idx;
1452	if (node->right != NULL)
1453	right = node->right->first->node_idx;
1454	else
1455	right = node->next->node_idx;
1456	DEBUG_ASSERT (left > -`1`);
1457	DEBUG_ASSERT (right > -`1`);
1458	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1459	}
1460	break;
1461
1462	case ANCHOR:
1463	case OP_OPEN_SUBEXP:
1464	case OP_CLOSE_SUBEXP:
1465	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1466	break;
1467
1468	case OP_BACK_REF:
1469	dfa->nexts[idx] = node->next->node_idx;
1470	if (node->token.type == OP_BACK_REF)
1471	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1472	break;
1473
1474	default:
1475	DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1476	dfa->nexts[idx] = node->next->node_idx;
1477	break;
1478	}
1479
1480	return err;
1481	}
1482
1483	/ Duplicate the epsilon closure of the node ROOT_NODE.*
1484	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1485	to their own constraint. /*
1486
1487	static reg_errcode_t
1488	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1489	Idx root_node, unsigned int init_constraint)
1490	{
1491	Idx org_node, clone_node;
1492	bool ok;
1493	unsigned int constraint = init_constraint;
1494	for (org_node = top_org_node, clone_node = top_clone_node;;)
1495	{
1496	Idx org_dest, clone_dest;
1497	if (dfa->nodes[org_node].type == OP_BACK_REF)
1498	{
1499	/ If the back reference epsilon-transit, its destination must*
1500	also have the constraint. Then duplicate the epsilon closure
1501	of the destination of the back reference, and store it in
1502	edests of the back reference. /*
1503	org_dest = dfa->nexts[org_node];
1504	re_node_set_empty (dfa->edests + clone_node);
1505	clone_dest = duplicate_node (dfa, org_dest, constraint);
1506	if (__glibc_unlikely (clone_dest == -`1`))
1507	return REG_ESPACE;
1508	dfa->nexts[clone_node] = dfa->nexts[org_node];
1509	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1510	if (__glibc_unlikely (! ok))
1511	return REG_ESPACE;
1512	}
1513	else if (dfa->edests[org_node].nelem == `0`)
1514	{
1515	/ In case of the node can't epsilon-transit, don't duplicate the*
1516	destination and store the original destination as the
1517	destination of the node. /*
1518	dfa->nexts[clone_node] = dfa->nexts[org_node];
1519	break;
1520	}
1521	else if (dfa->edests[org_node].nelem == `1`)
1522	{
1523	/ In case of the node can epsilon-transit, and it has only one*
1524	destination. /*
1525	org_dest = dfa->edests[org_node].elems[`0`];
1526	re_node_set_empty (dfa->edests + clone_node);
1527	/ If the node is root_node itself, it means the epsilon closure*
1528	has a loop. Then tie it to the destination of the root_node. /*
1529	if (org_node == root_node && clone_node != org_node)
1530	{
1531	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1532	if (__glibc_unlikely (! ok))
1533	return REG_ESPACE;
1534	break;
1535	}
1536	/ In case the node has another constraint, append it. /
1537	constraint \|= dfa->nodes[org_node].constraint;
1538	clone_dest = duplicate_node (dfa, org_dest, constraint);
1539	if (__glibc_unlikely (clone_dest == -`1`))
1540	return REG_ESPACE;
1541	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542	if (__glibc_unlikely (! ok))
1543	return REG_ESPACE;
1544	}
1545	else / dfa->edests[org_node].nelem == 2 /
1546	{
1547	/ In case of the node can epsilon-transit, and it has two*
1548	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1549	org_dest = dfa->edests[org_node].elems[`0`];
1550	re_node_set_empty (dfa->edests + clone_node);
1551	/ Search for a duplicated node which satisfies the constraint. /
1552	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1553	if (clone_dest == -`1`)
1554	{
1555	/ There is no such duplicated node, create a new one. /
1556	reg_errcode_t err;
1557	clone_dest = duplicate_node (dfa, org_dest, constraint);
1558	if (__glibc_unlikely (clone_dest == -`1`))
1559	return REG_ESPACE;
1560	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561	if (__glibc_unlikely (! ok))
1562	return REG_ESPACE;
1563	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1564	root_node, constraint);
1565	if (__glibc_unlikely (err != REG_NOERROR))
1566	return err;
1567	}
1568	else
1569	{
1570	/ There is a duplicated node which satisfies the constraint,*
1571	use it to avoid infinite loop. /*
1572	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573	if (__glibc_unlikely (! ok))
1574	return REG_ESPACE;
1575	}
1576
1577	org_dest = dfa->edests[org_node].elems[`1`];
1578	clone_dest = duplicate_node (dfa, org_dest, constraint);
1579	if (__glibc_unlikely (clone_dest == -`1`))
1580	return REG_ESPACE;
1581	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582	if (__glibc_unlikely (! ok))
1583	return REG_ESPACE;
1584	}
1585	org_node = org_dest;
1586	clone_node = clone_dest;
1587	}
1588	return REG_NOERROR;
1589	}
1590
1591	/ Search for a node which is duplicated from the node ORG_NODE, and*
1592	satisfies the constraint CONSTRAINT. /*
1593
1594	static Idx
1595	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1596	unsigned int constraint)
1597	{
1598	Idx idx;
1599	for (idx = dfa->nodes_len - `1`; dfa->nodes[idx].duplicated && idx > `0`; --idx)
1600	{
1601	if (org_node == dfa->org_indices[idx]
1602	&& constraint == dfa->nodes[idx].constraint)
1603	return idx; / Found. /
1604	}
1605	return -`1`; / Not found. /
1606	}
1607
1608	/ Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.*
1609	Return the index of the new node, or -1 if insufficient storage is
1610	available. /*
1611
1612	static Idx
1613	duplicate_node (re_dfa_t dfa, Idx org_idx, unsigned* int constraint)
1614	{
1615	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1616	if (__glibc_likely (dup_idx != -`1`))
1617	{
1618	dfa->nodes[dup_idx].constraint = constraint;
1619	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
1620	dfa->nodes[dup_idx].duplicated = `1`;
1621
1622	/ Store the index of the original node. /
1623	dfa->org_indices[dup_idx] = org_idx;
1624	}
1625	return dup_idx;
1626	}
1627
1628	static reg_errcode_t
1629	calc_inveclosure (re_dfa_t *dfa)
1630	{
1631	Idx src, idx;
1632	bool ok;
1633	for (idx = `0`; idx < dfa->nodes_len; ++idx)
1634	re_node_set_init_empty (dfa->inveclosures + idx);
1635
1636	for (src = `0`; src < dfa->nodes_len; ++src)
1637	{
1638	Idx *elems = dfa->eclosures[src].elems;
1639	for (idx = `0`; idx < dfa->eclosures[src].nelem; ++idx)
1640	{
1641	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1642	if (__glibc_unlikely (! ok))
1643	return REG_ESPACE;
1644	}
1645	}
1646
1647	return REG_NOERROR;
1648	}
1649
1650	/ Calculate "eclosure" for all the node in DFA. /
1651
1652	static reg_errcode_t
1653	calc_eclosure (re_dfa_t *dfa)
1654	{
1655	Idx node_idx;
1656	bool incomplete;
1657	DEBUG_ASSERT (dfa->nodes_len > `0`);
1658	incomplete = false;
1659	/ For each nodes, calculate epsilon closure. /
1660	for (node_idx = `0`; ; ++node_idx)
1661	{
1662	reg_errcode_t err;
1663	re_node_set eclosure_elem;
1664	if (node_idx == dfa->nodes_len)
1665	{
1666	if (!incomplete)
1667	break;
1668	incomplete = false;
1669	node_idx = `0`;
1670	}
1671
1672	DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -`1`);
1673
1674	/ If we have already calculated, skip it. /
1675	if (dfa->eclosures[node_idx].nelem != `0`)
1676	continue;
1677	/ Calculate epsilon closure of 'node_idx'. /
1678	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1679	if (__glibc_unlikely (err != REG_NOERROR))
1680	return err;
1681
1682	if (dfa->eclosures[node_idx].nelem == `0`)
1683	{
1684	incomplete = true;
1685	re_node_set_free (&eclosure_elem);
1686	}
1687	}
1688	return REG_NOERROR;
1689	}
1690
1691	/ Calculate epsilon closure of NODE. /
1692
1693	static reg_errcode_t
1694	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1695	{
1696	reg_errcode_t err;
1697	Idx i;
1698	re_node_set eclosure;
1699	bool incomplete = false;
1700	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + `1`);
1701	if (__glibc_unlikely (err != REG_NOERROR))
1702	return err;
1703
1704	/ An epsilon closure includes itself. /
1705	eclosure.elems[eclosure.nelem++] = node;
1706
1707	/ This indicates that we are calculating this node now.*
1708	We reference this value to avoid infinite loop. /*
1709	dfa->eclosures[node].nelem = -`1`;
1710
1711	/ If the current node has constraints, duplicate all nodes*
1712	since they must inherit the constraints. /*
1713	if (dfa->nodes[node].constraint
1714	&& dfa->edests[node].nelem
1715	&& !dfa->nodes[dfa->edests[node].elems[`0`]].duplicated)
1716	{
1717	err = duplicate_node_closure (dfa, node, node, node,
1718	dfa->nodes[node].constraint);
1719	if (__glibc_unlikely (err != REG_NOERROR))
1720	return err;
1721	}
1722
1723	/ Expand each epsilon destination nodes. /
1724	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1725	for (i = `0`; i < dfa->edests[node].nelem; ++i)
1726	{
1727	re_node_set eclosure_elem;
1728	Idx edest = dfa->edests[node].elems[i];
1729	/ If calculating the epsilon closure of 'edest' is in progress,*
1730	return intermediate result. /*
1731	if (dfa->eclosures[edest].nelem == -`1`)
1732	{
1733	incomplete = true;
1734	continue;
1735	}
1736	/ If we haven't calculated the epsilon closure of 'edest' yet,*
1737	calculate now. Otherwise use calculated epsilon closure. /*
1738	if (dfa->eclosures[edest].nelem == `0`)
1739	{
1740	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1741	if (__glibc_unlikely (err != REG_NOERROR))
1742	return err;
1743	}
1744	else
1745	eclosure_elem = dfa->eclosures[edest];
1746	/ Merge the epsilon closure of 'edest'. /
1747	err = re_node_set_merge (&eclosure, &eclosure_elem);
1748	if (__glibc_unlikely (err != REG_NOERROR))
1749	return err;
1750	/ If the epsilon closure of 'edest' is incomplete,*
1751	the epsilon closure of this node is also incomplete. /*
1752	if (dfa->eclosures[edest].nelem == `0`)
1753	{
1754	incomplete = true;
1755	re_node_set_free (&eclosure_elem);
1756	}
1757	}
1758
1759	if (incomplete && !root)
1760	dfa->eclosures[node].nelem = `0`;
1761	else
1762	dfa->eclosures[node] = eclosure;
1763	*new_set = eclosure;
1764	return REG_NOERROR;
1765	}
1766
1767	/ Functions for token which are used in the parser. /
1768
1769	/ Fetch a token from INPUT.*
1770	We must not use this function inside bracket expressions. /*
1771
1772	static void
1773	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1774	{
1775	re_string_skip_bytes (input, peek_token (result, input, syntax));
1776	}
1777
1778	/ Peek a token from INPUT, and return the length of the token.*
1779	We must not use this function inside bracket expressions. /*
1780
1781	static int
1782	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1783	{
1784	unsigned char c;
1785
1786	if (re_string_eoi (input))
1787	{
1788	token->type = END_OF_RE;
1789	return `0`;
1790	}
1791
1792	c = re_string_peek_byte (input, `0`);
1793	token->opr.c = c;
1794
1795	token->word_char = `0`;
1796	#ifdef RE_ENABLE_I18N
1797	token->mb_partial = `0`;
1798	if (input->mb_cur_max > `1`
1799	&& !re_string_first_byte (input, re_string_cur_idx (input)))
1800	{
1801	token->type = CHARACTER;
1802	token->mb_partial = `1`;
1803	return `1`;
1804	}
1805	#endif
1806	if (c == `'\\'`)
1807	{
1808	unsigned char c2;
1809	if (re_string_cur_idx (input) + `1` >= re_string_length (input))
1810	{
1811	token->type = BACK_SLASH;
1812	return `1`;
1813	}
1814
1815	c2 = re_string_peek_byte_case (input, `1`);
1816	token->opr.c = c2;
1817	token->type = CHARACTER;
1818	#ifdef RE_ENABLE_I18N
1819	if (input->mb_cur_max > `1`)
1820	{
1821	wint_t wc = re_string_wchar_at (input,
1822	re_string_cur_idx (input) + `1`);
1823	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1824	}
1825	else
1826	#endif
1827	token->word_char = IS_WORD_CHAR (c2) != `0`;
1828
1829	switch (c2)
1830	{
1831	case `'\|'`:
1832	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833	token->type = OP_ALT;
1834	break;
1835	case `'1'`: case `'2'`: case `'3'`: case `'4'`: case `'5'`:
1836	case `'6'`: case `'7'`: case `'8'`: case `'9'`:
1837	if (!(syntax & RE_NO_BK_REFS))
1838	{
1839	token->type = OP_BACK_REF;
1840	token->opr.idx = c2 - `'1'`;
1841	}
1842	break;
1843	case `'<'`:
1844	if (!(syntax & RE_NO_GNU_OPS))
1845	{
1846	token->type = ANCHOR;
1847	token->opr.ctx_type = WORD_FIRST;
1848	}
1849	break;
1850	case `'>'`:
1851	if (!(syntax & RE_NO_GNU_OPS))
1852	{
1853	token->type = ANCHOR;
1854	token->opr.ctx_type = WORD_LAST;
1855	}
1856	break;
1857	case `'b'`:
1858	if (!(syntax & RE_NO_GNU_OPS))
1859	{
1860	token->type = ANCHOR;
1861	token->opr.ctx_type = WORD_DELIM;
1862	}
1863	break;
1864	case `'B'`:
1865	if (!(syntax & RE_NO_GNU_OPS))
1866	{
1867	token->type = ANCHOR;
1868	token->opr.ctx_type = NOT_WORD_DELIM;
1869	}
1870	break;
1871	case `'w'`:
1872	if (!(syntax & RE_NO_GNU_OPS))
1873	token->type = OP_WORD;
1874	break;
1875	case `'W'`:
1876	if (!(syntax & RE_NO_GNU_OPS))
1877	token->type = OP_NOTWORD;
1878	break;
1879	case `'s'`:
1880	if (!(syntax & RE_NO_GNU_OPS))
1881	token->type = OP_SPACE;
1882	break;
1883	case `'S'`:
1884	if (!(syntax & RE_NO_GNU_OPS))
1885	token->type = OP_NOTSPACE;
1886	break;
1887	case '`':
1888	if (!(syntax & RE_NO_GNU_OPS))
1889	{
1890	token->type = ANCHOR;
1891	token->opr.ctx_type = BUF_FIRST;
1892	}
1893	break;
1894	case `'\''`:
1895	if (!(syntax & RE_NO_GNU_OPS))
1896	{
1897	token->type = ANCHOR;
1898	token->opr.ctx_type = BUF_LAST;
1899	}
1900	break;
1901	case `'('`:
1902	if (!(syntax & RE_NO_BK_PARENS))
1903	token->type = OP_OPEN_SUBEXP;
1904	break;
1905	case `')'`:
1906	if (!(syntax & RE_NO_BK_PARENS))
1907	token->type = OP_CLOSE_SUBEXP;
1908	break;
1909	case `'+'`:
1910	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911	token->type = OP_DUP_PLUS;
1912	break;
1913	case `'?'`:
1914	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915	token->type = OP_DUP_QUESTION;
1916	break;
1917	case `'{'`:
1918	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919	token->type = OP_OPEN_DUP_NUM;
1920	break;
1921	case `'}'`:
1922	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923	token->type = OP_CLOSE_DUP_NUM;
1924	break;
1925	default:
1926	break;
1927	}
1928	return `2`;
1929	}
1930
1931	token->type = CHARACTER;
1932	#ifdef RE_ENABLE_I18N
1933	if (input->mb_cur_max > `1`)
1934	{
1935	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936	token->word_char = IS_WIDE_WORD_CHAR (wc) != `0`;
1937	}
1938	else
1939	#endif
1940	token->word_char = IS_WORD_CHAR (token->opr.c);
1941
1942	switch (c)
1943	{
1944	case `'\n'`:
1945	if (syntax & RE_NEWLINE_ALT)
1946	token->type = OP_ALT;
1947	break;
1948	case `'\|'`:
1949	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1950	token->type = OP_ALT;
1951	break;
1952	case `'*'`:
1953	token->type = OP_DUP_ASTERISK;
1954	break;
1955	case `'+'`:
1956	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957	token->type = OP_DUP_PLUS;
1958	break;
1959	case `'?'`:
1960	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961	token->type = OP_DUP_QUESTION;
1962	break;
1963	case `'{'`:
1964	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965	token->type = OP_OPEN_DUP_NUM;
1966	break;
1967	case `'}'`:
1968	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969	token->type = OP_CLOSE_DUP_NUM;
1970	break;
1971	case `'('`:
1972	if (syntax & RE_NO_BK_PARENS)
1973	token->type = OP_OPEN_SUBEXP;
1974	break;
1975	case `')'`:
1976	if (syntax & RE_NO_BK_PARENS)
1977	token->type = OP_CLOSE_SUBEXP;
1978	break;
1979	case `'['`:
1980	token->type = OP_OPEN_BRACKET;
1981	break;
1982	case `'.'`:
1983	token->type = OP_PERIOD;
1984	break;
1985	case `'^'`:
1986	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
1987	&& re_string_cur_idx (input) != `0`)
1988	{
1989	char prev = re_string_peek_byte (input, -`1`);
1990	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != `'\n'`)
1991	break;
1992	}
1993	token->type = ANCHOR;
1994	token->opr.ctx_type = LINE_FIRST;
1995	break;
1996	case `'$'`:
1997	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1998	&& re_string_cur_idx (input) + `1` != re_string_length (input))
1999	{
2000	re_token_t next;
2001	re_string_skip_bytes (input, `1`);
2002	peek_token (&next, input, syntax);
2003	re_string_skip_bytes (input, -`1`);
2004	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005	break;
2006	}
2007	token->type = ANCHOR;
2008	token->opr.ctx_type = LINE_LAST;
2009	break;
2010	default:
2011	break;
2012	}
2013	return `1`;
2014	}
2015
2016	/ Peek a token from INPUT, and return the length of the token.*
2017	We must not use this function out of bracket expressions. /*
2018
2019	static int
2020	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2021	{
2022	unsigned char c;
2023	if (re_string_eoi (input))
2024	{
2025	token->type = END_OF_RE;
2026	return `0`;
2027	}
2028	c = re_string_peek_byte (input, `0`);
2029	token->opr.c = c;
2030
2031	#ifdef RE_ENABLE_I18N
2032	if (input->mb_cur_max > `1`
2033	&& !re_string_first_byte (input, re_string_cur_idx (input)))
2034	{
2035	token->type = CHARACTER;
2036	return `1`;
2037	}
2038	#endif /* RE_ENABLE_I18N */
2039
2040	if (c == `'\\'` && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041	&& re_string_cur_idx (input) + `1` < re_string_length (input))
2042	{
2043	/ In this case, '\' escape a character. /
2044	unsigned char c2;
2045	re_string_skip_bytes (input, `1`);
2046	c2 = re_string_peek_byte (input, `0`);
2047	token->opr.c = c2;
2048	token->type = CHARACTER;
2049	return `1`;
2050	}
2051	if (c == `'['`) / '[' is a special char in a bracket exps. /
2052	{
2053	unsigned char c2;
2054	int token_len;
2055	if (re_string_cur_idx (input) + `1` < re_string_length (input))
2056	c2 = re_string_peek_byte (input, `1`);
2057	else
2058	c2 = `0`;
2059	token->opr.c = c2;
2060	token_len = `2`;
2061	switch (c2)
2062	{
2063	case `'.'`:
2064	token->type = OP_OPEN_COLL_ELEM;
2065	break;
2066
2067	case `'='`:
2068	token->type = OP_OPEN_EQUIV_CLASS;
2069	break;
2070
2071	case `':'`:
2072	if (syntax & RE_CHAR_CLASSES)
2073	{
2074	token->type = OP_OPEN_CHAR_CLASS;
2075	break;
2076	}
2077	FALLTHROUGH;
2078	default:
2079	token->type = CHARACTER;
2080	token->opr.c = c;
2081	token_len = `1`;
2082	break;
2083	}
2084	return token_len;
2085	}
2086	switch (c)
2087	{
2088	case `'-'`:
2089	token->type = OP_CHARSET_RANGE;
2090	break;
2091	case `']'`:
2092	token->type = OP_CLOSE_BRACKET;
2093	break;
2094	case `'^'`:
2095	token->type = OP_NON_MATCH_LIST;
2096	break;
2097	default:
2098	token->type = CHARACTER;
2099	}
2100	return `1`;
2101	}
2102
2103	/ Functions for parser. /
2104
2105	/ Entry point of the parser.*
2106	Parse the regular expression REGEXP and return the structure tree.
2107	If an error occurs, ERR is set by error code, and return NULL.
2108	This function build the following tree, from regular expression <reg_exp>:
2109	CAT
2110	/ \
2111	/ \
2112	<reg_exp> EOR
2113
2114	CAT means concatenation.
2115	EOR means end of regular expression. /*
2116
2117	static bin_tree_t *
2118	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2119	reg_errcode_t *err)
2120	{
2121	re_dfa_t *dfa = preg->buffer;
2122	bin_tree_t tree, eor, *root;
2123	re_token_t current_token;
2124	dfa->syntax = syntax;
2125	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2126	tree = parse_reg_exp (regexp, preg, &current_token, syntax, `0`, err);
2127	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2128	return NULL;
2129	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2130	if (tree != NULL)
2131	root = create_tree (dfa, tree, eor, CONCAT);
2132	else
2133	root = eor;
2134	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
2135	{
2136	*err = REG_ESPACE;
2137	return NULL;
2138	}
2139	return root;
2140	}
2141
2142	/ This function build the following tree, from regular expression*
2143	<branch1>\|<branch2>:
2144	ALT
2145	/ \
2146	/ \
2147	<branch1> <branch2>
2148
2149	ALT means alternative, which represents the operator '\|'. /*
2150
2151	static bin_tree_t *
2152	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2153	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2154	{
2155	re_dfa_t *dfa = preg->buffer;
2156	bin_tree_t tree, branch = NULL;
2157	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2158	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2159	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2160	return NULL;
2161
2162	while (token->type == OP_ALT)
2163	{
2164	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2165	if (token->type != OP_ALT && token->type != END_OF_RE
2166	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2167	{
2168	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2169	dfa->completed_bkref_map = initial_bkref_map;
2170	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2171	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2172	{
2173	if (tree != NULL)
2174	postorder (tree, free_tree, NULL);
2175	return NULL;
2176	}
2177	dfa->completed_bkref_map \|= accumulated_bkref_map;
2178	}
2179	else
2180	branch = NULL;
2181	tree = create_tree (dfa, tree, branch, OP_ALT);
2182	if (__glibc_unlikely (tree == NULL))
2183	{
2184	*err = REG_ESPACE;
2185	return NULL;
2186	}
2187	}
2188	return tree;
2189	}
2190
2191	/ This function build the following tree, from regular expression*
2192	<exp1><exp2>:
2193	CAT
2194	/ \
2195	/ \
2196	<exp1> <exp2>
2197
2198	CAT means concatenation. /*
2199
2200	static bin_tree_t *
2201	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2202	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2203	{
2204	bin_tree_t tree, expr;
2205	re_dfa_t *dfa = preg->buffer;
2206	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2207	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2208	return NULL;
2209
2210	while (token->type != OP_ALT && token->type != END_OF_RE
2211	&& (nest == `0` \|\| token->type != OP_CLOSE_SUBEXP))
2212	{
2213	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2214	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2215	{
2216	if (tree != NULL)
2217	postorder (tree, free_tree, NULL);
2218	return NULL;
2219	}
2220	if (tree != NULL && expr != NULL)
2221	{
2222	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2223	if (newtree == NULL)
2224	{
2225	postorder (expr, free_tree, NULL);
2226	postorder (tree, free_tree, NULL);
2227	*err = REG_ESPACE;
2228	return NULL;
2229	}
2230	tree = newtree;
2231	}
2232	else if (tree == NULL)
2233	tree = expr;
2234	/ Otherwise expr == NULL, we don't need to create new tree. /
2235	}
2236	return tree;
2237	}
2238
2239	/ This function build the following tree, from regular expression a:
2240	*
2241	\|
2242	a
2243	*/
2244
2245	static bin_tree_t *
2246	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2247	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2248	{
2249	re_dfa_t *dfa = preg->buffer;
2250	bin_tree_t *tree;
2251	switch (token->type)
2252	{
2253	case CHARACTER:
2254	tree = create_token_tree (dfa, NULL, NULL, token);
2255	if (__glibc_unlikely (tree == NULL))
2256	{
2257	*err = REG_ESPACE;
2258	return NULL;
2259	}
2260	#ifdef RE_ENABLE_I18N
2261	if (dfa->mb_cur_max > `1`)
2262	{
2263	while (!re_string_eoi (regexp)
2264	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2265	{
2266	bin_tree_t *mbc_remain;
2267	fetch_token (token, regexp, syntax);
2268	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2269	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2270	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
2271	{
2272	*err = REG_ESPACE;
2273	return NULL;
2274	}
2275	}
2276	}
2277	#endif
2278	break;
2279
2280	case OP_OPEN_SUBEXP:
2281	tree = parse_sub_exp (regexp, preg, token, syntax, nest + `1`, err);
2282	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2283	return NULL;
2284	break;
2285
2286	case OP_OPEN_BRACKET:
2287	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2288	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2289	return NULL;
2290	break;
2291
2292	case OP_BACK_REF:
2293	if (!__glibc_likely (dfa->completed_bkref_map & (`1` << token->opr.idx)))
2294	{
2295	*err = REG_ESUBREG;
2296	return NULL;
2297	}
2298	dfa->used_bkref_map \|= `1` << token->opr.idx;
2299	tree = create_token_tree (dfa, NULL, NULL, token);
2300	if (__glibc_unlikely (tree == NULL))
2301	{
2302	*err = REG_ESPACE;
2303	return NULL;
2304	}
2305	++dfa->nbackref;
2306	dfa->has_mb_node = `1`;
2307	break;
2308
2309	case OP_OPEN_DUP_NUM:
2310	if (syntax & RE_CONTEXT_INVALID_DUP)
2311	{
2312	*err = REG_BADRPT;
2313	return NULL;
2314	}
2315	FALLTHROUGH;
2316	case OP_DUP_ASTERISK:
2317	case OP_DUP_PLUS:
2318	case OP_DUP_QUESTION:
2319	if (syntax & RE_CONTEXT_INVALID_OPS)
2320	{
2321	*err = REG_BADRPT;
2322	return NULL;
2323	}
2324	else if (syntax & RE_CONTEXT_INDEP_OPS)
2325	{
2326	fetch_token (token, regexp, syntax);
2327	return parse_expression (regexp, preg, token, syntax, nest, err);
2328	}
2329	FALLTHROUGH;
2330	case OP_CLOSE_SUBEXP:
2331	if ((token->type == OP_CLOSE_SUBEXP)
2332	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2333	{
2334	*err = REG_ERPAREN;
2335	return NULL;
2336	}
2337	FALLTHROUGH;
2338	case OP_CLOSE_DUP_NUM:
2339	/ We treat it as a normal character. /
2340
2341	/ Then we can these characters as normal characters. /
2342	token->type = CHARACTER;
2343	/ mb_partial and word_char bits should be initialized already*
2344	by peek_token. /*
2345	tree = create_token_tree (dfa, NULL, NULL, token);
2346	if (__glibc_unlikely (tree == NULL))
2347	{
2348	*err = REG_ESPACE;
2349	return NULL;
2350	}
2351	break;
2352
2353	case ANCHOR:
2354	if ((token->opr.ctx_type
2355	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2356	&& dfa->word_ops_used == `0`)
2357	init_word_char (dfa);
2358	if (token->opr.ctx_type == WORD_DELIM
2359	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2360	{
2361	bin_tree_t tree_first, tree_last;
2362	if (token->opr.ctx_type == WORD_DELIM)
2363	{
2364	token->opr.ctx_type = WORD_FIRST;
2365	tree_first = create_token_tree (dfa, NULL, NULL, token);
2366	token->opr.ctx_type = WORD_LAST;
2367	}
2368	else
2369	{
2370	token->opr.ctx_type = INSIDE_WORD;
2371	tree_first = create_token_tree (dfa, NULL, NULL, token);
2372	token->opr.ctx_type = INSIDE_NOTWORD;
2373	}
2374	tree_last = create_token_tree (dfa, NULL, NULL, token);
2375	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2376	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
2377	\|\| tree == NULL))
2378	{
2379	*err = REG_ESPACE;
2380	return NULL;
2381	}
2382	}
2383	else
2384	{
2385	tree = create_token_tree (dfa, NULL, NULL, token);
2386	if (__glibc_unlikely (tree == NULL))
2387	{
2388	*err = REG_ESPACE;
2389	return NULL;
2390	}
2391	}
2392	/ We must return here, since ANCHORs can't be followed*
2393	by repetition operators.
2394	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2395	it must not be "<ANCHOR(^)><REPEAT()>". /
2396	fetch_token (token, regexp, syntax);
2397	return tree;
2398
2399	case OP_PERIOD:
2400	tree = create_token_tree (dfa, NULL, NULL, token);
2401	if (__glibc_unlikely (tree == NULL))
2402	{
2403	*err = REG_ESPACE;
2404	return NULL;
2405	}
2406	if (dfa->mb_cur_max > `1`)
2407	dfa->has_mb_node = `1`;
2408	break;
2409
2410	case OP_WORD:
2411	case OP_NOTWORD:
2412	tree = build_charclass_op (dfa, regexp->trans,
2413	"alnum",
2414	"_",
2415	token->type == OP_NOTWORD, err);
2416	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2417	return NULL;
2418	break;
2419
2420	case OP_SPACE:
2421	case OP_NOTSPACE:
2422	tree = build_charclass_op (dfa, regexp->trans,
2423	"space",
2424	"",
2425	token->type == OP_NOTSPACE, err);
2426	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2427	return NULL;
2428	break;
2429
2430	case OP_ALT:
2431	case END_OF_RE:
2432	return NULL;
2433
2434	case BACK_SLASH:
2435	*err = REG_EESCAPE;
2436	return NULL;
2437
2438	default:
2439	/ Must not happen? /
2440	DEBUG_ASSERT (false);
2441	return NULL;
2442	}
2443	fetch_token (token, regexp, syntax);
2444
2445	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2446	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2447	{
2448	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2449	syntax, err);
2450	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2451	{
2452	if (tree != NULL)
2453	postorder (tree, free_tree, NULL);
2454	return NULL;
2455	}
2456	tree = dup_tree;
2457	/ In BRE consecutive duplications are not allowed. /
2458	if ((syntax & RE_CONTEXT_INVALID_DUP)
2459	&& (token->type == OP_DUP_ASTERISK
2460	\|\| token->type == OP_OPEN_DUP_NUM))
2461	{
2462	if (tree != NULL)
2463	postorder (tree, free_tree, NULL);
2464	*err = REG_BADRPT;
2465	return NULL;
2466	}
2467	}
2468
2469	return tree;
2470	}
2471
2472	/ This function build the following tree, from regular expression*
2473	(<reg_exp>):
2474	SUBEXP
2475	\|
2476	<reg_exp>
2477	*/
2478
2479	static bin_tree_t *
2480	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2481	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2482	{
2483	re_dfa_t *dfa = preg->buffer;
2484	bin_tree_t *tree;
2485	size_t cur_nsub;
2486	cur_nsub = preg->re_nsub++;
2487
2488	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2489
2490	/ The subexpression may be a null string. /
2491	if (token->type == OP_CLOSE_SUBEXP)
2492	tree = NULL;
2493	else
2494	{
2495	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2496	if (__glibc_unlikely (*err == REG_NOERROR
2497	&& token->type != OP_CLOSE_SUBEXP))
2498	{
2499	if (tree != NULL)
2500	postorder (tree, free_tree, NULL);
2501	*err = REG_EPAREN;
2502	}
2503	if (__glibc_unlikely (*err != REG_NOERROR))
2504	return NULL;
2505	}
2506
2507	if (cur_nsub <= `'9'` - `'1'`)
2508	dfa->completed_bkref_map \|= `1` << cur_nsub;
2509
2510	tree = create_tree (dfa, tree, NULL, SUBEXP);
2511	if (__glibc_unlikely (tree == NULL))
2512	{
2513	*err = REG_ESPACE;
2514	return NULL;
2515	}
2516	tree->token.opr.idx = cur_nsub;
2517	return tree;
2518	}
2519
2520	/ This function parse repetition operators like "", "+", "{1,3}" etc. /*
2521
2522	static bin_tree_t *
2523	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2524	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2525	{
2526	bin_tree_t tree = NULL, old_tree = NULL;
2527	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2528	re_token_t start_token = *token;
2529
2530	if (token->type == OP_OPEN_DUP_NUM)
2531	{
2532	end = `0`;
2533	start = fetch_number (regexp, token, syntax);
2534	if (start == -`1`)
2535	{
2536	if (token->type == CHARACTER && token->opr.c == `','`)
2537	start = `0`; / We treat "{,m}" as "{0,m}". /
2538	else
2539	{
2540	err = REG_BADBR; /* <re>{} is invalid. /
2541	return NULL;
2542	}
2543	}
2544	if (__glibc_likely (start != -`2`))
2545	{
2546	/ We treat "{n}" as "{n,n}". /
2547	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548	: ((token->type == CHARACTER && token->opr.c == `','`)
2549	? fetch_number (regexp, token, syntax) : -`2`));
2550	}
2551	if (__glibc_unlikely (start == -`2` \|\| end == -`2`))
2552	{
2553	/ Invalid sequence. /
2554	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2555	{
2556	if (token->type == END_OF_RE)
2557	*err = REG_EBRACE;
2558	else
2559	*err = REG_BADBR;
2560
2561	return NULL;
2562	}
2563
2564	/ If the syntax bit is set, rollback. /
2565	re_string_set_index (regexp, start_idx);
2566	*token = start_token;
2567	token->type = CHARACTER;
2568	/ mb_partial and word_char bits should be already initialized by*
2569	peek_token. /*
2570	return elem;
2571	}
2572
2573	if (__glibc_unlikely ((end != -`1` && start > end)
2574	\|\| token->type != OP_CLOSE_DUP_NUM))
2575	{
2576	/ First number greater than second. /
2577	*err = REG_BADBR;
2578	return NULL;
2579	}
2580
2581	if (__glibc_unlikely (RE_DUP_MAX < (end == -`1` ? start : end)))
2582	{
2583	*err = REG_ESIZE;
2584	return NULL;
2585	}
2586	}
2587	else
2588	{
2589	start = (token->type == OP_DUP_PLUS) ? `1` : `0`;
2590	end = (token->type == OP_DUP_QUESTION) ? `1` : -`1`;
2591	}
2592
2593	fetch_token (token, regexp, syntax);
2594
2595	if (__glibc_unlikely (elem == NULL))
2596	return NULL;
2597	if (__glibc_unlikely (start == `0` && end == `0`))
2598	{
2599	postorder (elem, free_tree, NULL);
2600	return NULL;
2601	}
2602
2603	/ Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". /
2604	if (__glibc_unlikely (start > `0`))
2605	{
2606	tree = elem;
2607	for (i = `2`; i <= start; ++i)
2608	{
2609	elem = duplicate_tree (elem, dfa);
2610	tree = create_tree (dfa, tree, elem, CONCAT);
2611	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2612	goto parse_dup_op_espace;
2613	}
2614
2615	if (start == end)
2616	return tree;
2617
2618	/ Duplicate ELEM before it is marked optional. /
2619	elem = duplicate_tree (elem, dfa);
2620	if (__glibc_unlikely (elem == NULL))
2621	goto parse_dup_op_espace;
2622	old_tree = tree;
2623	}
2624	else
2625	old_tree = NULL;
2626
2627	if (elem->token.type == SUBEXP)
2628	{
2629	uintptr_t subidx = elem->token.opr.idx;
2630	postorder (elem, mark_opt_subexp, (void *) subidx);
2631	}
2632
2633	tree = create_tree (dfa, elem, NULL,
2634	(end == -`1` ? OP_DUP_ASTERISK : OP_ALT));
2635	if (__glibc_unlikely (tree == NULL))
2636	goto parse_dup_op_espace;
2637
2638	/ This loop is actually executed only when end != -1,*
2639	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2640	already created the start+1-th copy. /*
2641	if (TYPE_SIGNED (Idx) \|\| end != -`1`)
2642	for (i = start + `2`; i <= end; ++i)
2643	{
2644	elem = duplicate_tree (elem, dfa);
2645	tree = create_tree (dfa, tree, elem, CONCAT);
2646	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2647	goto parse_dup_op_espace;
2648
2649	tree = create_tree (dfa, tree, NULL, OP_ALT);
2650	if (__glibc_unlikely (tree == NULL))
2651	goto parse_dup_op_espace;
2652	}
2653
2654	if (old_tree)
2655	tree = create_tree (dfa, old_tree, tree, CONCAT);
2656
2657	return tree;
2658
2659	parse_dup_op_espace:
2660	*err = REG_ESPACE;
2661	return NULL;
2662	}
2663
2664	/ Size of the names for collating symbol/equivalence_class/character_class.*
2665	I'm not sure, but maybe enough. /*
2666	#define BRACKET_NAME_BUF_SIZE 32
2667
2668	#ifndef _LIBC
2669
2670	# ifdef RE_ENABLE_I18N
2671	/ Convert the byte B to the corresponding wide character. In a*
2672	unibyte locale, treat B as itself. In a multibyte locale, return
2673	WEOF if B is an encoding error. /*
2674	static wint_t
2675	parse_byte (unsigned char b, re_charset_t *mbcset)
2676	{
2677	return mbcset == NULL ? b : __btowc (b);
2678	}
2679	# endif
2680
2681	/ Local function for parse_bracket_exp only used in case of NOT _LIBC.*
2682	Build the range expression which starts from START_ELEM, and ends
2683	at END_ELEM. The result are written to MBCSET and SBCSET.
2684	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2685	mbcset->range_ends, is a pointer argument since we may
2686	update it. /*
2687
2688	static reg_errcode_t
2689	# ifdef RE_ENABLE_I18N
2690	build_range_exp (const reg_syntax_t syntax,
2691	bitset_t sbcset,
2692	re_charset_t *mbcset,
2693	Idx *range_alloc,
2694	const bracket_elem_t *start_elem,
2695	const bracket_elem_t *end_elem)
2696	# else /* not RE_ENABLE_I18N */
2697	build_range_exp (const reg_syntax_t syntax,
2698	bitset_t sbcset,
2699	const bracket_elem_t *start_elem,
2700	const bracket_elem_t *end_elem)
2701	# endif /* not RE_ENABLE_I18N */
2702	{
2703	unsigned int start_ch, end_ch;
2704	/ Equivalence Classes and Character Classes can't be a range start/end. /
2705	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2706	\|\| start_elem->type == CHAR_CLASS
2707	\|\| end_elem->type == EQUIV_CLASS
2708	\|\| end_elem->type == CHAR_CLASS))
2709	return REG_ERANGE;
2710
2711	/ We can handle no multi character collating elements without libc*
2712	support. /*
2713	if (__glibc_unlikely ((start_elem->type == COLL_SYM
2714	&& strlen ((char *) start_elem->opr.name) > `1`)
2715	\|\| (end_elem->type == COLL_SYM
2716	&& strlen ((char *) end_elem->opr.name) > `1`)))
2717	return REG_ECOLLATE;
2718
2719	# ifdef RE_ENABLE_I18N
2720	{
2721	wchar_t wc;
2722	wint_t start_wc;
2723	wint_t end_wc;
2724
2725	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2726	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2727	: `0`));
2728	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2729	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2730	: `0`));
2731	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2732	? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2733	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2734	? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2735	if (start_wc == WEOF \|\| end_wc == WEOF)
2736	return REG_ECOLLATE;
2737	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2738	&& start_wc > end_wc))
2739	return REG_ERANGE;
2740
2741	/ Got valid collation sequence values, add them as a new entry.*
2742	However, for !_LIBC we have no collation elements: if the
2743	character set is single byte, the single byte character set
2744	that we build below suffices. parse_bracket_exp passes
2745	no MBCSET if dfa->mb_cur_max == 1. /*
2746	if (mbcset)
2747	{
2748	/ Check the space of the arrays. /
2749	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2750	{
2751	/ There is not enough space, need realloc. /
2752	wchar_t new_array_start, new_array_end;
2753	Idx new_nranges;
2754
2755	/ +1 in case of mbcset->nranges is 0. /
2756	new_nranges = `2` * mbcset->nranges + `1`;
2757	/ Use realloc since mbcset->range_starts and mbcset->range_ends*
2758	are NULL if range_alloc == 0. /
2759	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2760	new_nranges);
2761	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2762	new_nranges);
2763
2764	if (__glibc_unlikely (new_array_start == NULL
2765	\|\| new_array_end == NULL))
2766	{
2767	re_free (new_array_start);
2768	re_free (new_array_end);
2769	return REG_ESPACE;
2770	}
2771
2772	mbcset->range_starts = new_array_start;
2773	mbcset->range_ends = new_array_end;
2774	*range_alloc = new_nranges;
2775	}
2776
2777	mbcset->range_starts[mbcset->nranges] = start_wc;
2778	mbcset->range_ends[mbcset->nranges++] = end_wc;
2779	}
2780
2781	/ Build the table for single byte characters. /
2782	for (wc = `0`; wc < SBC_MAX; ++wc)
2783	{
2784	if (start_wc <= wc && wc <= end_wc)
2785	bitset_set (sbcset, wc);
2786	}
2787	}
2788	# else /* not RE_ENABLE_I18N */
2789	{
2790	unsigned int ch;
2791	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2792	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[`0`]
2793	: `0`));
2794	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2795	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[`0`]
2796	: `0`));
2797	if (start_ch > end_ch)
2798	return REG_ERANGE;
2799	/ Build the table for single byte characters. /
2800	for (ch = `0`; ch < SBC_MAX; ++ch)
2801	if (start_ch <= ch && ch <= end_ch)
2802	bitset_set (sbcset, ch);
2803	}
2804	# endif /* not RE_ENABLE_I18N */
2805	return REG_NOERROR;
2806	}
2807	#endif /* not _LIBC */
2808
2809	#ifndef _LIBC
2810	/ Helper function for parse_bracket_exp only used in case of NOT _LIBC..*
2811	Build the collating element which is represented by NAME.
2812	The result are written to MBCSET and SBCSET.
2813	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2814	pointer argument since we may update it. /*
2815
2816	static reg_errcode_t
2817	# ifdef RE_ENABLE_I18N
2818	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2819	Idx coll_sym_alloc, const* unsigned char *name)
2820	# else /* not RE_ENABLE_I18N */
2821	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2822	# endif /* not RE_ENABLE_I18N */
2823	{
2824	size_t name_len = strlen ((const char *) name);
2825	if (__glibc_unlikely (name_len != `1`))
2826	return REG_ECOLLATE;
2827	else
2828	{
2829	bitset_set (sbcset, name[`0`]);
2830	return REG_NOERROR;
2831	}
2832	}
2833	#endif /* not _LIBC */
2834
2835	#ifdef _LIBC
2836	/ Local function for parse_bracket_exp used in _LIBC environment.*
2837	Seek the collating symbol entry corresponding to NAME.
2838	Return the index of the symbol in the SYMB_TABLE,
2839	or -1 if not found. /*
2840
2841	static inline int32_t
2842	__attribute__ ((always_inline))
2843	seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2844	const int32_t *symb_table, int32_t table_size,
2845	const unsigned char *extra)
2846	{
2847	int32_t elem;
2848
2849	for (elem = `0`; elem < table_size; elem++)
2850	if (symb_table[`2` * elem] != `0`)
2851	{
2852	int32_t idx = symb_table[`2` * elem + `1`];
2853	/ Skip the name of collating element name. /
2854	idx += `1` + extra[idx];
2855	if (/ Compare the length of the name. /
2856	name_len == extra[idx]
2857	/ Compare the name. /
2858	&& memcmp (name, &extra[idx + `1`], name_len) == `0`)
2859	/ Yep, this is the entry. /
2860	return elem;
2861	}
2862	return -`1`;
2863	}
2864
2865	/ Local function for parse_bracket_exp used in _LIBC environment.*
2866	Look up the collation sequence value of BR_ELEM.
2867	Return the value if succeeded, UINT_MAX otherwise. /*
2868
2869	static inline unsigned int
2870	__attribute__ ((always_inline))
2871	lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2872	const unsigned char *collseqmb,
2873	const char *collseqwc, int32_t table_size,
2874	const int32_t *symb_table,
2875	const unsigned char *extra)
2876	{
2877	if (br_elem->type == SB_CHAR)
2878	{
2879	/ if (MB_CUR_MAX == 1) /
2880	if (nrules == `0`)
2881	return collseqmb[br_elem->opr.ch];
2882	else
2883	{
2884	wint_t wc = __btowc (br_elem->opr.ch);
2885	return __collseq_table_lookup (collseqwc, wc);
2886	}
2887	}
2888	else if (br_elem->type == MB_CHAR)
2889	{
2890	if (nrules != `0`)
2891	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2892	}
2893	else if (br_elem->type == COLL_SYM)
2894	{
2895	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2896	if (nrules != `0`)
2897	{
2898	int32_t elem, idx;
2899	elem = seek_collating_symbol_entry (br_elem->opr.name,
2900	sym_name_len,
2901	symb_table, table_size,
2902	extra);
2903	if (elem != -`1`)
2904	{
2905	/ We found the entry. /
2906	idx = symb_table[`2` * elem + `1`];
2907	/ Skip the name of collating element name. /
2908	idx += `1` + extra[idx];
2909	/ Skip the byte sequence of the collating element. /
2910	idx += `1` + extra[idx];
2911	/ Adjust for the alignment. /
2912	idx = (idx + `3`) & ~`3`;
2913	/ Skip the multibyte collation sequence value. /
2914	idx += sizeof (unsigned int);
2915	/ Skip the wide char sequence of the collating element. /
2916	idx += sizeof (unsigned int) *
2917	(`1` + (unsigned* int *) (extra + idx));
2918	/ Return the collation sequence value. /
2919	return (unsigned* int *) (extra + idx);
2920	}
2921	else if (sym_name_len == `1`)
2922	{
2923	/ No valid character. Match it as a single byte*
2924	character. /*
2925	return collseqmb[br_elem->opr.name[`0`]];
2926	}
2927	}
2928	else if (sym_name_len == `1`)
2929	return collseqmb[br_elem->opr.name[`0`]];
2930	}
2931	return UINT_MAX;
2932	}
2933
2934	/ Local function for parse_bracket_exp used in _LIBC environment.*
2935	Build the range expression which starts from START_ELEM, and ends
2936	at END_ELEM. The result are written to MBCSET and SBCSET.
2937	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2938	mbcset->range_ends, is a pointer argument since we may
2939	update it. /*
2940
2941	static inline reg_errcode_t
2942	__attribute__ ((always_inline))
2943	build_range_exp (bitset_t sbcset, re_charset_t mbcset, Idx range_alloc,
2944	bracket_elem_t start_elem, bracket_elem_t end_elem,
2945	re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2946	const unsigned char collseqmb, const* char *collseqwc,
2947	int32_t table_size, const int32_t *symb_table,
2948	const unsigned char *extra)
2949	{
2950	unsigned int ch;
2951	uint32_t start_collseq;
2952	uint32_t end_collseq;
2953
2954	/ Equivalence Classes and Character Classes can't be a range*
2955	start/end. /*
2956	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2957	\|\| start_elem->type == CHAR_CLASS
2958	\|\| end_elem->type == EQUIV_CLASS
2959	\|\| end_elem->type == CHAR_CLASS))
2960	return REG_ERANGE;
2961
2962	/ FIXME: Implement rational ranges here, too. /
2963	start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2964	table_size, symb_table, extra);
2965	end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2966	table_size, symb_table, extra);
2967	/ Check start/end collation sequence values. /
2968	if (__glibc_unlikely (start_collseq == UINT_MAX
2969	\|\| end_collseq == UINT_MAX))
2970	return REG_ECOLLATE;
2971	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2972	&& start_collseq > end_collseq))
2973	return REG_ERANGE;
2974
2975	/ Got valid collation sequence values, add them as a new entry.*
2976	However, if we have no collation elements, and the character set
2977	is single byte, the single byte character set that we
2978	build below suffices. /*
2979	if (nrules > `0` \|\| dfa->mb_cur_max > `1`)
2980	{
2981	/ Check the space of the arrays. /
2982	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2983	{
2984	/ There is not enough space, need realloc. /
2985	uint32_t *new_array_start;
2986	uint32_t *new_array_end;
2987	Idx new_nranges;
2988
2989	/ +1 in case of mbcset->nranges is 0. /
2990	new_nranges = `2` * mbcset->nranges + `1`;
2991	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2992	new_nranges);
2993	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994	new_nranges);
2995
2996	if (__glibc_unlikely (new_array_start == NULL
2997	\|\| new_array_end == NULL))
2998	return REG_ESPACE;
2999
3000	mbcset->range_starts = new_array_start;
3001	mbcset->range_ends = new_array_end;
3002	*range_alloc = new_nranges;
3003	}
3004
3005	mbcset->range_starts[mbcset->nranges] = start_collseq;
3006	mbcset->range_ends[mbcset->nranges++] = end_collseq;
3007	}
3008
3009	/ Build the table for single byte characters. /
3010	for (ch = `0`; ch < SBC_MAX; ch++)
3011	{
3012	uint32_t ch_collseq;
3013	/ if (MB_CUR_MAX == 1) /
3014	if (nrules == `0`)
3015	ch_collseq = collseqmb[ch];
3016	else
3017	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3018	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3019	bitset_set (sbcset, ch);
3020	}
3021	return REG_NOERROR;
3022	}
3023
3024	/ Local function for parse_bracket_exp used in _LIBC environment.*
3025	Build the collating element which is represented by NAME.
3026	The result are written to MBCSET and SBCSET.
3027	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3028	pointer argument since we may update it. /*
3029
3030	static inline reg_errcode_t
3031	__attribute__ ((always_inline))
3032	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3033	Idx coll_sym_alloc, const* unsigned char *name,
3034	uint32_t nrules, int32_t table_size,
3035	const int32_t symb_table, const* unsigned char *extra)
3036	{
3037	int32_t elem, idx;
3038	size_t name_len = strlen ((const char *) name);
3039	if (nrules != `0`)
3040	{
3041	elem = seek_collating_symbol_entry (name, name_len, symb_table,
3042	table_size, extra);
3043	if (elem != -`1`)
3044	{
3045	/ We found the entry. /
3046	idx = symb_table[`2` * elem + `1`];
3047	/ Skip the name of collating element name. /
3048	idx += `1` + extra[idx];
3049	}
3050	else if (name_len == `1`)
3051	{
3052	/ No valid character, treat it as a normal*
3053	character. /*
3054	bitset_set (sbcset, name[`0`]);
3055	return REG_NOERROR;
3056	}
3057	else
3058	return REG_ECOLLATE;
3059
3060	/ Got valid collation sequence, add it as a new entry. /
3061	/ Check the space of the arrays. /
3062	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3063	{
3064	/ Not enough, realloc it. /
3065	/ +1 in case of mbcset->ncoll_syms is 0. /
3066	Idx new_coll_sym_alloc = `2` * mbcset->ncoll_syms + `1`;
3067	/ Use realloc since mbcset->coll_syms is NULL*
3068	if alloc == 0. /
3069	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3070	new_coll_sym_alloc);
3071	if (__glibc_unlikely (new_coll_syms == NULL))
3072	return REG_ESPACE;
3073	mbcset->coll_syms = new_coll_syms;
3074	*coll_sym_alloc = new_coll_sym_alloc;
3075	}
3076	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3077	return REG_NOERROR;
3078	}
3079	else
3080	{
3081	if (__glibc_unlikely (name_len != `1`))
3082	return REG_ECOLLATE;
3083	else
3084	{
3085	bitset_set (sbcset, name[`0`]);
3086	return REG_NOERROR;
3087	}
3088	}
3089	}
3090	#endif /* _LIBC */
3091
3092	/ This function parse bracket expression like "[abc]", "[a-c]",*
3093	"[[.a-a.]]" etc. /*
3094
3095	static bin_tree_t *
3096	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
3097	reg_syntax_t syntax, reg_errcode_t *err)
3098	{
3099	#ifdef _LIBC
3100	const unsigned char *collseqmb;
3101	const char *collseqwc = NULL;
3102	uint32_t nrules;
3103	int32_t table_size = `0`;
3104	const int32_t *symb_table = NULL;
3105	const unsigned char *extra = NULL;
3106	#endif
3107
3108	re_token_t br_token;
3109	re_bitset_ptr_t sbcset;
3110	#ifdef RE_ENABLE_I18N
3111	re_charset_t *mbcset;
3112	Idx coll_sym_alloc = `0`, range_alloc = `0`, mbchar_alloc = `0`;
3113	Idx equiv_class_alloc = `0`, char_class_alloc = `0`;
3114	#endif /* not RE_ENABLE_I18N */
3115	bool non_match = false;
3116	bin_tree_t *work_tree;
3117	int token_len;
3118	bool first_round = true;
3119	#ifdef _LIBC
3120	collseqmb = (const unsigned char *)
3121	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3122	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123	if (nrules)
3124	{
3125	/*
3126	if (MB_CUR_MAX > 1)
3127	*/
3128	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3129	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3130	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3131	_NL_COLLATE_SYMB_TABLEMB);
3132	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3133	_NL_COLLATE_SYMB_EXTRAMB);
3134	}
3135	#endif
3136	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3137	#ifdef RE_ENABLE_I18N
3138	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3139	#endif /* RE_ENABLE_I18N */
3140	#ifdef RE_ENABLE_I18N
3141	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
3142	#else
3143	if (__glibc_unlikely (sbcset == NULL))
3144	#endif /* RE_ENABLE_I18N */
3145	{
3146	re_free (sbcset);
3147	#ifdef RE_ENABLE_I18N
3148	re_free (mbcset);
3149	#endif
3150	*err = REG_ESPACE;
3151	return NULL;
3152	}
3153
3154	token_len = peek_token_bracket (token, regexp, syntax);
3155	if (__glibc_unlikely (token->type == END_OF_RE))
3156	{
3157	*err = REG_BADPAT;
3158	goto parse_bracket_exp_free_return;
3159	}
3160	if (token->type == OP_NON_MATCH_LIST)
3161	{
3162	#ifdef RE_ENABLE_I18N
3163	mbcset->non_match = `1`;
3164	#endif /* not RE_ENABLE_I18N */
3165	non_match = true;
3166	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3167	bitset_set (sbcset, `'\n'`);
3168	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3169	token_len = peek_token_bracket (token, regexp, syntax);
3170	if (__glibc_unlikely (token->type == END_OF_RE))
3171	{
3172	*err = REG_BADPAT;
3173	goto parse_bracket_exp_free_return;
3174	}
3175	}
3176
3177	/ We treat the first ']' as a normal character. /
3178	if (token->type == OP_CLOSE_BRACKET)
3179	token->type = CHARACTER;
3180
3181	while (`1`)
3182	{
3183	bracket_elem_t start_elem, end_elem;
3184	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3185	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3186	reg_errcode_t ret;
3187	int token_len2 = `0`;
3188	bool is_range_exp = false;
3189	re_token_t token2;
3190
3191	start_elem.opr.name = start_name_buf;
3192	start_elem.type = COLL_SYM;
3193	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3194	syntax, first_round);
3195	if (__glibc_unlikely (ret != REG_NOERROR))
3196	{
3197	*err = ret;
3198	goto parse_bracket_exp_free_return;
3199	}
3200	first_round = false;
3201
3202	/ Get information about the next token. We need it in any case. /
3203	token_len = peek_token_bracket (token, regexp, syntax);
3204
3205	/ Do not check for ranges if we know they are not allowed. /
3206	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3207	{
3208	if (__glibc_unlikely (token->type == END_OF_RE))
3209	{
3210	*err = REG_EBRACK;
3211	goto parse_bracket_exp_free_return;
3212	}
3213	if (token->type == OP_CHARSET_RANGE)
3214	{
3215	re_string_skip_bytes (regexp, token_len); / Skip '-'. /
3216	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3217	if (__glibc_unlikely (token2.type == END_OF_RE))
3218	{
3219	*err = REG_EBRACK;
3220	goto parse_bracket_exp_free_return;
3221	}
3222	if (token2.type == OP_CLOSE_BRACKET)
3223	{
3224	/ We treat the last '-' as a normal character. /
3225	re_string_skip_bytes (regexp, -token_len);
3226	token->type = CHARACTER;
3227	}
3228	else
3229	is_range_exp = true;
3230	}
3231	}
3232
3233	if (is_range_exp == true)
3234	{
3235	end_elem.opr.name = end_name_buf;
3236	end_elem.type = COLL_SYM;
3237	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3238	dfa, syntax, true);
3239	if (__glibc_unlikely (ret != REG_NOERROR))
3240	{
3241	*err = ret;
3242	goto parse_bracket_exp_free_return;
3243	}
3244
3245	token_len = peek_token_bracket (token, regexp, syntax);
3246
3247	#ifdef _LIBC
3248	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3249	&start_elem, &end_elem,
3250	dfa, syntax, nrules, collseqmb, collseqwc,
3251	table_size, symb_table, extra);
3252	#else
3253	# ifdef RE_ENABLE_I18N
3254	*err = build_range_exp (syntax, sbcset,
3255	dfa->mb_cur_max > `1` ? mbcset : NULL,
3256	&range_alloc, &start_elem, &end_elem);
3257	# else
3258	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3259	# endif
3260	#endif /* RE_ENABLE_I18N */
3261	if (__glibc_unlikely (*err != REG_NOERROR))
3262	goto parse_bracket_exp_free_return;
3263	}
3264	else
3265	{
3266	switch (start_elem.type)
3267	{
3268	case SB_CHAR:
3269	bitset_set (sbcset, start_elem.opr.ch);
3270	break;
3271	#ifdef RE_ENABLE_I18N
3272	case MB_CHAR:
3273	/ Check whether the array has enough space. /
3274	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3275	{
3276	wchar_t *new_mbchars;
3277	/ Not enough, realloc it. /
3278	/ +1 in case of mbcset->nmbchars is 0. /
3279	mbchar_alloc = `2` * mbcset->nmbchars + `1`;
3280	/ Use realloc since array is NULL if alloc == 0. /*
3281	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3282	mbchar_alloc);
3283	if (__glibc_unlikely (new_mbchars == NULL))
3284	goto parse_bracket_exp_espace;
3285	mbcset->mbchars = new_mbchars;
3286	}
3287	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3288	break;
3289	#endif /* RE_ENABLE_I18N */
3290	case EQUIV_CLASS:
3291	*err = build_equiv_class (sbcset,
3292	#ifdef RE_ENABLE_I18N
3293	mbcset, &equiv_class_alloc,
3294	#endif /* RE_ENABLE_I18N */
3295	start_elem.opr.name);
3296	if (__glibc_unlikely (*err != REG_NOERROR))
3297	goto parse_bracket_exp_free_return;
3298	break;
3299	case COLL_SYM:
3300	*err = build_collating_symbol (sbcset,
3301	#ifdef RE_ENABLE_I18N
3302	mbcset, &coll_sym_alloc,
3303	#endif /* RE_ENABLE_I18N */
3304	start_elem.opr.name,
3305	nrules, table_size, symb_table, extra);
3306	if (__glibc_unlikely (*err != REG_NOERROR))
3307	goto parse_bracket_exp_free_return;
3308	break;
3309	case CHAR_CLASS:
3310	*err = build_charclass (regexp->trans, sbcset,
3311	#ifdef RE_ENABLE_I18N
3312	mbcset, &char_class_alloc,
3313	#endif /* RE_ENABLE_I18N */
3314	(const char *) start_elem.opr.name,
3315	syntax);
3316	if (__glibc_unlikely (*err != REG_NOERROR))
3317	goto parse_bracket_exp_free_return;
3318	break;
3319	default:
3320	DEBUG_ASSERT (false);
3321	break;
3322	}
3323	}
3324	if (__glibc_unlikely (token->type == END_OF_RE))
3325	{
3326	*err = REG_EBRACK;
3327	goto parse_bracket_exp_free_return;
3328	}
3329	if (token->type == OP_CLOSE_BRACKET)
3330	break;
3331	}
3332
3333	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3334
3335	/ If it is non-matching list. /
3336	if (non_match)
3337	bitset_not (sbcset);
3338
3339	#ifdef RE_ENABLE_I18N
3340	/ Ensure only single byte characters are set. /
3341	if (dfa->mb_cur_max > `1`)
3342	bitset_mask (sbcset, dfa->sb_char);
3343
3344	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3345	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > `1` && (mbcset->nchar_classes
3346	\|\| mbcset->non_match)))
3347	{
3348	bin_tree_t *mbc_tree;
3349	int sbc_idx;
3350	/ Build a tree for complex bracket. /
3351	dfa->has_mb_node = `1`;
3352	br_token.type = COMPLEX_BRACKET;
3353	br_token.opr.mbcset = mbcset;
3354	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3355	if (__glibc_unlikely (mbc_tree == NULL))
3356	goto parse_bracket_exp_espace;
3357	for (sbc_idx = `0`; sbc_idx < BITSET_WORDS; ++sbc_idx)
3358	if (sbcset[sbc_idx])
3359	break;
3360	/ If there are no bits set in sbcset, there is no point*
3361	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. /*
3362	if (sbc_idx < BITSET_WORDS)
3363	{
3364	/ Build a tree for simple bracket. /
3365	br_token.type = SIMPLE_BRACKET;
3366	br_token.opr.sbcset = sbcset;
3367	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3368	if (__glibc_unlikely (work_tree == NULL))
3369	goto parse_bracket_exp_espace;
3370
3371	/ Then join them by ALT node. /
3372	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3373	if (__glibc_unlikely (work_tree == NULL))
3374	goto parse_bracket_exp_espace;
3375	}
3376	else
3377	{
3378	re_free (sbcset);
3379	work_tree = mbc_tree;
3380	}
3381	}
3382	else
3383	#endif /* not RE_ENABLE_I18N */
3384	{
3385	#ifdef RE_ENABLE_I18N
3386	free_charset (mbcset);
3387	#endif
3388	/ Build a tree for simple bracket. /
3389	br_token.type = SIMPLE_BRACKET;
3390	br_token.opr.sbcset = sbcset;
3391	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3392	if (__glibc_unlikely (work_tree == NULL))
3393	goto parse_bracket_exp_espace;
3394	}
3395	return work_tree;
3396
3397	parse_bracket_exp_espace:
3398	*err = REG_ESPACE;
3399	parse_bracket_exp_free_return:
3400	re_free (sbcset);
3401	#ifdef RE_ENABLE_I18N
3402	free_charset (mbcset);
3403	#endif /* RE_ENABLE_I18N */
3404	return NULL;
3405	}
3406
3407	/ Parse an element in the bracket expression. /
3408
3409	static reg_errcode_t
3410	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3411	re_token_t token, int* token_len, re_dfa_t *dfa,
3412	reg_syntax_t syntax, bool accept_hyphen)
3413	{
3414	#ifdef RE_ENABLE_I18N
3415	int cur_char_size;
3416	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3417	if (cur_char_size > `1`)
3418	{
3419	elem->type = MB_CHAR;
3420	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3421	re_string_skip_bytes (regexp, cur_char_size);
3422	return REG_NOERROR;
3423	}
3424	#endif /* RE_ENABLE_I18N */
3425	re_string_skip_bytes (regexp, token_len); / Skip a token. /
3426	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3427	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3428	return parse_bracket_symbol (elem, regexp, token);
3429	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3430	{
3431	/ A '-' must only appear as anything but a range indicator before*
3432	the closing bracket. Everything else is an error. /*
3433	re_token_t token2;
3434	(void) peek_token_bracket (&token2, regexp, syntax);
3435	if (token2.type != OP_CLOSE_BRACKET)
3436	/ The actual error value is not standardized since this whole*
3437	case is undefined. But ERANGE makes good sense. /*
3438	return REG_ERANGE;
3439	}
3440	elem->type = SB_CHAR;
3441	elem->opr.ch = token->opr.c;
3442	return REG_NOERROR;
3443	}
3444
3445	/ Parse a bracket symbol in the bracket expression. Bracket symbols are*
3446	such as [:<character_class>:], [.<collating_element>.], and
3447	[=<equivalent_class>=]. /*
3448
3449	static reg_errcode_t
3450	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3451	re_token_t *token)
3452	{
3453	unsigned char ch, delim = token->opr.c;
3454	int i = `0`;
3455	if (re_string_eoi(regexp))
3456	return REG_EBRACK;
3457	for (;; ++i)
3458	{
3459	if (i >= BRACKET_NAME_BUF_SIZE)
3460	return REG_EBRACK;
3461	if (token->type == OP_OPEN_CHAR_CLASS)
3462	ch = re_string_fetch_byte_case (regexp);
3463	else
3464	ch = re_string_fetch_byte (regexp);
3465	if (re_string_eoi(regexp))
3466	return REG_EBRACK;
3467	if (ch == delim && re_string_peek_byte (regexp, `0`) == `']'`)
3468	break;
3469	elem->opr.name[i] = ch;
3470	}
3471	re_string_skip_bytes (regexp, `1`);
3472	elem->opr.name[i] = `'\0'`;
3473	switch (token->type)
3474	{
3475	case OP_OPEN_COLL_ELEM:
3476	elem->type = COLL_SYM;
3477	break;
3478	case OP_OPEN_EQUIV_CLASS:
3479	elem->type = EQUIV_CLASS;
3480	break;
3481	case OP_OPEN_CHAR_CLASS:
3482	elem->type = CHAR_CLASS;
3483	break;
3484	default:
3485	break;
3486	}
3487	return REG_NOERROR;
3488	}
3489
3490	/ Helper function for parse_bracket_exp.*
3491	Build the equivalence class which is represented by NAME.
3492	The result are written to MBCSET and SBCSET.
3493	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3494	is a pointer argument since we may update it. /*
3495
3496	static reg_errcode_t
3497	#ifdef RE_ENABLE_I18N
3498	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3499	Idx equiv_class_alloc, const* unsigned char *name)
3500	#else /* not RE_ENABLE_I18N */
3501	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3502	#endif /* not RE_ENABLE_I18N */
3503	{
3504	#ifdef _LIBC
3505	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3506	if (nrules != `0`)
3507	{
3508	const int32_t table, indirect;
3509	const unsigned char weights, extra, *cp;
3510	unsigned char char_buf[`2`];
3511	int32_t idx1, idx2;
3512	unsigned int ch;
3513	size_t len;
3514	/ Calculate the index for equivalence class. /
3515	cp = name;
3516	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3517	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3518	_NL_COLLATE_WEIGHTMB);
3519	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3520	_NL_COLLATE_EXTRAMB);
3521	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3522	_NL_COLLATE_INDIRECTMB);
3523	idx1 = findidx (table, indirect, extra, &cp, -`1`);
3524	if (__glibc_unlikely (idx1 == `0` \|\| *cp != `'\0'`))
3525	/ This isn't a valid character. /
3526	return REG_ECOLLATE;
3527
3528	/ Build single byte matching table for this equivalence class. /
3529	len = weights[idx1 & `0xffffff`];
3530	for (ch = `0`; ch < SBC_MAX; ++ch)
3531	{
3532	char_buf[`0`] = ch;
3533	cp = char_buf;
3534	idx2 = findidx (table, indirect, extra, &cp, `1`);
3535	/*
3536	idx2 = table[ch];
3537	*/
3538	if (idx2 == `0`)
3539	/ This isn't a valid character. /
3540	continue;
3541	/ Compare only if the length matches and the collation rule*
3542	index is the same. /*
3543	if (len == weights[idx2 & `0xffffff`] && (idx1 >> `24`) == (idx2 >> `24`)
3544	&& memcmp (weights + (idx1 & `0xffffff`) + `1`,
3545	weights + (idx2 & `0xffffff`) + `1`, len) == `0`)
3546	bitset_set (sbcset, ch);
3547	}
3548	/ Check whether the array has enough space. /
3549	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3550	{
3551	/ Not enough, realloc it. /
3552	/ +1 in case of mbcset->nequiv_classes is 0. /
3553	Idx new_equiv_class_alloc = `2` * mbcset->nequiv_classes + `1`;
3554	/ Use realloc since the array is NULL if alloc == 0. /*
3555	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3556	int32_t,
3557	new_equiv_class_alloc);
3558	if (__glibc_unlikely (new_equiv_classes == NULL))
3559	return REG_ESPACE;
3560	mbcset->equiv_classes = new_equiv_classes;
3561	*equiv_class_alloc = new_equiv_class_alloc;
3562	}
3563	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3564	}
3565	else
3566	#endif /* _LIBC */
3567	{
3568	if (__glibc_unlikely (strlen ((const char *) name) != `1`))
3569	return REG_ECOLLATE;
3570	bitset_set (sbcset, *name);
3571	}
3572	return REG_NOERROR;
3573	}
3574
3575	/ Helper function for parse_bracket_exp.*
3576	Build the character class which is represented by NAME.
3577	The result are written to MBCSET and SBCSET.
3578	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3579	is a pointer argument since we may update it. /*
3580
3581	static reg_errcode_t
3582	#ifdef RE_ENABLE_I18N
3583	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3584	re_charset_t mbcset, Idx char_class_alloc,
3585	const char *class_name, reg_syntax_t syntax)
3586	#else /* not RE_ENABLE_I18N */
3587	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3588	const char *class_name, reg_syntax_t syntax)
3589	#endif /* not RE_ENABLE_I18N */
3590	{
3591	int i;
3592	const char *name = class_name;
3593
3594	/ In case of REG_ICASE "upper" and "lower" match the both of*
3595	upper and lower cases. /*
3596	if ((syntax & RE_ICASE)
3597	&& (strcmp (name, "upper") == `0` \|\| strcmp (name, "lower") == `0`))
3598	name = "alpha";
3599
3600	#ifdef RE_ENABLE_I18N
3601	/ Check the space of the arrays. /
3602	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3603	{
3604	/ Not enough, realloc it. /
3605	/ +1 in case of mbcset->nchar_classes is 0. /
3606	Idx new_char_class_alloc = `2` * mbcset->nchar_classes + `1`;
3607	/ Use realloc since array is NULL if alloc == 0. /*
3608	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3609	new_char_class_alloc);
3610	if (__glibc_unlikely (new_char_classes == NULL))
3611	return REG_ESPACE;
3612	mbcset->char_classes = new_char_classes;
3613	*char_class_alloc = new_char_class_alloc;
3614	}
3615	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3616	#endif /* RE_ENABLE_I18N */
3617
3618	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3619	do { \
3620	if (__glibc_unlikely (trans != NULL)) \
3621	{ \
3622	for (i = 0; i < SBC_MAX; ++i) \
3623	if (ctype_func (i)) \
3624	bitset_set (sbcset, trans[i]); \
3625	} \
3626	else \
3627	{ \
3628	for (i = 0; i < SBC_MAX; ++i) \
3629	if (ctype_func (i)) \
3630	bitset_set (sbcset, i); \
3631	} \
3632	} while (0)
3633
3634	if (strcmp (name, "alnum") == `0`)
3635	BUILD_CHARCLASS_LOOP (isalnum);
3636	else if (strcmp (name, "cntrl") == `0`)
3637	BUILD_CHARCLASS_LOOP (iscntrl);
3638	else if (strcmp (name, "lower") == `0`)
3639	BUILD_CHARCLASS_LOOP (islower);
3640	else if (strcmp (name, "space") == `0`)
3641	BUILD_CHARCLASS_LOOP (isspace);
3642	else if (strcmp (name, "alpha") == `0`)
3643	BUILD_CHARCLASS_LOOP (isalpha);
3644	else if (strcmp (name, "digit") == `0`)
3645	BUILD_CHARCLASS_LOOP (isdigit);
3646	else if (strcmp (name, "print") == `0`)
3647	BUILD_CHARCLASS_LOOP (isprint);
3648	else if (strcmp (name, "upper") == `0`)
3649	BUILD_CHARCLASS_LOOP (isupper);
3650	else if (strcmp (name, "blank") == `0`)
3651	BUILD_CHARCLASS_LOOP (isblank);
3652	else if (strcmp (name, "graph") == `0`)
3653	BUILD_CHARCLASS_LOOP (isgraph);
3654	else if (strcmp (name, "punct") == `0`)
3655	BUILD_CHARCLASS_LOOP (ispunct);
3656	else if (strcmp (name, "xdigit") == `0`)
3657	BUILD_CHARCLASS_LOOP (isxdigit);
3658	else
3659	return REG_ECTYPE;
3660
3661	return REG_NOERROR;
3662	}
3663
3664	static bin_tree_t *
3665	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3666	const char *class_name,
3667	const char *extra, bool non_match,
3668	reg_errcode_t *err)
3669	{
3670	re_bitset_ptr_t sbcset;
3671	#ifdef RE_ENABLE_I18N
3672	re_charset_t *mbcset;
3673	Idx alloc = `0`;
3674	#endif /* not RE_ENABLE_I18N */
3675	reg_errcode_t ret;
3676	bin_tree_t *tree;
3677
3678	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), `1`);
3679	if (__glibc_unlikely (sbcset == NULL))
3680	{
3681	*err = REG_ESPACE;
3682	return NULL;
3683	}
3684	#ifdef RE_ENABLE_I18N
3685	mbcset = (re_charset_t ) calloc (sizeof* (re_charset_t), `1`);
3686	if (__glibc_unlikely (mbcset == NULL))
3687	{
3688	re_free (sbcset);
3689	*err = REG_ESPACE;
3690	return NULL;
3691	}
3692	mbcset->non_match = non_match;
3693	#endif /* RE_ENABLE_I18N */
3694
3695	/ We don't care the syntax in this case. /
3696	ret = build_charclass (trans, sbcset,
3697	#ifdef RE_ENABLE_I18N
3698	mbcset, &alloc,
3699	#endif /* RE_ENABLE_I18N */
3700	class_name, `0`);
3701
3702	if (__glibc_unlikely (ret != REG_NOERROR))
3703	{
3704	re_free (sbcset);
3705	#ifdef RE_ENABLE_I18N
3706	free_charset (mbcset);
3707	#endif /* RE_ENABLE_I18N */
3708	*err = ret;
3709	return NULL;
3710	}
3711	/ \w match '_' also. /
3712	for (; *extra; extra++)
3713	bitset_set (sbcset, *extra);
3714
3715	/ If it is non-matching list. /
3716	if (non_match)
3717	bitset_not (sbcset);
3718
3719	#ifdef RE_ENABLE_I18N
3720	/ Ensure only single byte characters are set. /
3721	if (dfa->mb_cur_max > `1`)
3722	bitset_mask (sbcset, dfa->sb_char);
3723	#endif
3724
3725	/ Build a tree for simple bracket. /
3726	re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3727	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3728	if (__glibc_unlikely (tree == NULL))
3729	goto build_word_op_espace;
3730
3731	#ifdef RE_ENABLE_I18N
3732	if (dfa->mb_cur_max > `1`)
3733	{
3734	bin_tree_t *mbc_tree;
3735	/ Build a tree for complex bracket. /
3736	br_token.type = COMPLEX_BRACKET;
3737	br_token.opr.mbcset = mbcset;
3738	dfa->has_mb_node = `1`;
3739	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3740	if (__glibc_unlikely (mbc_tree == NULL))
3741	goto build_word_op_espace;
3742	/ Then join them by ALT node. /
3743	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3744	if (__glibc_likely (mbc_tree != NULL))
3745	return tree;
3746	}
3747	else
3748	{
3749	free_charset (mbcset);
3750	return tree;
3751	}
3752	#else /* not RE_ENABLE_I18N */
3753	return tree;
3754	#endif /* not RE_ENABLE_I18N */
3755
3756	build_word_op_espace:
3757	re_free (sbcset);
3758	#ifdef RE_ENABLE_I18N
3759	free_charset (mbcset);
3760	#endif /* RE_ENABLE_I18N */
3761	*err = REG_ESPACE;
3762	return NULL;
3763	}
3764
3765	/ This is intended for the expressions like "a{1,3}".*
3766	Fetch a number from 'input', and return the number.
3767	Return -1 if the number field is empty like "{,1}".
3768	Return RE_DUP_MAX + 1 if the number field is too large.
3769	Return -2 if an error occurred. /*
3770
3771	static Idx
3772	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3773	{
3774	Idx num = -`1`;
3775	unsigned char c;
3776	while (`1`)
3777	{
3778	fetch_token (token, input, syntax);
3779	c = token->opr.c;
3780	if (__glibc_unlikely (token->type == END_OF_RE))
3781	return -`2`;
3782	if (token->type == OP_CLOSE_DUP_NUM \|\| c == `','`)
3783	break;
3784	num = ((token->type != CHARACTER \|\| c < `'0'` \|\| `'9'` < c \|\| num == -`2`)
3785	? -`2`
3786	: num == -`1`
3787	? c - `'0'`
3788	: MIN (RE_DUP_MAX + `1`, num * `10` + c - `'0'`));
3789	}
3790	return num;
3791	}
3792
3793	#ifdef RE_ENABLE_I18N
3794	static void
3795	free_charset (re_charset_t *cset)
3796	{
3797	re_free (cset->mbchars);
3798	# ifdef _LIBC
3799	re_free (cset->coll_syms);
3800	re_free (cset->equiv_classes);
3801	# endif
3802	re_free (cset->range_starts);
3803	re_free (cset->range_ends);
3804	re_free (cset->char_classes);
3805	re_free (cset);
3806	}
3807	#endif /* RE_ENABLE_I18N */
3808
3809	/ Functions for binary tree operation. /
3810
3811	/ Create a tree node. /
3812
3813	static bin_tree_t *
3814	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3815	re_token_type_t type)
3816	{
3817	re_token_t t = { .type = type };
3818	return create_token_tree (dfa, left, right, &t);
3819	}
3820
3821	static bin_tree_t *
3822	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3823	const re_token_t *token)
3824	{
3825	bin_tree_t *tree;
3826	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3827	{
3828	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, `1`);
3829
3830	if (storage == NULL)
3831	return NULL;
3832	storage->next = dfa->str_tree_storage;
3833	dfa->str_tree_storage = storage;
3834	dfa->str_tree_storage_idx = `0`;
3835	}
3836	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837
3838	tree->parent = NULL;
3839	tree->left = left;
3840	tree->right = right;
3841	tree->token = *token;
3842	tree->token.duplicated = `0`;
3843	tree->token.opt_subexp = `0`;
3844	tree->first = NULL;
3845	tree->next = NULL;
3846	tree->node_idx = -`1`;
3847
3848	if (left != NULL)
3849	left->parent = tree;
3850	if (right != NULL)
3851	right->parent = tree;
3852	return tree;
3853	}
3854
3855	/ Mark the tree SRC as an optional subexpression.*
3856	To be called from preorder or postorder. /*
3857
3858	static reg_errcode_t
3859	mark_opt_subexp (void extra, bin_tree_t node)
3860	{
3861	Idx idx = (uintptr_t) extra;
3862	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863	node->token.opt_subexp = `1`;
3864
3865	return REG_NOERROR;
3866	}
3867
3868	/ Free the allocated memory inside NODE. /
3869
3870	static void
3871	free_token (re_token_t *node)
3872	{
3873	#ifdef RE_ENABLE_I18N
3874	if (node->type == COMPLEX_BRACKET && node->duplicated == `0`)
3875	free_charset (node->opr.mbcset);
3876	else
3877	#endif /* RE_ENABLE_I18N */
3878	if (node->type == SIMPLE_BRACKET && node->duplicated == `0`)
3879	re_free (node->opr.sbcset);
3880	}
3881
3882	/ Worker function for tree walking. Free the allocated memory inside NODE*
3883	and its children. /*
3884
3885	static reg_errcode_t
3886	free_tree (void extra, bin_tree_t node)
3887	{
3888	free_token (&node->token);
3889	return REG_NOERROR;
3890	}
3891
3892
3893	/ Duplicate the node SRC, and return new node. This is a preorder*
3894	visit similar to the one implemented by the generic visitor, but
3895	we need more infrastructure to maintain two parallel trees --- so,
3896	it's easier to duplicate. /*
3897
3898	static bin_tree_t *
3899	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3900	{
3901	const bin_tree_t *node;
3902	bin_tree_t *dup_root;
3903	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3904
3905	for (node = root; ; )
3906	{
3907	/ Create a new tree and link it back to the current parent. /
3908	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909	if (*p_new == NULL)
3910	return NULL;
3911	(*p_new)->parent = dup_node;
3912	(*p_new)->token.duplicated = `1`;
3913	dup_node = *p_new;
3914
3915	/ Go to the left node, or up and to the right. /
3916	if (node->left)
3917	{
3918	node = node->left;
3919	p_new = &dup_node->left;
3920	}
3921	else
3922	{
3923	const bin_tree_t *prev = NULL;
3924	while (node->right == prev \|\| node->right == NULL)
3925	{
3926	prev = node;
3927	node = node->parent;
3928	dup_node = dup_node->parent;
3929	if (!node)
3930	return dup_root;
3931	}
3932	node = node->right;
3933	p_new = &dup_node->right;
3934	}
3935	}
3936	}
3937

Browse the source code of glibc/posix/regcomp.c