regex_internal.c source code [glibc/posix/regex_internal.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	static void re_string_construct_common (const char *str, Idx len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, bool icase,
23	const re_dfa_t *dfa);
24	static re_dfastate_t create_ci_newstate (const* re_dfa_t *dfa,
25	const re_node_set *nodes,
26	re_hashval_t hash);
27	static re_dfastate_t create_cd_newstate (const* re_dfa_t *dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	re_hashval_t hash);
31	static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32	Idx new_buf_len);
33	#ifdef RE_ENABLE_I18N
34	static void build_wcs_buffer (re_string_t *pstr);
35	static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36	#endif /* RE_ENABLE_I18N */
37	static void build_upper_buffer (re_string_t *pstr);
38	static void re_string_translate_buffer (re_string_t *pstr);
39	static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40	int eflags) __attribute__ ((pure));
41
42	/ Functions for string operation. /
43
44	/ This function allocate the buffers. It is necessary to call*
45	re_string_reconstruct before using the object. /*
46
47	static reg_errcode_t
48	__attribute_warn_unused_result__
49	re_string_allocate (re_string_t pstr, const* char *str, Idx len, Idx init_len,
50	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51	{
52	reg_errcode_t ret;
53	Idx init_buf_len;
54
55	/ Ensure at least one character fits into the buffers. /
56	if (init_len < dfa->mb_cur_max)
57	init_len = dfa->mb_cur_max;
58	init_buf_len = (len + `1` < init_len) ? len + `1`: init_len;
59	re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61	ret = re_string_realloc_buffers (pstr, init_buf_len);
62	if (__glibc_unlikely (ret != REG_NOERROR))
63	return ret;
64
65	pstr->word_char = dfa->word_char;
66	pstr->word_ops_used = dfa->word_ops_used;
67	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > `1`) ? `0` : len;
69	pstr->valid_raw_len = pstr->valid_len;
70	return REG_NOERROR;
71	}
72
73	/ This function allocate the buffers, and initialize them. /
74
75	static reg_errcode_t
76	__attribute_warn_unused_result__
77	re_string_construct (re_string_t pstr, const* char *str, Idx len,
78	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79	{
80	reg_errcode_t ret;
81	memset (pstr, `'\0'`, sizeof (re_string_t));
82	re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84	if (len > `0`)
85	{
86	ret = re_string_realloc_buffers (pstr, len + `1`);
87	if (__glibc_unlikely (ret != REG_NOERROR))
88	return ret;
89	}
90	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92	if (icase)
93	{
94	#ifdef RE_ENABLE_I18N
95	if (dfa->mb_cur_max > `1`)
96	{
97	while (`1`)
98	{
99	ret = build_wcs_upper_buffer (pstr);
100	if (__glibc_unlikely (ret != REG_NOERROR))
101	return ret;
102	if (pstr->valid_raw_len >= len)
103	break;
104	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105	break;
106	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * `2`);
107	if (__glibc_unlikely (ret != REG_NOERROR))
108	return ret;
109	}
110	}
111	else
112	#endif /* RE_ENABLE_I18N */
113	build_upper_buffer (pstr);
114	}
115	else
116	{
117	#ifdef RE_ENABLE_I18N
118	if (dfa->mb_cur_max > `1`)
119	build_wcs_buffer (pstr);
120	else
121	#endif /* RE_ENABLE_I18N */
122	{
123	if (trans != NULL)
124	re_string_translate_buffer (pstr);
125	else
126	{
127	pstr->valid_len = pstr->bufs_len;
128	pstr->valid_raw_len = pstr->bufs_len;
129	}
130	}
131	}
132
133	return REG_NOERROR;
134	}
135
136	/ Helper functions for re_string_allocate, and re_string_construct. /
137
138	static reg_errcode_t
139	__attribute_warn_unused_result__
140	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141	{
142	#ifdef RE_ENABLE_I18N
143	if (pstr->mb_cur_max > `1`)
144	{
145	wint_t *new_wcs;
146
147	/ Avoid overflow in realloc. /
148	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150	< new_buf_len))
151	return REG_ESPACE;
152
153	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154	if (__glibc_unlikely (new_wcs == NULL))
155	return REG_ESPACE;
156	pstr->wcs = new_wcs;
157	if (pstr->offsets != NULL)
158	{
159	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160	if (__glibc_unlikely (new_offsets == NULL))
161	return REG_ESPACE;
162	pstr->offsets = new_offsets;
163	}
164	}
165	#endif /* RE_ENABLE_I18N */
166	if (pstr->mbs_allocated)
167	{
168	unsigned char new_mbs = re_realloc (pstr->mbs, unsigned* char,
169	new_buf_len);
170	if (__glibc_unlikely (new_mbs == NULL))
171	return REG_ESPACE;
172	pstr->mbs = new_mbs;
173	}
174	pstr->bufs_len = new_buf_len;
175	return REG_NOERROR;
176	}
177
178
179	static void
180	re_string_construct_common (const char str, Idx len, re_string_t pstr,
181	RE_TRANSLATE_TYPE trans, bool icase,
182	const re_dfa_t *dfa)
183	{
184	pstr->raw_mbs = (const unsigned char *) str;
185	pstr->len = len;
186	pstr->raw_len = len;
187	pstr->trans = trans;
188	pstr->icase = icase;
189	pstr->mbs_allocated = (trans != NULL \|\| icase);
190	pstr->mb_cur_max = dfa->mb_cur_max;
191	pstr->is_utf8 = dfa->is_utf8;
192	pstr->map_notascii = dfa->map_notascii;
193	pstr->stop = pstr->len;
194	pstr->raw_stop = pstr->stop;
195	}
196
197	#ifdef RE_ENABLE_I18N
198
199	/ Build wide character buffer PSTR->WCS.*
200	If the byte sequence of the string are:
201	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
202	Then wide character buffer will be:
203	<wc1> , WEOF , <wc2> , WEOF , <wc3>
204	We use WEOF for padding, they indicate that the position isn't
205	a first byte of a multibyte character.
206
207	Note that this function assumes PSTR->VALID_LEN elements are already
208	built and starts from PSTR->VALID_LEN. /*
209
210	static void
211	build_wcs_buffer (re_string_t *pstr)
212	{
213	#ifdef _LIBC
214	unsigned char buf[MB_LEN_MAX];
215	DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
216	#else
217	unsigned char buf[`64`];
218	#endif
219	mbstate_t prev_st;
220	Idx byte_idx, end_idx, remain_len;
221	size_t mbclen;
222
223	/ Build the buffers from pstr->valid_len to either pstr->len or*
224	pstr->bufs_len. /*
225	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
226	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
227	{
228	wchar_t wc;
229	const char *p;
230
231	remain_len = end_idx - byte_idx;
232	prev_st = pstr->cur_state;
233	/ Apply the translation if we need. /
234	if (__glibc_unlikely (pstr->trans != NULL))
235	{
236	int i, ch;
237
238	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
239	{
240	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
241	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
242	}
243	p = (const char *) buf;
244	}
245	else
246	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
247	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248	if (__glibc_unlikely (mbclen == (size_t) -`1` \|\| mbclen == `0`
249	\|\| (mbclen == (size_t) -`2`
250	&& pstr->bufs_len >= pstr->len)))
251	{
252	/ We treat these cases as a singlebyte character. /
253	mbclen = `1`;
254	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255	if (__glibc_unlikely (pstr->trans != NULL))
256	wc = pstr->trans[wc];
257	pstr->cur_state = prev_st;
258	}
259	else if (__glibc_unlikely (mbclen == (size_t) -`2`))
260	{
261	/ The buffer doesn't have enough space, finish to build. /
262	pstr->cur_state = prev_st;
263	break;
264	}
265
266	/ Write wide character and padding. /
267	pstr->wcs[byte_idx++] = wc;
268	/ Write paddings. /
269	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
270	pstr->wcs[byte_idx++] = WEOF;
271	}
272	pstr->valid_len = byte_idx;
273	pstr->valid_raw_len = byte_idx;
274	}
275
276	/ Build wide character buffer PSTR->WCS like build_wcs_buffer,*
277	but for REG_ICASE. /*
278
279	static reg_errcode_t
280	__attribute_warn_unused_result__
281	build_wcs_upper_buffer (re_string_t *pstr)
282	{
283	mbstate_t prev_st;
284	Idx src_idx, byte_idx, end_idx, remain_len;
285	size_t mbclen;
286	#ifdef _LIBC
287	char buf[MB_LEN_MAX];
288	DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
289	#else
290	char buf[`64`];
291	#endif
292
293	byte_idx = pstr->valid_len;
294	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
295
296	/ The following optimization assumes that ASCII characters can be*
297	mapped to wide characters with a simple cast. /*
298	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
299	{
300	while (byte_idx < end_idx)
301	{
302	wchar_t wc;
303	unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
304
305	if (isascii (ch) && mbsinit (&pstr->cur_state))
306	{
307	/ The next step uses the assumption that wchar_t is encoded*
308	ASCII-safe: all ASCII values can be converted like this. /*
309	wchar_t wcu = __towupper (ch);
310	if (isascii (wcu))
311	{
312	pstr->mbs[byte_idx] = wcu;
313	pstr->wcs[byte_idx] = wcu;
314	byte_idx++;
315	continue;
316	}
317	}
318
319	remain_len = end_idx - byte_idx;
320	prev_st = pstr->cur_state;
321	mbclen = __mbrtowc (&wc,
322	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323	+ byte_idx), remain_len, &pstr->cur_state);
324	if (__glibc_likely (`0` < mbclen && mbclen < (size_t) -`2`))
325	{
326	wchar_t wcu = __towupper (wc);
327	if (wcu != wc)
328	{
329	size_t mbcdlen;
330
331	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
332	if (__glibc_likely (mbclen == mbcdlen))
333	memcpy (pstr->mbs + byte_idx, buf, mbclen);
334	else
335	{
336	src_idx = byte_idx;
337	goto offsets_needed;
338	}
339	}
340	else
341	memcpy (pstr->mbs + byte_idx,
342	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343	pstr->wcs[byte_idx++] = wcu;
344	/ Write paddings. /
345	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
346	pstr->wcs[byte_idx++] = WEOF;
347	}
348	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
349	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
350	{
351	/ It is an invalid character, an incomplete character*
352	at the end of the string, or '\0'. Just use the byte. /*
353	pstr->mbs[byte_idx] = ch;
354	/ And also cast it to wide char. /
355	pstr->wcs[byte_idx++] = (wchar_t) ch;
356	if (__glibc_unlikely (mbclen == (size_t) -`1`))
357	pstr->cur_state = prev_st;
358	}
359	else
360	{
361	/ The buffer doesn't have enough space, finish to build. /
362	pstr->cur_state = prev_st;
363	break;
364	}
365	}
366	pstr->valid_len = byte_idx;
367	pstr->valid_raw_len = byte_idx;
368	return REG_NOERROR;
369	}
370	else
371	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
372	{
373	wchar_t wc;
374	const char *p;
375	offsets_needed:
376	remain_len = end_idx - byte_idx;
377	prev_st = pstr->cur_state;
378	if (__glibc_unlikely (pstr->trans != NULL))
379	{
380	int i, ch;
381
382	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
383	{
384	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
385	buf[i] = pstr->trans[ch];
386	}
387	p = (const char *) buf;
388	}
389	else
390	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
391	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
392	if (__glibc_likely (`0` < mbclen && mbclen < (size_t) -`2`))
393	{
394	wchar_t wcu = __towupper (wc);
395	if (wcu != wc)
396	{
397	size_t mbcdlen;
398
399	mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
400	if (__glibc_likely (mbclen == mbcdlen))
401	memcpy (pstr->mbs + byte_idx, buf, mbclen);
402	else if (mbcdlen != (size_t) -`1`)
403	{
404	size_t i;
405
406	if (byte_idx + mbcdlen > pstr->bufs_len)
407	{
408	pstr->cur_state = prev_st;
409	break;
410	}
411
412	if (pstr->offsets == NULL)
413	{
414	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
415
416	if (pstr->offsets == NULL)
417	return REG_ESPACE;
418	}
419	if (!pstr->offsets_needed)
420	{
421	for (i = `0`; i < (size_t) byte_idx; ++i)
422	pstr->offsets[i] = i;
423	pstr->offsets_needed = `1`;
424	}
425
426	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427	pstr->wcs[byte_idx] = wcu;
428	pstr->offsets[byte_idx] = src_idx;
429	for (i = `1`; i < mbcdlen; ++i)
430	{
431	pstr->offsets[byte_idx + i]
432	= src_idx + (i < mbclen ? i : mbclen - `1`);
433	pstr->wcs[byte_idx + i] = WEOF;
434	}
435	pstr->len += mbcdlen - mbclen;
436	if (pstr->raw_stop > src_idx)
437	pstr->stop += mbcdlen - mbclen;
438	end_idx = (pstr->bufs_len > pstr->len)
439	? pstr->len : pstr->bufs_len;
440	byte_idx += mbcdlen;
441	src_idx += mbclen;
442	continue;
443	}
444	else
445	memcpy (pstr->mbs + byte_idx, p, mbclen);
446	}
447	else
448	memcpy (pstr->mbs + byte_idx, p, mbclen);
449
450	if (__glibc_unlikely (pstr->offsets_needed != `0`))
451	{
452	size_t i;
453	for (i = `0`; i < mbclen; ++i)
454	pstr->offsets[byte_idx + i] = src_idx + i;
455	}
456	src_idx += mbclen;
457
458	pstr->wcs[byte_idx++] = wcu;
459	/ Write paddings. /
460	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
461	pstr->wcs[byte_idx++] = WEOF;
462	}
463	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
464	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
465	{
466	/ It is an invalid character or '\0'. Just use the byte. /
467	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468
469	if (__glibc_unlikely (pstr->trans != NULL))
470	ch = pstr->trans [ch];
471	pstr->mbs[byte_idx] = ch;
472
473	if (__glibc_unlikely (pstr->offsets_needed != `0`))
474	pstr->offsets[byte_idx] = src_idx;
475	++src_idx;
476
477	/ And also cast it to wide char. /
478	pstr->wcs[byte_idx++] = (wchar_t) ch;
479	if (__glibc_unlikely (mbclen == (size_t) -`1`))
480	pstr->cur_state = prev_st;
481	}
482	else
483	{
484	/ The buffer doesn't have enough space, finish to build. /
485	pstr->cur_state = prev_st;
486	break;
487	}
488	}
489	pstr->valid_len = byte_idx;
490	pstr->valid_raw_len = src_idx;
491	return REG_NOERROR;
492	}
493
494	/ Skip characters until the index becomes greater than NEW_RAW_IDX.*
495	Return the index. /*
496
497	static Idx
498	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
499	{
500	mbstate_t prev_st;
501	Idx rawbuf_idx;
502	size_t mbclen;
503	wint_t wc = WEOF;
504
505	/ Skip the characters which are not necessary to check. /
506	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
507	rawbuf_idx < new_raw_idx;)
508	{
509	wchar_t wc2;
510	Idx remain_len = pstr->raw_len - rawbuf_idx;
511	prev_st = pstr->cur_state;
512	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
513	remain_len, &pstr->cur_state);
514	if (__glibc_unlikely (mbclen == (size_t) -`2` \|\| mbclen == (size_t) -`1`
515	\|\| mbclen == `0`))
516	{
517	/ We treat these cases as a single byte character. /
518	if (mbclen == `0` \|\| remain_len == `0`)
519	wc = L`'\0'`;
520	else
521	wc = (unsigned* char *) (pstr->raw_mbs + rawbuf_idx);
522	mbclen = `1`;
523	pstr->cur_state = prev_st;
524	}
525	else
526	wc = wc2;
527	/ Then proceed the next character. /
528	rawbuf_idx += mbclen;
529	}
530	*last_wc = wc;
531	return rawbuf_idx;
532	}
533	#endif /* RE_ENABLE_I18N */
534
535	/ Build the buffer PSTR->MBS, and apply the translation if we need.*
536	This function is used in case of REG_ICASE. /*
537
538	static void
539	build_upper_buffer (re_string_t *pstr)
540	{
541	Idx char_idx, end_idx;
542	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
543
544	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
545	{
546	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
547	if (__glibc_unlikely (pstr->trans != NULL))
548	ch = pstr->trans[ch];
549	pstr->mbs[char_idx] = toupper (ch);
550	}
551	pstr->valid_len = char_idx;
552	pstr->valid_raw_len = char_idx;
553	}
554
555	/ Apply TRANS to the buffer in PSTR. /
556
557	static void
558	re_string_translate_buffer (re_string_t *pstr)
559	{
560	Idx buf_idx, end_idx;
561	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562
563	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
564	{
565	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
566	pstr->mbs[buf_idx] = pstr->trans[ch];
567	}
568
569	pstr->valid_len = buf_idx;
570	pstr->valid_raw_len = buf_idx;
571	}
572
573	/ This function re-construct the buffers.*
574	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
575	convert to upper case in case of REG_ICASE, apply translation. /*
576
577	static reg_errcode_t
578	__attribute_warn_unused_result__
579	re_string_reconstruct (re_string_t pstr, Idx idx, int* eflags)
580	{
581	Idx offset;
582
583	if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
584	offset = idx - pstr->raw_mbs_idx;
585	else
586	{
587	/ Reset buffer. /
588	#ifdef RE_ENABLE_I18N
589	if (pstr->mb_cur_max > `1`)
590	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
591	#endif /* RE_ENABLE_I18N */
592	pstr->len = pstr->raw_len;
593	pstr->stop = pstr->raw_stop;
594	pstr->valid_len = `0`;
595	pstr->raw_mbs_idx = `0`;
596	pstr->valid_raw_len = `0`;
597	pstr->offsets_needed = `0`;
598	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
599	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
600	if (!pstr->mbs_allocated)
601	pstr->mbs = (unsigned char *) pstr->raw_mbs;
602	offset = idx;
603	}
604
605	if (__glibc_likely (offset != `0`))
606	{
607	/ Should the already checked characters be kept? /
608	if (__glibc_likely (offset < pstr->valid_raw_len))
609	{
610	/ Yes, move them to the front of the buffer. /
611	#ifdef RE_ENABLE_I18N
612	if (__glibc_unlikely (pstr->offsets_needed))
613	{
614	Idx low = `0`, high = pstr->valid_len, mid;
615	do
616	{
617	mid = (high + low) / `2`;
618	if (pstr->offsets[mid] > offset)
619	high = mid;
620	else if (pstr->offsets[mid] < offset)
621	low = mid + `1`;
622	else
623	break;
624	}
625	while (low < high);
626	if (pstr->offsets[mid] < offset)
627	++mid;
628	pstr->tip_context = re_string_context_at (pstr, mid - `1`,
629	eflags);
630	/ This can be quite complicated, so handle specially*
631	only the common and easy case where the character with
632	different length representation of lower and upper
633	case is present at or after offset. /*
634	if (pstr->valid_len > offset
635	&& mid == offset && pstr->offsets[mid] == offset)
636	{
637	memmove (pstr->wcs, pstr->wcs + offset,
638	(pstr->valid_len - offset) * sizeof (wint_t));
639	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
640	pstr->valid_len -= offset;
641	pstr->valid_raw_len -= offset;
642	for (low = `0`; low < pstr->valid_len; low++)
643	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
644	}
645	else
646	{
647	/ Otherwise, just find out how long the partial multibyte*
648	character at offset is and fill it with WEOF/255. /*
649	pstr->len = pstr->raw_len - idx + offset;
650	pstr->stop = pstr->raw_stop - idx + offset;
651	pstr->offsets_needed = `0`;
652	while (mid > `0` && pstr->offsets[mid - `1`] == offset)
653	--mid;
654	while (mid < pstr->valid_len)
655	if (pstr->wcs[mid] != WEOF)
656	break;
657	else
658	++mid;
659	if (mid == pstr->valid_len)
660	pstr->valid_len = `0`;
661	else
662	{
663	pstr->valid_len = pstr->offsets[mid] - offset;
664	if (pstr->valid_len)
665	{
666	for (low = `0`; low < pstr->valid_len; ++low)
667	pstr->wcs[low] = WEOF;
668	memset (pstr->mbs, `255`, pstr->valid_len);
669	}
670	}
671	pstr->valid_raw_len = pstr->valid_len;
672	}
673	}
674	else
675	#endif
676	{
677	pstr->tip_context = re_string_context_at (pstr, offset - `1`,
678	eflags);
679	#ifdef RE_ENABLE_I18N
680	if (pstr->mb_cur_max > `1`)
681	memmove (pstr->wcs, pstr->wcs + offset,
682	(pstr->valid_len - offset) * sizeof (wint_t));
683	#endif /* RE_ENABLE_I18N */
684	if (__glibc_unlikely (pstr->mbs_allocated))
685	memmove (pstr->mbs, pstr->mbs + offset,
686	pstr->valid_len - offset);
687	pstr->valid_len -= offset;
688	pstr->valid_raw_len -= offset;
689	DEBUG_ASSERT (pstr->valid_len > `0`);
690	}
691	}
692	else
693	{
694	#ifdef RE_ENABLE_I18N
695	/ No, skip all characters until IDX. /
696	Idx prev_valid_len = pstr->valid_len;
697
698	if (__glibc_unlikely (pstr->offsets_needed))
699	{
700	pstr->len = pstr->raw_len - idx + offset;
701	pstr->stop = pstr->raw_stop - idx + offset;
702	pstr->offsets_needed = `0`;
703	}
704	#endif
705	pstr->valid_len = `0`;
706	#ifdef RE_ENABLE_I18N
707	if (pstr->mb_cur_max > `1`)
708	{
709	Idx wcs_idx;
710	wint_t wc = WEOF;
711
712	if (pstr->is_utf8)
713	{
714	const unsigned char raw, p, *end;
715
716	/ Special case UTF-8. Multi-byte chars start with any*
717	byte other than 0x80 - 0xbf. /*
718	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
719	end = raw + (offset - pstr->mb_cur_max);
720	if (end < pstr->raw_mbs)
721	end = pstr->raw_mbs;
722	p = raw + offset - `1`;
723	#ifdef _LIBC
724	/ We know the wchar_t encoding is UCS4, so for the simple*
725	case, ASCII characters, skip the conversion step. /*
726	if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
727	{
728	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
729	/ pstr->valid_len = 0; /
730	wc = (wchar_t) *p;
731	}
732	else
733	#endif
734	for (; p >= end; --p)
735	if ((*p & `0xc0`) != `0x80`)
736	{
737	mbstate_t cur_state;
738	wchar_t wc2;
739	Idx mlen = raw + pstr->len - p;
740	unsigned char buf[`6`];
741	size_t mbclen;
742
743	const unsigned char *pp = p;
744	if (__glibc_unlikely (pstr->trans != NULL))
745	{
746	int i = mlen < `6` ? mlen : `6`;
747	while (--i >= `0`)
748	buf[i] = pstr->trans[p[i]];
749	pp = buf;
750	}
751	/ XXX Don't use mbrtowc, we know which conversion*
752	to use (UTF-8 -> UCS4). /*
753	memset (&cur_state, `0`, sizeof (cur_state));
754	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
755	&cur_state);
756	if (raw + offset - p <= mbclen
757	&& mbclen < (size_t) -`2`)
758	{
759	memset (&pstr->cur_state, `'\0'`,
760	sizeof (mbstate_t));
761	pstr->valid_len = mbclen - (raw + offset - p);
762	wc = wc2;
763	}
764	break;
765	}
766	}
767
768	if (wc == WEOF)
769	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
770	if (wc == WEOF)
771	pstr->tip_context
772	= re_string_context_at (pstr, prev_valid_len - `1`, eflags);
773	else
774	pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != `0`)
775	&& IS_WIDE_WORD_CHAR (wc))
776	? CONTEXT_WORD
777	: ((IS_WIDE_NEWLINE (wc)
778	&& pstr->newline_anchor)
779	? CONTEXT_NEWLINE : `0`));
780	if (__glibc_unlikely (pstr->valid_len))
781	{
782	for (wcs_idx = `0`; wcs_idx < pstr->valid_len; ++wcs_idx)
783	pstr->wcs[wcs_idx] = WEOF;
784	if (pstr->mbs_allocated)
785	memset (pstr->mbs, `255`, pstr->valid_len);
786	}
787	pstr->valid_raw_len = pstr->valid_len;
788	}
789	else
790	#endif /* RE_ENABLE_I18N */
791	{
792	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - `1`];
793	pstr->valid_raw_len = `0`;
794	if (pstr->trans)
795	c = pstr->trans[c];
796	pstr->tip_context = (bitset_contain (pstr->word_char, c)
797	? CONTEXT_WORD
798	: ((IS_NEWLINE (c) && pstr->newline_anchor)
799	? CONTEXT_NEWLINE : `0`));
800	}
801	}
802	if (!__glibc_unlikely (pstr->mbs_allocated))
803	pstr->mbs += offset;
804	}
805	pstr->raw_mbs_idx = idx;
806	pstr->len -= offset;
807	pstr->stop -= offset;
808
809	/ Then build the buffers. /
810	#ifdef RE_ENABLE_I18N
811	if (pstr->mb_cur_max > `1`)
812	{
813	if (pstr->icase)
814	{
815	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
816	if (__glibc_unlikely (ret != REG_NOERROR))
817	return ret;
818	}
819	else
820	build_wcs_buffer (pstr);
821	}
822	else
823	#endif /* RE_ENABLE_I18N */
824	if (__glibc_unlikely (pstr->mbs_allocated))
825	{
826	if (pstr->icase)
827	build_upper_buffer (pstr);
828	else if (pstr->trans != NULL)
829	re_string_translate_buffer (pstr);
830	}
831	else
832	pstr->valid_len = pstr->len;
833
834	pstr->cur_idx = `0`;
835	return REG_NOERROR;
836	}
837
838	static unsigned char
839	__attribute__ ((pure))
840	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
841	{
842	int ch;
843	Idx off;
844
845	/ Handle the common (easiest) cases first. /
846	if (__glibc_likely (!pstr->mbs_allocated))
847	return re_string_peek_byte (pstr, idx);
848
849	#ifdef RE_ENABLE_I18N
850	if (pstr->mb_cur_max > `1`
851	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
852	return re_string_peek_byte (pstr, idx);
853	#endif
854
855	off = pstr->cur_idx + idx;
856	#ifdef RE_ENABLE_I18N
857	if (pstr->offsets_needed)
858	off = pstr->offsets[off];
859	#endif
860
861	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
862
863	#ifdef RE_ENABLE_I18N
864	/ Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I*
865	this function returns CAPITAL LETTER I instead of first byte of
866	DOTLESS SMALL LETTER I. The latter would confuse the parser,
867	since peek_byte_case doesn't advance cur_idx in any way. /*
868	if (pstr->offsets_needed && !isascii (ch))
869	return re_string_peek_byte (pstr, idx);
870	#endif
871
872	return ch;
873	}
874
875	static unsigned char
876	re_string_fetch_byte_case (re_string_t *pstr)
877	{
878	if (__glibc_likely (!pstr->mbs_allocated))
879	return re_string_fetch_byte (pstr);
880
881	#ifdef RE_ENABLE_I18N
882	if (pstr->offsets_needed)
883	{
884	Idx off;
885	int ch;
886
887	/ For tr_TR.UTF-8 [[:islower:]] there is*
888	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
889	in that case the whole multi-byte character and return
890	the original letter. On the other side, with
891	[[: DOTLESS SMALL LETTER I return [[:I, as doing
892	anything else would complicate things too much. /*
893
894	if (!re_string_first_byte (pstr, pstr->cur_idx))
895	return re_string_fetch_byte (pstr);
896
897	off = pstr->offsets[pstr->cur_idx];
898	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
899
900	if (! isascii (ch))
901	return re_string_fetch_byte (pstr);
902
903	re_string_skip_bytes (pstr,
904	re_string_char_size_at (pstr, pstr->cur_idx));
905	return ch;
906	}
907	#endif
908
909	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
910	}
911
912	static void
913	re_string_destruct (re_string_t *pstr)
914	{
915	#ifdef RE_ENABLE_I18N
916	re_free (pstr->wcs);
917	re_free (pstr->offsets);
918	#endif /* RE_ENABLE_I18N */
919	if (pstr->mbs_allocated)
920	re_free (pstr->mbs);
921	}
922
923	/ Return the context at IDX in INPUT. /
924
925	static unsigned int
926	re_string_context_at (const re_string_t input, Idx idx, int* eflags)
927	{
928	int c;
929	if (__glibc_unlikely (idx < `0`))
930	/ In this case, we use the value stored in input->tip_context,*
931	since we can't know the character in input->mbs[-1] here. /*
932	return input->tip_context;
933	if (__glibc_unlikely (idx == input->len))
934	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
936	#ifdef RE_ENABLE_I18N
937	if (input->mb_cur_max > `1`)
938	{
939	wint_t wc;
940	Idx wc_idx = idx;
941	while(input->wcs[wc_idx] == WEOF)
942	{
943	DEBUG_ASSERT (wc_idx >= `0`);
944	--wc_idx;
945	if (wc_idx < `0`)
946	return input->tip_context;
947	}
948	wc = input->wcs[wc_idx];
949	if (__glibc_unlikely (input->word_ops_used != `0`)
950	&& IS_WIDE_WORD_CHAR (wc))
951	return CONTEXT_WORD;
952	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953	? CONTEXT_NEWLINE : `0`);
954	}
955	else
956	#endif
957	{
958	c = re_string_byte_at (input, idx);
959	if (bitset_contain (input->word_char, c))
960	return CONTEXT_WORD;
961	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : `0`;
962	}
963	}
964
965	/ Functions for set operation. /
966
967	static reg_errcode_t
968	__attribute_warn_unused_result__
969	re_node_set_alloc (re_node_set *set, Idx size)
970	{
971	set->alloc = size;
972	set->nelem = `0`;
973	set->elems = re_malloc (Idx, size);
974	if (__glibc_unlikely (set->elems == NULL)
975	&& (MALLOC_0_IS_NONNULL \|\| size != `0`))
976	return REG_ESPACE;
977	return REG_NOERROR;
978	}
979
980	static reg_errcode_t
981	__attribute_warn_unused_result__
982	re_node_set_init_1 (re_node_set *set, Idx elem)
983	{
984	set->alloc = `1`;
985	set->nelem = `1`;
986	set->elems = re_malloc (Idx, `1`);
987	if (__glibc_unlikely (set->elems == NULL))
988	{
989	set->alloc = set->nelem = `0`;
990	return REG_ESPACE;
991	}
992	set->elems[`0`] = elem;
993	return REG_NOERROR;
994	}
995
996	static reg_errcode_t
997	__attribute_warn_unused_result__
998	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
999	{
1000	set->alloc = `2`;
1001	set->elems = re_malloc (Idx, `2`);
1002	if (__glibc_unlikely (set->elems == NULL))
1003	return REG_ESPACE;
1004	if (elem1 == elem2)
1005	{
1006	set->nelem = `1`;
1007	set->elems[`0`] = elem1;
1008	}
1009	else
1010	{
1011	set->nelem = `2`;
1012	if (elem1 < elem2)
1013	{
1014	set->elems[`0`] = elem1;
1015	set->elems[`1`] = elem2;
1016	}
1017	else
1018	{
1019	set->elems[`0`] = elem2;
1020	set->elems[`1`] = elem1;
1021	}
1022	}
1023	return REG_NOERROR;
1024	}
1025
1026	static reg_errcode_t
1027	__attribute_warn_unused_result__
1028	re_node_set_init_copy (re_node_set dest, const* re_node_set *src)
1029	{
1030	dest->nelem = src->nelem;
1031	if (src->nelem > `0`)
1032	{
1033	dest->alloc = dest->nelem;
1034	dest->elems = re_malloc (Idx, dest->alloc);
1035	if (__glibc_unlikely (dest->elems == NULL))
1036	{
1037	dest->alloc = dest->nelem = `0`;
1038	return REG_ESPACE;
1039	}
1040	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041	}
1042	else
1043	re_node_set_init_empty (dest);
1044	return REG_NOERROR;
1045	}
1046
1047	/ Calculate the intersection of the sets SRC1 and SRC2. And merge it to*
1048	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049	Note: We assume dest->elems is NULL, when dest->alloc is 0. /*
1050
1051	static reg_errcode_t
1052	__attribute_warn_unused_result__
1053	re_node_set_add_intersect (re_node_set dest, const* re_node_set *src1,
1054	const re_node_set *src2)
1055	{
1056	Idx i1, i2, is, id, delta, sbase;
1057	if (src1->nelem == `0` \|\| src2->nelem == `0`)
1058	return REG_NOERROR;
1059
1060	/ We need dest->nelem + 2 * elems_in_intersection; this is a*
1061	conservative estimate. /*
1062	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063	{
1064	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066	if (__glibc_unlikely (new_elems == NULL))
1067	return REG_ESPACE;
1068	dest->elems = new_elems;
1069	dest->alloc = new_alloc;
1070	}
1071
1072	/ Find the items in the intersection of SRC1 and SRC2, and copy*
1073	into the top of DEST those that are not already in DEST itself. /*
1074	sbase = dest->nelem + src1->nelem + src2->nelem;
1075	i1 = src1->nelem - `1`;
1076	i2 = src2->nelem - `1`;
1077	id = dest->nelem - `1`;
1078	for (;;)
1079	{
1080	if (src1->elems[i1] == src2->elems[i2])
1081	{
1082	/ Try to find the item in DEST. Maybe we could binary search? /
1083	while (id >= `0` && dest->elems[id] > src1->elems[i1])
1084	--id;
1085
1086	if (id < `0` \|\| dest->elems[id] != src1->elems[i1])
1087	dest->elems[--sbase] = src1->elems[i1];
1088
1089	if (--i1 < `0` \|\| --i2 < `0`)
1090	break;
1091	}
1092
1093	/ Lower the highest of the two items. /
1094	else if (src1->elems[i1] < src2->elems[i2])
1095	{
1096	if (--i2 < `0`)
1097	break;
1098	}
1099	else
1100	{
1101	if (--i1 < `0`)
1102	break;
1103	}
1104	}
1105
1106	id = dest->nelem - `1`;
1107	is = dest->nelem + src1->nelem + src2->nelem - `1`;
1108	delta = is - sbase + `1`;
1109
1110	/ Now copy. When DELTA becomes zero, the remaining*
1111	DEST elements are already in place; this is more or
1112	less the same loop that is in re_node_set_merge. /*
1113	dest->nelem += delta;
1114	if (delta > `0` && id >= `0`)
1115	for (;;)
1116	{
1117	if (dest->elems[is] > dest->elems[id])
1118	{
1119	/ Copy from the top. /
1120	dest->elems[id + delta--] = dest->elems[is--];
1121	if (delta == `0`)
1122	break;
1123	}
1124	else
1125	{
1126	/ Slide from the bottom. /
1127	dest->elems[id + delta] = dest->elems[id];
1128	if (--id < `0`)
1129	break;
1130	}
1131	}
1132
1133	/ Copy remaining SRC elements. /
1134	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135
1136	return REG_NOERROR;
1137	}
1138
1139	/ Calculate the union set of the sets SRC1 and SRC2. And store it to*
1140	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1141
1142	static reg_errcode_t
1143	__attribute_warn_unused_result__
1144	re_node_set_init_union (re_node_set dest, const* re_node_set *src1,
1145	const re_node_set *src2)
1146	{
1147	Idx i1, i2, id;
1148	if (src1 != NULL && src1->nelem > `0` && src2 != NULL && src2->nelem > `0`)
1149	{
1150	dest->alloc = src1->nelem + src2->nelem;
1151	dest->elems = re_malloc (Idx, dest->alloc);
1152	if (__glibc_unlikely (dest->elems == NULL))
1153	return REG_ESPACE;
1154	}
1155	else
1156	{
1157	if (src1 != NULL && src1->nelem > `0`)
1158	return re_node_set_init_copy (dest, src1);
1159	else if (src2 != NULL && src2->nelem > `0`)
1160	return re_node_set_init_copy (dest, src2);
1161	else
1162	re_node_set_init_empty (dest);
1163	return REG_NOERROR;
1164	}
1165	for (i1 = i2 = id = `0` ; i1 < src1->nelem && i2 < src2->nelem ;)
1166	{
1167	if (src1->elems[i1] > src2->elems[i2])
1168	{
1169	dest->elems[id++] = src2->elems[i2++];
1170	continue;
1171	}
1172	if (src1->elems[i1] == src2->elems[i2])
1173	++i2;
1174	dest->elems[id++] = src1->elems[i1++];
1175	}
1176	if (i1 < src1->nelem)
1177	{
1178	memcpy (dest->elems + id, src1->elems + i1,
1179	(src1->nelem - i1) * sizeof (Idx));
1180	id += src1->nelem - i1;
1181	}
1182	else if (i2 < src2->nelem)
1183	{
1184	memcpy (dest->elems + id, src2->elems + i2,
1185	(src2->nelem - i2) * sizeof (Idx));
1186	id += src2->nelem - i2;
1187	}
1188	dest->nelem = id;
1189	return REG_NOERROR;
1190	}
1191
1192	/ Calculate the union set of the sets DEST and SRC. And store it to*
1193	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1194
1195	static reg_errcode_t
1196	__attribute_warn_unused_result__
1197	re_node_set_merge (re_node_set dest, const* re_node_set *src)
1198	{
1199	Idx is, id, sbase, delta;
1200	if (src == NULL \|\| src->nelem == `0`)
1201	return REG_NOERROR;
1202	if (dest->alloc < `2` * src->nelem + dest->nelem)
1203	{
1204	Idx new_alloc = `2` * (src->nelem + dest->alloc);
1205	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206	if (__glibc_unlikely (new_buffer == NULL))
1207	return REG_ESPACE;
1208	dest->elems = new_buffer;
1209	dest->alloc = new_alloc;
1210	}
1211
1212	if (__glibc_unlikely (dest->nelem == `0`))
1213	{
1214	dest->nelem = src->nelem;
1215	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1216	return REG_NOERROR;
1217	}
1218
1219	/ Copy into the top of DEST the items of SRC that are not*
1220	found in DEST. Maybe we could binary search in DEST? /*
1221	for (sbase = dest->nelem + `2` * src->nelem,
1222	is = src->nelem - `1`, id = dest->nelem - `1`; is >= `0` && id >= `0`; )
1223	{
1224	if (dest->elems[id] == src->elems[is])
1225	is--, id--;
1226	else if (dest->elems[id] < src->elems[is])
1227	dest->elems[--sbase] = src->elems[is--];
1228	else / if (dest->elems[id] > src->elems[is]) /
1229	--id;
1230	}
1231
1232	if (is >= `0`)
1233	{
1234	/ If DEST is exhausted, the remaining items of SRC must be unique. /
1235	sbase -= is + `1`;
1236	memcpy (dest->elems + sbase, src->elems, (is + `1`) * sizeof (Idx));
1237	}
1238
1239	id = dest->nelem - `1`;
1240	is = dest->nelem + `2` * src->nelem - `1`;
1241	delta = is - sbase + `1`;
1242	if (delta == `0`)
1243	return REG_NOERROR;
1244
1245	/ Now copy. When DELTA becomes zero, the remaining*
1246	DEST elements are already in place. /*
1247	dest->nelem += delta;
1248	for (;;)
1249	{
1250	if (dest->elems[is] > dest->elems[id])
1251	{
1252	/ Copy from the top. /
1253	dest->elems[id + delta--] = dest->elems[is--];
1254	if (delta == `0`)
1255	break;
1256	}
1257	else
1258	{
1259	/ Slide from the bottom. /
1260	dest->elems[id + delta] = dest->elems[id];
1261	if (--id < `0`)
1262	{
1263	/ Copy remaining SRC elements. /
1264	memcpy (dest->elems, dest->elems + sbase,
1265	delta * sizeof (Idx));
1266	break;
1267	}
1268	}
1269	}
1270
1271	return REG_NOERROR;
1272	}
1273
1274	/ Insert the new element ELEM to the re_node_set* SET.*
1275	SET should not already have ELEM.
1276	Return true if successful. /*
1277
1278	static bool
1279	__attribute_warn_unused_result__
1280	re_node_set_insert (re_node_set *set, Idx elem)
1281	{
1282	Idx idx;
1283	/ In case the set is empty. /
1284	if (set->alloc == `0`)
1285	return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1286
1287	if (__glibc_unlikely (set->nelem) == `0`)
1288	{
1289	/ We already guaranteed above that set->alloc != 0. /
1290	set->elems[`0`] = elem;
1291	++set->nelem;
1292	return true;
1293	}
1294
1295	/ Realloc if we need. /
1296	if (set->alloc == set->nelem)
1297	{
1298	Idx *new_elems;
1299	set->alloc = set->alloc * `2`;
1300	new_elems = re_realloc (set->elems, Idx, set->alloc);
1301	if (__glibc_unlikely (new_elems == NULL))
1302	return false;
1303	set->elems = new_elems;
1304	}
1305
1306	/ Move the elements which follows the new element. Test the*
1307	first element separately to skip a check in the inner loop. /*
1308	if (elem < set->elems[`0`])
1309	{
1310	for (idx = set->nelem; idx > `0`; idx--)
1311	set->elems[idx] = set->elems[idx - `1`];
1312	}
1313	else
1314	{
1315	for (idx = set->nelem; set->elems[idx - `1`] > elem; idx--)
1316	set->elems[idx] = set->elems[idx - `1`];
1317	}
1318
1319	/ Insert the new element. /
1320	set->elems[idx] = elem;
1321	++set->nelem;
1322	return true;
1323	}
1324
1325	/ Insert the new element ELEM to the re_node_set* SET.*
1326	SET should not already have any element greater than or equal to ELEM.
1327	Return true if successful. /*
1328
1329	static bool
1330	__attribute_warn_unused_result__
1331	re_node_set_insert_last (re_node_set *set, Idx elem)
1332	{
1333	/ Realloc if we need. /
1334	if (set->alloc == set->nelem)
1335	{
1336	Idx *new_elems;
1337	set->alloc = (set->alloc + `1`) * `2`;
1338	new_elems = re_realloc (set->elems, Idx, set->alloc);
1339	if (__glibc_unlikely (new_elems == NULL))
1340	return false;
1341	set->elems = new_elems;
1342	}
1343
1344	/ Insert the new element. /
1345	set->elems[set->nelem++] = elem;
1346	return true;
1347	}
1348
1349	/ Compare two node sets SET1 and SET2.*
1350	Return true if SET1 and SET2 are equivalent. /*
1351
1352	static bool
1353	__attribute__ ((pure))
1354	re_node_set_compare (const re_node_set set1, const* re_node_set *set2)
1355	{
1356	Idx i;
1357	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1358	return false;
1359	for (i = set1->nelem ; --i >= `0` ; )
1360	if (set1->elems[i] != set2->elems[i])
1361	return false;
1362	return true;
1363	}
1364
1365	/ Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. /
1366
1367	static Idx
1368	__attribute__ ((pure))
1369	re_node_set_contains (const re_node_set *set, Idx elem)
1370	{
1371	__re_size_t idx, right, mid;
1372	if (set->nelem <= `0`)
1373	return `0`;
1374
1375	/ Binary search the element. /
1376	idx = `0`;
1377	right = set->nelem - `1`;
1378	while (idx < right)
1379	{
1380	mid = (idx + right) / `2`;
1381	if (set->elems[mid] < elem)
1382	idx = mid + `1`;
1383	else
1384	right = mid;
1385	}
1386	return set->elems[idx] == elem ? idx + `1` : `0`;
1387	}
1388
1389	static void
1390	re_node_set_remove_at (re_node_set *set, Idx idx)
1391	{
1392	if (idx < `0` \|\| idx >= set->nelem)
1393	return;
1394	--set->nelem;
1395	for (; idx < set->nelem; idx++)
1396	set->elems[idx] = set->elems[idx + `1`];
1397	}
1398
1399
1400	/ Add the token TOKEN to dfa->nodes, and return the index of the token.*
1401	Or return -1 if an error occurred. /*
1402
1403	static Idx
1404	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405	{
1406	if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1407	{
1408	size_t new_nodes_alloc = dfa->nodes_alloc * `2`;
1409	Idx new_nexts, new_indices;
1410	re_node_set new_edests, new_eclosures;
1411	re_token_t *new_nodes;
1412
1413	/ Avoid overflows in realloc. /
1414	const size_t max_object_size = MAX (sizeof (re_token_t),
1415	MAX (sizeof (re_node_set),
1416	sizeof (Idx)));
1417	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1418	< new_nodes_alloc))
1419	return -`1`;
1420
1421	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1422	if (__glibc_unlikely (new_nodes == NULL))
1423	return -`1`;
1424	dfa->nodes = new_nodes;
1425	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1426	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1427	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1428	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1429	if (__glibc_unlikely (new_nexts == NULL \|\| new_indices == NULL
1430	\|\| new_edests == NULL \|\| new_eclosures == NULL))
1431	{
1432	re_free (new_nexts);
1433	re_free (new_indices);
1434	re_free (new_edests);
1435	re_free (new_eclosures);
1436	return -`1`;
1437	}
1438	dfa->nexts = new_nexts;
1439	dfa->org_indices = new_indices;
1440	dfa->edests = new_edests;
1441	dfa->eclosures = new_eclosures;
1442	dfa->nodes_alloc = new_nodes_alloc;
1443	}
1444	dfa->nodes[dfa->nodes_len] = token;
1445	dfa->nodes[dfa->nodes_len].constraint = `0`;
1446	#ifdef RE_ENABLE_I18N
1447	dfa->nodes[dfa->nodes_len].accept_mb =
1448	((token.type == OP_PERIOD && dfa->mb_cur_max > `1`)
1449	\|\| token.type == COMPLEX_BRACKET);
1450	#endif
1451	dfa->nexts[dfa->nodes_len] = -`1`;
1452	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454	return dfa->nodes_len++;
1455	}
1456
1457	static re_hashval_t
1458	calc_state_hash (const re_node_set nodes, unsigned* int context)
1459	{
1460	re_hashval_t hash = nodes->nelem + context;
1461	Idx i;
1462	for (i = `0` ; i < nodes->nelem ; i++)
1463	hash += nodes->elems[i];
1464	return hash;
1465	}
1466
1467	/ Search for the state whose node_set is equivalent to NODES.*
1468	Return the pointer to the state, if we found it in the DFA.
1469	Otherwise create the new one and return it. In case of an error
1470	return NULL and set the error code in ERR.
1471	Note: - We assume NULL as the invalid state, then it is possible that
1472	return value is NULL and ERR is REG_NOERROR.
1473	- We never return non-NULL value in case of any errors, it is for
1474	optimization. /*
1475
1476	static re_dfastate_t *
1477	__attribute_warn_unused_result__
1478	re_acquire_state (reg_errcode_t err, const* re_dfa_t *dfa,
1479	const re_node_set *nodes)
1480	{
1481	re_hashval_t hash;
1482	re_dfastate_t *new_state;
1483	struct re_state_table_entry *spot;
1484	Idx i;
1485	#if defined GCC_LINT \|\| defined lint
1486	/ Suppress bogus uninitialized-variable warnings. /
1487	*err = REG_NOERROR;
1488	#endif
1489	if (__glibc_unlikely (nodes->nelem == `0`))
1490	{
1491	*err = REG_NOERROR;
1492	return NULL;
1493	}
1494	hash = calc_state_hash (nodes, `0`);
1495	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
1497	for (i = `0` ; i < spot->num ; i++)
1498	{
1499	re_dfastate_t *state = spot->array[i];
1500	if (hash != state->hash)
1501	continue;
1502	if (re_node_set_compare (&state->nodes, nodes))
1503	return state;
1504	}
1505
1506	/ There are no appropriate state in the dfa, create the new one. /
1507	new_state = create_ci_newstate (dfa, nodes, hash);
1508	if (__glibc_unlikely (new_state == NULL))
1509	*err = REG_ESPACE;
1510
1511	return new_state;
1512	}
1513
1514	/ Search for the state whose node_set is equivalent to NODES and*
1515	whose context is equivalent to CONTEXT.
1516	Return the pointer to the state, if we found it in the DFA.
1517	Otherwise create the new one and return it. In case of an error
1518	return NULL and set the error code in ERR.
1519	Note: - We assume NULL as the invalid state, then it is possible that
1520	return value is NULL and ERR is REG_NOERROR.
1521	- We never return non-NULL value in case of any errors, it is for
1522	optimization. /*
1523
1524	static re_dfastate_t *
1525	__attribute_warn_unused_result__
1526	re_acquire_state_context (reg_errcode_t err, const* re_dfa_t *dfa,
1527	const re_node_set nodes, unsigned* int context)
1528	{
1529	re_hashval_t hash;
1530	re_dfastate_t *new_state;
1531	struct re_state_table_entry *spot;
1532	Idx i;
1533	#if defined GCC_LINT \|\| defined lint
1534	/ Suppress bogus uninitialized-variable warnings. /
1535	*err = REG_NOERROR;
1536	#endif
1537	if (nodes->nelem == `0`)
1538	{
1539	*err = REG_NOERROR;
1540	return NULL;
1541	}
1542	hash = calc_state_hash (nodes, context);
1543	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
1545	for (i = `0` ; i < spot->num ; i++)
1546	{
1547	re_dfastate_t *state = spot->array[i];
1548	if (state->hash == hash
1549	&& state->context == context
1550	&& re_node_set_compare (state->entrance_nodes, nodes))
1551	return state;
1552	}
1553	/ There are no appropriate state in 'dfa', create the new one. /
1554	new_state = create_cd_newstate (dfa, nodes, context, hash);
1555	if (__glibc_unlikely (new_state == NULL))
1556	*err = REG_ESPACE;
1557
1558	return new_state;
1559	}
1560
1561	/ Finish initialization of the new state NEWSTATE, and using its hash value*
1562	HASH put in the appropriate bucket of DFA's state table. Return value
1563	indicates the error code if failed. /*
1564
1565	static reg_errcode_t
1566	__attribute_warn_unused_result__
1567	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1568	re_hashval_t hash)
1569	{
1570	struct re_state_table_entry *spot;
1571	reg_errcode_t err;
1572	Idx i;
1573
1574	newstate->hash = hash;
1575	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576	if (__glibc_unlikely (err != REG_NOERROR))
1577	return REG_ESPACE;
1578	for (i = `0`; i < newstate->nodes.nelem; i++)
1579	{
1580	Idx elem = newstate->nodes.elems[i];
1581	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1583	return REG_ESPACE;
1584	}
1585
1586	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587	if (__glibc_unlikely (spot->alloc <= spot->num))
1588	{
1589	Idx new_alloc = `2` * spot->num + `2`;
1590	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1591	new_alloc);
1592	if (__glibc_unlikely (new_array == NULL))
1593	return REG_ESPACE;
1594	spot->array = new_array;
1595	spot->alloc = new_alloc;
1596	}
1597	spot->array[spot->num++] = newstate;
1598	return REG_NOERROR;
1599	}
1600
1601	static void
1602	free_state (re_dfastate_t *state)
1603	{
1604	re_node_set_free (&state->non_eps_nodes);
1605	re_node_set_free (&state->inveclosure);
1606	if (state->entrance_nodes != &state->nodes)
1607	{
1608	re_node_set_free (state->entrance_nodes);
1609	re_free (state->entrance_nodes);
1610	}
1611	re_node_set_free (&state->nodes);
1612	re_free (state->word_trtable);
1613	re_free (state->trtable);
1614	re_free (state);
1615	}
1616
1617	/ Create the new state which is independent of contexts.*
1618	Return the new state if succeeded, otherwise return NULL. /*
1619
1620	static re_dfastate_t *
1621	__attribute_warn_unused_result__
1622	create_ci_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1623	re_hashval_t hash)
1624	{
1625	Idx i;
1626	reg_errcode_t err;
1627	re_dfastate_t *newstate;
1628
1629	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1630	if (__glibc_unlikely (newstate == NULL))
1631	return NULL;
1632	err = re_node_set_init_copy (&newstate->nodes, nodes);
1633	if (__glibc_unlikely (err != REG_NOERROR))
1634	{
1635	re_free (newstate);
1636	return NULL;
1637	}
1638
1639	newstate->entrance_nodes = &newstate->nodes;
1640	for (i = `0` ; i < nodes->nelem ; i++)
1641	{
1642	re_token_t *node = dfa->nodes + nodes->elems[i];
1643	re_token_type_t type = node->type;
1644	if (type == CHARACTER && !node->constraint)
1645	continue;
1646	#ifdef RE_ENABLE_I18N
1647	newstate->accept_mb \|= node->accept_mb;
1648	#endif /* RE_ENABLE_I18N */
1649
1650	/ If the state has the halt node, the state is a halt state. /
1651	if (type == END_OF_RE)
1652	newstate->halt = `1`;
1653	else if (type == OP_BACK_REF)
1654	newstate->has_backref = `1`;
1655	else if (type == ANCHOR \|\| node->constraint)
1656	newstate->has_constraint = `1`;
1657	}
1658	err = register_state (dfa, newstate, hash);
1659	if (__glibc_unlikely (err != REG_NOERROR))
1660	{
1661	free_state (newstate);
1662	newstate = NULL;
1663	}
1664	return newstate;
1665	}
1666
1667	/ Create the new state which is depend on the context CONTEXT.*
1668	Return the new state if succeeded, otherwise return NULL. /*
1669
1670	static re_dfastate_t *
1671	__attribute_warn_unused_result__
1672	create_cd_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1673	unsigned int context, re_hashval_t hash)
1674	{
1675	Idx i, nctx_nodes = `0`;
1676	reg_errcode_t err;
1677	re_dfastate_t *newstate;
1678
1679	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1680	if (__glibc_unlikely (newstate == NULL))
1681	return NULL;
1682	err = re_node_set_init_copy (&newstate->nodes, nodes);
1683	if (__glibc_unlikely (err != REG_NOERROR))
1684	{
1685	re_free (newstate);
1686	return NULL;
1687	}
1688
1689	newstate->context = context;
1690	newstate->entrance_nodes = &newstate->nodes;
1691
1692	for (i = `0` ; i < nodes->nelem ; i++)
1693	{
1694	re_token_t *node = dfa->nodes + nodes->elems[i];
1695	re_token_type_t type = node->type;
1696	unsigned int constraint = node->constraint;
1697
1698	if (type == CHARACTER && !constraint)
1699	continue;
1700	#ifdef RE_ENABLE_I18N
1701	newstate->accept_mb \|= node->accept_mb;
1702	#endif /* RE_ENABLE_I18N */
1703
1704	/ If the state has the halt node, the state is a halt state. /
1705	if (type == END_OF_RE)
1706	newstate->halt = `1`;
1707	else if (type == OP_BACK_REF)
1708	newstate->has_backref = `1`;
1709
1710	if (constraint)
1711	{
1712	if (newstate->entrance_nodes == &newstate->nodes)
1713	{
1714	re_node_set *entrance_nodes = re_malloc (re_node_set, `1`);
1715	if (__glibc_unlikely (entrance_nodes == NULL))
1716	{
1717	free_state (newstate);
1718	return NULL;
1719	}
1720	newstate->entrance_nodes = entrance_nodes;
1721	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1722	!= REG_NOERROR)
1723	{
1724	free_state (newstate);
1725	return NULL;
1726	}
1727	nctx_nodes = `0`;
1728	newstate->has_constraint = `1`;
1729	}
1730
1731	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1732	{
1733	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1734	++nctx_nodes;
1735	}
1736	}
1737	}
1738	err = register_state (dfa, newstate, hash);
1739	if (__glibc_unlikely (err != REG_NOERROR))
1740	{
1741	free_state (newstate);
1742	newstate = NULL;
1743	}
1744	return newstate;
1745	}
1746

Browse the source code of glibc/posix/regex_internal.c