regex_internal.c source code [glibc/posix/regex_internal.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2018 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	static void re_string_construct_common (const char *str, Idx len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, bool icase,
23	const re_dfa_t *dfa);
24	static re_dfastate_t create_ci_newstate (const* re_dfa_t *dfa,
25	const re_node_set *nodes,
26	re_hashval_t hash);
27	static re_dfastate_t create_cd_newstate (const* re_dfa_t *dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	re_hashval_t hash);
31	static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32	Idx new_buf_len);
33	#ifdef RE_ENABLE_I18N
34	static void build_wcs_buffer (re_string_t *pstr);
35	static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36	#endif /* RE_ENABLE_I18N */
37	static void build_upper_buffer (re_string_t *pstr);
38	static void re_string_translate_buffer (re_string_t *pstr);
39	static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40	int eflags) __attribute__ ((pure));
41
42	/ Functions for string operation. /
43
44	/ This function allocate the buffers. It is necessary to call*
45	re_string_reconstruct before using the object. /*
46
47	static reg_errcode_t
48	__attribute_warn_unused_result__
49	re_string_allocate (re_string_t pstr, const* char *str, Idx len, Idx init_len,
50	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
51	{
52	reg_errcode_t ret;
53	Idx init_buf_len;
54
55	/ Ensure at least one character fits into the buffers. /
56	if (init_len < dfa->mb_cur_max)
57	init_len = dfa->mb_cur_max;
58	init_buf_len = (len + `1` < init_len) ? len + `1`: init_len;
59	re_string_construct_common (str, len, pstr, trans, icase, dfa);
60
61	ret = re_string_realloc_buffers (pstr, init_buf_len);
62	if (BE (ret != REG_NOERROR, `0`))
63	return ret;
64
65	pstr->word_char = dfa->word_char;
66	pstr->word_ops_used = dfa->word_ops_used;
67	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
68	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > `1`) ? `0` : len;
69	pstr->valid_raw_len = pstr->valid_len;
70	return REG_NOERROR;
71	}
72
73	/ This function allocate the buffers, and initialize them. /
74
75	static reg_errcode_t
76	__attribute_warn_unused_result__
77	re_string_construct (re_string_t pstr, const* char *str, Idx len,
78	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
79	{
80	reg_errcode_t ret;
81	memset (pstr, `'\0'`, sizeof (re_string_t));
82	re_string_construct_common (str, len, pstr, trans, icase, dfa);
83
84	if (len > `0`)
85	{
86	ret = re_string_realloc_buffers (pstr, len + `1`);
87	if (BE (ret != REG_NOERROR, `0`))
88	return ret;
89	}
90	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
91
92	if (icase)
93	{
94	#ifdef RE_ENABLE_I18N
95	if (dfa->mb_cur_max > `1`)
96	{
97	while (`1`)
98	{
99	ret = build_wcs_upper_buffer (pstr);
100	if (BE (ret != REG_NOERROR, `0`))
101	return ret;
102	if (pstr->valid_raw_len >= len)
103	break;
104	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
105	break;
106	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * `2`);
107	if (BE (ret != REG_NOERROR, `0`))
108	return ret;
109	}
110	}
111	else
112	#endif /* RE_ENABLE_I18N */
113	build_upper_buffer (pstr);
114	}
115	else
116	{
117	#ifdef RE_ENABLE_I18N
118	if (dfa->mb_cur_max > `1`)
119	build_wcs_buffer (pstr);
120	else
121	#endif /* RE_ENABLE_I18N */
122	{
123	if (trans != NULL)
124	re_string_translate_buffer (pstr);
125	else
126	{
127	pstr->valid_len = pstr->bufs_len;
128	pstr->valid_raw_len = pstr->bufs_len;
129	}
130	}
131	}
132
133	return REG_NOERROR;
134	}
135
136	/ Helper functions for re_string_allocate, and re_string_construct. /
137
138	static reg_errcode_t
139	__attribute_warn_unused_result__
140	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
141	{
142	#ifdef RE_ENABLE_I18N
143	if (pstr->mb_cur_max > `1`)
144	{
145	wint_t *new_wcs;
146
147	/ Avoid overflow in realloc. /
148	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, `0`))
150	return REG_ESPACE;
151
152	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
153	if (BE (new_wcs == NULL, `0`))
154	return REG_ESPACE;
155	pstr->wcs = new_wcs;
156	if (pstr->offsets != NULL)
157	{
158	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
159	if (BE (new_offsets == NULL, `0`))
160	return REG_ESPACE;
161	pstr->offsets = new_offsets;
162	}
163	}
164	#endif /* RE_ENABLE_I18N */
165	if (pstr->mbs_allocated)
166	{
167	unsigned char new_mbs = re_realloc (pstr->mbs, unsigned* char,
168	new_buf_len);
169	if (BE (new_mbs == NULL, `0`))
170	return REG_ESPACE;
171	pstr->mbs = new_mbs;
172	}
173	pstr->bufs_len = new_buf_len;
174	return REG_NOERROR;
175	}
176
177
178	static void
179	re_string_construct_common (const char str, Idx len, re_string_t pstr,
180	RE_TRANSLATE_TYPE trans, bool icase,
181	const re_dfa_t *dfa)
182	{
183	pstr->raw_mbs = (const unsigned char *) str;
184	pstr->len = len;
185	pstr->raw_len = len;
186	pstr->trans = trans;
187	pstr->icase = icase;
188	pstr->mbs_allocated = (trans != NULL \|\| icase);
189	pstr->mb_cur_max = dfa->mb_cur_max;
190	pstr->is_utf8 = dfa->is_utf8;
191	pstr->map_notascii = dfa->map_notascii;
192	pstr->stop = pstr->len;
193	pstr->raw_stop = pstr->stop;
194	}
195
196	#ifdef RE_ENABLE_I18N
197
198	/ Build wide character buffer PSTR->WCS.*
199	If the byte sequence of the string are:
200	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
201	Then wide character buffer will be:
202	<wc1> , WEOF , <wc2> , WEOF , <wc3>
203	We use WEOF for padding, they indicate that the position isn't
204	a first byte of a multibyte character.
205
206	Note that this function assumes PSTR->VALID_LEN elements are already
207	built and starts from PSTR->VALID_LEN. /*
208
209	static void
210	build_wcs_buffer (re_string_t *pstr)
211	{
212	#ifdef _LIBC
213	unsigned char buf[MB_LEN_MAX];
214	assert (MB_LEN_MAX >= pstr->mb_cur_max);
215	#else
216	unsigned char buf[`64`];
217	#endif
218	mbstate_t prev_st;
219	Idx byte_idx, end_idx, remain_len;
220	size_t mbclen;
221
222	/ Build the buffers from pstr->valid_len to either pstr->len or*
223	pstr->bufs_len. /*
224	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
225	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
226	{
227	wchar_t wc;
228	const char *p;
229
230	remain_len = end_idx - byte_idx;
231	prev_st = pstr->cur_state;
232	/ Apply the translation if we need. /
233	if (BE (pstr->trans != NULL, `0`))
234	{
235	int i, ch;
236
237	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
238	{
239	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
240	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
241	}
242	p = (const char *) buf;
243	}
244	else
245	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
246	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
247	if (BE (mbclen == (size_t) -`1` \|\| mbclen == `0`
248	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len), `0`))
249	{
250	/ We treat these cases as a singlebyte character. /
251	mbclen = `1`;
252	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
253	if (BE (pstr->trans != NULL, `0`))
254	wc = pstr->trans[wc];
255	pstr->cur_state = prev_st;
256	}
257	else if (BE (mbclen == (size_t) -`2`, `0`))
258	{
259	/ The buffer doesn't have enough space, finish to build. /
260	pstr->cur_state = prev_st;
261	break;
262	}
263
264	/ Write wide character and padding. /
265	pstr->wcs[byte_idx++] = wc;
266	/ Write paddings. /
267	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
268	pstr->wcs[byte_idx++] = WEOF;
269	}
270	pstr->valid_len = byte_idx;
271	pstr->valid_raw_len = byte_idx;
272	}
273
274	/ Build wide character buffer PSTR->WCS like build_wcs_buffer,*
275	but for REG_ICASE. /*
276
277	static reg_errcode_t
278	__attribute_warn_unused_result__
279	build_wcs_upper_buffer (re_string_t *pstr)
280	{
281	mbstate_t prev_st;
282	Idx src_idx, byte_idx, end_idx, remain_len;
283	size_t mbclen;
284	#ifdef _LIBC
285	char buf[MB_LEN_MAX];
286	assert (MB_LEN_MAX >= pstr->mb_cur_max);
287	#else
288	char buf[`64`];
289	#endif
290
291	byte_idx = pstr->valid_len;
292	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293
294	/ The following optimization assumes that ASCII characters can be*
295	mapped to wide characters with a simple cast. /*
296	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297	{
298	while (byte_idx < end_idx)
299	{
300	wchar_t wc;
301
302	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303	&& mbsinit (&pstr->cur_state))
304	{
305	/ In case of a singlebyte character. /
306	pstr->mbs[byte_idx]
307	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308	/ The next step uses the assumption that wchar_t is encoded*
309	ASCII-safe: all ASCII values can be converted like this. /*
310	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311	++byte_idx;
312	continue;
313	}
314
315	remain_len = end_idx - byte_idx;
316	prev_st = pstr->cur_state;
317	mbclen = __mbrtowc (&wc,
318	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319	+ byte_idx), remain_len, &pstr->cur_state);
320	if (BE (mbclen < (size_t) -`2`, `1`))
321	{
322	wchar_t wcu = __towupper (wc);
323	if (wcu != wc)
324	{
325	size_t mbcdlen;
326
327	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
328	if (BE (mbclen == mbcdlen, `1`))
329	memcpy (pstr->mbs + byte_idx, buf, mbclen);
330	else
331	{
332	src_idx = byte_idx;
333	goto offsets_needed;
334	}
335	}
336	else
337	memcpy (pstr->mbs + byte_idx,
338	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
339	pstr->wcs[byte_idx++] = wcu;
340	/ Write paddings. /
341	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
342	pstr->wcs[byte_idx++] = WEOF;
343	}
344	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
345	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
346	{
347	/ It is an invalid character, an incomplete character*
348	at the end of the string, or '\0'. Just use the byte. /*
349	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
350	pstr->mbs[byte_idx] = ch;
351	/ And also cast it to wide char. /
352	pstr->wcs[byte_idx++] = (wchar_t) ch;
353	if (BE (mbclen == (size_t) -`1`, `0`))
354	pstr->cur_state = prev_st;
355	}
356	else
357	{
358	/ The buffer doesn't have enough space, finish to build. /
359	pstr->cur_state = prev_st;
360	break;
361	}
362	}
363	pstr->valid_len = byte_idx;
364	pstr->valid_raw_len = byte_idx;
365	return REG_NOERROR;
366	}
367	else
368	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
369	{
370	wchar_t wc;
371	const char *p;
372	offsets_needed:
373	remain_len = end_idx - byte_idx;
374	prev_st = pstr->cur_state;
375	if (BE (pstr->trans != NULL, `0`))
376	{
377	int i, ch;
378
379	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
380	{
381	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382	buf[i] = pstr->trans[ch];
383	}
384	p = (const char *) buf;
385	}
386	else
387	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389	if (BE (mbclen < (size_t) -`2`, `1`))
390	{
391	wchar_t wcu = __towupper (wc);
392	if (wcu != wc)
393	{
394	size_t mbcdlen;
395
396	mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
397	if (BE (mbclen == mbcdlen, `1`))
398	memcpy (pstr->mbs + byte_idx, buf, mbclen);
399	else if (mbcdlen != (size_t) -`1`)
400	{
401	size_t i;
402
403	if (byte_idx + mbcdlen > pstr->bufs_len)
404	{
405	pstr->cur_state = prev_st;
406	break;
407	}
408
409	if (pstr->offsets == NULL)
410	{
411	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
412
413	if (pstr->offsets == NULL)
414	return REG_ESPACE;
415	}
416	if (!pstr->offsets_needed)
417	{
418	for (i = `0`; i < (size_t) byte_idx; ++i)
419	pstr->offsets[i] = i;
420	pstr->offsets_needed = `1`;
421	}
422
423	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
424	pstr->wcs[byte_idx] = wcu;
425	pstr->offsets[byte_idx] = src_idx;
426	for (i = `1`; i < mbcdlen; ++i)
427	{
428	pstr->offsets[byte_idx + i]
429	= src_idx + (i < mbclen ? i : mbclen - `1`);
430	pstr->wcs[byte_idx + i] = WEOF;
431	}
432	pstr->len += mbcdlen - mbclen;
433	if (pstr->raw_stop > src_idx)
434	pstr->stop += mbcdlen - mbclen;
435	end_idx = (pstr->bufs_len > pstr->len)
436	? pstr->len : pstr->bufs_len;
437	byte_idx += mbcdlen;
438	src_idx += mbclen;
439	continue;
440	}
441	else
442	memcpy (pstr->mbs + byte_idx, p, mbclen);
443	}
444	else
445	memcpy (pstr->mbs + byte_idx, p, mbclen);
446
447	if (BE (pstr->offsets_needed != `0`, `0`))
448	{
449	size_t i;
450	for (i = `0`; i < mbclen; ++i)
451	pstr->offsets[byte_idx + i] = src_idx + i;
452	}
453	src_idx += mbclen;
454
455	pstr->wcs[byte_idx++] = wcu;
456	/ Write paddings. /
457	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
458	pstr->wcs[byte_idx++] = WEOF;
459	}
460	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
461	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
462	{
463	/ It is an invalid character or '\0'. Just use the byte. /
464	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
465
466	if (BE (pstr->trans != NULL, `0`))
467	ch = pstr->trans [ch];
468	pstr->mbs[byte_idx] = ch;
469
470	if (BE (pstr->offsets_needed != `0`, `0`))
471	pstr->offsets[byte_idx] = src_idx;
472	++src_idx;
473
474	/ And also cast it to wide char. /
475	pstr->wcs[byte_idx++] = (wchar_t) ch;
476	if (BE (mbclen == (size_t) -`1`, `0`))
477	pstr->cur_state = prev_st;
478	}
479	else
480	{
481	/ The buffer doesn't have enough space, finish to build. /
482	pstr->cur_state = prev_st;
483	break;
484	}
485	}
486	pstr->valid_len = byte_idx;
487	pstr->valid_raw_len = src_idx;
488	return REG_NOERROR;
489	}
490
491	/ Skip characters until the index becomes greater than NEW_RAW_IDX.*
492	Return the index. /*
493
494	static Idx
495	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
496	{
497	mbstate_t prev_st;
498	Idx rawbuf_idx;
499	size_t mbclen;
500	wint_t wc = WEOF;
501
502	/ Skip the characters which are not necessary to check. /
503	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504	rawbuf_idx < new_raw_idx;)
505	{
506	wchar_t wc2;
507	Idx remain_len = pstr->raw_len - rawbuf_idx;
508	prev_st = pstr->cur_state;
509	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
510	remain_len, &pstr->cur_state);
511	if (BE (mbclen == (size_t) -`2` \|\| mbclen == (size_t) -`1` \|\| mbclen == `0`, `0`))
512	{
513	/ We treat these cases as a single byte character. /
514	if (mbclen == `0` \|\| remain_len == `0`)
515	wc = L`'\0'`;
516	else
517	wc = (unsigned* char *) (pstr->raw_mbs + rawbuf_idx);
518	mbclen = `1`;
519	pstr->cur_state = prev_st;
520	}
521	else
522	wc = wc2;
523	/ Then proceed the next character. /
524	rawbuf_idx += mbclen;
525	}
526	*last_wc = wc;
527	return rawbuf_idx;
528	}
529	#endif /* RE_ENABLE_I18N */
530
531	/ Build the buffer PSTR->MBS, and apply the translation if we need.*
532	This function is used in case of REG_ICASE. /*
533
534	static void
535	build_upper_buffer (re_string_t *pstr)
536	{
537	Idx char_idx, end_idx;
538	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
539
540	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
541	{
542	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
543	if (BE (pstr->trans != NULL, `0`))
544	ch = pstr->trans[ch];
545	pstr->mbs[char_idx] = toupper (ch);
546	}
547	pstr->valid_len = char_idx;
548	pstr->valid_raw_len = char_idx;
549	}
550
551	/ Apply TRANS to the buffer in PSTR. /
552
553	static void
554	re_string_translate_buffer (re_string_t *pstr)
555	{
556	Idx buf_idx, end_idx;
557	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560	{
561	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562	pstr->mbs[buf_idx] = pstr->trans[ch];
563	}
564
565	pstr->valid_len = buf_idx;
566	pstr->valid_raw_len = buf_idx;
567	}
568
569	/ This function re-construct the buffers.*
570	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571	convert to upper case in case of REG_ICASE, apply translation. /*
572
573	static reg_errcode_t
574	__attribute_warn_unused_result__
575	re_string_reconstruct (re_string_t pstr, Idx idx, int* eflags)
576	{
577	Idx offset;
578
579	if (BE (pstr->raw_mbs_idx <= idx, `0`))
580	offset = idx - pstr->raw_mbs_idx;
581	else
582	{
583	/ Reset buffer. /
584	#ifdef RE_ENABLE_I18N
585	if (pstr->mb_cur_max > `1`)
586	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
587	#endif /* RE_ENABLE_I18N */
588	pstr->len = pstr->raw_len;
589	pstr->stop = pstr->raw_stop;
590	pstr->valid_len = `0`;
591	pstr->raw_mbs_idx = `0`;
592	pstr->valid_raw_len = `0`;
593	pstr->offsets_needed = `0`;
594	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
596	if (!pstr->mbs_allocated)
597	pstr->mbs = (unsigned char *) pstr->raw_mbs;
598	offset = idx;
599	}
600
601	if (BE (offset != `0`, `1`))
602	{
603	/ Should the already checked characters be kept? /
604	if (BE (offset < pstr->valid_raw_len, `1`))
605	{
606	/ Yes, move them to the front of the buffer. /
607	#ifdef RE_ENABLE_I18N
608	if (BE (pstr->offsets_needed, `0`))
609	{
610	Idx low = `0`, high = pstr->valid_len, mid;
611	do
612	{
613	mid = (high + low) / `2`;
614	if (pstr->offsets[mid] > offset)
615	high = mid;
616	else if (pstr->offsets[mid] < offset)
617	low = mid + `1`;
618	else
619	break;
620	}
621	while (low < high);
622	if (pstr->offsets[mid] < offset)
623	++mid;
624	pstr->tip_context = re_string_context_at (pstr, mid - `1`,
625	eflags);
626	/ This can be quite complicated, so handle specially*
627	only the common and easy case where the character with
628	different length representation of lower and upper
629	case is present at or after offset. /*
630	if (pstr->valid_len > offset
631	&& mid == offset && pstr->offsets[mid] == offset)
632	{
633	memmove (pstr->wcs, pstr->wcs + offset,
634	(pstr->valid_len - offset) * sizeof (wint_t));
635	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636	pstr->valid_len -= offset;
637	pstr->valid_raw_len -= offset;
638	for (low = `0`; low < pstr->valid_len; low++)
639	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640	}
641	else
642	{
643	/ Otherwise, just find out how long the partial multibyte*
644	character at offset is and fill it with WEOF/255. /*
645	pstr->len = pstr->raw_len - idx + offset;
646	pstr->stop = pstr->raw_stop - idx + offset;
647	pstr->offsets_needed = `0`;
648	while (mid > `0` && pstr->offsets[mid - `1`] == offset)
649	--mid;
650	while (mid < pstr->valid_len)
651	if (pstr->wcs[mid] != WEOF)
652	break;
653	else
654	++mid;
655	if (mid == pstr->valid_len)
656	pstr->valid_len = `0`;
657	else
658	{
659	pstr->valid_len = pstr->offsets[mid] - offset;
660	if (pstr->valid_len)
661	{
662	for (low = `0`; low < pstr->valid_len; ++low)
663	pstr->wcs[low] = WEOF;
664	memset (pstr->mbs, `255`, pstr->valid_len);
665	}
666	}
667	pstr->valid_raw_len = pstr->valid_len;
668	}
669	}
670	else
671	#endif
672	{
673	pstr->tip_context = re_string_context_at (pstr, offset - `1`,
674	eflags);
675	#ifdef RE_ENABLE_I18N
676	if (pstr->mb_cur_max > `1`)
677	memmove (pstr->wcs, pstr->wcs + offset,
678	(pstr->valid_len - offset) * sizeof (wint_t));
679	#endif /* RE_ENABLE_I18N */
680	if (BE (pstr->mbs_allocated, `0`))
681	memmove (pstr->mbs, pstr->mbs + offset,
682	pstr->valid_len - offset);
683	pstr->valid_len -= offset;
684	pstr->valid_raw_len -= offset;
685	#if defined DEBUG && DEBUG
686	assert (pstr->valid_len > `0`);
687	#endif
688	}
689	}
690	else
691	{
692	#ifdef RE_ENABLE_I18N
693	/ No, skip all characters until IDX. /
694	Idx prev_valid_len = pstr->valid_len;
695
696	if (BE (pstr->offsets_needed, `0`))
697	{
698	pstr->len = pstr->raw_len - idx + offset;
699	pstr->stop = pstr->raw_stop - idx + offset;
700	pstr->offsets_needed = `0`;
701	}
702	#endif
703	pstr->valid_len = `0`;
704	#ifdef RE_ENABLE_I18N
705	if (pstr->mb_cur_max > `1`)
706	{
707	Idx wcs_idx;
708	wint_t wc = WEOF;
709
710	if (pstr->is_utf8)
711	{
712	const unsigned char raw, p, *end;
713
714	/ Special case UTF-8. Multi-byte chars start with any*
715	byte other than 0x80 - 0xbf. /*
716	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717	end = raw + (offset - pstr->mb_cur_max);
718	if (end < pstr->raw_mbs)
719	end = pstr->raw_mbs;
720	p = raw + offset - `1`;
721	#ifdef _LIBC
722	/ We know the wchar_t encoding is UCS4, so for the simple*
723	case, ASCII characters, skip the conversion step. /*
724	if (isascii (*p) && BE (pstr->trans == NULL, `1`))
725	{
726	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
727	/ pstr->valid_len = 0; /
728	wc = (wchar_t) *p;
729	}
730	else
731	#endif
732	for (; p >= end; --p)
733	if ((*p & `0xc0`) != `0x80`)
734	{
735	mbstate_t cur_state;
736	wchar_t wc2;
737	Idx mlen = raw + pstr->len - p;
738	unsigned char buf[`6`];
739	size_t mbclen;
740
741	const unsigned char *pp = p;
742	if (BE (pstr->trans != NULL, `0`))
743	{
744	int i = mlen < `6` ? mlen : `6`;
745	while (--i >= `0`)
746	buf[i] = pstr->trans[p[i]];
747	pp = buf;
748	}
749	/ XXX Don't use mbrtowc, we know which conversion*
750	to use (UTF-8 -> UCS4). /*
751	memset (&cur_state, `0`, sizeof (cur_state));
752	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
753	&cur_state);
754	if (raw + offset - p <= mbclen
755	&& mbclen < (size_t) -`2`)
756	{
757	memset (&pstr->cur_state, `'\0'`,
758	sizeof (mbstate_t));
759	pstr->valid_len = mbclen - (raw + offset - p);
760	wc = wc2;
761	}
762	break;
763	}
764	}
765
766	if (wc == WEOF)
767	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768	if (wc == WEOF)
769	pstr->tip_context
770	= re_string_context_at (pstr, prev_valid_len - `1`, eflags);
771	else
772	pstr->tip_context = ((BE (pstr->word_ops_used != `0`, `0`)
773	&& IS_WIDE_WORD_CHAR (wc))
774	? CONTEXT_WORD
775	: ((IS_WIDE_NEWLINE (wc)
776	&& pstr->newline_anchor)
777	? CONTEXT_NEWLINE : `0`));
778	if (BE (pstr->valid_len, `0`))
779	{
780	for (wcs_idx = `0`; wcs_idx < pstr->valid_len; ++wcs_idx)
781	pstr->wcs[wcs_idx] = WEOF;
782	if (pstr->mbs_allocated)
783	memset (pstr->mbs, `255`, pstr->valid_len);
784	}
785	pstr->valid_raw_len = pstr->valid_len;
786	}
787	else
788	#endif /* RE_ENABLE_I18N */
789	{
790	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - `1`];
791	pstr->valid_raw_len = `0`;
792	if (pstr->trans)
793	c = pstr->trans[c];
794	pstr->tip_context = (bitset_contain (pstr->word_char, c)
795	? CONTEXT_WORD
796	: ((IS_NEWLINE (c) && pstr->newline_anchor)
797	? CONTEXT_NEWLINE : `0`));
798	}
799	}
800	if (!BE (pstr->mbs_allocated, `0`))
801	pstr->mbs += offset;
802	}
803	pstr->raw_mbs_idx = idx;
804	pstr->len -= offset;
805	pstr->stop -= offset;
806
807	/ Then build the buffers. /
808	#ifdef RE_ENABLE_I18N
809	if (pstr->mb_cur_max > `1`)
810	{
811	if (pstr->icase)
812	{
813	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814	if (BE (ret != REG_NOERROR, `0`))
815	return ret;
816	}
817	else
818	build_wcs_buffer (pstr);
819	}
820	else
821	#endif /* RE_ENABLE_I18N */
822	if (BE (pstr->mbs_allocated, `0`))
823	{
824	if (pstr->icase)
825	build_upper_buffer (pstr);
826	else if (pstr->trans != NULL)
827	re_string_translate_buffer (pstr);
828	}
829	else
830	pstr->valid_len = pstr->len;
831
832	pstr->cur_idx = `0`;
833	return REG_NOERROR;
834	}
835
836	static unsigned char
837	__attribute__ ((pure))
838	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
839	{
840	int ch;
841	Idx off;
842
843	/ Handle the common (easiest) cases first. /
844	if (BE (!pstr->mbs_allocated, `1`))
845	return re_string_peek_byte (pstr, idx);
846
847	#ifdef RE_ENABLE_I18N
848	if (pstr->mb_cur_max > `1`
849	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850	return re_string_peek_byte (pstr, idx);
851	#endif
852
853	off = pstr->cur_idx + idx;
854	#ifdef RE_ENABLE_I18N
855	if (pstr->offsets_needed)
856	off = pstr->offsets[off];
857	#endif
858
859	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860
861	#ifdef RE_ENABLE_I18N
862	/ Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I*
863	this function returns CAPITAL LETTER I instead of first byte of
864	DOTLESS SMALL LETTER I. The latter would confuse the parser,
865	since peek_byte_case doesn't advance cur_idx in any way. /*
866	if (pstr->offsets_needed && !isascii (ch))
867	return re_string_peek_byte (pstr, idx);
868	#endif
869
870	return ch;
871	}
872
873	static unsigned char
874	re_string_fetch_byte_case (re_string_t *pstr)
875	{
876	if (BE (!pstr->mbs_allocated, `1`))
877	return re_string_fetch_byte (pstr);
878
879	#ifdef RE_ENABLE_I18N
880	if (pstr->offsets_needed)
881	{
882	Idx off;
883	int ch;
884
885	/ For tr_TR.UTF-8 [[:islower:]] there is*
886	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887	in that case the whole multi-byte character and return
888	the original letter. On the other side, with
889	[[: DOTLESS SMALL LETTER I return [[:I, as doing
890	anything else would complicate things too much. /*
891
892	if (!re_string_first_byte (pstr, pstr->cur_idx))
893	return re_string_fetch_byte (pstr);
894
895	off = pstr->offsets[pstr->cur_idx];
896	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897
898	if (! isascii (ch))
899	return re_string_fetch_byte (pstr);
900
901	re_string_skip_bytes (pstr,
902	re_string_char_size_at (pstr, pstr->cur_idx));
903	return ch;
904	}
905	#endif
906
907	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
908	}
909
910	static void
911	re_string_destruct (re_string_t *pstr)
912	{
913	#ifdef RE_ENABLE_I18N
914	re_free (pstr->wcs);
915	re_free (pstr->offsets);
916	#endif /* RE_ENABLE_I18N */
917	if (pstr->mbs_allocated)
918	re_free (pstr->mbs);
919	}
920
921	/ Return the context at IDX in INPUT. /
922
923	static unsigned int
924	re_string_context_at (const re_string_t input, Idx idx, int* eflags)
925	{
926	int c;
927	if (BE (idx < `0`, `0`))
928	/ In this case, we use the value stored in input->tip_context,*
929	since we can't know the character in input->mbs[-1] here. /*
930	return input->tip_context;
931	if (BE (idx == input->len, `0`))
932	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
933	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
934	#ifdef RE_ENABLE_I18N
935	if (input->mb_cur_max > `1`)
936	{
937	wint_t wc;
938	Idx wc_idx = idx;
939	while(input->wcs[wc_idx] == WEOF)
940	{
941	#if defined DEBUG && DEBUG
942	/ It must not happen. /
943	assert (wc_idx >= `0`);
944	#endif
945	--wc_idx;
946	if (wc_idx < `0`)
947	return input->tip_context;
948	}
949	wc = input->wcs[wc_idx];
950	if (BE (input->word_ops_used != `0`, `0`) && IS_WIDE_WORD_CHAR (wc))
951	return CONTEXT_WORD;
952	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
953	? CONTEXT_NEWLINE : `0`);
954	}
955	else
956	#endif
957	{
958	c = re_string_byte_at (input, idx);
959	if (bitset_contain (input->word_char, c))
960	return CONTEXT_WORD;
961	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : `0`;
962	}
963	}
964
965	/ Functions for set operation. /
966
967	static reg_errcode_t
968	__attribute_warn_unused_result__
969	re_node_set_alloc (re_node_set *set, Idx size)
970	{
971	set->alloc = size;
972	set->nelem = `0`;
973	set->elems = re_malloc (Idx, size);
974	if (BE (set->elems == NULL, `0`) && (MALLOC_0_IS_NONNULL \|\| size != `0`))
975	return REG_ESPACE;
976	return REG_NOERROR;
977	}
978
979	static reg_errcode_t
980	__attribute_warn_unused_result__
981	re_node_set_init_1 (re_node_set *set, Idx elem)
982	{
983	set->alloc = `1`;
984	set->nelem = `1`;
985	set->elems = re_malloc (Idx, `1`);
986	if (BE (set->elems == NULL, `0`))
987	{
988	set->alloc = set->nelem = `0`;
989	return REG_ESPACE;
990	}
991	set->elems[`0`] = elem;
992	return REG_NOERROR;
993	}
994
995	static reg_errcode_t
996	__attribute_warn_unused_result__
997	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
998	{
999	set->alloc = `2`;
1000	set->elems = re_malloc (Idx, `2`);
1001	if (BE (set->elems == NULL, `0`))
1002	return REG_ESPACE;
1003	if (elem1 == elem2)
1004	{
1005	set->nelem = `1`;
1006	set->elems[`0`] = elem1;
1007	}
1008	else
1009	{
1010	set->nelem = `2`;
1011	if (elem1 < elem2)
1012	{
1013	set->elems[`0`] = elem1;
1014	set->elems[`1`] = elem2;
1015	}
1016	else
1017	{
1018	set->elems[`0`] = elem2;
1019	set->elems[`1`] = elem1;
1020	}
1021	}
1022	return REG_NOERROR;
1023	}
1024
1025	static reg_errcode_t
1026	__attribute_warn_unused_result__
1027	re_node_set_init_copy (re_node_set dest, const* re_node_set *src)
1028	{
1029	dest->nelem = src->nelem;
1030	if (src->nelem > `0`)
1031	{
1032	dest->alloc = dest->nelem;
1033	dest->elems = re_malloc (Idx, dest->alloc);
1034	if (BE (dest->elems == NULL, `0`))
1035	{
1036	dest->alloc = dest->nelem = `0`;
1037	return REG_ESPACE;
1038	}
1039	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1040	}
1041	else
1042	re_node_set_init_empty (dest);
1043	return REG_NOERROR;
1044	}
1045
1046	/ Calculate the intersection of the sets SRC1 and SRC2. And merge it to*
1047	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048	Note: We assume dest->elems is NULL, when dest->alloc is 0. /*
1049
1050	static reg_errcode_t
1051	__attribute_warn_unused_result__
1052	re_node_set_add_intersect (re_node_set dest, const* re_node_set *src1,
1053	const re_node_set *src2)
1054	{
1055	Idx i1, i2, is, id, delta, sbase;
1056	if (src1->nelem == `0` \|\| src2->nelem == `0`)
1057	return REG_NOERROR;
1058
1059	/ We need dest->nelem + 2 * elems_in_intersection; this is a*
1060	conservative estimate. /*
1061	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062	{
1063	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065	if (BE (new_elems == NULL, `0`))
1066	return REG_ESPACE;
1067	dest->elems = new_elems;
1068	dest->alloc = new_alloc;
1069	}
1070
1071	/ Find the items in the intersection of SRC1 and SRC2, and copy*
1072	into the top of DEST those that are not already in DEST itself. /*
1073	sbase = dest->nelem + src1->nelem + src2->nelem;
1074	i1 = src1->nelem - `1`;
1075	i2 = src2->nelem - `1`;
1076	id = dest->nelem - `1`;
1077	for (;;)
1078	{
1079	if (src1->elems[i1] == src2->elems[i2])
1080	{
1081	/ Try to find the item in DEST. Maybe we could binary search? /
1082	while (id >= `0` && dest->elems[id] > src1->elems[i1])
1083	--id;
1084
1085	if (id < `0` \|\| dest->elems[id] != src1->elems[i1])
1086	dest->elems[--sbase] = src1->elems[i1];
1087
1088	if (--i1 < `0` \|\| --i2 < `0`)
1089	break;
1090	}
1091
1092	/ Lower the highest of the two items. /
1093	else if (src1->elems[i1] < src2->elems[i2])
1094	{
1095	if (--i2 < `0`)
1096	break;
1097	}
1098	else
1099	{
1100	if (--i1 < `0`)
1101	break;
1102	}
1103	}
1104
1105	id = dest->nelem - `1`;
1106	is = dest->nelem + src1->nelem + src2->nelem - `1`;
1107	delta = is - sbase + `1`;
1108
1109	/ Now copy. When DELTA becomes zero, the remaining*
1110	DEST elements are already in place; this is more or
1111	less the same loop that is in re_node_set_merge. /*
1112	dest->nelem += delta;
1113	if (delta > `0` && id >= `0`)
1114	for (;;)
1115	{
1116	if (dest->elems[is] > dest->elems[id])
1117	{
1118	/ Copy from the top. /
1119	dest->elems[id + delta--] = dest->elems[is--];
1120	if (delta == `0`)
1121	break;
1122	}
1123	else
1124	{
1125	/ Slide from the bottom. /
1126	dest->elems[id + delta] = dest->elems[id];
1127	if (--id < `0`)
1128	break;
1129	}
1130	}
1131
1132	/ Copy remaining SRC elements. /
1133	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134
1135	return REG_NOERROR;
1136	}
1137
1138	/ Calculate the union set of the sets SRC1 and SRC2. And store it to*
1139	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1140
1141	static reg_errcode_t
1142	__attribute_warn_unused_result__
1143	re_node_set_init_union (re_node_set dest, const* re_node_set *src1,
1144	const re_node_set *src2)
1145	{
1146	Idx i1, i2, id;
1147	if (src1 != NULL && src1->nelem > `0` && src2 != NULL && src2->nelem > `0`)
1148	{
1149	dest->alloc = src1->nelem + src2->nelem;
1150	dest->elems = re_malloc (Idx, dest->alloc);
1151	if (BE (dest->elems == NULL, `0`))
1152	return REG_ESPACE;
1153	}
1154	else
1155	{
1156	if (src1 != NULL && src1->nelem > `0`)
1157	return re_node_set_init_copy (dest, src1);
1158	else if (src2 != NULL && src2->nelem > `0`)
1159	return re_node_set_init_copy (dest, src2);
1160	else
1161	re_node_set_init_empty (dest);
1162	return REG_NOERROR;
1163	}
1164	for (i1 = i2 = id = `0` ; i1 < src1->nelem && i2 < src2->nelem ;)
1165	{
1166	if (src1->elems[i1] > src2->elems[i2])
1167	{
1168	dest->elems[id++] = src2->elems[i2++];
1169	continue;
1170	}
1171	if (src1->elems[i1] == src2->elems[i2])
1172	++i2;
1173	dest->elems[id++] = src1->elems[i1++];
1174	}
1175	if (i1 < src1->nelem)
1176	{
1177	memcpy (dest->elems + id, src1->elems + i1,
1178	(src1->nelem - i1) * sizeof (Idx));
1179	id += src1->nelem - i1;
1180	}
1181	else if (i2 < src2->nelem)
1182	{
1183	memcpy (dest->elems + id, src2->elems + i2,
1184	(src2->nelem - i2) * sizeof (Idx));
1185	id += src2->nelem - i2;
1186	}
1187	dest->nelem = id;
1188	return REG_NOERROR;
1189	}
1190
1191	/ Calculate the union set of the sets DEST and SRC. And store it to*
1192	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1193
1194	static reg_errcode_t
1195	__attribute_warn_unused_result__
1196	re_node_set_merge (re_node_set dest, const* re_node_set *src)
1197	{
1198	Idx is, id, sbase, delta;
1199	if (src == NULL \|\| src->nelem == `0`)
1200	return REG_NOERROR;
1201	if (dest->alloc < `2` * src->nelem + dest->nelem)
1202	{
1203	Idx new_alloc = `2` * (src->nelem + dest->alloc);
1204	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205	if (BE (new_buffer == NULL, `0`))
1206	return REG_ESPACE;
1207	dest->elems = new_buffer;
1208	dest->alloc = new_alloc;
1209	}
1210
1211	if (BE (dest->nelem == `0`, `0`))
1212	{
1213	dest->nelem = src->nelem;
1214	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215	return REG_NOERROR;
1216	}
1217
1218	/ Copy into the top of DEST the items of SRC that are not*
1219	found in DEST. Maybe we could binary search in DEST? /*
1220	for (sbase = dest->nelem + `2` * src->nelem,
1221	is = src->nelem - `1`, id = dest->nelem - `1`; is >= `0` && id >= `0`; )
1222	{
1223	if (dest->elems[id] == src->elems[is])
1224	is--, id--;
1225	else if (dest->elems[id] < src->elems[is])
1226	dest->elems[--sbase] = src->elems[is--];
1227	else / if (dest->elems[id] > src->elems[is]) /
1228	--id;
1229	}
1230
1231	if (is >= `0`)
1232	{
1233	/ If DEST is exhausted, the remaining items of SRC must be unique. /
1234	sbase -= is + `1`;
1235	memcpy (dest->elems + sbase, src->elems, (is + `1`) * sizeof (Idx));
1236	}
1237
1238	id = dest->nelem - `1`;
1239	is = dest->nelem + `2` * src->nelem - `1`;
1240	delta = is - sbase + `1`;
1241	if (delta == `0`)
1242	return REG_NOERROR;
1243
1244	/ Now copy. When DELTA becomes zero, the remaining*
1245	DEST elements are already in place. /*
1246	dest->nelem += delta;
1247	for (;;)
1248	{
1249	if (dest->elems[is] > dest->elems[id])
1250	{
1251	/ Copy from the top. /
1252	dest->elems[id + delta--] = dest->elems[is--];
1253	if (delta == `0`)
1254	break;
1255	}
1256	else
1257	{
1258	/ Slide from the bottom. /
1259	dest->elems[id + delta] = dest->elems[id];
1260	if (--id < `0`)
1261	{
1262	/ Copy remaining SRC elements. /
1263	memcpy (dest->elems, dest->elems + sbase,
1264	delta * sizeof (Idx));
1265	break;
1266	}
1267	}
1268	}
1269
1270	return REG_NOERROR;
1271	}
1272
1273	/ Insert the new element ELEM to the re_node_set* SET.*
1274	SET should not already have ELEM.
1275	Return true if successful. /*
1276
1277	static bool
1278	__attribute_warn_unused_result__
1279	re_node_set_insert (re_node_set *set, Idx elem)
1280	{
1281	Idx idx;
1282	/ In case the set is empty. /
1283	if (set->alloc == `0`)
1284	return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, `1`);
1285
1286	if (BE (set->nelem, `0`) == `0`)
1287	{
1288	/ We already guaranteed above that set->alloc != 0. /
1289	set->elems[`0`] = elem;
1290	++set->nelem;
1291	return true;
1292	}
1293
1294	/ Realloc if we need. /
1295	if (set->alloc == set->nelem)
1296	{
1297	Idx *new_elems;
1298	set->alloc = set->alloc * `2`;
1299	new_elems = re_realloc (set->elems, Idx, set->alloc);
1300	if (BE (new_elems == NULL, `0`))
1301	return false;
1302	set->elems = new_elems;
1303	}
1304
1305	/ Move the elements which follows the new element. Test the*
1306	first element separately to skip a check in the inner loop. /*
1307	if (elem < set->elems[`0`])
1308	{
1309	idx = `0`;
1310	for (idx = set->nelem; idx > `0`; idx--)
1311	set->elems[idx] = set->elems[idx - `1`];
1312	}
1313	else
1314	{
1315	for (idx = set->nelem; set->elems[idx - `1`] > elem; idx--)
1316	set->elems[idx] = set->elems[idx - `1`];
1317	}
1318
1319	/ Insert the new element. /
1320	set->elems[idx] = elem;
1321	++set->nelem;
1322	return true;
1323	}
1324
1325	/ Insert the new element ELEM to the re_node_set* SET.*
1326	SET should not already have any element greater than or equal to ELEM.
1327	Return true if successful. /*
1328
1329	static bool
1330	__attribute_warn_unused_result__
1331	re_node_set_insert_last (re_node_set *set, Idx elem)
1332	{
1333	/ Realloc if we need. /
1334	if (set->alloc == set->nelem)
1335	{
1336	Idx *new_elems;
1337	set->alloc = (set->alloc + `1`) * `2`;
1338	new_elems = re_realloc (set->elems, Idx, set->alloc);
1339	if (BE (new_elems == NULL, `0`))
1340	return false;
1341	set->elems = new_elems;
1342	}
1343
1344	/ Insert the new element. /
1345	set->elems[set->nelem++] = elem;
1346	return true;
1347	}
1348
1349	/ Compare two node sets SET1 and SET2.*
1350	Return true if SET1 and SET2 are equivalent. /*
1351
1352	static bool
1353	__attribute__ ((pure))
1354	re_node_set_compare (const re_node_set set1, const* re_node_set *set2)
1355	{
1356	Idx i;
1357	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1358	return false;
1359	for (i = set1->nelem ; --i >= `0` ; )
1360	if (set1->elems[i] != set2->elems[i])
1361	return false;
1362	return true;
1363	}
1364
1365	/ Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. /
1366
1367	static Idx
1368	__attribute__ ((pure))
1369	re_node_set_contains (const re_node_set *set, Idx elem)
1370	{
1371	__re_size_t idx, right, mid;
1372	if (set->nelem <= `0`)
1373	return `0`;
1374
1375	/ Binary search the element. /
1376	idx = `0`;
1377	right = set->nelem - `1`;
1378	while (idx < right)
1379	{
1380	mid = (idx + right) / `2`;
1381	if (set->elems[mid] < elem)
1382	idx = mid + `1`;
1383	else
1384	right = mid;
1385	}
1386	return set->elems[idx] == elem ? idx + `1` : `0`;
1387	}
1388
1389	static void
1390	re_node_set_remove_at (re_node_set *set, Idx idx)
1391	{
1392	if (idx < `0` \|\| idx >= set->nelem)
1393	return;
1394	--set->nelem;
1395	for (; idx < set->nelem; idx++)
1396	set->elems[idx] = set->elems[idx + `1`];
1397	}
1398
1399
1400	/ Add the token TOKEN to dfa->nodes, and return the index of the token.*
1401	Or return -1 if an error occurred. /*
1402
1403	static Idx
1404	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405	{
1406	if (BE (dfa->nodes_len >= dfa->nodes_alloc, `0`))
1407	{
1408	size_t new_nodes_alloc = dfa->nodes_alloc * `2`;
1409	Idx new_nexts, new_indices;
1410	re_node_set new_edests, new_eclosures;
1411	re_token_t *new_nodes;
1412
1413	/ Avoid overflows in realloc. /
1414	const size_t max_object_size = MAX (sizeof (re_token_t),
1415	MAX (sizeof (re_node_set),
1416	sizeof (Idx)));
1417	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, `0`))
1418	return -`1`;
1419
1420	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421	if (BE (new_nodes == NULL, `0`))
1422	return -`1`;
1423	dfa->nodes = new_nodes;
1424	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428	if (BE (new_nexts == NULL \|\| new_indices == NULL
1429	\|\| new_edests == NULL \|\| new_eclosures == NULL, `0`))
1430	{
1431	re_free (new_nexts);
1432	re_free (new_indices);
1433	re_free (new_edests);
1434	re_free (new_eclosures);
1435	return -`1`;
1436	}
1437	dfa->nexts = new_nexts;
1438	dfa->org_indices = new_indices;
1439	dfa->edests = new_edests;
1440	dfa->eclosures = new_eclosures;
1441	dfa->nodes_alloc = new_nodes_alloc;
1442	}
1443	dfa->nodes[dfa->nodes_len] = token;
1444	dfa->nodes[dfa->nodes_len].constraint = `0`;
1445	#ifdef RE_ENABLE_I18N
1446	dfa->nodes[dfa->nodes_len].accept_mb =
1447	((token.type == OP_PERIOD && dfa->mb_cur_max > `1`)
1448	\|\| token.type == COMPLEX_BRACKET);
1449	#endif
1450	dfa->nexts[dfa->nodes_len] = -`1`;
1451	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453	return dfa->nodes_len++;
1454	}
1455
1456	static re_hashval_t
1457	calc_state_hash (const re_node_set nodes, unsigned* int context)
1458	{
1459	re_hashval_t hash = nodes->nelem + context;
1460	Idx i;
1461	for (i = `0` ; i < nodes->nelem ; i++)
1462	hash += nodes->elems[i];
1463	return hash;
1464	}
1465
1466	/ Search for the state whose node_set is equivalent to NODES.*
1467	Return the pointer to the state, if we found it in the DFA.
1468	Otherwise create the new one and return it. In case of an error
1469	return NULL and set the error code in ERR.
1470	Note: - We assume NULL as the invalid state, then it is possible that
1471	return value is NULL and ERR is REG_NOERROR.
1472	- We never return non-NULL value in case of any errors, it is for
1473	optimization. /*
1474
1475	static re_dfastate_t *
1476	__attribute_warn_unused_result__
1477	re_acquire_state (reg_errcode_t err, const* re_dfa_t *dfa,
1478	const re_node_set *nodes)
1479	{
1480	re_hashval_t hash;
1481	re_dfastate_t *new_state;
1482	struct re_state_table_entry *spot;
1483	Idx i;
1484	#if defined GCC_LINT \|\| defined lint
1485	/ Suppress bogus uninitialized-variable warnings. /
1486	*err = REG_NOERROR;
1487	#endif
1488	if (BE (nodes->nelem == `0`, `0`))
1489	{
1490	*err = REG_NOERROR;
1491	return NULL;
1492	}
1493	hash = calc_state_hash (nodes, `0`);
1494	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496	for (i = `0` ; i < spot->num ; i++)
1497	{
1498	re_dfastate_t *state = spot->array[i];
1499	if (hash != state->hash)
1500	continue;
1501	if (re_node_set_compare (&state->nodes, nodes))
1502	return state;
1503	}
1504
1505	/ There are no appropriate state in the dfa, create the new one. /
1506	new_state = create_ci_newstate (dfa, nodes, hash);
1507	if (BE (new_state == NULL, `0`))
1508	*err = REG_ESPACE;
1509
1510	return new_state;
1511	}
1512
1513	/ Search for the state whose node_set is equivalent to NODES and*
1514	whose context is equivalent to CONTEXT.
1515	Return the pointer to the state, if we found it in the DFA.
1516	Otherwise create the new one and return it. In case of an error
1517	return NULL and set the error code in ERR.
1518	Note: - We assume NULL as the invalid state, then it is possible that
1519	return value is NULL and ERR is REG_NOERROR.
1520	- We never return non-NULL value in case of any errors, it is for
1521	optimization. /*
1522
1523	static re_dfastate_t *
1524	__attribute_warn_unused_result__
1525	re_acquire_state_context (reg_errcode_t err, const* re_dfa_t *dfa,
1526	const re_node_set nodes, unsigned* int context)
1527	{
1528	re_hashval_t hash;
1529	re_dfastate_t *new_state;
1530	struct re_state_table_entry *spot;
1531	Idx i;
1532	#if defined GCC_LINT \|\| defined lint
1533	/ Suppress bogus uninitialized-variable warnings. /
1534	*err = REG_NOERROR;
1535	#endif
1536	if (nodes->nelem == `0`)
1537	{
1538	*err = REG_NOERROR;
1539	return NULL;
1540	}
1541	hash = calc_state_hash (nodes, context);
1542	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544	for (i = `0` ; i < spot->num ; i++)
1545	{
1546	re_dfastate_t *state = spot->array[i];
1547	if (state->hash == hash
1548	&& state->context == context
1549	&& re_node_set_compare (state->entrance_nodes, nodes))
1550	return state;
1551	}
1552	/ There are no appropriate state in 'dfa', create the new one. /
1553	new_state = create_cd_newstate (dfa, nodes, context, hash);
1554	if (BE (new_state == NULL, `0`))
1555	*err = REG_ESPACE;
1556
1557	return new_state;
1558	}
1559
1560	/ Finish initialization of the new state NEWSTATE, and using its hash value*
1561	HASH put in the appropriate bucket of DFA's state table. Return value
1562	indicates the error code if failed. /*
1563
1564	static reg_errcode_t
1565	__attribute_warn_unused_result__
1566	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1567	re_hashval_t hash)
1568	{
1569	struct re_state_table_entry *spot;
1570	reg_errcode_t err;
1571	Idx i;
1572
1573	newstate->hash = hash;
1574	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575	if (BE (err != REG_NOERROR, `0`))
1576	return REG_ESPACE;
1577	for (i = `0`; i < newstate->nodes.nelem; i++)
1578	{
1579	Idx elem = newstate->nodes.elems[i];
1580	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1582	return REG_ESPACE;
1583	}
1584
1585	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586	if (BE (spot->alloc <= spot->num, `0`))
1587	{
1588	Idx new_alloc = `2` * spot->num + `2`;
1589	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1590	new_alloc);
1591	if (BE (new_array == NULL, `0`))
1592	return REG_ESPACE;
1593	spot->array = new_array;
1594	spot->alloc = new_alloc;
1595	}
1596	spot->array[spot->num++] = newstate;
1597	return REG_NOERROR;
1598	}
1599
1600	static void
1601	free_state (re_dfastate_t *state)
1602	{
1603	re_node_set_free (&state->non_eps_nodes);
1604	re_node_set_free (&state->inveclosure);
1605	if (state->entrance_nodes != &state->nodes)
1606	{
1607	re_node_set_free (state->entrance_nodes);
1608	re_free (state->entrance_nodes);
1609	}
1610	re_node_set_free (&state->nodes);
1611	re_free (state->word_trtable);
1612	re_free (state->trtable);
1613	re_free (state);
1614	}
1615
1616	/ Create the new state which is independent of contexts.*
1617	Return the new state if succeeded, otherwise return NULL. /*
1618
1619	static re_dfastate_t *
1620	__attribute_warn_unused_result__
1621	create_ci_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1622	re_hashval_t hash)
1623	{
1624	Idx i;
1625	reg_errcode_t err;
1626	re_dfastate_t *newstate;
1627
1628	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1629	if (BE (newstate == NULL, `0`))
1630	return NULL;
1631	err = re_node_set_init_copy (&newstate->nodes, nodes);
1632	if (BE (err != REG_NOERROR, `0`))
1633	{
1634	re_free (newstate);
1635	return NULL;
1636	}
1637
1638	newstate->entrance_nodes = &newstate->nodes;
1639	for (i = `0` ; i < nodes->nelem ; i++)
1640	{
1641	re_token_t *node = dfa->nodes + nodes->elems[i];
1642	re_token_type_t type = node->type;
1643	if (type == CHARACTER && !node->constraint)
1644	continue;
1645	#ifdef RE_ENABLE_I18N
1646	newstate->accept_mb \|= node->accept_mb;
1647	#endif /* RE_ENABLE_I18N */
1648
1649	/ If the state has the halt node, the state is a halt state. /
1650	if (type == END_OF_RE)
1651	newstate->halt = `1`;
1652	else if (type == OP_BACK_REF)
1653	newstate->has_backref = `1`;
1654	else if (type == ANCHOR \|\| node->constraint)
1655	newstate->has_constraint = `1`;
1656	}
1657	err = register_state (dfa, newstate, hash);
1658	if (BE (err != REG_NOERROR, `0`))
1659	{
1660	free_state (newstate);
1661	newstate = NULL;
1662	}
1663	return newstate;
1664	}
1665
1666	/ Create the new state which is depend on the context CONTEXT.*
1667	Return the new state if succeeded, otherwise return NULL. /*
1668
1669	static re_dfastate_t *
1670	__attribute_warn_unused_result__
1671	create_cd_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1672	unsigned int context, re_hashval_t hash)
1673	{
1674	Idx i, nctx_nodes = `0`;
1675	reg_errcode_t err;
1676	re_dfastate_t *newstate;
1677
1678	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1679	if (BE (newstate == NULL, `0`))
1680	return NULL;
1681	err = re_node_set_init_copy (&newstate->nodes, nodes);
1682	if (BE (err != REG_NOERROR, `0`))
1683	{
1684	re_free (newstate);
1685	return NULL;
1686	}
1687
1688	newstate->context = context;
1689	newstate->entrance_nodes = &newstate->nodes;
1690
1691	for (i = `0` ; i < nodes->nelem ; i++)
1692	{
1693	re_token_t *node = dfa->nodes + nodes->elems[i];
1694	re_token_type_t type = node->type;
1695	unsigned int constraint = node->constraint;
1696
1697	if (type == CHARACTER && !constraint)
1698	continue;
1699	#ifdef RE_ENABLE_I18N
1700	newstate->accept_mb \|= node->accept_mb;
1701	#endif /* RE_ENABLE_I18N */
1702
1703	/ If the state has the halt node, the state is a halt state. /
1704	if (type == END_OF_RE)
1705	newstate->halt = `1`;
1706	else if (type == OP_BACK_REF)
1707	newstate->has_backref = `1`;
1708
1709	if (constraint)
1710	{
1711	if (newstate->entrance_nodes == &newstate->nodes)
1712	{
1713	newstate->entrance_nodes = re_malloc (re_node_set, `1`);
1714	if (BE (newstate->entrance_nodes == NULL, `0`))
1715	{
1716	free_state (newstate);
1717	return NULL;
1718	}
1719	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1720	!= REG_NOERROR)
1721	return NULL;
1722	nctx_nodes = `0`;
1723	newstate->has_constraint = `1`;
1724	}
1725
1726	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1727	{
1728	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1729	++nctx_nodes;
1730	}
1731	}
1732	}
1733	err = register_state (dfa, newstate, hash);
1734	if (BE (err != REG_NOERROR, `0`))
1735	{
1736	free_state (newstate);
1737	newstate = NULL;
1738	}
1739	return newstate;
1740	}
1741

Browse the source code of glibc/posix/regex_internal.c