regex_internal.c source code [glibc/posix/regex_internal.c]

1	/ Extended regular expression matching and search library.*
2	Copyright (C) 2002-2018 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<http://www.gnu.org/licenses/>. /*
19
20	static void re_string_construct_common (const char str, int* len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, int icase,
23	const re_dfa_t *dfa);
24	static re_dfastate_t create_ci_newstate (const* re_dfa_t *dfa,
25	const re_node_set *nodes,
26	unsigned int hash);
27	static re_dfastate_t create_cd_newstate (const* re_dfa_t *dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	unsigned int hash);
31
32	/ Functions for string operation. /
33
34	/ This function allocate the buffers. It is necessary to call*
35	re_string_reconstruct before using the object. /*
36
37	static reg_errcode_t
38	__attribute_warn_unused_result__
39	re_string_allocate (re_string_t pstr, const* char str, int* len, int init_len,
40	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
41	{
42	reg_errcode_t ret;
43	int init_buf_len;
44
45	/ Ensure at least one character fits into the buffers. /
46	if (init_len < dfa->mb_cur_max)
47	init_len = dfa->mb_cur_max;
48	init_buf_len = (len + `1` < init_len) ? len + `1`: init_len;
49	re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51	ret = re_string_realloc_buffers (pstr, init_buf_len);
52	if (BE (ret != REG_NOERROR, `0`))
53	return ret;
54
55	pstr->word_char = dfa->word_char;
56	pstr->word_ops_used = dfa->word_ops_used;
57	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > `1`) ? `0` : len;
59	pstr->valid_raw_len = pstr->valid_len;
60	return REG_NOERROR;
61	}
62
63	/ This function allocate the buffers, and initialize them. /
64
65	static reg_errcode_t
66	__attribute_warn_unused_result__
67	re_string_construct (re_string_t pstr, const* char str, int* len,
68	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
69	{
70	reg_errcode_t ret;
71	memset (pstr, `'\0'`, sizeof (re_string_t));
72	re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74	if (len > `0`)
75	{
76	ret = re_string_realloc_buffers (pstr, len + `1`);
77	if (BE (ret != REG_NOERROR, `0`))
78	return ret;
79	}
80	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82	if (icase)
83	{
84	#ifdef RE_ENABLE_I18N
85	if (dfa->mb_cur_max > `1`)
86	{
87	while (`1`)
88	{
89	ret = build_wcs_upper_buffer (pstr);
90	if (BE (ret != REG_NOERROR, `0`))
91	return ret;
92	if (pstr->valid_raw_len >= len)
93	break;
94	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95	break;
96	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * `2`);
97	if (BE (ret != REG_NOERROR, `0`))
98	return ret;
99	}
100	}
101	else
102	#endif /* RE_ENABLE_I18N */
103	build_upper_buffer (pstr);
104	}
105	else
106	{
107	#ifdef RE_ENABLE_I18N
108	if (dfa->mb_cur_max > `1`)
109	build_wcs_buffer (pstr);
110	else
111	#endif /* RE_ENABLE_I18N */
112	{
113	if (trans != NULL)
114	re_string_translate_buffer (pstr);
115	else
116	{
117	pstr->valid_len = pstr->bufs_len;
118	pstr->valid_raw_len = pstr->bufs_len;
119	}
120	}
121	}
122
123	return REG_NOERROR;
124	}
125
126	/ Helper functions for re_string_allocate, and re_string_construct. /
127
128	static reg_errcode_t
129	__attribute_warn_unused_result__
130	re_string_realloc_buffers (re_string_t pstr, int* new_buf_len)
131	{
132	#ifdef RE_ENABLE_I18N
133	if (pstr->mb_cur_max > `1`)
134	{
135	wint_t *new_wcs;
136
137	/ Avoid overflow in realloc. /
138	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
139	if (BE (SIZE_MAX / max_object_size < new_buf_len, `0`))
140	return REG_ESPACE;
141
142	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143	if (BE (new_wcs == NULL, `0`))
144	return REG_ESPACE;
145	pstr->wcs = new_wcs;
146	if (pstr->offsets != NULL)
147	{
148	int new_offsets = re_realloc (pstr->offsets, int*, new_buf_len);
149	if (BE (new_offsets == NULL, `0`))
150	return REG_ESPACE;
151	pstr->offsets = new_offsets;
152	}
153	}
154	#endif /* RE_ENABLE_I18N */
155	if (pstr->mbs_allocated)
156	{
157	unsigned char new_mbs = re_realloc (pstr->mbs, unsigned* char,
158	new_buf_len);
159	if (BE (new_mbs == NULL, `0`))
160	return REG_ESPACE;
161	pstr->mbs = new_mbs;
162	}
163	pstr->bufs_len = new_buf_len;
164	return REG_NOERROR;
165	}
166
167
168	static void
169	re_string_construct_common (const char str, int* len, re_string_t *pstr,
170	RE_TRANSLATE_TYPE trans, int icase,
171	const re_dfa_t *dfa)
172	{
173	pstr->raw_mbs = (const unsigned char *) str;
174	pstr->len = len;
175	pstr->raw_len = len;
176	pstr->trans = trans;
177	pstr->icase = icase ? `1` : `0`;
178	pstr->mbs_allocated = (trans != NULL \|\| icase);
179	pstr->mb_cur_max = dfa->mb_cur_max;
180	pstr->is_utf8 = dfa->is_utf8;
181	pstr->map_notascii = dfa->map_notascii;
182	pstr->stop = pstr->len;
183	pstr->raw_stop = pstr->stop;
184	}
185
186	#ifdef RE_ENABLE_I18N
187
188	/ Build wide character buffer PSTR->WCS.*
189	If the byte sequence of the string are:
190	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
191	Then wide character buffer will be:
192	<wc1> , WEOF , <wc2> , WEOF , <wc3>
193	We use WEOF for padding, they indicate that the position isn't
194	a first byte of a multibyte character.
195
196	Note that this function assumes PSTR->VALID_LEN elements are already
197	built and starts from PSTR->VALID_LEN. /*
198
199	static void
200	build_wcs_buffer (re_string_t *pstr)
201	{
202	#ifdef _LIBC
203	unsigned char buf[MB_LEN_MAX];
204	assert (MB_LEN_MAX >= pstr->mb_cur_max);
205	#else
206	unsigned char buf[`64`];
207	#endif
208	mbstate_t prev_st;
209	int byte_idx, end_idx, remain_len;
210	size_t mbclen;
211
212	/ Build the buffers from pstr->valid_len to either pstr->len or*
213	pstr->bufs_len. /*
214	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
215	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
216	{
217	wchar_t wc;
218	const char *p;
219
220	remain_len = end_idx - byte_idx;
221	prev_st = pstr->cur_state;
222	/ Apply the translation if we need. /
223	if (BE (pstr->trans != NULL, `0`))
224	{
225	int i, ch;
226
227	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
228	{
229	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
230	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
231	}
232	p = (const char *) buf;
233	}
234	else
235	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
236	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
237	if (BE (mbclen == (size_t) -`1` \|\| mbclen == `0`
238	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len), `0`))
239	{
240	/ We treat these cases as a singlebyte character. /
241	mbclen = `1`;
242	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243	if (BE (pstr->trans != NULL, `0`))
244	wc = pstr->trans[wc];
245	pstr->cur_state = prev_st;
246	}
247	else if (BE (mbclen == (size_t) -`2`, `0`))
248	{
249	/ The buffer doesn't have enough space, finish to build. /
250	pstr->cur_state = prev_st;
251	break;
252	}
253
254	/ Write wide character and padding. /
255	pstr->wcs[byte_idx++] = wc;
256	/ Write paddings. /
257	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
258	pstr->wcs[byte_idx++] = WEOF;
259	}
260	pstr->valid_len = byte_idx;
261	pstr->valid_raw_len = byte_idx;
262	}
263
264	/ Build wide character buffer PSTR->WCS like build_wcs_buffer,*
265	but for REG_ICASE. /*
266
267	static reg_errcode_t
268	__attribute_warn_unused_result__
269	build_wcs_upper_buffer (re_string_t *pstr)
270	{
271	mbstate_t prev_st;
272	int src_idx, byte_idx, end_idx, remain_len;
273	size_t mbclen;
274	#ifdef _LIBC
275	char buf[MB_LEN_MAX];
276	assert (MB_LEN_MAX >= pstr->mb_cur_max);
277	#else
278	char buf[`64`];
279	#endif
280
281	byte_idx = pstr->valid_len;
282	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
283
284	/ The following optimization assumes that ASCII characters can be*
285	mapped to wide characters with a simple cast. /*
286	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
287	{
288	while (byte_idx < end_idx)
289	{
290	wchar_t wc;
291
292	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
293	&& mbsinit (&pstr->cur_state))
294	{
295	/ In case of a singlebyte character. /
296	pstr->mbs[byte_idx]
297	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
298	/ The next step uses the assumption that wchar_t is encoded*
299	ASCII-safe: all ASCII values can be converted like this. /*
300	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
301	++byte_idx;
302	continue;
303	}
304
305	remain_len = end_idx - byte_idx;
306	prev_st = pstr->cur_state;
307	mbclen = __mbrtowc (&wc,
308	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
309	+ byte_idx), remain_len, &pstr->cur_state);
310	if (BE (mbclen + `2` > `2`, `1`))
311	{
312	wchar_t wcu = wc;
313	if (__iswlower (wc))
314	{
315	size_t mbcdlen;
316
317	wcu = __towupper (wc);
318	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
319	if (BE (mbclen == mbcdlen, `1`))
320	memcpy (pstr->mbs + byte_idx, buf, mbclen);
321	else
322	{
323	src_idx = byte_idx;
324	goto offsets_needed;
325	}
326	}
327	else
328	memcpy (pstr->mbs + byte_idx,
329	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
330	pstr->wcs[byte_idx++] = wcu;
331	/ Write paddings. /
332	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
333	pstr->wcs[byte_idx++] = WEOF;
334	}
335	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
336	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
337	{
338	/ It is an invalid character, an incomplete character*
339	at the end of the string, or '\0'. Just use the byte. /*
340	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341	pstr->mbs[byte_idx] = ch;
342	/ And also cast it to wide char. /
343	pstr->wcs[byte_idx++] = (wchar_t) ch;
344	if (BE (mbclen == (size_t) -`1`, `0`))
345	pstr->cur_state = prev_st;
346	}
347	else
348	{
349	/ The buffer doesn't have enough space, finish to build. /
350	pstr->cur_state = prev_st;
351	break;
352	}
353	}
354	pstr->valid_len = byte_idx;
355	pstr->valid_raw_len = byte_idx;
356	return REG_NOERROR;
357	}
358	else
359	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
360	{
361	wchar_t wc;
362	const char *p;
363	offsets_needed:
364	remain_len = end_idx - byte_idx;
365	prev_st = pstr->cur_state;
366	if (BE (pstr->trans != NULL, `0`))
367	{
368	int i, ch;
369
370	for (i = `0`; i < pstr->mb_cur_max && i < remain_len; ++i)
371	{
372	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373	buf[i] = pstr->trans[ch];
374	}
375	p = (const char *) buf;
376	}
377	else
378	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380	if (BE (mbclen + `2` > `2`, `1`))
381	{
382	wchar_t wcu = wc;
383	if (__iswlower (wc))
384	{
385	size_t mbcdlen;
386
387	wcu = __towupper (wc);
388	mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
389	if (BE (mbclen == mbcdlen, `1`))
390	memcpy (pstr->mbs + byte_idx, buf, mbclen);
391	else if (mbcdlen != (size_t) -`1`)
392	{
393	size_t i;
394
395	if (byte_idx + mbcdlen > pstr->bufs_len)
396	{
397	pstr->cur_state = prev_st;
398	break;
399	}
400
401	if (pstr->offsets == NULL)
402	{
403	pstr->offsets = re_malloc (int, pstr->bufs_len);
404
405	if (pstr->offsets == NULL)
406	return REG_ESPACE;
407	}
408	if (!pstr->offsets_needed)
409	{
410	for (i = `0`; i < (size_t) byte_idx; ++i)
411	pstr->offsets[i] = i;
412	pstr->offsets_needed = `1`;
413	}
414
415	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416	pstr->wcs[byte_idx] = wcu;
417	pstr->offsets[byte_idx] = src_idx;
418	for (i = `1`; i < mbcdlen; ++i)
419	{
420	pstr->offsets[byte_idx + i]
421	= src_idx + (i < mbclen ? i : mbclen - `1`);
422	pstr->wcs[byte_idx + i] = WEOF;
423	}
424	pstr->len += mbcdlen - mbclen;
425	if (pstr->raw_stop > src_idx)
426	pstr->stop += mbcdlen - mbclen;
427	end_idx = (pstr->bufs_len > pstr->len)
428	? pstr->len : pstr->bufs_len;
429	byte_idx += mbcdlen;
430	src_idx += mbclen;
431	continue;
432	}
433	else
434	memcpy (pstr->mbs + byte_idx, p, mbclen);
435	}
436	else
437	memcpy (pstr->mbs + byte_idx, p, mbclen);
438
439	if (BE (pstr->offsets_needed != `0`, `0`))
440	{
441	size_t i;
442	for (i = `0`; i < mbclen; ++i)
443	pstr->offsets[byte_idx + i] = src_idx + i;
444	}
445	src_idx += mbclen;
446
447	pstr->wcs[byte_idx++] = wcu;
448	/ Write paddings. /
449	for (remain_len = byte_idx + mbclen - `1`; byte_idx < remain_len ;)
450	pstr->wcs[byte_idx++] = WEOF;
451	}
452	else if (mbclen == (size_t) -`1` \|\| mbclen == `0`
453	\|\| (mbclen == (size_t) -`2` && pstr->bufs_len >= pstr->len))
454	{
455	/ It is an invalid character or '\0'. Just use the byte. /
456	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457
458	if (BE (pstr->trans != NULL, `0`))
459	ch = pstr->trans [ch];
460	pstr->mbs[byte_idx] = ch;
461
462	if (BE (pstr->offsets_needed != `0`, `0`))
463	pstr->offsets[byte_idx] = src_idx;
464	++src_idx;
465
466	/ And also cast it to wide char. /
467	pstr->wcs[byte_idx++] = (wchar_t) ch;
468	if (BE (mbclen == (size_t) -`1`, `0`))
469	pstr->cur_state = prev_st;
470	}
471	else
472	{
473	/ The buffer doesn't have enough space, finish to build. /
474	pstr->cur_state = prev_st;
475	break;
476	}
477	}
478	pstr->valid_len = byte_idx;
479	pstr->valid_raw_len = src_idx;
480	return REG_NOERROR;
481	}
482
483	/ Skip characters until the index becomes greater than NEW_RAW_IDX.*
484	Return the index. /*
485
486	static int
487	re_string_skip_chars (re_string_t pstr, int* new_raw_idx, wint_t *last_wc)
488	{
489	mbstate_t prev_st;
490	int rawbuf_idx;
491	size_t mbclen;
492	wint_t wc = WEOF;
493
494	/ Skip the characters which are not necessary to check. /
495	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496	rawbuf_idx < new_raw_idx;)
497	{
498	wchar_t wc2;
499	int remain_len = pstr->raw_len - rawbuf_idx;
500	prev_st = pstr->cur_state;
501	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
502	remain_len, &pstr->cur_state);
503	if (BE ((ssize_t) mbclen <= `0`, `0`))
504	{
505	/ We treat these cases as a single byte character. /
506	if (mbclen == `0` \|\| remain_len == `0`)
507	wc = L`'\0'`;
508	else
509	wc = (unsigned* char *) (pstr->raw_mbs + rawbuf_idx);
510	mbclen = `1`;
511	pstr->cur_state = prev_st;
512	}
513	else
514	wc = (wint_t) wc2;
515	/ Then proceed the next character. /
516	rawbuf_idx += mbclen;
517	}
518	*last_wc = wc;
519	return rawbuf_idx;
520	}
521	#endif /* RE_ENABLE_I18N */
522
523	/ Build the buffer PSTR->MBS, and apply the translation if we need.*
524	This function is used in case of REG_ICASE. /*
525
526	static void
527	build_upper_buffer (re_string_t *pstr)
528	{
529	int char_idx, end_idx;
530	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
531
532	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
533	{
534	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
535	if (BE (pstr->trans != NULL, `0`))
536	ch = pstr->trans[ch];
537	if (islower (ch))
538	pstr->mbs[char_idx] = toupper (ch);
539	else
540	pstr->mbs[char_idx] = ch;
541	}
542	pstr->valid_len = char_idx;
543	pstr->valid_raw_len = char_idx;
544	}
545
546	/ Apply TRANS to the buffer in PSTR. /
547
548	static void
549	re_string_translate_buffer (re_string_t *pstr)
550	{
551	int buf_idx, end_idx;
552	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553
554	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555	{
556	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557	pstr->mbs[buf_idx] = pstr->trans[ch];
558	}
559
560	pstr->valid_len = buf_idx;
561	pstr->valid_raw_len = buf_idx;
562	}
563
564	/ This function re-construct the buffers.*
565	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566	convert to upper case in case of REG_ICASE, apply translation. /*
567
568	static reg_errcode_t
569	__attribute_warn_unused_result__
570	re_string_reconstruct (re_string_t pstr, int* idx, int eflags)
571	{
572	int offset = idx - pstr->raw_mbs_idx;
573	if (BE (offset < `0`, `0`))
574	{
575	/ Reset buffer. /
576	#ifdef RE_ENABLE_I18N
577	if (pstr->mb_cur_max > `1`)
578	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
579	#endif /* RE_ENABLE_I18N */
580	pstr->len = pstr->raw_len;
581	pstr->stop = pstr->raw_stop;
582	pstr->valid_len = `0`;
583	pstr->raw_mbs_idx = `0`;
584	pstr->valid_raw_len = `0`;
585	pstr->offsets_needed = `0`;
586	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
587	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
588	if (!pstr->mbs_allocated)
589	pstr->mbs = (unsigned char *) pstr->raw_mbs;
590	offset = idx;
591	}
592
593	if (BE (offset != `0`, `1`))
594	{
595	/ Should the already checked characters be kept? /
596	if (BE (offset < pstr->valid_raw_len, `1`))
597	{
598	/ Yes, move them to the front of the buffer. /
599	#ifdef RE_ENABLE_I18N
600	if (BE (pstr->offsets_needed, `0`))
601	{
602	int low = `0`, high = pstr->valid_len, mid;
603	do
604	{
605	mid = (high + low) / `2`;
606	if (pstr->offsets[mid] > offset)
607	high = mid;
608	else if (pstr->offsets[mid] < offset)
609	low = mid + `1`;
610	else
611	break;
612	}
613	while (low < high);
614	if (pstr->offsets[mid] < offset)
615	++mid;
616	pstr->tip_context = re_string_context_at (pstr, mid - `1`,
617	eflags);
618	/ This can be quite complicated, so handle specially*
619	only the common and easy case where the character with
620	different length representation of lower and upper
621	case is present at or after offset. /*
622	if (pstr->valid_len > offset
623	&& mid == offset && pstr->offsets[mid] == offset)
624	{
625	memmove (pstr->wcs, pstr->wcs + offset,
626	(pstr->valid_len - offset) * sizeof (wint_t));
627	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
628	pstr->valid_len -= offset;
629	pstr->valid_raw_len -= offset;
630	for (low = `0`; low < pstr->valid_len; low++)
631	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
632	}
633	else
634	{
635	/ Otherwise, just find out how long the partial multibyte*
636	character at offset is and fill it with WEOF/255. /*
637	pstr->len = pstr->raw_len - idx + offset;
638	pstr->stop = pstr->raw_stop - idx + offset;
639	pstr->offsets_needed = `0`;
640	while (mid > `0` && pstr->offsets[mid - `1`] == offset)
641	--mid;
642	while (mid < pstr->valid_len)
643	if (pstr->wcs[mid] != WEOF)
644	break;
645	else
646	++mid;
647	if (mid == pstr->valid_len)
648	pstr->valid_len = `0`;
649	else
650	{
651	pstr->valid_len = pstr->offsets[mid] - offset;
652	if (pstr->valid_len)
653	{
654	for (low = `0`; low < pstr->valid_len; ++low)
655	pstr->wcs[low] = WEOF;
656	memset (pstr->mbs, `255`, pstr->valid_len);
657	}
658	}
659	pstr->valid_raw_len = pstr->valid_len;
660	}
661	}
662	else
663	#endif
664	{
665	pstr->tip_context = re_string_context_at (pstr, offset - `1`,
666	eflags);
667	#ifdef RE_ENABLE_I18N
668	if (pstr->mb_cur_max > `1`)
669	memmove (pstr->wcs, pstr->wcs + offset,
670	(pstr->valid_len - offset) * sizeof (wint_t));
671	#endif /* RE_ENABLE_I18N */
672	if (BE (pstr->mbs_allocated, `0`))
673	memmove (pstr->mbs, pstr->mbs + offset,
674	pstr->valid_len - offset);
675	pstr->valid_len -= offset;
676	pstr->valid_raw_len -= offset;
677	#if defined DEBUG && DEBUG
678	assert (pstr->valid_len > `0`);
679	#endif
680	}
681	}
682	else
683	{
684	#ifdef RE_ENABLE_I18N
685	/ No, skip all characters until IDX. /
686	int prev_valid_len = pstr->valid_len;
687
688	if (BE (pstr->offsets_needed, `0`))
689	{
690	pstr->len = pstr->raw_len - idx + offset;
691	pstr->stop = pstr->raw_stop - idx + offset;
692	pstr->offsets_needed = `0`;
693	}
694	#endif
695	pstr->valid_len = `0`;
696	#ifdef RE_ENABLE_I18N
697	if (pstr->mb_cur_max > `1`)
698	{
699	int wcs_idx;
700	wint_t wc = WEOF;
701
702	if (pstr->is_utf8)
703	{
704	const unsigned char raw, p, *end;
705
706	/ Special case UTF-8. Multi-byte chars start with any*
707	byte other than 0x80 - 0xbf. /*
708	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
709	end = raw + (offset - pstr->mb_cur_max);
710	if (end < pstr->raw_mbs)
711	end = pstr->raw_mbs;
712	p = raw + offset - `1`;
713	#ifdef _LIBC
714	/ We know the wchar_t encoding is UCS4, so for the simple*
715	case, ASCII characters, skip the conversion step. /*
716	if (isascii (*p) && BE (pstr->trans == NULL, `1`))
717	{
718	memset (&pstr->cur_state, `'\0'`, sizeof (mbstate_t));
719	/ pstr->valid_len = 0; /
720	wc = (wchar_t) *p;
721	}
722	else
723	#endif
724	for (; p >= end; --p)
725	if ((*p & `0xc0`) != `0x80`)
726	{
727	mbstate_t cur_state;
728	wchar_t wc2;
729	int mlen = raw + pstr->len - p;
730	unsigned char buf[`6`];
731	size_t mbclen;
732
733	const unsigned char *pp = p;
734	if (BE (pstr->trans != NULL, `0`))
735	{
736	int i = mlen < `6` ? mlen : `6`;
737	while (--i >= `0`)
738	buf[i] = pstr->trans[p[i]];
739	pp = buf;
740	}
741	/ XXX Don't use mbrtowc, we know which conversion*
742	to use (UTF-8 -> UCS4). /*
743	memset (&cur_state, `0`, sizeof (cur_state));
744	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
745	&cur_state);
746	if (raw + offset - p <= mbclen
747	&& mbclen < (size_t) -`2`)
748	{
749	memset (&pstr->cur_state, `'\0'`,
750	sizeof (mbstate_t));
751	pstr->valid_len = mbclen - (raw + offset - p);
752	wc = wc2;
753	}
754	break;
755	}
756	}
757
758	if (wc == WEOF)
759	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
760	if (wc == WEOF)
761	pstr->tip_context
762	= re_string_context_at (pstr, prev_valid_len - `1`, eflags);
763	else
764	pstr->tip_context = ((BE (pstr->word_ops_used != `0`, `0`)
765	&& IS_WIDE_WORD_CHAR (wc))
766	? CONTEXT_WORD
767	: ((IS_WIDE_NEWLINE (wc)
768	&& pstr->newline_anchor)
769	? CONTEXT_NEWLINE : `0`));
770	if (BE (pstr->valid_len, `0`))
771	{
772	for (wcs_idx = `0`; wcs_idx < pstr->valid_len; ++wcs_idx)
773	pstr->wcs[wcs_idx] = WEOF;
774	if (pstr->mbs_allocated)
775	memset (pstr->mbs, `255`, pstr->valid_len);
776	}
777	pstr->valid_raw_len = pstr->valid_len;
778	}
779	else
780	#endif /* RE_ENABLE_I18N */
781	{
782	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - `1`];
783	pstr->valid_raw_len = `0`;
784	if (pstr->trans)
785	c = pstr->trans[c];
786	pstr->tip_context = (bitset_contain (pstr->word_char, c)
787	? CONTEXT_WORD
788	: ((IS_NEWLINE (c) && pstr->newline_anchor)
789	? CONTEXT_NEWLINE : `0`));
790	}
791	}
792	if (!BE (pstr->mbs_allocated, `0`))
793	pstr->mbs += offset;
794	}
795	pstr->raw_mbs_idx = idx;
796	pstr->len -= offset;
797	pstr->stop -= offset;
798
799	/ Then build the buffers. /
800	#ifdef RE_ENABLE_I18N
801	if (pstr->mb_cur_max > `1`)
802	{
803	if (pstr->icase)
804	{
805	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
806	if (BE (ret != REG_NOERROR, `0`))
807	return ret;
808	}
809	else
810	build_wcs_buffer (pstr);
811	}
812	else
813	#endif /* RE_ENABLE_I18N */
814	if (BE (pstr->mbs_allocated, `0`))
815	{
816	if (pstr->icase)
817	build_upper_buffer (pstr);
818	else if (pstr->trans != NULL)
819	re_string_translate_buffer (pstr);
820	}
821	else
822	pstr->valid_len = pstr->len;
823
824	pstr->cur_idx = `0`;
825	return REG_NOERROR;
826	}
827
828	static unsigned char
829	__attribute ((pure))
830	re_string_peek_byte_case (const re_string_t pstr, int* idx)
831	{
832	int ch, off;
833
834	/ Handle the common (easiest) cases first. /
835	if (BE (!pstr->mbs_allocated, `1`))
836	return re_string_peek_byte (pstr, idx);
837
838	#ifdef RE_ENABLE_I18N
839	if (pstr->mb_cur_max > `1`
840	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
841	return re_string_peek_byte (pstr, idx);
842	#endif
843
844	off = pstr->cur_idx + idx;
845	#ifdef RE_ENABLE_I18N
846	if (pstr->offsets_needed)
847	off = pstr->offsets[off];
848	#endif
849
850	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
851
852	#ifdef RE_ENABLE_I18N
853	/ Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I*
854	this function returns CAPITAL LETTER I instead of first byte of
855	DOTLESS SMALL LETTER I. The latter would confuse the parser,
856	since peek_byte_case doesn't advance cur_idx in any way. /*
857	if (pstr->offsets_needed && !isascii (ch))
858	return re_string_peek_byte (pstr, idx);
859	#endif
860
861	return ch;
862	}
863
864	static unsigned char
865	re_string_fetch_byte_case (re_string_t *pstr)
866	{
867	if (BE (!pstr->mbs_allocated, `1`))
868	return re_string_fetch_byte (pstr);
869
870	#ifdef RE_ENABLE_I18N
871	if (pstr->offsets_needed)
872	{
873	int off, ch;
874
875	/ For tr_TR.UTF-8 [[:islower:]] there is*
876	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
877	in that case the whole multi-byte character and return
878	the original letter. On the other side, with
879	[[: DOTLESS SMALL LETTER I return [[:I, as doing
880	anything else would complicate things too much. /*
881
882	if (!re_string_first_byte (pstr, pstr->cur_idx))
883	return re_string_fetch_byte (pstr);
884
885	off = pstr->offsets[pstr->cur_idx];
886	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
887
888	if (! isascii (ch))
889	return re_string_fetch_byte (pstr);
890
891	re_string_skip_bytes (pstr,
892	re_string_char_size_at (pstr, pstr->cur_idx));
893	return ch;
894	}
895	#endif
896
897	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
898	}
899
900	static void
901	re_string_destruct (re_string_t *pstr)
902	{
903	#ifdef RE_ENABLE_I18N
904	re_free (pstr->wcs);
905	re_free (pstr->offsets);
906	#endif /* RE_ENABLE_I18N */
907	if (pstr->mbs_allocated)
908	re_free (pstr->mbs);
909	}
910
911	/ Return the context at IDX in INPUT. /
912
913	static unsigned int
914	re_string_context_at (const re_string_t input, int* idx, int eflags)
915	{
916	int c;
917	if (BE (idx < `0`, `0`))
918	/ In this case, we use the value stored in input->tip_context,*
919	since we can't know the character in input->mbs[-1] here. /*
920	return input->tip_context;
921	if (BE (idx == input->len, `0`))
922	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
923	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
924	#ifdef RE_ENABLE_I18N
925	if (input->mb_cur_max > `1`)
926	{
927	wint_t wc;
928	int wc_idx = idx;
929	while(input->wcs[wc_idx] == WEOF)
930	{
931	#if defined DEBUG && DEBUG
932	/ It must not happen. /
933	assert (wc_idx >= `0`);
934	#endif
935	--wc_idx;
936	if (wc_idx < `0`)
937	return input->tip_context;
938	}
939	wc = input->wcs[wc_idx];
940	if (BE (input->word_ops_used != `0`, `0`) && IS_WIDE_WORD_CHAR (wc))
941	return CONTEXT_WORD;
942	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
943	? CONTEXT_NEWLINE : `0`);
944	}
945	else
946	#endif
947	{
948	c = re_string_byte_at (input, idx);
949	if (bitset_contain (input->word_char, c))
950	return CONTEXT_WORD;
951	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : `0`;
952	}
953	}
954
955	/ Functions for set operation. /
956
957	static reg_errcode_t
958	__attribute_warn_unused_result__
959	re_node_set_alloc (re_node_set set, int* size)
960	{
961	set->alloc = size;
962	set->nelem = `0`;
963	set->elems = re_malloc (int, size);
964	if (BE (set->elems == NULL, `0`))
965	return REG_ESPACE;
966	return REG_NOERROR;
967	}
968
969	static reg_errcode_t
970	__attribute_warn_unused_result__
971	re_node_set_init_1 (re_node_set set, int* elem)
972	{
973	set->alloc = `1`;
974	set->nelem = `1`;
975	set->elems = re_malloc (int, `1`);
976	if (BE (set->elems == NULL, `0`))
977	{
978	set->alloc = set->nelem = `0`;
979	return REG_ESPACE;
980	}
981	set->elems[`0`] = elem;
982	return REG_NOERROR;
983	}
984
985	static reg_errcode_t
986	__attribute_warn_unused_result__
987	re_node_set_init_2 (re_node_set set, int* elem1, int elem2)
988	{
989	set->alloc = `2`;
990	set->elems = re_malloc (int, `2`);
991	if (BE (set->elems == NULL, `0`))
992	return REG_ESPACE;
993	if (elem1 == elem2)
994	{
995	set->nelem = `1`;
996	set->elems[`0`] = elem1;
997	}
998	else
999	{
1000	set->nelem = `2`;
1001	if (elem1 < elem2)
1002	{
1003	set->elems[`0`] = elem1;
1004	set->elems[`1`] = elem2;
1005	}
1006	else
1007	{
1008	set->elems[`0`] = elem2;
1009	set->elems[`1`] = elem1;
1010	}
1011	}
1012	return REG_NOERROR;
1013	}
1014
1015	static reg_errcode_t
1016	__attribute_warn_unused_result__
1017	re_node_set_init_copy (re_node_set dest, const* re_node_set *src)
1018	{
1019	dest->nelem = src->nelem;
1020	if (src->nelem > `0`)
1021	{
1022	dest->alloc = dest->nelem;
1023	dest->elems = re_malloc (int, dest->alloc);
1024	if (BE (dest->elems == NULL, `0`))
1025	{
1026	dest->alloc = dest->nelem = `0`;
1027	return REG_ESPACE;
1028	}
1029	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1030	}
1031	else
1032	re_node_set_init_empty (dest);
1033	return REG_NOERROR;
1034	}
1035
1036	/ Calculate the intersection of the sets SRC1 and SRC2. And merge it to*
1037	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1038	Note: We assume dest->elems is NULL, when dest->alloc is 0. /*
1039
1040	static reg_errcode_t
1041	__attribute_warn_unused_result__
1042	re_node_set_add_intersect (re_node_set dest, const* re_node_set *src1,
1043	const re_node_set *src2)
1044	{
1045	int i1, i2, is, id, delta, sbase;
1046	if (src1->nelem == `0` \|\| src2->nelem == `0`)
1047	return REG_NOERROR;
1048
1049	/ We need dest->nelem + 2 * elems_in_intersection; this is a*
1050	conservative estimate. /*
1051	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1052	{
1053	int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1054	int new_elems = re_realloc (dest->elems, int*, new_alloc);
1055	if (BE (new_elems == NULL, `0`))
1056	return REG_ESPACE;
1057	dest->elems = new_elems;
1058	dest->alloc = new_alloc;
1059	}
1060
1061	/ Find the items in the intersection of SRC1 and SRC2, and copy*
1062	into the top of DEST those that are not already in DEST itself. /*
1063	sbase = dest->nelem + src1->nelem + src2->nelem;
1064	i1 = src1->nelem - `1`;
1065	i2 = src2->nelem - `1`;
1066	id = dest->nelem - `1`;
1067	for (;;)
1068	{
1069	if (src1->elems[i1] == src2->elems[i2])
1070	{
1071	/ Try to find the item in DEST. Maybe we could binary search? /
1072	while (id >= `0` && dest->elems[id] > src1->elems[i1])
1073	--id;
1074
1075	if (id < `0` \|\| dest->elems[id] != src1->elems[i1])
1076	dest->elems[--sbase] = src1->elems[i1];
1077
1078	if (--i1 < `0` \|\| --i2 < `0`)
1079	break;
1080	}
1081
1082	/ Lower the highest of the two items. /
1083	else if (src1->elems[i1] < src2->elems[i2])
1084	{
1085	if (--i2 < `0`)
1086	break;
1087	}
1088	else
1089	{
1090	if (--i1 < `0`)
1091	break;
1092	}
1093	}
1094
1095	id = dest->nelem - `1`;
1096	is = dest->nelem + src1->nelem + src2->nelem - `1`;
1097	delta = is - sbase + `1`;
1098
1099	/ Now copy. When DELTA becomes zero, the remaining*
1100	DEST elements are already in place; this is more or
1101	less the same loop that is in re_node_set_merge. /*
1102	dest->nelem += delta;
1103	if (delta > `0` && id >= `0`)
1104	for (;;)
1105	{
1106	if (dest->elems[is] > dest->elems[id])
1107	{
1108	/ Copy from the top. /
1109	dest->elems[id + delta--] = dest->elems[is--];
1110	if (delta == `0`)
1111	break;
1112	}
1113	else
1114	{
1115	/ Slide from the bottom. /
1116	dest->elems[id + delta] = dest->elems[id];
1117	if (--id < `0`)
1118	break;
1119	}
1120	}
1121
1122	/ Copy remaining SRC elements. /
1123	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1124
1125	return REG_NOERROR;
1126	}
1127
1128	/ Calculate the union set of the sets SRC1 and SRC2. And store it to*
1129	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1130
1131	static reg_errcode_t
1132	__attribute_warn_unused_result__
1133	re_node_set_init_union (re_node_set dest, const* re_node_set *src1,
1134	const re_node_set *src2)
1135	{
1136	int i1, i2, id;
1137	if (src1 != NULL && src1->nelem > `0` && src2 != NULL && src2->nelem > `0`)
1138	{
1139	dest->alloc = src1->nelem + src2->nelem;
1140	dest->elems = re_malloc (int, dest->alloc);
1141	if (BE (dest->elems == NULL, `0`))
1142	return REG_ESPACE;
1143	}
1144	else
1145	{
1146	if (src1 != NULL && src1->nelem > `0`)
1147	return re_node_set_init_copy (dest, src1);
1148	else if (src2 != NULL && src2->nelem > `0`)
1149	return re_node_set_init_copy (dest, src2);
1150	else
1151	re_node_set_init_empty (dest);
1152	return REG_NOERROR;
1153	}
1154	for (i1 = i2 = id = `0` ; i1 < src1->nelem && i2 < src2->nelem ;)
1155	{
1156	if (src1->elems[i1] > src2->elems[i2])
1157	{
1158	dest->elems[id++] = src2->elems[i2++];
1159	continue;
1160	}
1161	if (src1->elems[i1] == src2->elems[i2])
1162	++i2;
1163	dest->elems[id++] = src1->elems[i1++];
1164	}
1165	if (i1 < src1->nelem)
1166	{
1167	memcpy (dest->elems + id, src1->elems + i1,
1168	(src1->nelem - i1) * sizeof (int));
1169	id += src1->nelem - i1;
1170	}
1171	else if (i2 < src2->nelem)
1172	{
1173	memcpy (dest->elems + id, src2->elems + i2,
1174	(src2->nelem - i2) * sizeof (int));
1175	id += src2->nelem - i2;
1176	}
1177	dest->nelem = id;
1178	return REG_NOERROR;
1179	}
1180
1181	/ Calculate the union set of the sets DEST and SRC. And store it to*
1182	DEST. Return value indicate the error code or REG_NOERROR if succeeded. /*
1183
1184	static reg_errcode_t
1185	__attribute_warn_unused_result__
1186	re_node_set_merge (re_node_set dest, const* re_node_set *src)
1187	{
1188	int is, id, sbase, delta;
1189	if (src == NULL \|\| src->nelem == `0`)
1190	return REG_NOERROR;
1191	if (dest->alloc < `2` * src->nelem + dest->nelem)
1192	{
1193	int new_alloc = `2` * (src->nelem + dest->alloc);
1194	int new_buffer = re_realloc (dest->elems, int*, new_alloc);
1195	if (BE (new_buffer == NULL, `0`))
1196	return REG_ESPACE;
1197	dest->elems = new_buffer;
1198	dest->alloc = new_alloc;
1199	}
1200
1201	if (BE (dest->nelem == `0`, `0`))
1202	{
1203	dest->nelem = src->nelem;
1204	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1205	return REG_NOERROR;
1206	}
1207
1208	/ Copy into the top of DEST the items of SRC that are not*
1209	found in DEST. Maybe we could binary search in DEST? /*
1210	for (sbase = dest->nelem + `2` * src->nelem,
1211	is = src->nelem - `1`, id = dest->nelem - `1`; is >= `0` && id >= `0`; )
1212	{
1213	if (dest->elems[id] == src->elems[is])
1214	is--, id--;
1215	else if (dest->elems[id] < src->elems[is])
1216	dest->elems[--sbase] = src->elems[is--];
1217	else / if (dest->elems[id] > src->elems[is]) /
1218	--id;
1219	}
1220
1221	if (is >= `0`)
1222	{
1223	/ If DEST is exhausted, the remaining items of SRC must be unique. /
1224	sbase -= is + `1`;
1225	memcpy (dest->elems + sbase, src->elems, (is + `1`) * sizeof (int));
1226	}
1227
1228	id = dest->nelem - `1`;
1229	is = dest->nelem + `2` * src->nelem - `1`;
1230	delta = is - sbase + `1`;
1231	if (delta == `0`)
1232	return REG_NOERROR;
1233
1234	/ Now copy. When DELTA becomes zero, the remaining*
1235	DEST elements are already in place. /*
1236	dest->nelem += delta;
1237	for (;;)
1238	{
1239	if (dest->elems[is] > dest->elems[id])
1240	{
1241	/ Copy from the top. /
1242	dest->elems[id + delta--] = dest->elems[is--];
1243	if (delta == `0`)
1244	break;
1245	}
1246	else
1247	{
1248	/ Slide from the bottom. /
1249	dest->elems[id + delta] = dest->elems[id];
1250	if (--id < `0`)
1251	{
1252	/ Copy remaining SRC elements. /
1253	memcpy (dest->elems, dest->elems + sbase,
1254	delta * sizeof (int));
1255	break;
1256	}
1257	}
1258	}
1259
1260	return REG_NOERROR;
1261	}
1262
1263	/ Insert the new element ELEM to the re_node_set* SET.*
1264	SET should not already have ELEM.
1265	return -1 if an error is occured, return 1 otherwise. /*
1266
1267	static int
1268	__attribute_warn_unused_result__
1269	re_node_set_insert (re_node_set set, int* elem)
1270	{
1271	int idx;
1272	/ In case the set is empty. /
1273	if (set->alloc == `0`)
1274	{
1275	if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, `1`))
1276	return `1`;
1277	else
1278	return -`1`;
1279	}
1280
1281	if (BE (set->nelem, `0`) == `0`)
1282	{
1283	/ We already guaranteed above that set->alloc != 0. /
1284	set->elems[`0`] = elem;
1285	++set->nelem;
1286	return `1`;
1287	}
1288
1289	/ Realloc if we need. /
1290	if (set->alloc == set->nelem)
1291	{
1292	int *new_elems;
1293	set->alloc = set->alloc * `2`;
1294	new_elems = re_realloc (set->elems, int, set->alloc);
1295	if (BE (new_elems == NULL, `0`))
1296	return -`1`;
1297	set->elems = new_elems;
1298	}
1299
1300	/ Move the elements which follows the new element. Test the*
1301	first element separately to skip a check in the inner loop. /*
1302	if (elem < set->elems[`0`])
1303	{
1304	idx = `0`;
1305	for (idx = set->nelem; idx > `0`; idx--)
1306	set->elems[idx] = set->elems[idx - `1`];
1307	}
1308	else
1309	{
1310	for (idx = set->nelem; set->elems[idx - `1`] > elem; idx--)
1311	set->elems[idx] = set->elems[idx - `1`];
1312	}
1313
1314	/ Insert the new element. /
1315	set->elems[idx] = elem;
1316	++set->nelem;
1317	return `1`;
1318	}
1319
1320	/ Insert the new element ELEM to the re_node_set* SET.*
1321	SET should not already have any element greater than or equal to ELEM.
1322	Return -1 if an error is occured, return 1 otherwise. /*
1323
1324	static int
1325	__attribute_warn_unused_result__
1326	re_node_set_insert_last (re_node_set set, int* elem)
1327	{
1328	/ Realloc if we need. /
1329	if (set->alloc == set->nelem)
1330	{
1331	int *new_elems;
1332	set->alloc = (set->alloc + `1`) * `2`;
1333	new_elems = re_realloc (set->elems, int, set->alloc);
1334	if (BE (new_elems == NULL, `0`))
1335	return -`1`;
1336	set->elems = new_elems;
1337	}
1338
1339	/ Insert the new element. /
1340	set->elems[set->nelem++] = elem;
1341	return `1`;
1342	}
1343
1344	/ Compare two node sets SET1 and SET2.*
1345	return 1 if SET1 and SET2 are equivalent, return 0 otherwise. /*
1346
1347	static int
1348	__attribute ((pure))
1349	re_node_set_compare (const re_node_set set1, const* re_node_set *set2)
1350	{
1351	int i;
1352	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1353	return `0`;
1354	for (i = set1->nelem ; --i >= `0` ; )
1355	if (set1->elems[i] != set2->elems[i])
1356	return `0`;
1357	return `1`;
1358	}
1359
1360	/ Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. /
1361
1362	static int
1363	__attribute ((pure))
1364	re_node_set_contains (const re_node_set set, int* elem)
1365	{
1366	unsigned int idx, right, mid;
1367	if (set->nelem <= `0`)
1368	return `0`;
1369
1370	/ Binary search the element. /
1371	idx = `0`;
1372	right = set->nelem - `1`;
1373	while (idx < right)
1374	{
1375	mid = (idx + right) / `2`;
1376	if (set->elems[mid] < elem)
1377	idx = mid + `1`;
1378	else
1379	right = mid;
1380	}
1381	return set->elems[idx] == elem ? idx + `1` : `0`;
1382	}
1383
1384	static void
1385	re_node_set_remove_at (re_node_set set, int* idx)
1386	{
1387	if (idx < `0` \|\| idx >= set->nelem)
1388	return;
1389	--set->nelem;
1390	for (; idx < set->nelem; idx++)
1391	set->elems[idx] = set->elems[idx + `1`];
1392	}
1393
1394
1395	/ Add the token TOKEN to dfa->nodes, and return the index of the token.*
1396	Or return -1, if an error will be occured. /*
1397
1398	static int
1399	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1400	{
1401	int type = token.type;
1402	if (BE (dfa->nodes_len >= dfa->nodes_alloc, `0`))
1403	{
1404	size_t new_nodes_alloc = dfa->nodes_alloc * `2`;
1405	int new_nexts, new_indices;
1406	re_node_set new_edests, new_eclosures;
1407	re_token_t *new_nodes;
1408
1409	/ Avoid overflows in realloc. /
1410	const size_t max_object_size = MAX (sizeof (re_token_t),
1411	MAX (sizeof (re_node_set),
1412	sizeof (int)));
1413	if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, `0`))
1414	return -`1`;
1415
1416	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1417	if (BE (new_nodes == NULL, `0`))
1418	return -`1`;
1419	dfa->nodes = new_nodes;
1420	new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1421	new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1422	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1423	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1424	if (BE (new_nexts == NULL \|\| new_indices == NULL
1425	\|\| new_edests == NULL \|\| new_eclosures == NULL, `0`))
1426	return -`1`;
1427	dfa->nexts = new_nexts;
1428	dfa->org_indices = new_indices;
1429	dfa->edests = new_edests;
1430	dfa->eclosures = new_eclosures;
1431	dfa->nodes_alloc = new_nodes_alloc;
1432	}
1433	dfa->nodes[dfa->nodes_len] = token;
1434	dfa->nodes[dfa->nodes_len].constraint = `0`;
1435	#ifdef RE_ENABLE_I18N
1436	dfa->nodes[dfa->nodes_len].accept_mb =
1437	(type == OP_PERIOD && dfa->mb_cur_max > `1`) \|\| type == COMPLEX_BRACKET;
1438	#endif
1439	dfa->nexts[dfa->nodes_len] = -`1`;
1440	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1441	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1442	return dfa->nodes_len++;
1443	}
1444
1445	static inline unsigned int
1446	calc_state_hash (const re_node_set nodes, unsigned* int context)
1447	{
1448	unsigned int hash = nodes->nelem + context;
1449	int i;
1450	for (i = `0` ; i < nodes->nelem ; i++)
1451	hash += nodes->elems[i];
1452	return hash;
1453	}
1454
1455	/ Search for the state whose node_set is equivalent to NODES.*
1456	Return the pointer to the state, if we found it in the DFA.
1457	Otherwise create the new one and return it. In case of an error
1458	return NULL and set the error code in ERR.
1459	Note: - We assume NULL as the invalid state, then it is possible that
1460	return value is NULL and ERR is REG_NOERROR.
1461	- We never return non-NULL value in case of any errors, it is for
1462	optimization. /*
1463
1464	static re_dfastate_t *
1465	__attribute_warn_unused_result__
1466	re_acquire_state (reg_errcode_t err, const* re_dfa_t *dfa,
1467	const re_node_set *nodes)
1468	{
1469	unsigned int hash;
1470	re_dfastate_t *new_state;
1471	struct re_state_table_entry *spot;
1472	int i;
1473	if (BE (nodes->nelem == `0`, `0`))
1474	{
1475	*err = REG_NOERROR;
1476	return NULL;
1477	}
1478	hash = calc_state_hash (nodes, `0`);
1479	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1480
1481	for (i = `0` ; i < spot->num ; i++)
1482	{
1483	re_dfastate_t *state = spot->array[i];
1484	if (hash != state->hash)
1485	continue;
1486	if (re_node_set_compare (&state->nodes, nodes))
1487	return state;
1488	}
1489
1490	/ There are no appropriate state in the dfa, create the new one. /
1491	new_state = create_ci_newstate (dfa, nodes, hash);
1492	if (BE (new_state == NULL, `0`))
1493	*err = REG_ESPACE;
1494
1495	return new_state;
1496	}
1497
1498	/ Search for the state whose node_set is equivalent to NODES and*
1499	whose context is equivalent to CONTEXT.
1500	Return the pointer to the state, if we found it in the DFA.
1501	Otherwise create the new one and return it. In case of an error
1502	return NULL and set the error code in ERR.
1503	Note: - We assume NULL as the invalid state, then it is possible that
1504	return value is NULL and ERR is REG_NOERROR.
1505	- We never return non-NULL value in case of any errors, it is for
1506	optimization. /*
1507
1508	static re_dfastate_t *
1509	__attribute_warn_unused_result__
1510	re_acquire_state_context (reg_errcode_t err, const* re_dfa_t *dfa,
1511	const re_node_set nodes, unsigned* int context)
1512	{
1513	unsigned int hash;
1514	re_dfastate_t *new_state;
1515	struct re_state_table_entry *spot;
1516	int i;
1517	if (nodes->nelem == `0`)
1518	{
1519	*err = REG_NOERROR;
1520	return NULL;
1521	}
1522	hash = calc_state_hash (nodes, context);
1523	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1524
1525	for (i = `0` ; i < spot->num ; i++)
1526	{
1527	re_dfastate_t *state = spot->array[i];
1528	if (state->hash == hash
1529	&& state->context == context
1530	&& re_node_set_compare (state->entrance_nodes, nodes))
1531	return state;
1532	}
1533	/ There are no appropriate state in `dfa', create the new one. /
1534	new_state = create_cd_newstate (dfa, nodes, context, hash);
1535	if (BE (new_state == NULL, `0`))
1536	*err = REG_ESPACE;
1537
1538	return new_state;
1539	}
1540
1541	/ Finish initialization of the new state NEWSTATE, and using its hash value*
1542	HASH put in the appropriate bucket of DFA's state table. Return value
1543	indicates the error code if failed. /*
1544
1545	static reg_errcode_t
1546	__attribute_warn_unused_result__
1547	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1548	unsigned int hash)
1549	{
1550	struct re_state_table_entry *spot;
1551	reg_errcode_t err;
1552	int i;
1553
1554	newstate->hash = hash;
1555	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1556	if (BE (err != REG_NOERROR, `0`))
1557	return REG_ESPACE;
1558	for (i = `0`; i < newstate->nodes.nelem; i++)
1559	{
1560	int elem = newstate->nodes.elems[i];
1561	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1562	if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < `0`)
1563	return REG_ESPACE;
1564	}
1565
1566	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1567	if (BE (spot->alloc <= spot->num, `0`))
1568	{
1569	int new_alloc = `2` * spot->num + `2`;
1570	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1571	new_alloc);
1572	if (BE (new_array == NULL, `0`))
1573	return REG_ESPACE;
1574	spot->array = new_array;
1575	spot->alloc = new_alloc;
1576	}
1577	spot->array[spot->num++] = newstate;
1578	return REG_NOERROR;
1579	}
1580
1581	static void
1582	free_state (re_dfastate_t *state)
1583	{
1584	re_node_set_free (&state->non_eps_nodes);
1585	re_node_set_free (&state->inveclosure);
1586	if (state->entrance_nodes != &state->nodes)
1587	{
1588	re_node_set_free (state->entrance_nodes);
1589	re_free (state->entrance_nodes);
1590	}
1591	re_node_set_free (&state->nodes);
1592	re_free (state->word_trtable);
1593	re_free (state->trtable);
1594	re_free (state);
1595	}
1596
1597	/ Create the new state which is independent of contexts.*
1598	Return the new state if succeeded, otherwise return NULL. /*
1599
1600	static re_dfastate_t *
1601	__attribute_warn_unused_result__
1602	create_ci_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1603	unsigned int hash)
1604	{
1605	int i;
1606	reg_errcode_t err;
1607	re_dfastate_t *newstate;
1608
1609	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1610	if (BE (newstate == NULL, `0`))
1611	return NULL;
1612	err = re_node_set_init_copy (&newstate->nodes, nodes);
1613	if (BE (err != REG_NOERROR, `0`))
1614	{
1615	re_free (newstate);
1616	return NULL;
1617	}
1618
1619	newstate->entrance_nodes = &newstate->nodes;
1620	for (i = `0` ; i < nodes->nelem ; i++)
1621	{
1622	re_token_t *node = dfa->nodes + nodes->elems[i];
1623	re_token_type_t type = node->type;
1624	if (type == CHARACTER && !node->constraint)
1625	continue;
1626	#ifdef RE_ENABLE_I18N
1627	newstate->accept_mb \|= node->accept_mb;
1628	#endif /* RE_ENABLE_I18N */
1629
1630	/ If the state has the halt node, the state is a halt state. /
1631	if (type == END_OF_RE)
1632	newstate->halt = `1`;
1633	else if (type == OP_BACK_REF)
1634	newstate->has_backref = `1`;
1635	else if (type == ANCHOR \|\| node->constraint)
1636	newstate->has_constraint = `1`;
1637	}
1638	err = register_state (dfa, newstate, hash);
1639	if (BE (err != REG_NOERROR, `0`))
1640	{
1641	free_state (newstate);
1642	newstate = NULL;
1643	}
1644	return newstate;
1645	}
1646
1647	/ Create the new state which is depend on the context CONTEXT.*
1648	Return the new state if succeeded, otherwise return NULL. /*
1649
1650	static re_dfastate_t *
1651	__attribute_warn_unused_result__
1652	create_cd_newstate (const re_dfa_t dfa, const* re_node_set *nodes,
1653	unsigned int context, unsigned int hash)
1654	{
1655	int i, nctx_nodes = `0`;
1656	reg_errcode_t err;
1657	re_dfastate_t *newstate;
1658
1659	newstate = (re_dfastate_t ) calloc (sizeof* (re_dfastate_t), `1`);
1660	if (BE (newstate == NULL, `0`))
1661	return NULL;
1662	err = re_node_set_init_copy (&newstate->nodes, nodes);
1663	if (BE (err != REG_NOERROR, `0`))
1664	{
1665	re_free (newstate);
1666	return NULL;
1667	}
1668
1669	newstate->context = context;
1670	newstate->entrance_nodes = &newstate->nodes;
1671
1672	for (i = `0` ; i < nodes->nelem ; i++)
1673	{
1674	re_token_t *node = dfa->nodes + nodes->elems[i];
1675	re_token_type_t type = node->type;
1676	unsigned int constraint = node->constraint;
1677
1678	if (type == CHARACTER && !constraint)
1679	continue;
1680	#ifdef RE_ENABLE_I18N
1681	newstate->accept_mb \|= node->accept_mb;
1682	#endif /* RE_ENABLE_I18N */
1683
1684	/ If the state has the halt node, the state is a halt state. /
1685	if (type == END_OF_RE)
1686	newstate->halt = `1`;
1687	else if (type == OP_BACK_REF)
1688	newstate->has_backref = `1`;
1689
1690	if (constraint)
1691	{
1692	if (newstate->entrance_nodes == &newstate->nodes)
1693	{
1694	newstate->entrance_nodes = re_malloc (re_node_set, `1`);
1695	if (BE (newstate->entrance_nodes == NULL, `0`))
1696	{
1697	free_state (newstate);
1698	return NULL;
1699	}
1700	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1701	!= REG_NOERROR)
1702	return NULL;
1703	nctx_nodes = `0`;
1704	newstate->has_constraint = `1`;
1705	}
1706
1707	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1708	{
1709	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1710	++nctx_nodes;
1711	}
1712	}
1713	}
1714	err = register_state (dfa, newstate, hash);
1715	if (BE (err != REG_NOERROR, `0`))
1716	{
1717	free_state (newstate);
1718	newstate = NULL;
1719	}
1720	return newstate;
1721	}
1722

Browse the source code of glibc/posix/regex_internal.c