linereader.c source code [glibc/locale/programs/linereader.c]

1	/ Copyright (C) 1996-2023 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published
6	by the Free Software Foundation; version 2 of the License, or
7	(at your option) any later version.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; if not, see <https://www.gnu.org/licenses/>. /*
16
17	#ifdef HAVE_CONFIG_H
18	# include <config.h>
19	#endif
20
21	#include <assert.h>
22	#include <ctype.h>
23	#include <errno.h>
24	#include <libintl.h>
25	#include <stdarg.h>
26	#include <stdlib.h>
27	#include <string.h>
28	#include <stdint.h>
29
30	#include "localedef.h"
31	#include "charmap.h"
32	#include "error.h"
33	#include "linereader.h"
34	#include "locfile.h"
35
36	/ Prototypes for local functions. /
37	static struct token get_toplvl_escape (struct* linereader *lr);
38	static struct token get_symname (struct* linereader *lr);
39	static struct token get_ident (struct* linereader *lr);
40	static struct token get_string (struct* linereader *lr,
41	const struct charmap_t *charmap,
42	struct localedef_t *locale,
43	const struct repertoire_t *repertoire,
44	int verbose);
45	static bool utf8_decode (struct linereader lr, uint8_t ch1, uint32_t wch);
46
47
48	struct linereader *
49	lr_open (const char *fname, kw_hash_fct_t hf)
50	{
51	FILE *fp;
52
53	if (fname == NULL \|\| strcmp (fname, "-") == `0`
54	\|\| strcmp (fname, "/dev/stdin") == `0`)
55	return lr_create (stdin, "<stdin>", hf);
56	else
57	{
58	fp = fopen (fname, "rm");
59	if (fp == NULL)
60	return NULL;
61	return lr_create (fp, fname, hf);
62	}
63	}
64
65	struct linereader *
66	lr_create (FILE fp, const* char *fname, kw_hash_fct_t hf)
67	{
68	struct linereader *result;
69	int n;
70
71	result = (struct linereader ) xmalloc (sizeof* (*result));
72
73	result->fp = fp;
74	result->fname = xstrdup (fname);
75	result->buf = NULL;
76	result->bufsize = `0`;
77	result->lineno = `1`;
78	result->idx = `0`;
79	result->comment_char = `'#'`;
80	result->escape_char = `'\\'`;
81	result->translate_strings = `1`;
82	result->return_widestr = `0`;
83
84	n = getdelim (&result->buf, &result->bufsize, `'\n'`, result->fp);
85	if (n < `0`)
86	{
87	int save = errno;
88	fclose (result->fp);
89	free ((char *) result->fname);
90	free (result);
91	errno = save;
92	return NULL;
93	}
94
95	if (n > `1` && result->buf[n - `2`] == `'\\'` && result->buf[n - `1`] == `'\n'`)
96	n -= `2`;
97
98	result->buf[n] = `'\0'`;
99	result->bufact = n;
100	result->hash_fct = hf;
101
102	return result;
103	}
104
105
106	int
107	lr_eof (struct linereader *lr)
108	{
109	return lr->bufact = `0`;
110	}
111
112
113	void
114	lr_ignore_rest (struct linereader lr, int* verbose)
115	{
116	if (verbose)
117	{
118	while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != `'\n'`
119	&& lr->buf[lr->idx] != lr->comment_char)
120	if (lr->buf[lr->idx] == `'\0'`)
121	{
122	if (lr_next (lr) < `0`)
123	return;
124	}
125	else
126	++lr->idx;
127
128	if (lr->buf[lr->idx] != `'\n'` && ! feof (lr->fp)
129	&& lr->buf[lr->idx] != lr->comment_char)
130	lr_error (lr, _("trailing garbage at end of line"));
131	}
132
133	/ Ignore continued line. /
134	while (lr->bufact > `0` && lr->buf[lr->bufact - `1`] != `'\n'`)
135	if (lr_next (lr) < `0`)
136	break;
137
138	lr->idx = lr->bufact;
139	}
140
141
142	void
143	lr_close (struct linereader *lr)
144	{
145	fclose (lr->fp);
146	free (lr->buf);
147	free (lr);
148	}
149
150
151	int
152	lr_next (struct linereader *lr)
153	{
154	int n;
155
156	n = getdelim (&lr->buf, &lr->bufsize, `'\n'`, lr->fp);
157	if (n < `0`)
158	return -`1`;
159
160	++lr->lineno;
161
162	if (n > `1` && lr->buf[n - `2`] == lr->escape_char && lr->buf[n - `1`] == `'\n'`)
163	{
164	#if 0
165	/ XXX Is this correct? /
166	/ An escaped newline character is substituted with a single <SP>. /
167	--n;
168	lr->buf[n - `1`] = `' '`;
169	#else
170	n -= `2`;
171	#endif
172	}
173
174	lr->buf[n] = `'\0'`;
175	lr->bufact = n;
176	lr->idx = `0`;
177
178	return `0`;
179	}
180
181
182	/ Defined in error.c. /
183	/ This variable is incremented each time `error' is called. /
184	extern unsigned int error_message_count;
185
186	/ The calling program should define program_name and set it to the*
187	name of the executing program. /*
188	extern char *program_name;
189
190
191	struct token *
192	lr_token (struct linereader lr, const* struct charmap_t *charmap,
193	struct localedef_t locale, const* struct repertoire_t *repertoire,
194	int verbose)
195	{
196	int ch;
197
198	while (`1`)
199	{
200	do
201	{
202	ch = lr_getc (lr);
203
204	if (ch == EOF)
205	{
206	lr->token.tok = tok_eof;
207	return &lr->token;
208	};
209
210	if (ch == `'\n'`)
211	{
212	lr->token.tok = tok_eol;
213	return &lr->token;
214	}
215	}
216	while (isspace (ch));
217
218	if (ch != lr->comment_char)
219	break;
220
221	/ Is there an newline at the end of the buffer? /
222	if (lr->buf[lr->bufact - `1`] != `'\n'`)
223	{
224	/ No. Some people want this to mean that only the line in*
225	the file not the logical, concatenated line is ignored.
226	Let's try this. /*
227	lr->idx = lr->bufact;
228	continue;
229	}
230
231	/ Ignore rest of line. /
232	lr_ignore_rest (lr, `0`);
233	lr->token.tok = tok_eol;
234	return &lr->token;
235	}
236
237	/ Match escape sequences. /
238	if (ch == lr->escape_char)
239	return get_toplvl_escape (lr);
240
241	/ Match ellipsis. /
242	if (ch == `'.'`)
243	{
244	if (strncmp (&lr->buf[lr->idx], "...(2)....", `10`) == `0`)
245	{
246	int cnt;
247	for (cnt = `0`; cnt < `10`; ++cnt)
248	lr_getc (lr);
249	lr->token.tok = tok_ellipsis4_2;
250	return &lr->token;
251	}
252	if (strncmp (&lr->buf[lr->idx], "...", `3`) == `0`)
253	{
254	lr_getc (lr);
255	lr_getc (lr);
256	lr_getc (lr);
257	lr->token.tok = tok_ellipsis4;
258	return &lr->token;
259	}
260	if (strncmp (&lr->buf[lr->idx], "..", `2`) == `0`)
261	{
262	lr_getc (lr);
263	lr_getc (lr);
264	lr->token.tok = tok_ellipsis3;
265	return &lr->token;
266	}
267	if (strncmp (&lr->buf[lr->idx], ".(2)..", `6`) == `0`)
268	{
269	int cnt;
270	for (cnt = `0`; cnt < `6`; ++cnt)
271	lr_getc (lr);
272	lr->token.tok = tok_ellipsis2_2;
273	return &lr->token;
274	}
275	if (lr->buf[lr->idx] == `'.'`)
276	{
277	lr_getc (lr);
278	lr->token.tok = tok_ellipsis2;
279	return &lr->token;
280	}
281	}
282
283	switch (ch)
284	{
285	case `'<'`:
286	return get_symname (lr);
287
288	case `'0'` ... `'9'`:
289	lr->token.tok = tok_number;
290	lr->token.val.num = ch - `'0'`;
291
292	while (isdigit (ch = lr_getc (lr)))
293	{
294	lr->token.val.num *= `10`;
295	lr->token.val.num += ch - `'0'`;
296	}
297	if (isalpha (ch))
298	lr_error (lr, _("garbage at end of number"));
299	lr_ungetn (lr, `1`);
300
301	return &lr->token;
302
303	case `';'`:
304	lr->token.tok = tok_semicolon;
305	return &lr->token;
306
307	case `','`:
308	lr->token.tok = tok_comma;
309	return &lr->token;
310
311	case `'('`:
312	lr->token.tok = tok_open_brace;
313	return &lr->token;
314
315	case `')'`:
316	lr->token.tok = tok_close_brace;
317	return &lr->token;
318
319	case `'"'`:
320	return get_string (lr, charmap, locale, repertoire, verbose);
321
322	case `'-'`:
323	ch = lr_getc (lr);
324	if (ch == `'1'`)
325	{
326	lr->token.tok = tok_minus1;
327	return &lr->token;
328	}
329	lr_ungetn (lr, `2`);
330	break;
331
332	case `0x80` ... `0xff`: / UTF-8 sequence. /
333	{
334	uint32_t wch;
335	if (!utf8_decode (lr, ch, &wch))
336	{
337	lr->token.tok = tok_error;
338	return &lr->token;
339	}
340	lr->token.tok = tok_ucs4;
341	lr->token.val.ucs4 = wch;
342	return &lr->token;
343	}
344	}
345
346	return get_ident (lr);
347	}
348
349
350	static struct token *
351	get_toplvl_escape (struct linereader *lr)
352	{
353	/ This is supposed to be a numeric value. We return the*
354	numerical value and the number of bytes. /*
355	size_t start_idx = lr->idx - `1`;
356	unsigned char *bytes = lr->token.val.charcode.bytes;
357	size_t nbytes = `0`;
358	int ch;
359
360	do
361	{
362	unsigned int byte = `0`;
363	unsigned int base = `8`;
364
365	ch = lr_getc (lr);
366
367	if (ch == `'d'`)
368	{
369	base = `10`;
370	ch = lr_getc (lr);
371	}
372	else if (ch == `'x'`)
373	{
374	base = `16`;
375	ch = lr_getc (lr);
376	}
377
378	if ((base == `16` && !isxdigit (ch))
379	\|\| (base != `16` && (ch < `'0'` \|\| ch >= (int) (`'0'` + base))))
380	{
381	esc_error:
382	lr->token.val.str.startmb = &lr->buf[start_idx];
383
384	while (ch != EOF && !isspace (ch))
385	ch = lr_getc (lr);
386	lr->token.val.str.lenmb = lr->idx - start_idx;
387
388	lr->token.tok = tok_error;
389	return &lr->token;
390	}
391
392	if (isdigit (ch))
393	byte = ch - `'0'`;
394	else
395	byte = tolower (ch) - `'a'` + `10`;
396
397	ch = lr_getc (lr);
398	if ((base == `16` && !isxdigit (ch))
399	\|\| (base != `16` && (ch < `'0'` \|\| ch >= (int) (`'0'` + base))))
400	goto esc_error;
401
402	byte *= base;
403	if (isdigit (ch))
404	byte += ch - `'0'`;
405	else
406	byte += tolower (ch) - `'a'` + `10`;
407
408	ch = lr_getc (lr);
409	if (base != `16` && isdigit (ch))
410	{
411	byte *= base;
412	byte += ch - `'0'`;
413
414	ch = lr_getc (lr);
415	}
416
417	bytes[nbytes++] = byte;
418	}
419	while (ch == lr->escape_char
420	&& nbytes < (int) sizeof (lr->token.val.charcode.bytes));
421
422	if (!isspace (ch))
423	lr_error (lr, _("garbage at end of character code specification"));
424
425	lr_ungetn (lr, `1`);
426
427	lr->token.tok = tok_charcode;
428	lr->token.val.charcode.nbytes = nbytes;
429
430	return &lr->token;
431	}
432
433	/ Multibyte string buffer. /
434	struct lr_buffer
435	{
436	size_t act;
437	size_t max;
438	char *buf;
439	};
440
441	/ Initialize LRB with a default-sized buffer. /*
442	static void
443	lr_buffer_init (struct lr_buffer *lrb)
444	{
445	lrb->act = `0`;
446	lrb->max = `56`;
447	lrb->buf = xmalloc (lrb->max);
448	}
449
450	/ Transfers the buffer string from LRB to LR->token.mbstr. /*
451	static void
452	lr_buffer_to_token (struct lr_buffer lrb, struct* linereader *lr)
453	{
454	lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + `1`);
455	lr->token.val.str.startmb[lrb->act] = `'\0'`;
456	lr->token.val.str.lenmb = lrb->act;
457	}
458
459	/ Adds CH to LRB. /*
460	static void
461	addc (struct lr_buffer lrb, char* ch)
462	{
463	if (lrb->act == lrb->max)
464	{
465	lrb->max *= `2`;
466	lrb->buf = xrealloc (lrb->buf, lrb->max);
467	}
468	lrb->buf[lrb->act++] = ch;
469	}
470
471	/ Adds L bytes at S to LRB. /*
472	static void
473	adds (struct lr_buffer lrb, const* unsigned char *s, size_t l)
474	{
475	if (lrb->max - lrb->act < l)
476	{
477	size_t required_size = lrb->act + l;
478	size_t new_max = `2` * lrb->max;
479	if (new_max < required_size)
480	new_max = required_size;
481	lrb->buf = xrealloc (lrb->buf, new_max);
482	lrb->max = new_max;
483	}
484	memcpy (lrb->buf + lrb->act, s, l);
485	lrb->act += l;
486	}
487
488	#define ADDWC(ch) \
489	do \
490	{ \
491	if (buf2act == buf2max) \
492	{ \
493	buf2max *= 2; \
494	buf2 = xrealloc (buf2, buf2max * 4); \
495	} \
496	buf2[buf2act++] = (ch); \
497	} \
498	while (0)
499
500
501	static struct token *
502	get_symname (struct linereader *lr)
503	{
504	/ Symbol in brackets. We must distinguish three kinds:*
505	1. reserved words
506	2. ISO 10646 position values
507	3. all other. /*
508	const struct keyword_t *kw;
509	int ch;
510	struct lr_buffer lrb;
511
512	lr_buffer_init (&lrb);
513
514	do
515	{
516	ch = lr_getc (lr);
517	if (ch == lr->escape_char)
518	{
519	int c2 = lr_getc (lr);
520	addc (&lrb, c2);
521
522	if (c2 == `'\n'`)
523	ch = `'\n'`;
524	}
525	else
526	addc (&lrb, ch);
527	}
528	while (ch != `'>'` && ch != `'\n'`);
529
530	if (ch == `'\n'`)
531	lr_error (lr, _("unterminated symbolic name"));
532
533	/ Test for ISO 10646 position value. /
534	if (lrb.buf[`0`] == `'U'` && (lrb.act == `6` \|\| lrb.act == `10`))
535	{
536	char *cp = lrb.buf + `1`;
537	while (cp < &lrb.buf[lrb.act - `1`] && isxdigit (*cp))
538	++cp;
539
540	if (cp == &lrb.buf[lrb.act - `1`])
541	{
542	/ Yes, it is. /
543	lr->token.tok = tok_ucs4;
544	lr->token.val.ucs4 = strtoul (lrb.buf + `1`, NULL, `16`);
545
546	return &lr->token;
547	}
548	}
549
550	/ It is a symbolic name. Test for reserved words. /
551	kw = lr->hash_fct (lrb.buf, lrb.act - `1`);
552
553	if (kw != NULL && kw->symname_or_ident == `1`)
554	{
555	lr->token.tok = kw->token;
556	free (lrb.buf);
557	}
558	else
559	{
560	lr->token.tok = tok_bsymbol;
561	lr_buffer_to_token (&lrb, lr);
562	--lr->token.val.str.lenmb; / Hide the training '>'. /
563	}
564
565	return &lr->token;
566	}
567
568
569	static struct token *
570	get_ident (struct linereader *lr)
571	{
572	const struct keyword_t *kw;
573	int ch;
574	struct lr_buffer lrb;
575
576	lr_buffer_init (&lrb);
577
578	addc (&lrb, lr->buf[lr->idx - `1`]);
579
580	while (!isspace ((ch = lr_getc (lr))) && ch != `'"'` && ch != `';'`
581	&& ch != `'<'` && ch != `','` && ch != EOF)
582	{
583	if (ch == lr->escape_char)
584	{
585	ch = lr_getc (lr);
586	if (ch == `'\n'` \|\| ch == EOF)
587	{
588	lr_error (lr, _("invalid escape sequence"));
589	break;
590	}
591	}
592	addc (&lrb, ch);
593	}
594
595	lr_ungetc (lr, ch);
596
597	kw = lr->hash_fct (lrb.buf, lrb.act);
598
599	if (kw != NULL && kw->symname_or_ident == `0`)
600	{
601	lr->token.tok = kw->token;
602	free (lrb.buf);
603	}
604	else
605	{
606	lr->token.tok = tok_ident;
607	lr_buffer_to_token (&lrb, lr);
608	}
609
610	return &lr->token;
611	}
612
613	/ Process a decoded Unicode codepoint WCH in a string, placing the*
614	multibyte sequence into LRB. Return false if the character is not
615	found in CHARMAP/REPERTOIRE. /*
616	static bool
617	translate_unicode_codepoint (struct localedef_t *locale,
618	const struct charmap_t *charmap,
619	const struct repertoire_t *repertoire,
620	uint32_t wch, struct lr_buffer *lrb)
621	{
622	/ See whether the charmap contains the Uxxxxxxxx names. /
623	char utmp[`10`];
624	snprintf (utmp, sizeof (utmp), "U%08X", wch);
625	struct charseq *seq = charmap_find_value (charmap, utmp, `9`);
626
627	if (seq == NULL)
628	{
629	/ No, this isn't the case. Now determine from*
630	the repertoire the name of the character and
631	find it in the charmap. /*
632	if (repertoire != NULL)
633	{
634	const char *symbol = repertoire_find_symbol (repertoire, wch);
635	if (symbol != NULL)
636	seq = charmap_find_value (charmap, symbol, strlen (symbol));
637	}
638
639	if (seq == NULL)
640	{
641	#ifndef NO_TRANSLITERATION
642	/ Transliterate if possible. /
643	if (locale != NULL)
644	{
645	if ((locale->avail & CTYPE_LOCALE) == `0`)
646	{
647	/ Load the CTYPE data now. /
648	int old_needed = locale->needed;
649
650	locale->needed = `0`;
651	locale = load_locale (LC_CTYPE, locale->name,
652	locale->repertoire_name,
653	charmap, locale);
654	locale->needed = old_needed;
655	}
656
657	uint32_t *translit;
658	if ((locale->avail & CTYPE_LOCALE) != `0`
659	&& ((translit = find_translit (locale, charmap, wch))
660	!= NULL))
661	/ The CTYPE data contains a matching*
662	transliteration. /*
663	{
664	for (int i = `0`; translit[i] != `0`; ++i)
665	{
666	snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
667	seq = charmap_find_value (charmap, utmp, `9`);
668	assert (seq != NULL);
669	adds (lrb, seq->bytes, seq->nbytes);
670	}
671	return true;
672	}
673	}
674	#endif /* NO_TRANSLITERATION */
675
676	/ Not a known name. /
677	return false;
678	}
679	}
680
681	if (seq != NULL)
682	{
683	adds (lrb, seq->bytes, seq->nbytes);
684	return true;
685	}
686	else
687	return false;
688	}
689
690	/ Returns true if ch is not EOF (that is, non-negative) and a valid*
691	UTF-8 trailing byte. /*
692	static bool
693	utf8_valid_trailing (int ch)
694	{
695	return ch >= `0` && (ch & `0xc0`) == `0x80`;
696	}
697
698	/ Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be*
699	EOF. Always returns false. /*
700	static bool
701	utf8_sequence_error (struct linereader lr, uint8_t ch1, int* ch2, int ch3,
702	int ch4)
703	{
704	char buf[`38`];
705
706	if (ch2 < `0`)
707	snprintf (buf, sizeof (buf), "0x%02x", ch1);
708	else if (ch3 < `0`)
709	snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
710	else if (ch4 < `0`)
711	snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
712	else
713	snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
714	ch1, ch2, ch3, ch4);
715
716	lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
717	return false;
718	}
719
720	/ Reads a UTF-8 sequence from LR, with the leading byte CH1, and*
721	stores the decoded codepoint in WCH. Returns false on failure and*
722	reports an error. /*
723	static bool
724	utf8_decode (struct linereader lr, uint8_t ch1, uint32_t wch)
725	{
726	/ See RFC 3629 section 4 and __gconv_transform_utf8_internal. /
727	if (ch1 < `0xc2`)
728	return utf8_sequence_error (lr, ch1, -`1`, -`1`, -`1`);
729
730	int ch2 = lr_getc (lr);
731	if (!utf8_valid_trailing (ch2))
732	return utf8_sequence_error (lr, ch1, ch2, -`1`, -`1`);
733
734	if (ch1 <= `0xdf`)
735	{
736	uint32_t result = ((ch1 & `0x1f`) << `6`) \| (ch2 & `0x3f`);
737	if (result < `0x80`)
738	return utf8_sequence_error (lr, ch1, ch2, -`1`, -`1`);
739	*wch = result;
740	return true;
741	}
742
743	int ch3 = lr_getc (lr);
744	if (!utf8_valid_trailing (ch3) \|\| ch1 < `0xe0`)
745	return utf8_sequence_error (lr, ch1, ch2, ch3, -`1`);
746
747	if (ch1 <= `0xef`)
748	{
749	uint32_t result = (((ch1 & `0x0f`) << `12`)
750	\| ((ch2 & `0x3f`) << `6`)
751	\| (ch3 & `0x3f`));
752	if (result < `0x800`)
753	return utf8_sequence_error (lr, ch1, ch2, ch3, -`1`);
754	*wch = result;
755	return true;
756	}
757
758	int ch4 = lr_getc (lr);
759	if (!utf8_valid_trailing (ch4) \|\| ch1 < `0xf0` \|\| ch1 > `0xf4`)
760	return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
761
762	uint32_t result = (((ch1 & `0x07`) << `18`)
763	\| ((ch2 & `0x3f`) << `12`)
764	\| ((ch3 & `0x3f`) << `6`)
765	\| (ch4 & `0x3f`));
766	if (result < `0x10000`)
767	return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
768	*wch = result;
769	return true;
770	}
771
772	static struct token *
773	get_string (struct linereader lr, const* struct charmap_t *charmap,
774	struct localedef_t locale, const* struct repertoire_t *repertoire,
775	int verbose)
776	{
777	int return_widestr = lr->return_widestr;
778	struct lr_buffer lrb;
779	wchar_t *buf2 = NULL;
780
781	lr_buffer_init (&lrb);
782
783	/ We know it'll be a string. /
784	lr->token.tok = tok_string;
785
786	/ If we need not translate the strings (i.e., expand <...> parts)*
787	we can run a simple loop. /*
788	if (!lr->translate_strings)
789	{
790	int ch;
791
792	buf2 = NULL;
793	while ((ch = lr_getc (lr)) != `'"'` && ch != `'\n'` && ch != EOF)
794	{
795	if (ch >= `0x80`)
796	lr_error (lr, _("illegal 8-bit character in untranslated string"));
797	addc (&lrb, ch);
798	}
799
800	/ Catch errors with trailing escape character. /
801	if (lrb.act > `0` && lrb.buf[lrb.act - `1`] == lr->escape_char
802	&& (lrb.act == `1` \|\| lrb.buf[lrb.act - `2`] != lr->escape_char))
803	{
804	lr_error (lr, _("illegal escape sequence at end of string"));
805	--lrb.act;
806	}
807	else if (ch == `'\n'` \|\| ch == EOF)
808	lr_error (lr, _("unterminated string"));
809
810	addc (&lrb, `'\0'`);
811	}
812	else
813	{
814	bool illegal_string = false;
815	size_t buf2act = `0`;
816	size_t buf2max = `56` * sizeof (uint32_t);
817	int ch;
818
819	/ We have to provide the wide character result as well. /
820	if (return_widestr)
821	buf2 = xmalloc (buf2max);
822
823	/ Read until the end of the string (or end of the line or file). /
824	while ((ch = lr_getc (lr)) != `'"'` && ch != `'\n'` && ch != EOF)
825	{
826	size_t startidx;
827	uint32_t wch;
828	struct charseq *seq;
829
830	if (ch != `'<'`)
831	{
832	/ The standards leave it up to the implementation to*
833	decide what to do with characters which stand for
834	themselves. This implementation treats the input
835	file as encoded in UTF-8. /*
836	if (ch == lr->escape_char)
837	{
838	ch = lr_getc (lr);
839	if (ch >= `0x80`)
840	{
841	lr_error (lr, _("illegal 8-bit escape sequence"));
842	illegal_string = true;
843	break;
844	}
845	if (ch == `'\n'` \|\| ch == EOF)
846	break;
847	addc (&lrb, ch);
848	wch = ch;
849	}
850	else if (ch < `0x80`)
851	{
852	wch = ch;
853	addc (&lrb, ch);
854	}
855	else / UTF-8 sequence. /
856	{
857	if (!utf8_decode (lr, ch, &wch))
858	{
859	illegal_string = true;
860	break;
861	}
862	if (!translate_unicode_codepoint (locale, charmap,
863	repertoire, wch, &lrb))
864	{
865	/ Ignore the rest of the string. Callers may*
866	skip this string because it cannot be encoded
867	in the output character set. /*
868	illegal_string = true;
869	continue;
870	}
871	}
872
873	if (return_widestr)
874	ADDWC (wch);
875
876	continue;
877	}
878
879	/ Now we have to search for the end of the symbolic name, i.e.,*
880	the closing '>'. /*
881	startidx = lrb.act;
882	while ((ch = lr_getc (lr)) != `'>'` && ch != `'\n'` && ch != EOF)
883	{
884	if (ch == lr->escape_char)
885	{
886	ch = lr_getc (lr);
887	if (ch == `'\n'` \|\| ch == EOF)
888	break;
889	}
890	addc (&lrb, ch);
891	}
892	if (ch == `'\n'` \|\| ch == EOF)
893	/ Not a correct string. /
894	break;
895	if (lrb.act == startidx)
896	{
897	/ <> is no correct name. Ignore it and also signal an*
898	error. /*
899	illegal_string = true;
900	continue;
901	}
902
903	/ It might be a Uxxxx symbol. /
904	if (lrb.buf[startidx] == `'U'`
905	&& (lrb.act - startidx == `5` \|\| lrb.act - startidx == `9`))
906	{
907	char *cp = lrb.buf + startidx + `1`;
908	while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
909	++cp;
910
911	if (cp == &lrb.buf[lrb.act])
912	{
913	/ Yes, it is. /
914	addc (&lrb, `'\0'`);
915	wch = strtoul (lrb.buf + startidx + `1`, NULL, `16`);
916
917	/ Now forget about the name we just added. /
918	lrb.act = startidx;
919
920	if (return_widestr)
921	ADDWC (wch);
922
923	if (!translate_unicode_codepoint (locale, charmap,
924	repertoire, wch, &lrb))
925	illegal_string = true;
926	continue;
927	}
928	}
929
930	/ We now have the symbolic name in lrb.buf[startidx] to*
931	lrb.buf[lrb.act-1]. Now find out the value for this character
932	in the charmap as well as in the repertoire map (in this
933	order). /*
934	seq = charmap_find_value (charmap, &lrb.buf[startidx],
935	lrb.act - startidx);
936
937	if (seq == NULL)
938	{
939	/ This name is not in the charmap. /
940	lr_error (lr, _("symbol `%.*s' not in charmap"),
941	(int) (lrb.act - startidx), &lrb.buf[startidx]);
942	illegal_string = true;
943	}
944
945	if (return_widestr)
946	{
947	/ Now the same for the multibyte representation. /
948	if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
949	wch = seq->ucs4;
950	else
951	{
952	wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
953	lrb.act - startidx);
954	if (seq != NULL)
955	seq->ucs4 = wch;
956	}
957
958	if (wch == ILLEGAL_CHAR_VALUE)
959	{
960	/ This name is not in the repertoire map. /
961	lr_error (lr, _("symbol `%.*s' not in repertoire map"),
962	(int) (lrb.act - startidx), &lrb.buf[startidx]);
963	illegal_string = true;
964	}
965	else
966	ADDWC (wch);
967	}
968
969	/ Now forget about the name we just added. /
970	lrb.act = startidx;
971
972	/ And copy the bytes. /
973	if (seq != NULL)
974	adds (&lrb, seq->bytes, seq->nbytes);
975	}
976
977	if (ch == `'\n'` \|\| ch == EOF)
978	{
979	lr_error (lr, _("unterminated string"));
980	illegal_string = true;
981	}
982
983	if (illegal_string)
984	{
985	free (lrb.buf);
986	free (buf2);
987	lr->token.val.str.startmb = NULL;
988	lr->token.val.str.lenmb = `0`;
989	lr->token.val.str.startwc = NULL;
990	lr->token.val.str.lenwc = `0`;
991
992	return &lr->token;
993	}
994
995	addc (&lrb, `'\0'`);
996
997	if (return_widestr)
998	{
999	ADDWC (`0`);
1000	lr->token.val.str.startwc = xrealloc (buf2,
1001	buf2act * sizeof (uint32_t));
1002	lr->token.val.str.lenwc = buf2act;
1003	}
1004	}
1005
1006	lr_buffer_to_token (&lrb, lr);
1007
1008	return &lr->token;
1009	}
1010

Browse the source code of glibc/locale/programs/linereader.c