1 | /* Copyright (C) 1996-2023 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published |
6 | by the Free Software Foundation; version 2 of the License, or |
7 | (at your option) any later version. |
8 | |
9 | This program is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License |
15 | along with this program; if not, see <https://www.gnu.org/licenses/>. */ |
16 | |
17 | #ifdef HAVE_CONFIG_H |
18 | # include <config.h> |
19 | #endif |
20 | |
21 | #include <assert.h> |
22 | #include <ctype.h> |
23 | #include <errno.h> |
24 | #include <libintl.h> |
25 | #include <stdarg.h> |
26 | #include <stdlib.h> |
27 | #include <string.h> |
28 | #include <stdint.h> |
29 | |
30 | #include "localedef.h" |
31 | #include "charmap.h" |
32 | #include "error.h" |
33 | #include "linereader.h" |
34 | #include "locfile.h" |
35 | |
36 | /* Prototypes for local functions. */ |
37 | static struct token *get_toplvl_escape (struct linereader *lr); |
38 | static struct token *get_symname (struct linereader *lr); |
39 | static struct token *get_ident (struct linereader *lr); |
40 | static struct token *get_string (struct linereader *lr, |
41 | const struct charmap_t *charmap, |
42 | struct localedef_t *locale, |
43 | const struct repertoire_t *repertoire, |
44 | int verbose); |
45 | static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch); |
46 | |
47 | |
48 | struct linereader * |
49 | lr_open (const char *fname, kw_hash_fct_t hf) |
50 | { |
51 | FILE *fp; |
52 | |
53 | if (fname == NULL || strcmp (fname, "-" ) == 0 |
54 | || strcmp (fname, "/dev/stdin" ) == 0) |
55 | return lr_create (stdin, "<stdin>" , hf); |
56 | else |
57 | { |
58 | fp = fopen (fname, "rm" ); |
59 | if (fp == NULL) |
60 | return NULL; |
61 | return lr_create (fp, fname, hf); |
62 | } |
63 | } |
64 | |
65 | struct linereader * |
66 | lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) |
67 | { |
68 | struct linereader *result; |
69 | int n; |
70 | |
71 | result = (struct linereader *) xmalloc (sizeof (*result)); |
72 | |
73 | result->fp = fp; |
74 | result->fname = xstrdup (fname); |
75 | result->buf = NULL; |
76 | result->bufsize = 0; |
77 | result->lineno = 1; |
78 | result->idx = 0; |
79 | result->comment_char = '#'; |
80 | result->escape_char = '\\'; |
81 | result->translate_strings = 1; |
82 | result->return_widestr = 0; |
83 | |
84 | n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); |
85 | if (n < 0) |
86 | { |
87 | int save = errno; |
88 | fclose (result->fp); |
89 | free ((char *) result->fname); |
90 | free (result); |
91 | errno = save; |
92 | return NULL; |
93 | } |
94 | |
95 | if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') |
96 | n -= 2; |
97 | |
98 | result->buf[n] = '\0'; |
99 | result->bufact = n; |
100 | result->hash_fct = hf; |
101 | |
102 | return result; |
103 | } |
104 | |
105 | |
106 | int |
107 | lr_eof (struct linereader *lr) |
108 | { |
109 | return lr->bufact = 0; |
110 | } |
111 | |
112 | |
113 | void |
114 | lr_ignore_rest (struct linereader *lr, int verbose) |
115 | { |
116 | if (verbose) |
117 | { |
118 | while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n' |
119 | && lr->buf[lr->idx] != lr->comment_char) |
120 | if (lr->buf[lr->idx] == '\0') |
121 | { |
122 | if (lr_next (lr) < 0) |
123 | return; |
124 | } |
125 | else |
126 | ++lr->idx; |
127 | |
128 | if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp) |
129 | && lr->buf[lr->idx] != lr->comment_char) |
130 | lr_error (lr, _("trailing garbage at end of line" )); |
131 | } |
132 | |
133 | /* Ignore continued line. */ |
134 | while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n') |
135 | if (lr_next (lr) < 0) |
136 | break; |
137 | |
138 | lr->idx = lr->bufact; |
139 | } |
140 | |
141 | |
142 | void |
143 | lr_close (struct linereader *lr) |
144 | { |
145 | fclose (lr->fp); |
146 | free (lr->buf); |
147 | free (lr); |
148 | } |
149 | |
150 | |
151 | int |
152 | lr_next (struct linereader *lr) |
153 | { |
154 | int n; |
155 | |
156 | n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); |
157 | if (n < 0) |
158 | return -1; |
159 | |
160 | ++lr->lineno; |
161 | |
162 | if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') |
163 | { |
164 | #if 0 |
165 | /* XXX Is this correct? */ |
166 | /* An escaped newline character is substituted with a single <SP>. */ |
167 | --n; |
168 | lr->buf[n - 1] = ' '; |
169 | #else |
170 | n -= 2; |
171 | #endif |
172 | } |
173 | |
174 | lr->buf[n] = '\0'; |
175 | lr->bufact = n; |
176 | lr->idx = 0; |
177 | |
178 | return 0; |
179 | } |
180 | |
181 | |
182 | /* Defined in error.c. */ |
183 | /* This variable is incremented each time `error' is called. */ |
184 | extern unsigned int error_message_count; |
185 | |
186 | /* The calling program should define program_name and set it to the |
187 | name of the executing program. */ |
188 | extern char *program_name; |
189 | |
190 | |
191 | struct token * |
192 | lr_token (struct linereader *lr, const struct charmap_t *charmap, |
193 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
194 | int verbose) |
195 | { |
196 | int ch; |
197 | |
198 | while (1) |
199 | { |
200 | do |
201 | { |
202 | ch = lr_getc (lr); |
203 | |
204 | if (ch == EOF) |
205 | { |
206 | lr->token.tok = tok_eof; |
207 | return &lr->token; |
208 | }; |
209 | |
210 | if (ch == '\n') |
211 | { |
212 | lr->token.tok = tok_eol; |
213 | return &lr->token; |
214 | } |
215 | } |
216 | while (isspace (ch)); |
217 | |
218 | if (ch != lr->comment_char) |
219 | break; |
220 | |
221 | /* Is there an newline at the end of the buffer? */ |
222 | if (lr->buf[lr->bufact - 1] != '\n') |
223 | { |
224 | /* No. Some people want this to mean that only the line in |
225 | the file not the logical, concatenated line is ignored. |
226 | Let's try this. */ |
227 | lr->idx = lr->bufact; |
228 | continue; |
229 | } |
230 | |
231 | /* Ignore rest of line. */ |
232 | lr_ignore_rest (lr, 0); |
233 | lr->token.tok = tok_eol; |
234 | return &lr->token; |
235 | } |
236 | |
237 | /* Match escape sequences. */ |
238 | if (ch == lr->escape_char) |
239 | return get_toplvl_escape (lr); |
240 | |
241 | /* Match ellipsis. */ |
242 | if (ch == '.') |
243 | { |
244 | if (strncmp (&lr->buf[lr->idx], "...(2)...." , 10) == 0) |
245 | { |
246 | int cnt; |
247 | for (cnt = 0; cnt < 10; ++cnt) |
248 | lr_getc (lr); |
249 | lr->token.tok = tok_ellipsis4_2; |
250 | return &lr->token; |
251 | } |
252 | if (strncmp (&lr->buf[lr->idx], "..." , 3) == 0) |
253 | { |
254 | lr_getc (lr); |
255 | lr_getc (lr); |
256 | lr_getc (lr); |
257 | lr->token.tok = tok_ellipsis4; |
258 | return &lr->token; |
259 | } |
260 | if (strncmp (&lr->buf[lr->idx], ".." , 2) == 0) |
261 | { |
262 | lr_getc (lr); |
263 | lr_getc (lr); |
264 | lr->token.tok = tok_ellipsis3; |
265 | return &lr->token; |
266 | } |
267 | if (strncmp (&lr->buf[lr->idx], ".(2).." , 6) == 0) |
268 | { |
269 | int cnt; |
270 | for (cnt = 0; cnt < 6; ++cnt) |
271 | lr_getc (lr); |
272 | lr->token.tok = tok_ellipsis2_2; |
273 | return &lr->token; |
274 | } |
275 | if (lr->buf[lr->idx] == '.') |
276 | { |
277 | lr_getc (lr); |
278 | lr->token.tok = tok_ellipsis2; |
279 | return &lr->token; |
280 | } |
281 | } |
282 | |
283 | switch (ch) |
284 | { |
285 | case '<': |
286 | return get_symname (lr); |
287 | |
288 | case '0' ... '9': |
289 | lr->token.tok = tok_number; |
290 | lr->token.val.num = ch - '0'; |
291 | |
292 | while (isdigit (ch = lr_getc (lr))) |
293 | { |
294 | lr->token.val.num *= 10; |
295 | lr->token.val.num += ch - '0'; |
296 | } |
297 | if (isalpha (ch)) |
298 | lr_error (lr, _("garbage at end of number" )); |
299 | lr_ungetn (lr, 1); |
300 | |
301 | return &lr->token; |
302 | |
303 | case ';': |
304 | lr->token.tok = tok_semicolon; |
305 | return &lr->token; |
306 | |
307 | case ',': |
308 | lr->token.tok = tok_comma; |
309 | return &lr->token; |
310 | |
311 | case '(': |
312 | lr->token.tok = tok_open_brace; |
313 | return &lr->token; |
314 | |
315 | case ')': |
316 | lr->token.tok = tok_close_brace; |
317 | return &lr->token; |
318 | |
319 | case '"': |
320 | return get_string (lr, charmap, locale, repertoire, verbose); |
321 | |
322 | case '-': |
323 | ch = lr_getc (lr); |
324 | if (ch == '1') |
325 | { |
326 | lr->token.tok = tok_minus1; |
327 | return &lr->token; |
328 | } |
329 | lr_ungetn (lr, 2); |
330 | break; |
331 | |
332 | case 0x80 ... 0xff: /* UTF-8 sequence. */ |
333 | { |
334 | uint32_t wch; |
335 | if (!utf8_decode (lr, ch, &wch)) |
336 | { |
337 | lr->token.tok = tok_error; |
338 | return &lr->token; |
339 | } |
340 | lr->token.tok = tok_ucs4; |
341 | lr->token.val.ucs4 = wch; |
342 | return &lr->token; |
343 | } |
344 | } |
345 | |
346 | return get_ident (lr); |
347 | } |
348 | |
349 | |
350 | static struct token * |
351 | get_toplvl_escape (struct linereader *lr) |
352 | { |
353 | /* This is supposed to be a numeric value. We return the |
354 | numerical value and the number of bytes. */ |
355 | size_t start_idx = lr->idx - 1; |
356 | unsigned char *bytes = lr->token.val.charcode.bytes; |
357 | size_t nbytes = 0; |
358 | int ch; |
359 | |
360 | do |
361 | { |
362 | unsigned int byte = 0; |
363 | unsigned int base = 8; |
364 | |
365 | ch = lr_getc (lr); |
366 | |
367 | if (ch == 'd') |
368 | { |
369 | base = 10; |
370 | ch = lr_getc (lr); |
371 | } |
372 | else if (ch == 'x') |
373 | { |
374 | base = 16; |
375 | ch = lr_getc (lr); |
376 | } |
377 | |
378 | if ((base == 16 && !isxdigit (ch)) |
379 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
380 | { |
381 | esc_error: |
382 | lr->token.val.str.startmb = &lr->buf[start_idx]; |
383 | |
384 | while (ch != EOF && !isspace (ch)) |
385 | ch = lr_getc (lr); |
386 | lr->token.val.str.lenmb = lr->idx - start_idx; |
387 | |
388 | lr->token.tok = tok_error; |
389 | return &lr->token; |
390 | } |
391 | |
392 | if (isdigit (ch)) |
393 | byte = ch - '0'; |
394 | else |
395 | byte = tolower (ch) - 'a' + 10; |
396 | |
397 | ch = lr_getc (lr); |
398 | if ((base == 16 && !isxdigit (ch)) |
399 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
400 | goto esc_error; |
401 | |
402 | byte *= base; |
403 | if (isdigit (ch)) |
404 | byte += ch - '0'; |
405 | else |
406 | byte += tolower (ch) - 'a' + 10; |
407 | |
408 | ch = lr_getc (lr); |
409 | if (base != 16 && isdigit (ch)) |
410 | { |
411 | byte *= base; |
412 | byte += ch - '0'; |
413 | |
414 | ch = lr_getc (lr); |
415 | } |
416 | |
417 | bytes[nbytes++] = byte; |
418 | } |
419 | while (ch == lr->escape_char |
420 | && nbytes < (int) sizeof (lr->token.val.charcode.bytes)); |
421 | |
422 | if (!isspace (ch)) |
423 | lr_error (lr, _("garbage at end of character code specification" )); |
424 | |
425 | lr_ungetn (lr, 1); |
426 | |
427 | lr->token.tok = tok_charcode; |
428 | lr->token.val.charcode.nbytes = nbytes; |
429 | |
430 | return &lr->token; |
431 | } |
432 | |
433 | /* Multibyte string buffer. */ |
434 | struct lr_buffer |
435 | { |
436 | size_t act; |
437 | size_t max; |
438 | char *buf; |
439 | }; |
440 | |
441 | /* Initialize *LRB with a default-sized buffer. */ |
442 | static void |
443 | lr_buffer_init (struct lr_buffer *lrb) |
444 | { |
445 | lrb->act = 0; |
446 | lrb->max = 56; |
447 | lrb->buf = xmalloc (lrb->max); |
448 | } |
449 | |
450 | /* Transfers the buffer string from *LRB to LR->token.mbstr. */ |
451 | static void |
452 | lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr) |
453 | { |
454 | lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1); |
455 | lr->token.val.str.startmb[lrb->act] = '\0'; |
456 | lr->token.val.str.lenmb = lrb->act; |
457 | } |
458 | |
459 | /* Adds CH to *LRB. */ |
460 | static void |
461 | addc (struct lr_buffer *lrb, char ch) |
462 | { |
463 | if (lrb->act == lrb->max) |
464 | { |
465 | lrb->max *= 2; |
466 | lrb->buf = xrealloc (lrb->buf, lrb->max); |
467 | } |
468 | lrb->buf[lrb->act++] = ch; |
469 | } |
470 | |
471 | /* Adds L bytes at S to *LRB. */ |
472 | static void |
473 | adds (struct lr_buffer *lrb, const unsigned char *s, size_t l) |
474 | { |
475 | if (lrb->max - lrb->act < l) |
476 | { |
477 | size_t required_size = lrb->act + l; |
478 | size_t new_max = 2 * lrb->max; |
479 | if (new_max < required_size) |
480 | new_max = required_size; |
481 | lrb->buf = xrealloc (lrb->buf, new_max); |
482 | lrb->max = new_max; |
483 | } |
484 | memcpy (lrb->buf + lrb->act, s, l); |
485 | lrb->act += l; |
486 | } |
487 | |
488 | #define ADDWC(ch) \ |
489 | do \ |
490 | { \ |
491 | if (buf2act == buf2max) \ |
492 | { \ |
493 | buf2max *= 2; \ |
494 | buf2 = xrealloc (buf2, buf2max * 4); \ |
495 | } \ |
496 | buf2[buf2act++] = (ch); \ |
497 | } \ |
498 | while (0) |
499 | |
500 | |
501 | static struct token * |
502 | get_symname (struct linereader *lr) |
503 | { |
504 | /* Symbol in brackets. We must distinguish three kinds: |
505 | 1. reserved words |
506 | 2. ISO 10646 position values |
507 | 3. all other. */ |
508 | const struct keyword_t *kw; |
509 | int ch; |
510 | struct lr_buffer lrb; |
511 | |
512 | lr_buffer_init (&lrb); |
513 | |
514 | do |
515 | { |
516 | ch = lr_getc (lr); |
517 | if (ch == lr->escape_char) |
518 | { |
519 | int c2 = lr_getc (lr); |
520 | addc (&lrb, c2); |
521 | |
522 | if (c2 == '\n') |
523 | ch = '\n'; |
524 | } |
525 | else |
526 | addc (&lrb, ch); |
527 | } |
528 | while (ch != '>' && ch != '\n'); |
529 | |
530 | if (ch == '\n') |
531 | lr_error (lr, _("unterminated symbolic name" )); |
532 | |
533 | /* Test for ISO 10646 position value. */ |
534 | if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10)) |
535 | { |
536 | char *cp = lrb.buf + 1; |
537 | while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp)) |
538 | ++cp; |
539 | |
540 | if (cp == &lrb.buf[lrb.act - 1]) |
541 | { |
542 | /* Yes, it is. */ |
543 | lr->token.tok = tok_ucs4; |
544 | lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16); |
545 | |
546 | return &lr->token; |
547 | } |
548 | } |
549 | |
550 | /* It is a symbolic name. Test for reserved words. */ |
551 | kw = lr->hash_fct (lrb.buf, lrb.act - 1); |
552 | |
553 | if (kw != NULL && kw->symname_or_ident == 1) |
554 | { |
555 | lr->token.tok = kw->token; |
556 | free (lrb.buf); |
557 | } |
558 | else |
559 | { |
560 | lr->token.tok = tok_bsymbol; |
561 | lr_buffer_to_token (&lrb, lr); |
562 | --lr->token.val.str.lenmb; /* Hide the training '>'. */ |
563 | } |
564 | |
565 | return &lr->token; |
566 | } |
567 | |
568 | |
569 | static struct token * |
570 | get_ident (struct linereader *lr) |
571 | { |
572 | const struct keyword_t *kw; |
573 | int ch; |
574 | struct lr_buffer lrb; |
575 | |
576 | lr_buffer_init (&lrb); |
577 | |
578 | addc (&lrb, lr->buf[lr->idx - 1]); |
579 | |
580 | while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' |
581 | && ch != '<' && ch != ',' && ch != EOF) |
582 | { |
583 | if (ch == lr->escape_char) |
584 | { |
585 | ch = lr_getc (lr); |
586 | if (ch == '\n' || ch == EOF) |
587 | { |
588 | lr_error (lr, _("invalid escape sequence" )); |
589 | break; |
590 | } |
591 | } |
592 | addc (&lrb, ch); |
593 | } |
594 | |
595 | lr_ungetc (lr, ch); |
596 | |
597 | kw = lr->hash_fct (lrb.buf, lrb.act); |
598 | |
599 | if (kw != NULL && kw->symname_or_ident == 0) |
600 | { |
601 | lr->token.tok = kw->token; |
602 | free (lrb.buf); |
603 | } |
604 | else |
605 | { |
606 | lr->token.tok = tok_ident; |
607 | lr_buffer_to_token (&lrb, lr); |
608 | } |
609 | |
610 | return &lr->token; |
611 | } |
612 | |
613 | /* Process a decoded Unicode codepoint WCH in a string, placing the |
614 | multibyte sequence into LRB. Return false if the character is not |
615 | found in CHARMAP/REPERTOIRE. */ |
616 | static bool |
617 | translate_unicode_codepoint (struct localedef_t *locale, |
618 | const struct charmap_t *charmap, |
619 | const struct repertoire_t *repertoire, |
620 | uint32_t wch, struct lr_buffer *lrb) |
621 | { |
622 | /* See whether the charmap contains the Uxxxxxxxx names. */ |
623 | char utmp[10]; |
624 | snprintf (utmp, sizeof (utmp), "U%08X" , wch); |
625 | struct charseq *seq = charmap_find_value (charmap, utmp, 9); |
626 | |
627 | if (seq == NULL) |
628 | { |
629 | /* No, this isn't the case. Now determine from |
630 | the repertoire the name of the character and |
631 | find it in the charmap. */ |
632 | if (repertoire != NULL) |
633 | { |
634 | const char *symbol = repertoire_find_symbol (repertoire, wch); |
635 | if (symbol != NULL) |
636 | seq = charmap_find_value (charmap, symbol, strlen (symbol)); |
637 | } |
638 | |
639 | if (seq == NULL) |
640 | { |
641 | #ifndef NO_TRANSLITERATION |
642 | /* Transliterate if possible. */ |
643 | if (locale != NULL) |
644 | { |
645 | if ((locale->avail & CTYPE_LOCALE) == 0) |
646 | { |
647 | /* Load the CTYPE data now. */ |
648 | int old_needed = locale->needed; |
649 | |
650 | locale->needed = 0; |
651 | locale = load_locale (LC_CTYPE, locale->name, |
652 | locale->repertoire_name, |
653 | charmap, locale); |
654 | locale->needed = old_needed; |
655 | } |
656 | |
657 | uint32_t *translit; |
658 | if ((locale->avail & CTYPE_LOCALE) != 0 |
659 | && ((translit = find_translit (locale, charmap, wch)) |
660 | != NULL)) |
661 | /* The CTYPE data contains a matching |
662 | transliteration. */ |
663 | { |
664 | for (int i = 0; translit[i] != 0; ++i) |
665 | { |
666 | snprintf (utmp, sizeof (utmp), "U%08X" , translit[i]); |
667 | seq = charmap_find_value (charmap, utmp, 9); |
668 | assert (seq != NULL); |
669 | adds (lrb, seq->bytes, seq->nbytes); |
670 | } |
671 | return true; |
672 | } |
673 | } |
674 | #endif /* NO_TRANSLITERATION */ |
675 | |
676 | /* Not a known name. */ |
677 | return false; |
678 | } |
679 | } |
680 | |
681 | if (seq != NULL) |
682 | { |
683 | adds (lrb, seq->bytes, seq->nbytes); |
684 | return true; |
685 | } |
686 | else |
687 | return false; |
688 | } |
689 | |
690 | /* Returns true if ch is not EOF (that is, non-negative) and a valid |
691 | UTF-8 trailing byte. */ |
692 | static bool |
693 | utf8_valid_trailing (int ch) |
694 | { |
695 | return ch >= 0 && (ch & 0xc0) == 0x80; |
696 | } |
697 | |
698 | /* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be |
699 | EOF. Always returns false. */ |
700 | static bool |
701 | utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3, |
702 | int ch4) |
703 | { |
704 | char buf[38]; |
705 | |
706 | if (ch2 < 0) |
707 | snprintf (buf, sizeof (buf), "0x%02x" , ch1); |
708 | else if (ch3 < 0) |
709 | snprintf (buf, sizeof (buf), "0x%02x 0x%02x" , ch1, ch2); |
710 | else if (ch4 < 0) |
711 | snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x" , ch1, ch2, ch3); |
712 | else |
713 | snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x" , |
714 | ch1, ch2, ch3, ch4); |
715 | |
716 | lr_error (lr, _("invalid UTF-8 sequence %s" ), buf); |
717 | return false; |
718 | } |
719 | |
720 | /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and |
721 | stores the decoded codepoint in *WCH. Returns false on failure and |
722 | reports an error. */ |
723 | static bool |
724 | utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch) |
725 | { |
726 | /* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */ |
727 | if (ch1 < 0xc2) |
728 | return utf8_sequence_error (lr, ch1, -1, -1, -1); |
729 | |
730 | int ch2 = lr_getc (lr); |
731 | if (!utf8_valid_trailing (ch2)) |
732 | return utf8_sequence_error (lr, ch1, ch2, -1, -1); |
733 | |
734 | if (ch1 <= 0xdf) |
735 | { |
736 | uint32_t result = ((ch1 & 0x1f) << 6) | (ch2 & 0x3f); |
737 | if (result < 0x80) |
738 | return utf8_sequence_error (lr, ch1, ch2, -1, -1); |
739 | *wch = result; |
740 | return true; |
741 | } |
742 | |
743 | int ch3 = lr_getc (lr); |
744 | if (!utf8_valid_trailing (ch3) || ch1 < 0xe0) |
745 | return utf8_sequence_error (lr, ch1, ch2, ch3, -1); |
746 | |
747 | if (ch1 <= 0xef) |
748 | { |
749 | uint32_t result = (((ch1 & 0x0f) << 12) |
750 | | ((ch2 & 0x3f) << 6) |
751 | | (ch3 & 0x3f)); |
752 | if (result < 0x800) |
753 | return utf8_sequence_error (lr, ch1, ch2, ch3, -1); |
754 | *wch = result; |
755 | return true; |
756 | } |
757 | |
758 | int ch4 = lr_getc (lr); |
759 | if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4) |
760 | return utf8_sequence_error (lr, ch1, ch2, ch3, ch4); |
761 | |
762 | uint32_t result = (((ch1 & 0x07) << 18) |
763 | | ((ch2 & 0x3f) << 12) |
764 | | ((ch3 & 0x3f) << 6) |
765 | | (ch4 & 0x3f)); |
766 | if (result < 0x10000) |
767 | return utf8_sequence_error (lr, ch1, ch2, ch3, ch4); |
768 | *wch = result; |
769 | return true; |
770 | } |
771 | |
772 | static struct token * |
773 | get_string (struct linereader *lr, const struct charmap_t *charmap, |
774 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
775 | int verbose) |
776 | { |
777 | int return_widestr = lr->return_widestr; |
778 | struct lr_buffer lrb; |
779 | wchar_t *buf2 = NULL; |
780 | |
781 | lr_buffer_init (&lrb); |
782 | |
783 | /* We know it'll be a string. */ |
784 | lr->token.tok = tok_string; |
785 | |
786 | /* If we need not translate the strings (i.e., expand <...> parts) |
787 | we can run a simple loop. */ |
788 | if (!lr->translate_strings) |
789 | { |
790 | int ch; |
791 | |
792 | buf2 = NULL; |
793 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) |
794 | { |
795 | if (ch >= 0x80) |
796 | lr_error (lr, _("illegal 8-bit character in untranslated string" )); |
797 | addc (&lrb, ch); |
798 | } |
799 | |
800 | /* Catch errors with trailing escape character. */ |
801 | if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char |
802 | && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char)) |
803 | { |
804 | lr_error (lr, _("illegal escape sequence at end of string" )); |
805 | --lrb.act; |
806 | } |
807 | else if (ch == '\n' || ch == EOF) |
808 | lr_error (lr, _("unterminated string" )); |
809 | |
810 | addc (&lrb, '\0'); |
811 | } |
812 | else |
813 | { |
814 | bool illegal_string = false; |
815 | size_t buf2act = 0; |
816 | size_t buf2max = 56 * sizeof (uint32_t); |
817 | int ch; |
818 | |
819 | /* We have to provide the wide character result as well. */ |
820 | if (return_widestr) |
821 | buf2 = xmalloc (buf2max); |
822 | |
823 | /* Read until the end of the string (or end of the line or file). */ |
824 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) |
825 | { |
826 | size_t startidx; |
827 | uint32_t wch; |
828 | struct charseq *seq; |
829 | |
830 | if (ch != '<') |
831 | { |
832 | /* The standards leave it up to the implementation to |
833 | decide what to do with characters which stand for |
834 | themselves. This implementation treats the input |
835 | file as encoded in UTF-8. */ |
836 | if (ch == lr->escape_char) |
837 | { |
838 | ch = lr_getc (lr); |
839 | if (ch >= 0x80) |
840 | { |
841 | lr_error (lr, _("illegal 8-bit escape sequence" )); |
842 | illegal_string = true; |
843 | break; |
844 | } |
845 | if (ch == '\n' || ch == EOF) |
846 | break; |
847 | addc (&lrb, ch); |
848 | wch = ch; |
849 | } |
850 | else if (ch < 0x80) |
851 | { |
852 | wch = ch; |
853 | addc (&lrb, ch); |
854 | } |
855 | else /* UTF-8 sequence. */ |
856 | { |
857 | if (!utf8_decode (lr, ch, &wch)) |
858 | { |
859 | illegal_string = true; |
860 | break; |
861 | } |
862 | if (!translate_unicode_codepoint (locale, charmap, |
863 | repertoire, wch, &lrb)) |
864 | { |
865 | /* Ignore the rest of the string. Callers may |
866 | skip this string because it cannot be encoded |
867 | in the output character set. */ |
868 | illegal_string = true; |
869 | continue; |
870 | } |
871 | } |
872 | |
873 | if (return_widestr) |
874 | ADDWC (wch); |
875 | |
876 | continue; |
877 | } |
878 | |
879 | /* Now we have to search for the end of the symbolic name, i.e., |
880 | the closing '>'. */ |
881 | startidx = lrb.act; |
882 | while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) |
883 | { |
884 | if (ch == lr->escape_char) |
885 | { |
886 | ch = lr_getc (lr); |
887 | if (ch == '\n' || ch == EOF) |
888 | break; |
889 | } |
890 | addc (&lrb, ch); |
891 | } |
892 | if (ch == '\n' || ch == EOF) |
893 | /* Not a correct string. */ |
894 | break; |
895 | if (lrb.act == startidx) |
896 | { |
897 | /* <> is no correct name. Ignore it and also signal an |
898 | error. */ |
899 | illegal_string = true; |
900 | continue; |
901 | } |
902 | |
903 | /* It might be a Uxxxx symbol. */ |
904 | if (lrb.buf[startidx] == 'U' |
905 | && (lrb.act - startidx == 5 || lrb.act - startidx == 9)) |
906 | { |
907 | char *cp = lrb.buf + startidx + 1; |
908 | while (cp < &lrb.buf[lrb.act] && isxdigit (*cp)) |
909 | ++cp; |
910 | |
911 | if (cp == &lrb.buf[lrb.act]) |
912 | { |
913 | /* Yes, it is. */ |
914 | addc (&lrb, '\0'); |
915 | wch = strtoul (lrb.buf + startidx + 1, NULL, 16); |
916 | |
917 | /* Now forget about the name we just added. */ |
918 | lrb.act = startidx; |
919 | |
920 | if (return_widestr) |
921 | ADDWC (wch); |
922 | |
923 | if (!translate_unicode_codepoint (locale, charmap, |
924 | repertoire, wch, &lrb)) |
925 | illegal_string = true; |
926 | continue; |
927 | } |
928 | } |
929 | |
930 | /* We now have the symbolic name in lrb.buf[startidx] to |
931 | lrb.buf[lrb.act-1]. Now find out the value for this character |
932 | in the charmap as well as in the repertoire map (in this |
933 | order). */ |
934 | seq = charmap_find_value (charmap, &lrb.buf[startidx], |
935 | lrb.act - startidx); |
936 | |
937 | if (seq == NULL) |
938 | { |
939 | /* This name is not in the charmap. */ |
940 | lr_error (lr, _("symbol `%.*s' not in charmap" ), |
941 | (int) (lrb.act - startidx), &lrb.buf[startidx]); |
942 | illegal_string = true; |
943 | } |
944 | |
945 | if (return_widestr) |
946 | { |
947 | /* Now the same for the multibyte representation. */ |
948 | if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) |
949 | wch = seq->ucs4; |
950 | else |
951 | { |
952 | wch = repertoire_find_value (repertoire, &lrb.buf[startidx], |
953 | lrb.act - startidx); |
954 | if (seq != NULL) |
955 | seq->ucs4 = wch; |
956 | } |
957 | |
958 | if (wch == ILLEGAL_CHAR_VALUE) |
959 | { |
960 | /* This name is not in the repertoire map. */ |
961 | lr_error (lr, _("symbol `%.*s' not in repertoire map" ), |
962 | (int) (lrb.act - startidx), &lrb.buf[startidx]); |
963 | illegal_string = true; |
964 | } |
965 | else |
966 | ADDWC (wch); |
967 | } |
968 | |
969 | /* Now forget about the name we just added. */ |
970 | lrb.act = startidx; |
971 | |
972 | /* And copy the bytes. */ |
973 | if (seq != NULL) |
974 | adds (&lrb, seq->bytes, seq->nbytes); |
975 | } |
976 | |
977 | if (ch == '\n' || ch == EOF) |
978 | { |
979 | lr_error (lr, _("unterminated string" )); |
980 | illegal_string = true; |
981 | } |
982 | |
983 | if (illegal_string) |
984 | { |
985 | free (lrb.buf); |
986 | free (buf2); |
987 | lr->token.val.str.startmb = NULL; |
988 | lr->token.val.str.lenmb = 0; |
989 | lr->token.val.str.startwc = NULL; |
990 | lr->token.val.str.lenwc = 0; |
991 | |
992 | return &lr->token; |
993 | } |
994 | |
995 | addc (&lrb, '\0'); |
996 | |
997 | if (return_widestr) |
998 | { |
999 | ADDWC (0); |
1000 | lr->token.val.str.startwc = xrealloc (buf2, |
1001 | buf2act * sizeof (uint32_t)); |
1002 | lr->token.val.str.lenwc = buf2act; |
1003 | } |
1004 | } |
1005 | |
1006 | lr_buffer_to_token (&lrb, lr); |
1007 | |
1008 | return &lr->token; |
1009 | } |
1010 | |